mlrun 1.6.2rc6__py3-none-any.whl → 1.6.3rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (57) hide show
  1. mlrun/artifacts/model.py +28 -22
  2. mlrun/common/db/sql_session.py +3 -0
  3. mlrun/common/model_monitoring/helpers.py +4 -2
  4. mlrun/common/schemas/__init__.py +2 -0
  5. mlrun/common/schemas/common.py +40 -0
  6. mlrun/common/schemas/model_monitoring/__init__.py +1 -0
  7. mlrun/common/schemas/model_monitoring/constants.py +21 -5
  8. mlrun/common/schemas/project.py +2 -0
  9. mlrun/config.py +51 -20
  10. mlrun/data_types/data_types.py +4 -0
  11. mlrun/datastore/azure_blob.py +9 -9
  12. mlrun/datastore/base.py +22 -44
  13. mlrun/datastore/google_cloud_storage.py +6 -6
  14. mlrun/datastore/v3io.py +70 -46
  15. mlrun/db/base.py +18 -0
  16. mlrun/db/httpdb.py +41 -36
  17. mlrun/execution.py +3 -3
  18. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
  19. mlrun/frameworks/tf_keras/model_handler.py +7 -7
  20. mlrun/k8s_utils.py +10 -5
  21. mlrun/kfpops.py +19 -10
  22. mlrun/model.py +6 -0
  23. mlrun/model_monitoring/api.py +8 -8
  24. mlrun/model_monitoring/batch.py +1 -1
  25. mlrun/model_monitoring/controller.py +0 -7
  26. mlrun/model_monitoring/features_drift_table.py +6 -0
  27. mlrun/model_monitoring/helpers.py +4 -1
  28. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +13 -13
  29. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
  30. mlrun/model_monitoring/stream_processing.py +50 -36
  31. mlrun/package/packagers/pandas_packagers.py +3 -3
  32. mlrun/package/utils/_archiver.py +3 -1
  33. mlrun/platforms/iguazio.py +6 -65
  34. mlrun/projects/pipelines.py +29 -12
  35. mlrun/projects/project.py +69 -55
  36. mlrun/run.py +2 -0
  37. mlrun/runtimes/base.py +24 -1
  38. mlrun/runtimes/function.py +9 -9
  39. mlrun/runtimes/kubejob.py +5 -3
  40. mlrun/runtimes/local.py +2 -2
  41. mlrun/runtimes/mpijob/abstract.py +6 -6
  42. mlrun/runtimes/pod.py +3 -3
  43. mlrun/runtimes/serving.py +3 -3
  44. mlrun/runtimes/sparkjob/spark3job.py +3 -3
  45. mlrun/serving/remote.py +4 -2
  46. mlrun/utils/async_http.py +3 -3
  47. mlrun/utils/helpers.py +20 -0
  48. mlrun/utils/http.py +3 -3
  49. mlrun/utils/logger.py +2 -2
  50. mlrun/utils/notifications/notification_pusher.py +6 -6
  51. mlrun/utils/version/version.json +2 -2
  52. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/METADATA +15 -17
  53. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/RECORD +57 -56
  54. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/LICENSE +0 -0
  55. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/WHEEL +0 -0
  56. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/entry_points.txt +0 -0
  57. {mlrun-1.6.2rc6.dist-info → mlrun-1.6.3rc3.dist-info}/top_level.txt +0 -0
mlrun/artifacts/model.py CHANGED
@@ -13,8 +13,9 @@
13
13
  # limitations under the License.
14
14
  import tempfile
15
15
  from os import path
16
- from typing import List
16
+ from typing import Any
17
17
 
18
+ import pandas as pd
18
19
  import yaml
19
20
  from deprecated import deprecated
20
21
 
@@ -68,8 +69,8 @@ class ModelArtifactSpec(ArtifactSpec):
68
69
  model_file=None,
69
70
  metrics=None,
70
71
  paraemeters=None,
71
- inputs: List[Feature] = None,
72
- outputs: List[Feature] = None,
72
+ inputs: list[Feature] = None,
73
+ outputs: list[Feature] = None,
73
74
  framework=None,
74
75
  algorithm=None,
75
76
  feature_vector=None,
@@ -91,8 +92,8 @@ class ModelArtifactSpec(ArtifactSpec):
91
92
  self.model_file = model_file
92
93
  self.metrics = metrics or {}
93
94
  self.parameters = paraemeters or {}
94
- self.inputs: List[Feature] = inputs or []
95
- self.outputs: List[Feature] = outputs or []
95
+ self.inputs: list[Feature] = inputs or []
96
+ self.outputs: list[Feature] = outputs or []
96
97
  self.framework = framework
97
98
  self.algorithm = algorithm
98
99
  self.feature_vector = feature_vector
@@ -101,21 +102,21 @@ class ModelArtifactSpec(ArtifactSpec):
101
102
  self.model_target_file = model_target_file
102
103
 
103
104
  @property
104
- def inputs(self) -> List[Feature]:
105
+ def inputs(self) -> list[Feature]:
105
106
  """input feature list"""
106
107
  return self._inputs
107
108
 
108
109
  @inputs.setter
109
- def inputs(self, inputs: List[Feature]):
110
+ def inputs(self, inputs: list[Feature]):
110
111
  self._inputs = ObjectList.from_list(Feature, inputs)
111
112
 
112
113
  @property
113
- def outputs(self) -> List[Feature]:
114
+ def outputs(self) -> list[Feature]:
114
115
  """output feature list"""
115
116
  return self._outputs
116
117
 
117
118
  @outputs.setter
118
- def outputs(self, outputs: List[Feature]):
119
+ def outputs(self, outputs: list[Feature]):
119
120
  self._outputs = ObjectList.from_list(Feature, outputs)
120
121
 
121
122
 
@@ -175,22 +176,22 @@ class ModelArtifact(Artifact):
175
176
  self._spec = self._verify_dict(spec, "spec", ModelArtifactSpec)
176
177
 
177
178
  @property
178
- def inputs(self) -> List[Feature]:
179
+ def inputs(self) -> list[Feature]:
179
180
  """input feature list"""
180
181
  return self.spec.inputs
181
182
 
182
183
  @inputs.setter
183
- def inputs(self, inputs: List[Feature]):
184
+ def inputs(self, inputs: list[Feature]):
184
185
  """input feature list"""
185
186
  self.spec.inputs = inputs
186
187
 
187
188
  @property
188
- def outputs(self) -> List[Feature]:
189
+ def outputs(self) -> list[Feature]:
189
190
  """input feature list"""
190
191
  return self.spec.outputs
191
192
 
192
193
  @outputs.setter
193
- def outputs(self, outputs: List[Feature]):
194
+ def outputs(self, outputs: list[Feature]):
194
195
  """input feature list"""
195
196
  self.spec.outputs = outputs
196
197
 
@@ -260,6 +261,7 @@ class ModelArtifact(Artifact):
260
261
  """
261
262
  subset = df
262
263
  inferer = get_infer_interface(subset)
264
+ numeric_columns = self._extract_numeric_features(df)
263
265
  if label_columns:
264
266
  if not isinstance(label_columns, list):
265
267
  label_columns = [label_columns]
@@ -273,9 +275,13 @@ class ModelArtifact(Artifact):
273
275
  )
274
276
  if with_stats:
275
277
  self.spec.feature_stats = inferer.get_stats(
276
- df, options=InferOptions.Histogram, num_bins=num_bins
278
+ df[numeric_columns], options=InferOptions.Histogram, num_bins=num_bins
277
279
  )
278
280
 
281
+ @staticmethod
282
+ def _extract_numeric_features(df: pd.DataFrame) -> list[Any]:
283
+ return [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
284
+
279
285
  @property
280
286
  def is_dir(self):
281
287
  return True
@@ -445,8 +451,8 @@ class LegacyModelArtifact(LegacyArtifact):
445
451
  self.model_file = model_file
446
452
  self.parameters = parameters or {}
447
453
  self.metrics = metrics or {}
448
- self.inputs: List[Feature] = inputs or []
449
- self.outputs: List[Feature] = outputs or []
454
+ self.inputs: list[Feature] = inputs or []
455
+ self.outputs: list[Feature] = outputs or []
450
456
  self.extra_data = extra_data or {}
451
457
  self.framework = framework
452
458
  self.algorithm = algorithm
@@ -456,21 +462,21 @@ class LegacyModelArtifact(LegacyArtifact):
456
462
  self.model_target_file = model_target_file
457
463
 
458
464
  @property
459
- def inputs(self) -> List[Feature]:
465
+ def inputs(self) -> list[Feature]:
460
466
  """input feature list"""
461
467
  return self._inputs
462
468
 
463
469
  @inputs.setter
464
- def inputs(self, inputs: List[Feature]):
470
+ def inputs(self, inputs: list[Feature]):
465
471
  self._inputs = ObjectList.from_list(Feature, inputs)
466
472
 
467
473
  @property
468
- def outputs(self) -> List[Feature]:
474
+ def outputs(self) -> list[Feature]:
469
475
  """output feature list"""
470
476
  return self._outputs
471
477
 
472
478
  @outputs.setter
473
- def outputs(self, outputs: List[Feature]):
479
+ def outputs(self, outputs: list[Feature]):
474
480
  self._outputs = ObjectList.from_list(Feature, outputs)
475
481
 
476
482
  def infer_from_df(self, df, label_columns=None, with_stats=True, num_bins=None):
@@ -642,8 +648,8 @@ def update_model(
642
648
  parameters: dict = None,
643
649
  metrics: dict = None,
644
650
  extra_data: dict = None,
645
- inputs: List[Feature] = None,
646
- outputs: List[Feature] = None,
651
+ inputs: list[Feature] = None,
652
+ outputs: list[Feature] = None,
647
653
  feature_vector: str = None,
648
654
  feature_weights: list = None,
649
655
  key_prefix: str = "",
@@ -63,9 +63,12 @@ def _init_engine(dsn=None):
63
63
  max_overflow = config.httpdb.db.connections_pool_max_overflow
64
64
  if max_overflow is None:
65
65
  max_overflow = config.httpdb.max_workers
66
+
66
67
  kwargs = {
67
68
  "pool_size": pool_size,
68
69
  "max_overflow": max_overflow,
70
+ "pool_pre_ping": config.httpdb.db.connections_pool_pre_ping,
71
+ "pool_recycle": config.httpdb.db.connections_pool_recycle,
69
72
  }
70
73
  engine = create_engine(dsn, **kwargs)
71
74
  _engines[dsn] = engine
@@ -82,13 +82,15 @@ def parse_monitoring_stream_path(
82
82
  if application_name is None:
83
83
  stream_uri = (
84
84
  mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
85
- project=project
85
+ project=project, namespace=mlrun.mlconf.namespace
86
86
  )
87
87
  )
88
88
  else:
89
89
  stream_uri = (
90
90
  mlrun.mlconf.model_endpoint_monitoring.default_http_sink_app.format(
91
- project=project, application_name=application_name
91
+ project=project,
92
+ application_name=application_name,
93
+ namespace=mlrun.mlconf.namespace,
92
94
  )
93
95
  )
94
96
  return stream_uri
@@ -43,6 +43,7 @@ from .clusterization_spec import (
43
43
  ClusterizationSpec,
44
44
  WaitForChiefToReachOnlineStateFeatureFlag,
45
45
  )
46
+ from .common import ImageBuilder
46
47
  from .constants import (
47
48
  APIStates,
48
49
  ClusterizationRole,
@@ -113,6 +114,7 @@ from .model_monitoring import (
113
114
  EventFieldType,
114
115
  EventKeyMetrics,
115
116
  Features,
117
+ FeatureSetFeatures,
116
118
  FeatureValues,
117
119
  GrafanaColumn,
118
120
  GrafanaDataPoint,
@@ -0,0 +1,40 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import typing
16
+
17
+ import pydantic
18
+
19
+
20
+ class ImageBuilder(pydantic.BaseModel):
21
+ functionSourceCode: typing.Optional[str] = None
22
+ codeEntryType: typing.Optional[str] = None
23
+ codeEntryAttributes: typing.Optional[str] = None
24
+ source: typing.Optional[str] = None
25
+ code_origin: typing.Optional[str] = None
26
+ origin_filename: typing.Optional[str] = None
27
+ image: typing.Optional[str] = None
28
+ base_image: typing.Optional[str] = None
29
+ commands: typing.Optional[list] = None
30
+ extra: typing.Optional[str] = None
31
+ extra_args: typing.Optional[dict] = None
32
+ builder_env: typing.Optional[dict] = None
33
+ secret: typing.Optional[str] = None
34
+ registry: typing.Optional[str] = None
35
+ load_source_on_run: typing.Optional[bool] = None
36
+ with_mlrun: typing.Optional[bool] = None
37
+ auto_build: typing.Optional[bool] = None
38
+ build_pod: typing.Optional[str] = None
39
+ requirements: typing.Optional[list] = None
40
+ source_code_target_dir: typing.Optional[str] = None
@@ -22,6 +22,7 @@ from .constants import (
22
22
  EventFieldType,
23
23
  EventKeyMetrics,
24
24
  EventLiveStats,
25
+ FeatureSetFeatures,
25
26
  FileTargetKind,
26
27
  FunctionURI,
27
28
  ModelEndpointTarget,
@@ -77,6 +77,26 @@ class EventFieldType:
77
77
  SAMPLE_PARQUET_PATH = "sample_parquet_path"
78
78
 
79
79
 
80
+ class MonitoringStrEnum(StrEnum):
81
+ @classmethod
82
+ def list(cls):
83
+ return list(map(lambda c: c.value, cls))
84
+
85
+
86
+ class FeatureSetFeatures(MonitoringStrEnum):
87
+ LATENCY = EventFieldType.LATENCY
88
+ ERROR_COUNT = EventFieldType.ERROR_COUNT
89
+ METRICS = EventFieldType.METRICS
90
+
91
+ @classmethod
92
+ def time_stamp(cls):
93
+ return EventFieldType.TIMESTAMP
94
+
95
+ @classmethod
96
+ def entity(cls):
97
+ return EventFieldType.ENDPOINT_ID
98
+
99
+
80
100
  class ApplicationEvent:
81
101
  APPLICATION_NAME = "application_name"
82
102
  CURRENT_STATS = "current_stats"
@@ -89,7 +109,7 @@ class ApplicationEvent:
89
109
  OUTPUT_STREAM_URI = "output_stream_uri"
90
110
 
91
111
 
92
- class WriterEvent(StrEnum):
112
+ class WriterEvent(MonitoringStrEnum):
93
113
  APPLICATION_NAME = "application_name"
94
114
  ENDPOINT_ID = "endpoint_id"
95
115
  START_INFER_TIME = "start_infer_time"
@@ -101,10 +121,6 @@ class WriterEvent(StrEnum):
101
121
  RESULT_EXTRA_DATA = "result_extra_data"
102
122
  CURRENT_STATS = "current_stats"
103
123
 
104
- @classmethod
105
- def list(cls):
106
- return list(map(lambda c: c.value, cls))
107
-
108
124
 
109
125
  class EventLiveStats:
110
126
  LATENCY_AVG_5M = "latency_avg_5m"
@@ -19,6 +19,7 @@ import pydantic
19
19
 
20
20
  import mlrun.common.types
21
21
 
22
+ from .common import ImageBuilder
22
23
  from .object import ObjectKind, ObjectStatus
23
24
 
24
25
 
@@ -85,6 +86,7 @@ class ProjectSpec(pydantic.BaseModel):
85
86
  desired_state: typing.Optional[ProjectDesiredState] = ProjectDesiredState.online
86
87
  custom_packagers: typing.Optional[typing.List[typing.Tuple[str, bool]]] = None
87
88
  default_image: typing.Optional[str] = None
89
+ build: typing.Optional[ImageBuilder] = None
88
90
 
89
91
  class Config:
90
92
  extra = pydantic.Extra.allow
mlrun/config.py CHANGED
@@ -288,6 +288,12 @@ default_config = {
288
288
  "state": "online",
289
289
  "retry_api_call_on_exception": "enabled",
290
290
  "http_connection_timeout_keep_alive": 11,
291
+ # http client used by httpdb
292
+ "http": {
293
+ # when True, the client will verify the server's TLS
294
+ # set to False for backwards compatibility.
295
+ "verify": False,
296
+ },
291
297
  "db": {
292
298
  "commit_retry_timeout": 30,
293
299
  "commit_retry_interval": 3,
@@ -306,7 +312,11 @@ default_config = {
306
312
  # default is 16MB, max 1G, for more info https://dev.mysql.com/doc/refman/8.0/en/packet-too-large.html
307
313
  "max_allowed_packet": 64000000, # 64MB
308
314
  },
309
- # None will set this to be equal to the httpdb.max_workers
315
+ # tests connections for liveness upon each checkout
316
+ "connections_pool_pre_ping": True,
317
+ # this setting causes the pool to recycle connections after the given number of seconds has passed
318
+ "connections_pool_recycle": 60 * 60,
319
+ # None defaults to httpdb.max_workers
310
320
  "connections_pool_size": None,
311
321
  "connections_pool_max_overflow": None,
312
322
  # below is a db-specific configuration
@@ -434,7 +444,7 @@ default_config = {
434
444
  # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
435
445
  # git+https://github.com/mlrun/mlrun@development. by default uses the version
436
446
  "mlrun_version_specifier": "",
437
- "kaniko_image": "gcr.io/kaniko-project/executor:v1.8.0", # kaniko builder image
447
+ "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1", # kaniko builder image
438
448
  "kaniko_init_container_image": "alpine:3.18",
439
449
  # image for kaniko init container when docker registry is ECR
440
450
  "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",
@@ -481,8 +491,8 @@ default_config = {
481
491
  "offline_storage_path": "model-endpoints/{kind}",
482
492
  # Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
483
493
  # when the user is working in CE environment and has not provided any stream path.
484
- "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
485
- "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
494
+ "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
495
+ "default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
486
496
  "batch_processing_function_branch": "master",
487
497
  "parquet_batching_max_events": 10_000,
488
498
  "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
@@ -601,8 +611,9 @@ default_config = {
601
611
  },
602
612
  "workflows": {
603
613
  "default_workflow_runner_name": "workflow-runner-{}",
604
- # Default timeout seconds for retrieving workflow id after execution:
605
- "timeouts": {"local": 120, "kfp": 30, "remote": 30},
614
+ # Default timeout seconds for retrieving workflow id after execution
615
+ # Remote workflow timeout is the maximum between remote and the inner engine timeout
616
+ "timeouts": {"local": 120, "kfp": 60, "remote": 60 * 5},
606
617
  },
607
618
  "log_collector": {
608
619
  "address": "localhost:8282",
@@ -954,10 +965,10 @@ class Config:
954
965
  with_gpu = (
955
966
  with_gpu_requests if requirement == "requests" else with_gpu_limits
956
967
  )
957
- resources[
958
- requirement
959
- ] = self.get_default_function_pod_requirement_resources(
960
- requirement, with_gpu
968
+ resources[requirement] = (
969
+ self.get_default_function_pod_requirement_resources(
970
+ requirement, with_gpu
971
+ )
961
972
  )
962
973
  return resources
963
974
 
@@ -1051,7 +1062,7 @@ class Config:
1051
1062
  target: str = "online",
1052
1063
  artifact_path: str = None,
1053
1064
  application_name: str = None,
1054
- ) -> str:
1065
+ ) -> typing.Union[str, list[str]]:
1055
1066
  """Get the full path from the configuration based on the provided project and kind.
1056
1067
 
1057
1068
  :param project: Project name.
@@ -1067,7 +1078,8 @@ class Config:
1067
1078
  relative artifact path will be taken from the global MLRun artifact path.
1068
1079
  :param application_name: Application name, None for model_monitoring_stream.
1069
1080
 
1070
- :return: Full configured path for the provided kind.
1081
+ :return: Full configured path for the provided kind. Can be either a single path
1082
+ or a list of paths in the case of the online model monitoring stream path.
1071
1083
  """
1072
1084
 
1073
1085
  if target != "offline":
@@ -1088,12 +1100,22 @@ class Config:
1088
1100
  if application_name is None
1089
1101
  else f"{kind}-{application_name.lower()}",
1090
1102
  )
1091
- return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1092
- project=project,
1093
- kind=kind
1094
- if application_name is None
1095
- else f"{kind}-{application_name.lower()}",
1096
- )
1103
+ elif kind == "stream": # return list for mlrun<1.6.3 BC
1104
+ return [
1105
+ mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1106
+ project=project,
1107
+ kind=kind,
1108
+ ), # old stream uri (pipelines) for BC ML-6043
1109
+ mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1110
+ project=project,
1111
+ kind=kind,
1112
+ ), # new stream uri (projects)
1113
+ ]
1114
+ else:
1115
+ return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1116
+ project=project,
1117
+ kind=kind,
1118
+ )
1097
1119
 
1098
1120
  # Get the current offline path from the configuration
1099
1121
  file_path = mlrun.mlconf.model_endpoint_monitoring.offline_storage_path.format(
@@ -1340,12 +1362,21 @@ def read_env(env=None, prefix=env_prefix):
1340
1362
  if igz_domain:
1341
1363
  config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
1342
1364
 
1343
- if config.get("log_level"):
1365
+ if log_level := config.get("log_level"):
1344
1366
  import mlrun.utils.logger
1345
1367
 
1346
1368
  # logger created (because of imports mess) before the config is loaded (in tests), therefore we're changing its
1347
1369
  # level manually
1348
- mlrun.utils.logger.set_logger_level(config["log_level"])
1370
+ mlrun.utils.logger.set_logger_level(log_level)
1371
+
1372
+ if log_formatter_name := config.get("log_formatter"):
1373
+ import mlrun.utils.logger
1374
+
1375
+ log_formatter = mlrun.utils.create_formatter_instance(
1376
+ mlrun.utils.FormatterKinds(log_formatter_name)
1377
+ )
1378
+ mlrun.utils.logger.get_handler("default").setFormatter(log_formatter)
1379
+
1349
1380
  # The default function pod resource values are of type str; however, when reading from environment variable numbers,
1350
1381
  # it converts them to type int if contains only number, so we want to convert them to str.
1351
1382
  _convert_resources_to_str(config)
@@ -41,6 +41,7 @@ class ValueType(str, Enum):
41
41
  BYTES = "bytes"
42
42
  STRING = "str"
43
43
  DATETIME = "datetime"
44
+ LIST = "List"
44
45
  BYTES_LIST = "List[bytes]"
45
46
  STRING_LIST = "List[string]"
46
47
  INT32_LIST = "List[int32]"
@@ -48,6 +49,7 @@ class ValueType(str, Enum):
48
49
  DOUBLE_LIST = "List[float]"
49
50
  FLOAT_LIST = "List[float32]"
50
51
  BOOL_LIST = "List[bool]"
52
+ Tuple = "Tuple"
51
53
 
52
54
 
53
55
  def pd_schema_to_value_type(value):
@@ -102,6 +104,8 @@ def python_type_to_value_type(value_type):
102
104
  "datetime64[ns]": ValueType.INT64,
103
105
  "datetime64[ns, tz]": ValueType.INT64,
104
106
  "category": ValueType.STRING,
107
+ "list": ValueType.LIST,
108
+ "tuple": ValueType.Tuple,
105
109
  }
106
110
 
107
111
  if type_name in type_map:
@@ -175,9 +175,9 @@ class AzureBlobStore(DataStore):
175
175
 
176
176
  if "client_secret" in st or "client_id" in st or "tenant_id" in st:
177
177
  res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
178
- res[
179
- f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"
180
- ] = "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
178
+ res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
179
+ "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
180
+ )
181
181
  if "client_id" in st:
182
182
  res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
183
183
  "client_id"
@@ -188,14 +188,14 @@ class AzureBlobStore(DataStore):
188
188
  ]
189
189
  if "tenant_id" in st:
190
190
  tenant_id = st["tenant_id"]
191
- res[
192
- f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"
193
- ] = f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
191
+ res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
192
+ f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
193
+ )
194
194
 
195
195
  if "sas_token" in st:
196
196
  res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
197
- res[
198
- f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"
199
- ] = "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
197
+ res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
198
+ "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
199
+ )
200
200
  res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
201
201
  return res
mlrun/datastore/base.py CHANGED
@@ -27,6 +27,7 @@ import requests
27
27
  import urllib3
28
28
  from deprecated import deprecated
29
29
 
30
+ import mlrun.config
30
31
  import mlrun.errors
31
32
  from mlrun.errors import err_to_str
32
33
  from mlrun.utils import StorePrefix, is_ipython, logger
@@ -34,10 +35,6 @@ from mlrun.utils import StorePrefix, is_ipython, logger
34
35
  from .store_resources import is_store_uri, parse_store_uri
35
36
  from .utils import filter_df_start_end_time, select_columns_from_df
36
37
 
37
- verify_ssl = False
38
- if not verify_ssl:
39
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
40
-
41
38
 
42
39
  class FileStats:
43
40
  def __init__(self, size, modified, content_type=None):
@@ -643,45 +640,6 @@ def basic_auth_header(user, password):
643
640
  return {"Authorization": authstr}
644
641
 
645
642
 
646
- def http_get(url, headers=None, auth=None):
647
- try:
648
- response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
649
- except OSError as exc:
650
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
651
-
652
- mlrun.errors.raise_for_status(response)
653
-
654
- return response.content
655
-
656
-
657
- def http_head(url, headers=None, auth=None):
658
- try:
659
- response = requests.head(url, headers=headers, auth=auth, verify=verify_ssl)
660
- except OSError as exc:
661
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
662
-
663
- mlrun.errors.raise_for_status(response)
664
-
665
- return response.headers
666
-
667
-
668
- def http_put(url, data, headers=None, auth=None, session=None):
669
- try:
670
- put_api = session.put if session else requests.put
671
- response = put_api(
672
- url, data=data, headers=headers, auth=auth, verify=verify_ssl
673
- )
674
- except OSError as exc:
675
- raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}") from exc
676
-
677
- mlrun.errors.raise_for_status(response)
678
-
679
-
680
- def http_upload(url, file_path, headers=None, auth=None):
681
- with open(file_path, "rb") as data:
682
- http_put(url, data, headers, auth)
683
-
684
-
685
643
  class HttpStore(DataStore):
686
644
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
687
645
  super().__init__(parent, name, schema, endpoint, secrets)
@@ -709,7 +667,7 @@ class HttpStore(DataStore):
709
667
  raise ValueError("unimplemented")
710
668
 
711
669
  def get(self, key, size=None, offset=0):
712
- data = http_get(self.url + self._join(key), self._headers, self.auth)
670
+ data = self._http_get(self.url + self._join(key), self._headers, self.auth)
713
671
  if offset:
714
672
  data = data[offset:]
715
673
  if size:
@@ -729,6 +687,26 @@ class HttpStore(DataStore):
729
687
  f"schema as it is not secure and is not recommended."
730
688
  )
731
689
 
690
+ def _http_get(
691
+ self,
692
+ url,
693
+ headers=None,
694
+ auth=None,
695
+ ):
696
+ # import here to prevent import cycle
697
+ from mlrun.config import config as mlconf
698
+
699
+ verify_ssl = mlconf.httpdb.http.verify
700
+ try:
701
+ if not verify_ssl:
702
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
703
+ response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
704
+ except OSError as exc:
705
+ raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
706
+
707
+ mlrun.errors.raise_for_status(response)
708
+ return response.content
709
+
732
710
 
733
711
  # This wrapper class is designed to extract the 'ds' schema and profile name from URL-formatted paths.
734
712
  # Within fsspec, the AbstractFileSystem::_strip_protocol() internal method is used to handle complete URL paths.
@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
147
147
  if "project_id" in credentials:
148
148
  res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
149
149
  if "private_key_id" in credentials:
150
- res[
151
- "spark.hadoop.fs.gs.auth.service.account.private.key.id"
152
- ] = credentials["private_key_id"]
150
+ res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
151
+ credentials["private_key_id"]
152
+ )
153
153
  if "private_key" in credentials:
154
- res[
155
- "spark.hadoop.fs.gs.auth.service.account.private.key"
156
- ] = credentials["private_key"]
154
+ res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
155
+ credentials["private_key"]
156
+ )
157
157
  if "client_email" in credentials:
158
158
  res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
159
159
  "client_email"