mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -27,6 +27,7 @@ import copy
|
|
|
27
27
|
import json
|
|
28
28
|
import os
|
|
29
29
|
import typing
|
|
30
|
+
import warnings
|
|
30
31
|
from collections.abc import Mapping
|
|
31
32
|
from datetime import timedelta
|
|
32
33
|
from distutils.util import strtobool
|
|
@@ -35,6 +36,7 @@ from threading import Lock
|
|
|
35
36
|
|
|
36
37
|
import dotenv
|
|
37
38
|
import semver
|
|
39
|
+
import urllib3.exceptions
|
|
38
40
|
import yaml
|
|
39
41
|
|
|
40
42
|
import mlrun.common.constants
|
|
@@ -52,6 +54,11 @@ default_config = {
|
|
|
52
54
|
"kubernetes": {
|
|
53
55
|
"kubeconfig_path": "", # local path to kubeconfig file (for development purposes),
|
|
54
56
|
# empty by default as the API already running inside k8s cluster
|
|
57
|
+
"pagination": {
|
|
58
|
+
# pagination config for interacting with k8s API
|
|
59
|
+
"list_pods_limit": 200,
|
|
60
|
+
"list_crd_objects_limit": 200,
|
|
61
|
+
},
|
|
55
62
|
},
|
|
56
63
|
"dbpath": "", # db/api url
|
|
57
64
|
# url to nuclio dashboard api (can be with user & token, e.g. https://username:password@dashboard-url.com)
|
|
@@ -64,11 +71,15 @@ default_config = {
|
|
|
64
71
|
"api_base_version": "v1",
|
|
65
72
|
"version": "", # will be set to current version
|
|
66
73
|
"images_tag": "", # tag to use with mlrun images e.g. mlrun/mlrun (defaults to version)
|
|
67
|
-
|
|
74
|
+
# registry to use with mlrun images that start with "mlrun/" e.g. quay.io/ (defaults to empty, for dockerhub)
|
|
75
|
+
"images_registry": "",
|
|
76
|
+
# registry to use with non-mlrun images (don't start with "mlrun/") specified in 'images_to_enrich_registry'
|
|
77
|
+
# defaults to empty, for dockerhub
|
|
78
|
+
"vendor_images_registry": "",
|
|
68
79
|
# comma separated list of images that are in the specified images_registry, and therefore will be enriched with this
|
|
69
80
|
# registry when used. default to mlrun/* which means any image which is of the mlrun repository (mlrun/mlrun,
|
|
70
81
|
# mlrun/ml-base, etc...)
|
|
71
|
-
"images_to_enrich_registry": "^mlrun
|
|
82
|
+
"images_to_enrich_registry": "^mlrun/*,python:3.9",
|
|
72
83
|
"kfp_url": "",
|
|
73
84
|
"kfp_ttl": "14400", # KFP ttl in sec, after that completed PODs will be deleted
|
|
74
85
|
"kfp_image": "mlrun/mlrun", # image to use for KFP runner (defaults to mlrun/mlrun)
|
|
@@ -104,7 +115,12 @@ default_config = {
|
|
|
104
115
|
# max number of parallel abort run jobs in runs monitoring
|
|
105
116
|
"concurrent_abort_stale_runs_workers": 10,
|
|
106
117
|
"list_runs_time_period_in_days": 7, # days
|
|
107
|
-
}
|
|
118
|
+
},
|
|
119
|
+
"projects": {
|
|
120
|
+
"summaries": {
|
|
121
|
+
"cache_interval": "30",
|
|
122
|
+
},
|
|
123
|
+
},
|
|
108
124
|
},
|
|
109
125
|
"crud": {
|
|
110
126
|
"runs": {
|
|
@@ -138,6 +154,11 @@ default_config = {
|
|
|
138
154
|
"datasets": {
|
|
139
155
|
"max_preview_columns": 100,
|
|
140
156
|
},
|
|
157
|
+
"limits": {
|
|
158
|
+
"max_chunk_size": 1024 * 1024 * 1, # 1MB
|
|
159
|
+
"max_preview_size": 1024 * 1024 * 10, # 10MB
|
|
160
|
+
"max_download_size": 1024 * 1024 * 100, # 100MB
|
|
161
|
+
},
|
|
141
162
|
},
|
|
142
163
|
# FIXME: Adding these defaults here so we won't need to patch the "installing component" (provazio-controller) to
|
|
143
164
|
# configure this values on field systems, for newer system this will be configured correctly
|
|
@@ -238,7 +259,7 @@ default_config = {
|
|
|
238
259
|
},
|
|
239
260
|
"application": {
|
|
240
261
|
"default_sidecar_internal_port": 8050,
|
|
241
|
-
"default_authentication_mode":
|
|
262
|
+
"default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
|
|
242
263
|
},
|
|
243
264
|
},
|
|
244
265
|
# TODO: function defaults should be moved to the function spec config above
|
|
@@ -250,7 +271,7 @@ default_config = {
|
|
|
250
271
|
"remote": "mlrun/mlrun",
|
|
251
272
|
"dask": "mlrun/ml-base",
|
|
252
273
|
"mpijob": "mlrun/mlrun",
|
|
253
|
-
"application": "python:3.9
|
|
274
|
+
"application": "python:3.9",
|
|
254
275
|
},
|
|
255
276
|
# see enrich_function_preemption_spec for more info,
|
|
256
277
|
# and mlrun.common.schemas.function.PreemptionModes for available options
|
|
@@ -265,6 +286,16 @@ default_config = {
|
|
|
265
286
|
"url": "",
|
|
266
287
|
"service": "mlrun-api-chief",
|
|
267
288
|
"port": 8080,
|
|
289
|
+
"feature_gates": {
|
|
290
|
+
"scheduler": "enabled",
|
|
291
|
+
"project_sync": "enabled",
|
|
292
|
+
"cleanup": "enabled",
|
|
293
|
+
"runs_monitoring": "enabled",
|
|
294
|
+
"pagination_cache": "enabled",
|
|
295
|
+
"project_summaries": "enabled",
|
|
296
|
+
"start_logs": "enabled",
|
|
297
|
+
"stop_logs": "enabled",
|
|
298
|
+
},
|
|
268
299
|
},
|
|
269
300
|
"worker": {
|
|
270
301
|
"sync_with_chief": {
|
|
@@ -302,7 +333,7 @@ default_config = {
|
|
|
302
333
|
"http": {
|
|
303
334
|
# when True, the client will verify the server's TLS
|
|
304
335
|
# set to False for backwards compatibility.
|
|
305
|
-
"verify":
|
|
336
|
+
"verify": True,
|
|
306
337
|
},
|
|
307
338
|
"db": {
|
|
308
339
|
"commit_retry_timeout": 30,
|
|
@@ -433,7 +464,6 @@ default_config = {
|
|
|
433
464
|
"followers": "",
|
|
434
465
|
# This is used as the interval for the sync loop both when mlrun is leader and follower
|
|
435
466
|
"periodic_sync_interval": "1 minute",
|
|
436
|
-
"counters_cache_ttl": "2 minutes",
|
|
437
467
|
"project_owners_cache_ttl": "30 seconds",
|
|
438
468
|
# access key to be used when the leader is iguazio and polling is done from it
|
|
439
469
|
"iguazio_access_key": "",
|
|
@@ -462,10 +492,10 @@ default_config = {
|
|
|
462
492
|
# pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
|
|
463
493
|
# git+https://github.com/mlrun/mlrun@development. by default uses the version
|
|
464
494
|
"mlrun_version_specifier": "",
|
|
465
|
-
"kaniko_image": "gcr.io/kaniko-project/executor:v1.
|
|
495
|
+
"kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2", # kaniko builder image
|
|
466
496
|
"kaniko_init_container_image": "alpine:3.18",
|
|
467
497
|
# image for kaniko init container when docker registry is ECR
|
|
468
|
-
"kaniko_aws_cli_image": "amazon/aws-cli:2.
|
|
498
|
+
"kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
|
|
469
499
|
# kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
|
|
470
500
|
# a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
|
|
471
501
|
"kaniko_image_fs_extraction_retries": "3",
|
|
@@ -509,7 +539,6 @@ default_config = {
|
|
|
509
539
|
"store_prefixes": {
|
|
510
540
|
"default": "v3io:///users/pipelines/{project}/model-endpoints/{kind}",
|
|
511
541
|
"user_space": "v3io:///projects/{project}/model-endpoints/{kind}",
|
|
512
|
-
"stream": "", # TODO: Delete in 1.9.0
|
|
513
542
|
"monitoring_application": "v3io:///users/pipelines/{project}/monitoring-apps/",
|
|
514
543
|
},
|
|
515
544
|
# Offline storage path can be either relative or a full path. This path is used for general offline data
|
|
@@ -522,7 +551,6 @@ default_config = {
|
|
|
522
551
|
"parquet_batching_max_events": 10_000,
|
|
523
552
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
524
553
|
# See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
|
|
525
|
-
"store_type": "v3io-nosql", # TODO: Delete in 1.9.0
|
|
526
554
|
"endpoint_store_connection": "",
|
|
527
555
|
# See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
|
|
528
556
|
"tsdb_connection": "",
|
|
@@ -706,7 +734,7 @@ default_config = {
|
|
|
706
734
|
"grafana_url": "",
|
|
707
735
|
"alerts": {
|
|
708
736
|
# supported modes: "enabled", "disabled".
|
|
709
|
-
"mode": "
|
|
737
|
+
"mode": "disabled",
|
|
710
738
|
# maximum number of alerts we allow to be configured.
|
|
711
739
|
# user will get an error when exceeding this
|
|
712
740
|
"max_allowed": 10000,
|
|
@@ -768,7 +796,21 @@ class Config:
|
|
|
768
796
|
for key, value in cfg.items():
|
|
769
797
|
if hasattr(self, key):
|
|
770
798
|
if isinstance(value, dict):
|
|
771
|
-
|
|
799
|
+
# ignore the `skip_errors` flag here
|
|
800
|
+
# if the key does not align with what mlrun config expects it is a user
|
|
801
|
+
# input error that can lead to unexpected behavior.
|
|
802
|
+
# raise the exception to ensure configuration is loaded correctly and do not
|
|
803
|
+
# ignore any errors.
|
|
804
|
+
config_value = getattr(self, key)
|
|
805
|
+
try:
|
|
806
|
+
config_value.update(value)
|
|
807
|
+
except AttributeError as exc:
|
|
808
|
+
if not isinstance(config_value, (dict, Config)):
|
|
809
|
+
raise ValueError(
|
|
810
|
+
f"Can not update `{key}` config. "
|
|
811
|
+
f"Expected a configuration but received {type(value)}"
|
|
812
|
+
) from exc
|
|
813
|
+
raise exc
|
|
772
814
|
else:
|
|
773
815
|
try:
|
|
774
816
|
setattr(self, key, value)
|
|
@@ -840,7 +882,7 @@ class Config:
|
|
|
840
882
|
f"Unable to decode {attribute_path}"
|
|
841
883
|
)
|
|
842
884
|
parsed_attribute_value = json.loads(decoded_attribute_value)
|
|
843
|
-
if
|
|
885
|
+
if not isinstance(parsed_attribute_value, expected_type):
|
|
844
886
|
raise mlrun.errors.MLRunInvalidArgumentTypeError(
|
|
845
887
|
f"Expected type {expected_type}, got {type(parsed_attribute_value)}"
|
|
846
888
|
)
|
|
@@ -1032,6 +1074,14 @@ class Config:
|
|
|
1032
1074
|
resource_requirement.pop(gpu)
|
|
1033
1075
|
return resource_requirement
|
|
1034
1076
|
|
|
1077
|
+
def force_api_gateway_ssl_redirect(self):
|
|
1078
|
+
"""
|
|
1079
|
+
Get the default value for the ssl_redirect configuration.
|
|
1080
|
+
In Iguazio we always want to redirect to HTTPS, in other cases we don't.
|
|
1081
|
+
:return: True if we should redirect to HTTPS, False otherwise.
|
|
1082
|
+
"""
|
|
1083
|
+
return self.is_running_on_iguazio()
|
|
1084
|
+
|
|
1035
1085
|
def to_dict(self):
|
|
1036
1086
|
return copy.deepcopy(self._cfg)
|
|
1037
1087
|
|
|
@@ -1064,6 +1114,9 @@ class Config:
|
|
|
1064
1114
|
# importing here to avoid circular dependency
|
|
1065
1115
|
import mlrun.db
|
|
1066
1116
|
|
|
1117
|
+
# It ensures that SSL verification is set before establishing a connection
|
|
1118
|
+
_configure_ssl_verification(self.httpdb.http.verify)
|
|
1119
|
+
|
|
1067
1120
|
# when dbpath is set we want to connect to it which will sync configuration from it to the client
|
|
1068
1121
|
mlrun.db.get_run_db(value, force_reconnect=True)
|
|
1069
1122
|
|
|
@@ -1092,10 +1145,10 @@ class Config:
|
|
|
1092
1145
|
project: str = "",
|
|
1093
1146
|
kind: str = "",
|
|
1094
1147
|
target: str = "online",
|
|
1095
|
-
artifact_path: str = None,
|
|
1096
|
-
function_name: str = None,
|
|
1148
|
+
artifact_path: typing.Optional[str] = None,
|
|
1149
|
+
function_name: typing.Optional[str] = None,
|
|
1097
1150
|
**kwargs,
|
|
1098
|
-
) ->
|
|
1151
|
+
) -> str:
|
|
1099
1152
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
1100
1153
|
|
|
1101
1154
|
:param project: Project name.
|
|
@@ -1111,8 +1164,7 @@ class Config:
|
|
|
1111
1164
|
relative artifact path will be taken from the global MLRun artifact path.
|
|
1112
1165
|
:param function_name: Application name, None for model_monitoring_stream.
|
|
1113
1166
|
|
|
1114
|
-
:return: Full configured path for the provided kind.
|
|
1115
|
-
or a list of paths in the case of the online model monitoring stream path.
|
|
1167
|
+
:return: Full configured path for the provided kind.
|
|
1116
1168
|
"""
|
|
1117
1169
|
|
|
1118
1170
|
if target != "offline":
|
|
@@ -1133,17 +1185,11 @@ class Config:
|
|
|
1133
1185
|
if function_name is None
|
|
1134
1186
|
else f"{kind}-{function_name.lower()}",
|
|
1135
1187
|
)
|
|
1136
|
-
elif kind == "stream":
|
|
1137
|
-
return
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
), # old stream uri (pipelines) for BC ML-6043
|
|
1142
|
-
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1143
|
-
project=project,
|
|
1144
|
-
kind=kind,
|
|
1145
|
-
), # new stream uri (projects)
|
|
1146
|
-
]
|
|
1188
|
+
elif kind == "stream":
|
|
1189
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1190
|
+
project=project,
|
|
1191
|
+
kind=kind,
|
|
1192
|
+
)
|
|
1147
1193
|
else:
|
|
1148
1194
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1149
1195
|
project=project,
|
|
@@ -1206,12 +1252,11 @@ class Config:
|
|
|
1206
1252
|
|
|
1207
1253
|
return storage_options
|
|
1208
1254
|
|
|
1209
|
-
def
|
|
1210
|
-
if not version:
|
|
1211
|
-
version = self.nuclio_version
|
|
1255
|
+
def is_explicit_ack_enabled(self) -> bool:
|
|
1212
1256
|
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1213
|
-
not
|
|
1214
|
-
or semver.VersionInfo.parse(
|
|
1257
|
+
not self.nuclio_version
|
|
1258
|
+
or semver.VersionInfo.parse(self.nuclio_version)
|
|
1259
|
+
>= semver.VersionInfo.parse("1.12.10")
|
|
1215
1260
|
)
|
|
1216
1261
|
|
|
1217
1262
|
|
|
@@ -1261,6 +1306,7 @@ def _do_populate(env=None, skip_errors=False):
|
|
|
1261
1306
|
if data:
|
|
1262
1307
|
config.update(data, skip_errors=skip_errors)
|
|
1263
1308
|
|
|
1309
|
+
_configure_ssl_verification(config.httpdb.http.verify)
|
|
1264
1310
|
_validate_config(config)
|
|
1265
1311
|
|
|
1266
1312
|
|
|
@@ -1320,6 +1366,16 @@ def _convert_str(value, typ):
|
|
|
1320
1366
|
return typ(value)
|
|
1321
1367
|
|
|
1322
1368
|
|
|
1369
|
+
def _configure_ssl_verification(verify_ssl: bool) -> None:
|
|
1370
|
+
"""Configure SSL verification warnings based on the setting."""
|
|
1371
|
+
if not verify_ssl:
|
|
1372
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
1373
|
+
else:
|
|
1374
|
+
# If the user changes the `verify` setting to `True` at runtime using `mlrun.set_env_from_file` after
|
|
1375
|
+
# importing `mlrun`, we need to reload the `mlrun` configuration and enable this warning.
|
|
1376
|
+
warnings.simplefilter("default", urllib3.exceptions.InsecureRequestWarning)
|
|
1377
|
+
|
|
1378
|
+
|
|
1323
1379
|
def read_env(env=None, prefix=env_prefix):
|
|
1324
1380
|
"""Read configuration from environment"""
|
|
1325
1381
|
env = os.environ if env is None else env
|
mlrun/data_types/data_types.py
CHANGED
|
@@ -70,6 +70,11 @@ def pa_type_to_value_type(type_):
|
|
|
70
70
|
if isinstance(type_, TimestampType):
|
|
71
71
|
return ValueType.DATETIME
|
|
72
72
|
|
|
73
|
+
# pandas category type translates to pyarrow DictionaryType
|
|
74
|
+
# we need to unpack the value type (ML-7868)
|
|
75
|
+
if isinstance(type_, pyarrow.DictionaryType):
|
|
76
|
+
type_ = type_.value_type
|
|
77
|
+
|
|
73
78
|
type_map = {
|
|
74
79
|
pyarrow.bool_(): ValueType.BOOL,
|
|
75
80
|
pyarrow.int64(): ValueType.INT64,
|
|
@@ -139,7 +144,7 @@ def gbq_to_pandas_dtype(gbq_type):
|
|
|
139
144
|
"BOOL": "bool",
|
|
140
145
|
"FLOAT": "float64",
|
|
141
146
|
"INTEGER": pd.Int64Dtype(),
|
|
142
|
-
"TIMESTAMP": "datetime64[ns]",
|
|
147
|
+
"TIMESTAMP": "datetime64[ns, UTC]",
|
|
143
148
|
}
|
|
144
149
|
return type_map.get(gbq_type, "object")
|
|
145
150
|
|
mlrun/data_types/spark.py
CHANGED
|
@@ -20,10 +20,10 @@ import pytz
|
|
|
20
20
|
from pyspark.sql.functions import to_utc_timestamp
|
|
21
21
|
from pyspark.sql.types import BooleanType, DoubleType, TimestampType
|
|
22
22
|
|
|
23
|
+
from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
|
|
23
24
|
from mlrun.utils import logger
|
|
24
25
|
|
|
25
26
|
from .data_types import InferOptions, spark_to_value_type
|
|
26
|
-
from .to_pandas import toPandas
|
|
27
27
|
|
|
28
28
|
try:
|
|
29
29
|
import pyspark.sql.functions as funcs
|
|
@@ -75,7 +75,7 @@ def get_df_preview_spark(df, preview_lines=20):
|
|
|
75
75
|
"""capture preview data from spark df"""
|
|
76
76
|
df = df.limit(preview_lines)
|
|
77
77
|
|
|
78
|
-
result_dict =
|
|
78
|
+
result_dict = spark_df_to_pandas(df).to_dict(orient="split")
|
|
79
79
|
return [result_dict["columns"], *result_dict["data"]]
|
|
80
80
|
|
|
81
81
|
|
mlrun/data_types/to_pandas.py
CHANGED
|
@@ -15,23 +15,13 @@
|
|
|
15
15
|
import warnings
|
|
16
16
|
from collections import Counter
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
IntegerType,
|
|
24
|
-
IntegralType,
|
|
25
|
-
LongType,
|
|
26
|
-
MapType,
|
|
27
|
-
ShortType,
|
|
28
|
-
TimestampType,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def toPandas(spark_df):
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import semver
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _to_pandas(spark_df):
|
|
33
23
|
"""
|
|
34
|
-
Modified version of spark DataFrame.toPandas()
|
|
24
|
+
Modified version of spark DataFrame.toPandas() -
|
|
35
25
|
https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
|
|
36
26
|
|
|
37
27
|
The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
|
|
@@ -40,6 +30,12 @@ def toPandas(spark_df):
|
|
|
40
30
|
This modification adds the missing unit to the dtype.
|
|
41
31
|
"""
|
|
42
32
|
from pyspark.sql.dataframe import DataFrame
|
|
33
|
+
from pyspark.sql.types import (
|
|
34
|
+
BooleanType,
|
|
35
|
+
IntegralType,
|
|
36
|
+
MapType,
|
|
37
|
+
TimestampType,
|
|
38
|
+
)
|
|
43
39
|
|
|
44
40
|
assert isinstance(spark_df, DataFrame)
|
|
45
41
|
|
|
@@ -48,7 +44,6 @@ def toPandas(spark_df):
|
|
|
48
44
|
require_minimum_pandas_version()
|
|
49
45
|
|
|
50
46
|
import numpy as np
|
|
51
|
-
import pandas as pd
|
|
52
47
|
|
|
53
48
|
timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
|
|
54
49
|
|
|
@@ -217,22 +212,59 @@ def toPandas(spark_df):
|
|
|
217
212
|
|
|
218
213
|
def _to_corrected_pandas_type(dt):
|
|
219
214
|
import numpy as np
|
|
215
|
+
from pyspark.sql.types import (
|
|
216
|
+
BooleanType,
|
|
217
|
+
ByteType,
|
|
218
|
+
DoubleType,
|
|
219
|
+
FloatType,
|
|
220
|
+
IntegerType,
|
|
221
|
+
LongType,
|
|
222
|
+
ShortType,
|
|
223
|
+
TimestampType,
|
|
224
|
+
)
|
|
220
225
|
|
|
221
|
-
if
|
|
226
|
+
if isinstance(dt, ByteType):
|
|
222
227
|
return np.int8
|
|
223
|
-
elif
|
|
228
|
+
elif isinstance(dt, ShortType):
|
|
224
229
|
return np.int16
|
|
225
|
-
elif
|
|
230
|
+
elif isinstance(dt, IntegerType):
|
|
226
231
|
return np.int32
|
|
227
|
-
elif
|
|
232
|
+
elif isinstance(dt, LongType):
|
|
228
233
|
return np.int64
|
|
229
|
-
elif
|
|
234
|
+
elif isinstance(dt, FloatType):
|
|
230
235
|
return np.float32
|
|
231
|
-
elif
|
|
236
|
+
elif isinstance(dt, DoubleType):
|
|
232
237
|
return np.float64
|
|
233
|
-
elif
|
|
238
|
+
elif isinstance(dt, BooleanType):
|
|
234
239
|
return bool
|
|
235
|
-
elif
|
|
240
|
+
elif isinstance(dt, TimestampType):
|
|
236
241
|
return "datetime64[ns]"
|
|
237
242
|
else:
|
|
238
243
|
return None
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def spark_df_to_pandas(spark_df):
|
|
247
|
+
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
248
|
+
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
249
|
+
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
250
|
+
if semver.parse(pd.__version__)["major"] >= 2:
|
|
251
|
+
import pyspark.sql.functions as pyspark_functions
|
|
252
|
+
|
|
253
|
+
type_conversion_dict = {}
|
|
254
|
+
for field in spark_df.schema.fields:
|
|
255
|
+
if str(field.dataType) == "TimestampType":
|
|
256
|
+
spark_df = spark_df.withColumn(
|
|
257
|
+
field.name,
|
|
258
|
+
pyspark_functions.date_format(
|
|
259
|
+
pyspark_functions.to_timestamp(field.name),
|
|
260
|
+
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
|
|
261
|
+
),
|
|
262
|
+
)
|
|
263
|
+
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
264
|
+
|
|
265
|
+
df = _to_pandas(spark_df)
|
|
266
|
+
if type_conversion_dict:
|
|
267
|
+
df = df.astype(type_conversion_dict)
|
|
268
|
+
return df
|
|
269
|
+
else:
|
|
270
|
+
return _to_pandas(spark_df)
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -117,6 +117,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
117
117
|
return OutputStream(stream_path, **kwargs)
|
|
118
118
|
elif stream_path.startswith("v3io"):
|
|
119
119
|
endpoint, stream_path = parse_path(stream_path)
|
|
120
|
+
endpoint = kwargs.pop("endpoint", None) or endpoint
|
|
120
121
|
return OutputStream(stream_path, endpoint=endpoint, **kwargs)
|
|
121
122
|
elif stream_path.startswith("dummy://"):
|
|
122
123
|
return _DummyStream(**kwargs)
|
mlrun/datastore/alibaba_oss.py
CHANGED
|
@@ -22,7 +22,7 @@ from fsspec.registry import get_filesystem_class
|
|
|
22
22
|
|
|
23
23
|
import mlrun.errors
|
|
24
24
|
|
|
25
|
-
from .base import DataStore, FileStats,
|
|
25
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class OSSStore(DataStore):
|
|
@@ -53,7 +53,7 @@ class OSSStore(DataStore):
|
|
|
53
53
|
except ImportError as exc:
|
|
54
54
|
raise ImportError("ALIBABA ossfs not installed") from exc
|
|
55
55
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
56
|
-
self._filesystem =
|
|
56
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
57
57
|
filesystem_class,
|
|
58
58
|
using_bucket=self.using_bucket,
|
|
59
59
|
**self.get_storage_options(),
|
|
@@ -85,6 +85,7 @@ class OSSStore(DataStore):
|
|
|
85
85
|
return oss.get_object(key).read()
|
|
86
86
|
|
|
87
87
|
def put(self, key, data, append=False):
|
|
88
|
+
data, _ = self._prepare_put_data(data, append)
|
|
88
89
|
bucket, key = self.get_bucket_and_key(key)
|
|
89
90
|
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
90
91
|
oss.put_object(key, data)
|