mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -27,6 +27,7 @@ import copy
|
|
|
27
27
|
import json
|
|
28
28
|
import os
|
|
29
29
|
import typing
|
|
30
|
+
import warnings
|
|
30
31
|
from collections.abc import Mapping
|
|
31
32
|
from datetime import timedelta
|
|
32
33
|
from distutils.util import strtobool
|
|
@@ -35,8 +36,10 @@ from threading import Lock
|
|
|
35
36
|
|
|
36
37
|
import dotenv
|
|
37
38
|
import semver
|
|
39
|
+
import urllib3.exceptions
|
|
38
40
|
import yaml
|
|
39
41
|
|
|
42
|
+
import mlrun.common.constants
|
|
40
43
|
import mlrun.common.schemas
|
|
41
44
|
import mlrun.errors
|
|
42
45
|
|
|
@@ -46,11 +49,17 @@ _load_lock = Lock()
|
|
|
46
49
|
_none_type = type(None)
|
|
47
50
|
default_env_file = os.getenv("MLRUN_DEFAULT_ENV_FILE", "~/.mlrun.env")
|
|
48
51
|
|
|
52
|
+
|
|
49
53
|
default_config = {
|
|
50
54
|
"namespace": "", # default kubernetes namespace
|
|
51
55
|
"kubernetes": {
|
|
52
56
|
"kubeconfig_path": "", # local path to kubeconfig file (for development purposes),
|
|
53
57
|
# empty by default as the API already running inside k8s cluster
|
|
58
|
+
"pagination": {
|
|
59
|
+
# pagination config for interacting with k8s API
|
|
60
|
+
"list_pods_limit": 200,
|
|
61
|
+
"list_crd_objects_limit": 200,
|
|
62
|
+
},
|
|
54
63
|
},
|
|
55
64
|
"dbpath": "", # db/api url
|
|
56
65
|
# url to nuclio dashboard api (can be with user & token, e.g. https://username:password@dashboard-url.com)
|
|
@@ -63,11 +72,15 @@ default_config = {
|
|
|
63
72
|
"api_base_version": "v1",
|
|
64
73
|
"version": "", # will be set to current version
|
|
65
74
|
"images_tag": "", # tag to use with mlrun images e.g. mlrun/mlrun (defaults to version)
|
|
66
|
-
|
|
75
|
+
# registry to use with mlrun images that start with "mlrun/" e.g. quay.io/ (defaults to empty, for dockerhub)
|
|
76
|
+
"images_registry": "",
|
|
77
|
+
# registry to use with non-mlrun images (don't start with "mlrun/") specified in 'images_to_enrich_registry'
|
|
78
|
+
# defaults to empty, for dockerhub
|
|
79
|
+
"vendor_images_registry": "",
|
|
67
80
|
# comma separated list of images that are in the specified images_registry, and therefore will be enriched with this
|
|
68
81
|
# registry when used. default to mlrun/* which means any image which is of the mlrun repository (mlrun/mlrun,
|
|
69
82
|
# mlrun/ml-base, etc...)
|
|
70
|
-
"images_to_enrich_registry": "^mlrun
|
|
83
|
+
"images_to_enrich_registry": "^mlrun/*,python:3.9",
|
|
71
84
|
"kfp_url": "",
|
|
72
85
|
"kfp_ttl": "14400", # KFP ttl in sec, after that completed PODs will be deleted
|
|
73
86
|
"kfp_image": "mlrun/mlrun", # image to use for KFP runner (defaults to mlrun/mlrun)
|
|
@@ -87,7 +100,7 @@ default_config = {
|
|
|
87
100
|
"mpijob_crd_version": "", # mpijob crd version (e.g: "v1alpha1". must be in: mlrun.runtime.MPIJobCRDVersions)
|
|
88
101
|
"ipython_widget": True,
|
|
89
102
|
"log_level": "INFO",
|
|
90
|
-
# log formatter (options: human | json)
|
|
103
|
+
# log formatter (options: human | human_extended | json)
|
|
91
104
|
"log_formatter": "human",
|
|
92
105
|
"submit_timeout": "180", # timeout when submitting a new k8s resource
|
|
93
106
|
# runtimes cleanup interval in seconds
|
|
@@ -103,7 +116,20 @@ default_config = {
|
|
|
103
116
|
# max number of parallel abort run jobs in runs monitoring
|
|
104
117
|
"concurrent_abort_stale_runs_workers": 10,
|
|
105
118
|
"list_runs_time_period_in_days": 7, # days
|
|
106
|
-
}
|
|
119
|
+
},
|
|
120
|
+
"projects": {
|
|
121
|
+
"summaries": {
|
|
122
|
+
"cache_interval": "30",
|
|
123
|
+
"feature_gates": {
|
|
124
|
+
"artifacts": "enabled",
|
|
125
|
+
"schedules": "enabled",
|
|
126
|
+
"feature_sets": "enabled",
|
|
127
|
+
"models": "enabled",
|
|
128
|
+
"runs": "enabled",
|
|
129
|
+
"pipelines": "enabled",
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
},
|
|
107
133
|
},
|
|
108
134
|
"crud": {
|
|
109
135
|
"runs": {
|
|
@@ -137,6 +163,11 @@ default_config = {
|
|
|
137
163
|
"datasets": {
|
|
138
164
|
"max_preview_columns": 100,
|
|
139
165
|
},
|
|
166
|
+
"limits": {
|
|
167
|
+
"max_chunk_size": 1024 * 1024 * 1, # 1MB
|
|
168
|
+
"max_preview_size": 1024 * 1024 * 10, # 10MB
|
|
169
|
+
"max_download_size": 1024 * 1024 * 100, # 100MB
|
|
170
|
+
},
|
|
140
171
|
},
|
|
141
172
|
# FIXME: Adding these defaults here so we won't need to patch the "installing component" (provazio-controller) to
|
|
142
173
|
# configure this values on field systems, for newer system this will be configured correctly
|
|
@@ -188,6 +219,7 @@ default_config = {
|
|
|
188
219
|
"background_tasks": {
|
|
189
220
|
# enabled / disabled
|
|
190
221
|
"timeout_mode": "enabled",
|
|
222
|
+
"function_deletion_batch_size": 10,
|
|
191
223
|
# timeout in seconds to wait for background task to be updated / finished by the worker responsible for the task
|
|
192
224
|
"default_timeouts": {
|
|
193
225
|
"operations": {
|
|
@@ -196,6 +228,7 @@ default_config = {
|
|
|
196
228
|
"run_abortion": "600",
|
|
197
229
|
"abort_grace_period": "10",
|
|
198
230
|
"delete_project": "900",
|
|
231
|
+
"delete_function": "900",
|
|
199
232
|
},
|
|
200
233
|
"runtimes": {"dask": "600"},
|
|
201
234
|
},
|
|
@@ -226,10 +259,17 @@ default_config = {
|
|
|
226
259
|
"executing": "24h",
|
|
227
260
|
}
|
|
228
261
|
},
|
|
262
|
+
# When the module is reloaded, the maximum depth recursion configuration for the recursive reload
|
|
263
|
+
# function is used to prevent infinite loop
|
|
264
|
+
"reload_max_recursion_depth": 100,
|
|
229
265
|
},
|
|
230
266
|
"databricks": {
|
|
231
267
|
"artifact_directory_path": "/mlrun_databricks_runtime/artifacts_dictionaries"
|
|
232
268
|
},
|
|
269
|
+
"application": {
|
|
270
|
+
"default_sidecar_internal_port": 8050,
|
|
271
|
+
"default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
|
|
272
|
+
},
|
|
233
273
|
},
|
|
234
274
|
# TODO: function defaults should be moved to the function spec config above
|
|
235
275
|
"function_defaults": {
|
|
@@ -240,6 +280,7 @@ default_config = {
|
|
|
240
280
|
"remote": "mlrun/mlrun",
|
|
241
281
|
"dask": "mlrun/ml-base",
|
|
242
282
|
"mpijob": "mlrun/mlrun",
|
|
283
|
+
"application": "python:3.9",
|
|
243
284
|
},
|
|
244
285
|
# see enrich_function_preemption_spec for more info,
|
|
245
286
|
# and mlrun.common.schemas.function.PreemptionModes for available options
|
|
@@ -254,6 +295,16 @@ default_config = {
|
|
|
254
295
|
"url": "",
|
|
255
296
|
"service": "mlrun-api-chief",
|
|
256
297
|
"port": 8080,
|
|
298
|
+
"feature_gates": {
|
|
299
|
+
"scheduler": "enabled",
|
|
300
|
+
"project_sync": "enabled",
|
|
301
|
+
"cleanup": "enabled",
|
|
302
|
+
"runs_monitoring": "enabled",
|
|
303
|
+
"pagination_cache": "enabled",
|
|
304
|
+
"project_summaries": "enabled",
|
|
305
|
+
"start_logs": "enabled",
|
|
306
|
+
"stop_logs": "enabled",
|
|
307
|
+
},
|
|
257
308
|
},
|
|
258
309
|
"worker": {
|
|
259
310
|
"sync_with_chief": {
|
|
@@ -291,7 +342,7 @@ default_config = {
|
|
|
291
342
|
"http": {
|
|
292
343
|
# when True, the client will verify the server's TLS
|
|
293
344
|
# set to False for backwards compatibility.
|
|
294
|
-
"verify":
|
|
345
|
+
"verify": True,
|
|
295
346
|
},
|
|
296
347
|
"db": {
|
|
297
348
|
"commit_retry_timeout": 30,
|
|
@@ -324,7 +375,13 @@ default_config = {
|
|
|
324
375
|
# optional values (as per https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sql-mode-full):
|
|
325
376
|
#
|
|
326
377
|
# if set to "nil" or "none", nothing would be set
|
|
327
|
-
"modes":
|
|
378
|
+
"modes": (
|
|
379
|
+
"STRICT_TRANS_TABLES"
|
|
380
|
+
",NO_ZERO_IN_DATE"
|
|
381
|
+
",NO_ZERO_DATE"
|
|
382
|
+
",ERROR_FOR_DIVISION_BY_ZERO"
|
|
383
|
+
",NO_ENGINE_SUBSTITUTION",
|
|
384
|
+
)
|
|
328
385
|
},
|
|
329
386
|
},
|
|
330
387
|
"jobs": {
|
|
@@ -352,10 +409,12 @@ default_config = {
|
|
|
352
409
|
# is set to ClusterIP
|
|
353
410
|
# ---------------------------------------------------------------------
|
|
354
411
|
# Note: adding a mode requires special handling on
|
|
355
|
-
# - mlrun.runtimes.constants.NuclioIngressAddTemplatedIngressModes
|
|
412
|
+
# - mlrun.common.runtimes.constants.NuclioIngressAddTemplatedIngressModes
|
|
356
413
|
# - mlrun.runtimes.nuclio.function.enrich_function_with_ingress
|
|
357
414
|
"add_templated_ingress_host_mode": "never",
|
|
358
415
|
"explicit_ack": "enabled",
|
|
416
|
+
# size of serving spec to move to config maps
|
|
417
|
+
"serving_spec_env_cutoff": 0,
|
|
359
418
|
},
|
|
360
419
|
"logs": {
|
|
361
420
|
"decode": {
|
|
@@ -414,7 +473,6 @@ default_config = {
|
|
|
414
473
|
"followers": "",
|
|
415
474
|
# This is used as the interval for the sync loop both when mlrun is leader and follower
|
|
416
475
|
"periodic_sync_interval": "1 minute",
|
|
417
|
-
"counters_cache_ttl": "2 minutes",
|
|
418
476
|
"project_owners_cache_ttl": "30 seconds",
|
|
419
477
|
# access key to be used when the leader is iguazio and polling is done from it
|
|
420
478
|
"iguazio_access_key": "",
|
|
@@ -443,10 +501,10 @@ default_config = {
|
|
|
443
501
|
# pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
|
|
444
502
|
# git+https://github.com/mlrun/mlrun@development. by default uses the version
|
|
445
503
|
"mlrun_version_specifier": "",
|
|
446
|
-
"kaniko_image": "gcr.io/kaniko-project/executor:v1.
|
|
504
|
+
"kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2", # kaniko builder image
|
|
447
505
|
"kaniko_init_container_image": "alpine:3.18",
|
|
448
506
|
# image for kaniko init container when docker registry is ECR
|
|
449
|
-
"kaniko_aws_cli_image": "amazon/aws-cli:2.
|
|
507
|
+
"kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
|
|
450
508
|
# kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
|
|
451
509
|
# a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
|
|
452
510
|
"kaniko_image_fs_extraction_retries": "3",
|
|
@@ -473,17 +531,71 @@ default_config = {
|
|
|
473
531
|
# if set to true, will log a warning for trying to use run db functionality while in nop db mode
|
|
474
532
|
"verbose": True,
|
|
475
533
|
},
|
|
534
|
+
"pagination": {
|
|
535
|
+
"default_page_size": 20,
|
|
536
|
+
"pagination_cache": {
|
|
537
|
+
"interval": 60,
|
|
538
|
+
"ttl": 3600,
|
|
539
|
+
"max_size": 10000,
|
|
540
|
+
},
|
|
541
|
+
},
|
|
476
542
|
},
|
|
477
543
|
"model_endpoint_monitoring": {
|
|
478
|
-
"
|
|
479
|
-
|
|
480
|
-
|
|
544
|
+
"serving_stream": {
|
|
545
|
+
"v3io": {
|
|
546
|
+
"shard_count": 2,
|
|
547
|
+
"retention_period_hours": 24,
|
|
548
|
+
"num_workers": 1,
|
|
549
|
+
"min_replicas": 2,
|
|
550
|
+
"max_replicas": 2,
|
|
551
|
+
},
|
|
552
|
+
"kafka": {
|
|
553
|
+
"partition_count": 8,
|
|
554
|
+
"replication_factor": 1,
|
|
555
|
+
"num_workers": 2,
|
|
556
|
+
"min_replicas": 1,
|
|
557
|
+
"max_replicas": 4,
|
|
558
|
+
},
|
|
559
|
+
},
|
|
560
|
+
"application_stream_args": {
|
|
561
|
+
"v3io": {
|
|
562
|
+
"shard_count": 1,
|
|
563
|
+
"retention_period_hours": 24,
|
|
564
|
+
"num_workers": 1,
|
|
565
|
+
"min_replicas": 1,
|
|
566
|
+
"max_replicas": 1,
|
|
567
|
+
},
|
|
568
|
+
"kafka": {
|
|
569
|
+
"partition_count": 1,
|
|
570
|
+
"replication_factor": 1,
|
|
571
|
+
"num_workers": 1,
|
|
572
|
+
"min_replicas": 1,
|
|
573
|
+
"max_replicas": 1,
|
|
574
|
+
},
|
|
575
|
+
},
|
|
576
|
+
"writer_stream_args": {
|
|
577
|
+
"v3io": {
|
|
578
|
+
"shard_count": 1,
|
|
579
|
+
"retention_period_hours": 24,
|
|
580
|
+
"num_workers": 1,
|
|
581
|
+
"min_replicas": 1,
|
|
582
|
+
"max_replicas": 1,
|
|
583
|
+
},
|
|
584
|
+
"kafka": {
|
|
585
|
+
"partition_count": 1,
|
|
586
|
+
# TODO: add retention period configuration
|
|
587
|
+
"replication_factor": 1,
|
|
588
|
+
"num_workers": 1,
|
|
589
|
+
"min_replicas": 1,
|
|
590
|
+
"max_replicas": 1,
|
|
591
|
+
},
|
|
592
|
+
},
|
|
481
593
|
# Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
|
|
482
594
|
# stream, and endpoints.
|
|
483
595
|
"store_prefixes": {
|
|
484
596
|
"default": "v3io:///users/pipelines/{project}/model-endpoints/{kind}",
|
|
485
597
|
"user_space": "v3io:///projects/{project}/model-endpoints/{kind}",
|
|
486
|
-
"
|
|
598
|
+
"monitoring_application": "v3io:///users/pipelines/{project}/monitoring-apps/",
|
|
487
599
|
},
|
|
488
600
|
# Offline storage path can be either relative or a full path. This path is used for general offline data
|
|
489
601
|
# storage such as the parquet file which is generated from the monitoring stream function for the drift analysis
|
|
@@ -492,12 +604,18 @@ default_config = {
|
|
|
492
604
|
# when the user is working in CE environment and has not provided any stream path.
|
|
493
605
|
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
|
|
494
606
|
"default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
|
|
495
|
-
"batch_processing_function_branch": "master",
|
|
496
607
|
"parquet_batching_max_events": 10_000,
|
|
497
608
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
498
|
-
# See mlrun.model_monitoring.stores.
|
|
499
|
-
"store_type": "v3io-nosql",
|
|
609
|
+
# See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
|
|
500
610
|
"endpoint_store_connection": "",
|
|
611
|
+
# See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
|
|
612
|
+
"tsdb_connection": "",
|
|
613
|
+
# See mlrun.common.schemas.model_monitoring.constants.StreamKind for available options
|
|
614
|
+
"stream_connection": "",
|
|
615
|
+
"tdengine": {
|
|
616
|
+
"timeout": 10,
|
|
617
|
+
"retries": 1,
|
|
618
|
+
},
|
|
501
619
|
},
|
|
502
620
|
"secret_stores": {
|
|
503
621
|
# Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory
|
|
@@ -533,9 +651,10 @@ default_config = {
|
|
|
533
651
|
"feature_store": {
|
|
534
652
|
"data_prefixes": {
|
|
535
653
|
"default": "v3io:///projects/{project}/FeatureStore/{name}/{kind}",
|
|
536
|
-
"nosql": "v3io:///projects/{project}/FeatureStore/{name}/
|
|
654
|
+
"nosql": "v3io:///projects/{project}/FeatureStore/{name}/nosql",
|
|
537
655
|
# "authority" is optional and generalizes [userinfo "@"] host [":" port]
|
|
538
|
-
"redisnosql": "redis://{authority}/projects/{project}/FeatureStore/{name}/
|
|
656
|
+
"redisnosql": "redis://{authority}/projects/{project}/FeatureStore/{name}/nosql",
|
|
657
|
+
"dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/{kind}",
|
|
539
658
|
},
|
|
540
659
|
"default_targets": "parquet,nosql",
|
|
541
660
|
"default_job_image": "mlrun/mlrun",
|
|
@@ -610,8 +729,9 @@ default_config = {
|
|
|
610
729
|
},
|
|
611
730
|
"workflows": {
|
|
612
731
|
"default_workflow_runner_name": "workflow-runner-{}",
|
|
613
|
-
# Default timeout seconds for retrieving workflow id after execution
|
|
614
|
-
|
|
732
|
+
# Default timeout seconds for retrieving workflow id after execution
|
|
733
|
+
# Remote workflow timeout is the maximum between remote and the inner engine timeout
|
|
734
|
+
"timeouts": {"local": 120, "kfp": 60, "remote": 60 * 5},
|
|
615
735
|
},
|
|
616
736
|
"log_collector": {
|
|
617
737
|
"address": "localhost:8282",
|
|
@@ -628,7 +748,9 @@ default_config = {
|
|
|
628
748
|
"failed_runs_grace_period": 3600,
|
|
629
749
|
"verbose": True,
|
|
630
750
|
# the number of workers which will be used to trigger the start log collection
|
|
631
|
-
"concurrent_start_logs_workers":
|
|
751
|
+
"concurrent_start_logs_workers": 50,
|
|
752
|
+
# the number of runs for which to start logs on api startup
|
|
753
|
+
"start_logs_startup_run_limit": 150,
|
|
632
754
|
# the time in hours in which to start log collection from.
|
|
633
755
|
# after upgrade, we might have runs which completed in the mean time or still in non-terminal state and
|
|
634
756
|
# we want to collect their logs in the new log collection method (sidecar)
|
|
@@ -670,8 +792,20 @@ default_config = {
|
|
|
670
792
|
"access_key": "",
|
|
671
793
|
},
|
|
672
794
|
"grafana_url": "",
|
|
795
|
+
"alerts": {
|
|
796
|
+
# supported modes: "enabled", "disabled".
|
|
797
|
+
"mode": "disabled",
|
|
798
|
+
# maximum number of alerts we allow to be configured.
|
|
799
|
+
# user will get an error when exceeding this
|
|
800
|
+
"max_allowed": 10000,
|
|
801
|
+
# maximum allowed value for count in criteria field inside AlertConfig
|
|
802
|
+
"max_criteria_count": 100,
|
|
803
|
+
},
|
|
804
|
+
"auth_with_client_id": {
|
|
805
|
+
"enabled": False,
|
|
806
|
+
"request_timeout": 5,
|
|
807
|
+
},
|
|
673
808
|
}
|
|
674
|
-
|
|
675
809
|
_is_running_as_api = None
|
|
676
810
|
|
|
677
811
|
|
|
@@ -721,7 +855,21 @@ class Config:
|
|
|
721
855
|
for key, value in cfg.items():
|
|
722
856
|
if hasattr(self, key):
|
|
723
857
|
if isinstance(value, dict):
|
|
724
|
-
|
|
858
|
+
# ignore the `skip_errors` flag here
|
|
859
|
+
# if the key does not align with what mlrun config expects it is a user
|
|
860
|
+
# input error that can lead to unexpected behavior.
|
|
861
|
+
# raise the exception to ensure configuration is loaded correctly and do not
|
|
862
|
+
# ignore any errors.
|
|
863
|
+
config_value = getattr(self, key)
|
|
864
|
+
try:
|
|
865
|
+
config_value.update(value)
|
|
866
|
+
except AttributeError as exc:
|
|
867
|
+
if not isinstance(config_value, (dict, Config)):
|
|
868
|
+
raise ValueError(
|
|
869
|
+
f"Can not update `{key}` config. "
|
|
870
|
+
f"Expected a configuration but received {type(value)}"
|
|
871
|
+
) from exc
|
|
872
|
+
raise exc
|
|
725
873
|
else:
|
|
726
874
|
try:
|
|
727
875
|
setattr(self, key, value)
|
|
@@ -769,6 +917,7 @@ class Config:
|
|
|
769
917
|
):
|
|
770
918
|
"""
|
|
771
919
|
decodes and loads the config attribute to expected type
|
|
920
|
+
|
|
772
921
|
:param attribute_path: the path in the default_config e.g. preemptible_nodes.node_selector
|
|
773
922
|
:param expected_type: the object type valid values are : `dict`, `list` etc...
|
|
774
923
|
:return: the expected type instance
|
|
@@ -792,7 +941,7 @@ class Config:
|
|
|
792
941
|
f"Unable to decode {attribute_path}"
|
|
793
942
|
)
|
|
794
943
|
parsed_attribute_value = json.loads(decoded_attribute_value)
|
|
795
|
-
if
|
|
944
|
+
if not isinstance(parsed_attribute_value, expected_type):
|
|
796
945
|
raise mlrun.errors.MLRunInvalidArgumentTypeError(
|
|
797
946
|
f"Expected type {expected_type}, got {type(parsed_attribute_value)}"
|
|
798
947
|
)
|
|
@@ -894,24 +1043,6 @@ class Config:
|
|
|
894
1043
|
f"is not allowed for iguazio version: {igz_version} < 3.5.1"
|
|
895
1044
|
)
|
|
896
1045
|
|
|
897
|
-
def resolve_kfp_url(self, namespace=None):
|
|
898
|
-
if config.kfp_url:
|
|
899
|
-
return config.kfp_url
|
|
900
|
-
igz_version = self.get_parsed_igz_version()
|
|
901
|
-
# TODO: When Iguazio 3.4 will deprecate we can remove this line
|
|
902
|
-
if igz_version and igz_version <= semver.VersionInfo.parse("3.6.0-b1"):
|
|
903
|
-
if namespace is None:
|
|
904
|
-
if not config.namespace:
|
|
905
|
-
raise mlrun.errors.MLRunNotFoundError(
|
|
906
|
-
"For KubeFlow Pipelines to function, a namespace must be configured"
|
|
907
|
-
)
|
|
908
|
-
namespace = config.namespace
|
|
909
|
-
# When instead of host we provided namespace we tackled this issue
|
|
910
|
-
# https://github.com/canonical/bundle-kubeflow/issues/412
|
|
911
|
-
# TODO: When we'll move to kfp 1.4.0 (server side) it should be resolved
|
|
912
|
-
return f"http://ml-pipeline.{namespace}.svc.cluster.local:8888"
|
|
913
|
-
return None
|
|
914
|
-
|
|
915
1046
|
def resolve_chief_api_url(self) -> str:
|
|
916
1047
|
if self.httpdb.clusterization.chief.url:
|
|
917
1048
|
return self.httpdb.clusterization.chief.url
|
|
@@ -931,6 +1062,10 @@ class Config:
|
|
|
931
1062
|
self.httpdb.clusterization.chief.url = chief_api_url
|
|
932
1063
|
return self.httpdb.clusterization.chief.url
|
|
933
1064
|
|
|
1065
|
+
@staticmethod
|
|
1066
|
+
def internal_labels():
|
|
1067
|
+
return mlrun.common.constants.MLRunInternalLabels.all()
|
|
1068
|
+
|
|
934
1069
|
@staticmethod
|
|
935
1070
|
def get_storage_auto_mount_params():
|
|
936
1071
|
auto_mount_params = {}
|
|
@@ -998,6 +1133,14 @@ class Config:
|
|
|
998
1133
|
resource_requirement.pop(gpu)
|
|
999
1134
|
return resource_requirement
|
|
1000
1135
|
|
|
1136
|
+
def force_api_gateway_ssl_redirect(self):
|
|
1137
|
+
"""
|
|
1138
|
+
Get the default value for the ssl_redirect configuration.
|
|
1139
|
+
In Iguazio we always want to redirect to HTTPS, in other cases we don't.
|
|
1140
|
+
:return: True if we should redirect to HTTPS, False otherwise.
|
|
1141
|
+
"""
|
|
1142
|
+
return self.is_running_on_iguazio()
|
|
1143
|
+
|
|
1001
1144
|
def to_dict(self):
|
|
1002
1145
|
return copy.deepcopy(self._cfg)
|
|
1003
1146
|
|
|
@@ -1030,6 +1173,9 @@ class Config:
|
|
|
1030
1173
|
# importing here to avoid circular dependency
|
|
1031
1174
|
import mlrun.db
|
|
1032
1175
|
|
|
1176
|
+
# It ensures that SSL verification is set before establishing a connection
|
|
1177
|
+
_configure_ssl_verification(self.httpdb.http.verify)
|
|
1178
|
+
|
|
1033
1179
|
# when dbpath is set we want to connect to it which will sync configuration from it to the client
|
|
1034
1180
|
mlrun.db.get_run_db(value, force_reconnect=True)
|
|
1035
1181
|
|
|
@@ -1058,8 +1204,9 @@ class Config:
|
|
|
1058
1204
|
project: str = "",
|
|
1059
1205
|
kind: str = "",
|
|
1060
1206
|
target: str = "online",
|
|
1061
|
-
artifact_path: str = None,
|
|
1062
|
-
function_name: str = None,
|
|
1207
|
+
artifact_path: typing.Optional[str] = None,
|
|
1208
|
+
function_name: typing.Optional[str] = None,
|
|
1209
|
+
**kwargs,
|
|
1063
1210
|
) -> str:
|
|
1064
1211
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
1065
1212
|
|
|
@@ -1085,8 +1232,7 @@ class Config:
|
|
|
1085
1232
|
)
|
|
1086
1233
|
if store_prefix_dict.get(kind):
|
|
1087
1234
|
# Target exist in store prefix and has a valid string value
|
|
1088
|
-
return store_prefix_dict[kind].format(project=project)
|
|
1089
|
-
|
|
1235
|
+
return store_prefix_dict[kind].format(project=project, **kwargs)
|
|
1090
1236
|
if (
|
|
1091
1237
|
function_name
|
|
1092
1238
|
and function_name
|
|
@@ -1098,10 +1244,16 @@ class Config:
|
|
|
1098
1244
|
if function_name is None
|
|
1099
1245
|
else f"{kind}-{function_name.lower()}",
|
|
1100
1246
|
)
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1247
|
+
elif kind == "stream":
|
|
1248
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1249
|
+
project=project,
|
|
1250
|
+
kind=kind,
|
|
1251
|
+
)
|
|
1252
|
+
else:
|
|
1253
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1254
|
+
project=project,
|
|
1255
|
+
kind=kind,
|
|
1256
|
+
)
|
|
1105
1257
|
|
|
1106
1258
|
# Get the current offline path from the configuration
|
|
1107
1259
|
file_path = mlrun.mlconf.model_endpoint_monitoring.offline_storage_path.format(
|
|
@@ -1159,12 +1311,11 @@ class Config:
|
|
|
1159
1311
|
|
|
1160
1312
|
return storage_options
|
|
1161
1313
|
|
|
1162
|
-
def
|
|
1163
|
-
if not version:
|
|
1164
|
-
version = self.nuclio_version
|
|
1314
|
+
def is_explicit_ack_enabled(self) -> bool:
|
|
1165
1315
|
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1166
|
-
not
|
|
1167
|
-
or semver.VersionInfo.parse(
|
|
1316
|
+
not self.nuclio_version
|
|
1317
|
+
or semver.VersionInfo.parse(self.nuclio_version)
|
|
1318
|
+
>= semver.VersionInfo.parse("1.12.10")
|
|
1168
1319
|
)
|
|
1169
1320
|
|
|
1170
1321
|
|
|
@@ -1214,6 +1365,7 @@ def _do_populate(env=None, skip_errors=False):
|
|
|
1214
1365
|
if data:
|
|
1215
1366
|
config.update(data, skip_errors=skip_errors)
|
|
1216
1367
|
|
|
1368
|
+
_configure_ssl_verification(config.httpdb.http.verify)
|
|
1217
1369
|
_validate_config(config)
|
|
1218
1370
|
|
|
1219
1371
|
|
|
@@ -1273,6 +1425,16 @@ def _convert_str(value, typ):
|
|
|
1273
1425
|
return typ(value)
|
|
1274
1426
|
|
|
1275
1427
|
|
|
1428
|
+
def _configure_ssl_verification(verify_ssl: bool) -> None:
|
|
1429
|
+
"""Configure SSL verification warnings based on the setting."""
|
|
1430
|
+
if not verify_ssl:
|
|
1431
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
1432
|
+
else:
|
|
1433
|
+
# If the user changes the `verify` setting to `True` at runtime using `mlrun.set_env_from_file` after
|
|
1434
|
+
# importing `mlrun`, we need to reload the `mlrun` configuration and enable this warning.
|
|
1435
|
+
warnings.simplefilter("default", urllib3.exceptions.InsecureRequestWarning)
|
|
1436
|
+
|
|
1437
|
+
|
|
1276
1438
|
def read_env(env=None, prefix=env_prefix):
|
|
1277
1439
|
"""Read configuration from environment"""
|
|
1278
1440
|
env = os.environ if env is None else env
|
|
@@ -1358,10 +1520,14 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1358
1520
|
if log_formatter_name := config.get("log_formatter"):
|
|
1359
1521
|
import mlrun.utils.logger
|
|
1360
1522
|
|
|
1361
|
-
log_formatter = mlrun.utils.
|
|
1523
|
+
log_formatter = mlrun.utils.resolve_formatter_by_kind(
|
|
1362
1524
|
mlrun.utils.FormatterKinds(log_formatter_name)
|
|
1363
1525
|
)
|
|
1364
|
-
mlrun.utils.logger.get_handler("default")
|
|
1526
|
+
current_handler = mlrun.utils.logger.get_handler("default")
|
|
1527
|
+
current_formatter_name = current_handler.formatter.__class__.__name__
|
|
1528
|
+
desired_formatter_name = log_formatter.__name__
|
|
1529
|
+
if current_formatter_name != desired_formatter_name:
|
|
1530
|
+
current_handler.setFormatter(log_formatter())
|
|
1365
1531
|
|
|
1366
1532
|
# The default function pod resource values are of type str; however, when reading from environment variable numbers,
|
|
1367
1533
|
# it converts them to type int if contains only number, so we want to convert them to str.
|
mlrun/data_types/data_types.py
CHANGED
|
@@ -41,6 +41,7 @@ class ValueType(str, Enum):
|
|
|
41
41
|
BYTES = "bytes"
|
|
42
42
|
STRING = "str"
|
|
43
43
|
DATETIME = "datetime"
|
|
44
|
+
LIST = "List"
|
|
44
45
|
BYTES_LIST = "List[bytes]"
|
|
45
46
|
STRING_LIST = "List[string]"
|
|
46
47
|
INT32_LIST = "List[int32]"
|
|
@@ -48,6 +49,7 @@ class ValueType(str, Enum):
|
|
|
48
49
|
DOUBLE_LIST = "List[float]"
|
|
49
50
|
FLOAT_LIST = "List[float32]"
|
|
50
51
|
BOOL_LIST = "List[bool]"
|
|
52
|
+
Tuple = "Tuple"
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
def pd_schema_to_value_type(value):
|
|
@@ -68,6 +70,11 @@ def pa_type_to_value_type(type_):
|
|
|
68
70
|
if isinstance(type_, TimestampType):
|
|
69
71
|
return ValueType.DATETIME
|
|
70
72
|
|
|
73
|
+
# pandas category type translates to pyarrow DictionaryType
|
|
74
|
+
# we need to unpack the value type (ML-7868)
|
|
75
|
+
if isinstance(type_, pyarrow.DictionaryType):
|
|
76
|
+
type_ = type_.value_type
|
|
77
|
+
|
|
71
78
|
type_map = {
|
|
72
79
|
pyarrow.bool_(): ValueType.BOOL,
|
|
73
80
|
pyarrow.int64(): ValueType.INT64,
|
|
@@ -102,6 +109,8 @@ def python_type_to_value_type(value_type):
|
|
|
102
109
|
"datetime64[ns]": ValueType.INT64,
|
|
103
110
|
"datetime64[ns, tz]": ValueType.INT64,
|
|
104
111
|
"category": ValueType.STRING,
|
|
112
|
+
"list": ValueType.LIST,
|
|
113
|
+
"tuple": ValueType.Tuple,
|
|
105
114
|
}
|
|
106
115
|
|
|
107
116
|
if type_name in type_map:
|
|
@@ -115,6 +124,7 @@ def spark_to_value_type(data_type):
|
|
|
115
124
|
"double": ValueType.DOUBLE,
|
|
116
125
|
"boolean": ValueType.BOOL,
|
|
117
126
|
"timestamp": ValueType.DATETIME,
|
|
127
|
+
"timestamp_ntz": ValueType.DATETIME,
|
|
118
128
|
"string": ValueType.STRING,
|
|
119
129
|
"array": "list",
|
|
120
130
|
"map": "dict",
|
|
@@ -135,7 +145,7 @@ def gbq_to_pandas_dtype(gbq_type):
|
|
|
135
145
|
"BOOL": "bool",
|
|
136
146
|
"FLOAT": "float64",
|
|
137
147
|
"INTEGER": pd.Int64Dtype(),
|
|
138
|
-
"TIMESTAMP": "datetime64[ns]",
|
|
148
|
+
"TIMESTAMP": "datetime64[ns, UTC]",
|
|
139
149
|
}
|
|
140
150
|
return type_map.get(gbq_type, "object")
|
|
141
151
|
|
mlrun/data_types/spark.py
CHANGED
|
@@ -18,12 +18,12 @@ from os import environ
|
|
|
18
18
|
import numpy as np
|
|
19
19
|
import pytz
|
|
20
20
|
from pyspark.sql.functions import to_utc_timestamp
|
|
21
|
-
from pyspark.sql.types import BooleanType, DoubleType
|
|
21
|
+
from pyspark.sql.types import BooleanType, DoubleType
|
|
22
22
|
|
|
23
|
+
from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
|
|
23
24
|
from mlrun.utils import logger
|
|
24
25
|
|
|
25
26
|
from .data_types import InferOptions, spark_to_value_type
|
|
26
|
-
from .to_pandas import toPandas
|
|
27
27
|
|
|
28
28
|
try:
|
|
29
29
|
import pyspark.sql.functions as funcs
|
|
@@ -75,7 +75,7 @@ def get_df_preview_spark(df, preview_lines=20):
|
|
|
75
75
|
"""capture preview data from spark df"""
|
|
76
76
|
df = df.limit(preview_lines)
|
|
77
77
|
|
|
78
|
-
result_dict =
|
|
78
|
+
result_dict = spark_df_to_pandas(df).to_dict(orient="split")
|
|
79
79
|
return [result_dict["columns"], *result_dict["data"]]
|
|
80
80
|
|
|
81
81
|
|
|
@@ -143,7 +143,8 @@ def get_df_stats_spark(df, options, num_bins=20, sample_size=None):
|
|
|
143
143
|
timestamp_columns = set()
|
|
144
144
|
boolean_columns = set()
|
|
145
145
|
for field in df_after_type_casts.schema.fields:
|
|
146
|
-
|
|
146
|
+
# covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
|
|
147
|
+
is_timestamp = field.dataType.typeName().startswith("timestamp")
|
|
147
148
|
is_boolean = isinstance(field.dataType, BooleanType)
|
|
148
149
|
if is_timestamp:
|
|
149
150
|
df_after_type_casts = df_after_type_casts.withColumn(
|