mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +26 -22
- mlrun/__main__.py +15 -16
- mlrun/alerts/alert.py +150 -15
- mlrun/api/schemas/__init__.py +1 -9
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +62 -19
- mlrun/artifacts/dataset.py +17 -17
- mlrun/artifacts/document.py +454 -0
- mlrun/artifacts/manager.py +28 -18
- mlrun/artifacts/model.py +91 -59
- mlrun/artifacts/plots.py +2 -2
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -1
- mlrun/common/formatters/feature_set.py +2 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +12 -62
- mlrun/common/runtimes/constants.py +25 -4
- mlrun/common/schemas/__init__.py +9 -5
- mlrun/common/schemas/alert.py +114 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +22 -9
- mlrun/common/schemas/auth.py +8 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +4 -4
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +4 -8
- mlrun/common/schemas/model_monitoring/constants.py +127 -46
- mlrun/common/schemas/model_monitoring/grafana.py +18 -12
- mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +142 -0
- mlrun/common/schemas/pipeline.py +3 -3
- mlrun/common/schemas/project.py +26 -18
- mlrun/common/schemas/runs.py +3 -3
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +6 -5
- mlrun/common/types.py +1 -0
- mlrun/config.py +157 -89
- mlrun/data_types/__init__.py +5 -3
- mlrun/data_types/infer.py +13 -3
- mlrun/data_types/spark.py +2 -1
- mlrun/datastore/__init__.py +59 -18
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +19 -24
- mlrun/datastore/datastore.py +10 -4
- mlrun/datastore/datastore_profile.py +178 -45
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +14 -3
- mlrun/datastore/sources.py +89 -92
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/storeytargets.py +51 -16
- mlrun/datastore/targets.py +38 -31
- mlrun/datastore/utils.py +87 -4
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +291 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +286 -100
- mlrun/db/httpdb.py +1562 -490
- mlrun/db/nopdb.py +250 -83
- mlrun/errors.py +6 -2
- mlrun/execution.py +194 -50
- mlrun/feature_store/__init__.py +2 -10
- mlrun/feature_store/api.py +20 -458
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +105 -479
- mlrun/feature_store/feature_vector_utils.py +466 -0
- mlrun/feature_store/retrieval/base.py +15 -11
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/retrieval/storey_merger.py +1 -1
- mlrun/feature_store/steps.py +3 -3
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +31 -31
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/k8s_utils.py +2 -5
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +6 -2
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +8 -4
- mlrun/model.py +132 -46
- mlrun/model_monitoring/__init__.py +3 -5
- mlrun/model_monitoring/api.py +113 -98
- mlrun/model_monitoring/applications/__init__.py +0 -5
- mlrun/model_monitoring/applications/_application_steps.py +81 -50
- mlrun/model_monitoring/applications/base.py +467 -14
- mlrun/model_monitoring/applications/context.py +212 -134
- mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
- mlrun/model_monitoring/applications/evidently/base.py +146 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
- mlrun/model_monitoring/applications/results.py +67 -15
- mlrun/model_monitoring/controller.py +701 -315
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +242 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
- mlrun/model_monitoring/db/tsdb/base.py +243 -49
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
- mlrun/model_monitoring/helpers.py +356 -114
- mlrun/model_monitoring/stream_processing.py +190 -345
- mlrun/model_monitoring/tracking_policy.py +11 -4
- mlrun/model_monitoring/writer.py +49 -90
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +2 -2
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +35 -32
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +30 -30
- mlrun/projects/pipelines.py +116 -47
- mlrun/projects/project.py +1292 -329
- mlrun/render.py +5 -9
- mlrun/run.py +57 -14
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +30 -22
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
- mlrun/runtimes/function_reference.py +5 -2
- mlrun/runtimes/generators.py +3 -2
- mlrun/runtimes/kubejob.py +6 -7
- mlrun/runtimes/mounts.py +574 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -13
- mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
- mlrun/runtimes/nuclio/function.py +127 -70
- mlrun/runtimes/nuclio/serving.py +105 -37
- mlrun/runtimes/pod.py +159 -54
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +22 -12
- mlrun/runtimes/utils.py +7 -6
- mlrun/secrets.py +2 -2
- mlrun/serving/__init__.py +8 -0
- mlrun/serving/merger.py +7 -5
- mlrun/serving/remote.py +35 -22
- mlrun/serving/routers.py +186 -240
- mlrun/serving/server.py +41 -10
- mlrun/serving/states.py +432 -118
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +161 -203
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +35 -22
- mlrun/utils/clones.py +7 -4
- mlrun/utils/helpers.py +511 -58
- mlrun/utils/logger.py +119 -13
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +39 -15
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +16 -8
- mlrun/utils/notifications/notification/webhook.py +24 -8
- mlrun/utils/notifications/notification_pusher.py +191 -200
- mlrun/utils/regex.py +12 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
- mlrun-1.8.0.dist-info/RECORD +351 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/applications/evidently_base.py +0 -137
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.2rc3.dist-info/RECORD +0 -351
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -30,7 +30,6 @@ import typing
|
|
|
30
30
|
import warnings
|
|
31
31
|
from collections.abc import Mapping
|
|
32
32
|
from datetime import timedelta
|
|
33
|
-
from distutils.util import strtobool
|
|
34
33
|
from os.path import expanduser
|
|
35
34
|
from threading import Lock
|
|
36
35
|
|
|
@@ -83,8 +82,8 @@ default_config = {
|
|
|
83
82
|
"images_to_enrich_registry": "^mlrun/*,python:3.9",
|
|
84
83
|
"kfp_url": "",
|
|
85
84
|
"kfp_ttl": "14400", # KFP ttl in sec, after that completed PODs will be deleted
|
|
86
|
-
"kfp_image": "mlrun/mlrun", # image to use for KFP runner
|
|
87
|
-
"dask_kfp_image": "mlrun/ml-base", # image to use for dask KFP runner
|
|
85
|
+
"kfp_image": "mlrun/mlrun-kfp", # image to use for KFP runner
|
|
86
|
+
"dask_kfp_image": "mlrun/ml-base", # image to use for dask KFP runner
|
|
88
87
|
"igz_version": "", # the version of the iguazio system the API is running on
|
|
89
88
|
"iguazio_api_url": "", # the url to iguazio api
|
|
90
89
|
"spark_app_image": "", # image to use for spark operator app runtime
|
|
@@ -102,7 +101,10 @@ default_config = {
|
|
|
102
101
|
"log_level": "INFO",
|
|
103
102
|
# log formatter (options: human | human_extended | json)
|
|
104
103
|
"log_formatter": "human",
|
|
105
|
-
|
|
104
|
+
# custom logger format, workes only with log_formatter: custom
|
|
105
|
+
# Note that your custom format must include those 4 fields - timestamp, level, message and more
|
|
106
|
+
"log_format_override": None,
|
|
107
|
+
"submit_timeout": "280", # timeout when submitting a new k8s resource
|
|
106
108
|
# runtimes cleanup interval in seconds
|
|
107
109
|
"runtimes_cleanup_interval": "300",
|
|
108
110
|
"monitoring": {
|
|
@@ -120,14 +122,6 @@ default_config = {
|
|
|
120
122
|
"projects": {
|
|
121
123
|
"summaries": {
|
|
122
124
|
"cache_interval": "30",
|
|
123
|
-
"feature_gates": {
|
|
124
|
-
"artifacts": "enabled",
|
|
125
|
-
"schedules": "enabled",
|
|
126
|
-
"feature_sets": "enabled",
|
|
127
|
-
"models": "enabled",
|
|
128
|
-
"runs": "enabled",
|
|
129
|
-
"pipelines": "enabled",
|
|
130
|
-
},
|
|
131
125
|
},
|
|
132
126
|
},
|
|
133
127
|
},
|
|
@@ -140,6 +134,12 @@ default_config = {
|
|
|
140
134
|
"delete_crd_resources_timeout": "5 minutes",
|
|
141
135
|
},
|
|
142
136
|
},
|
|
137
|
+
"object_retentions": {
|
|
138
|
+
"alert_activations": 14 * 7, # days
|
|
139
|
+
},
|
|
140
|
+
# A safety margin to account for delays
|
|
141
|
+
# This ensures that extra partitions are available beyond the specified retention period
|
|
142
|
+
"partitions_buffer_multiplier": 3,
|
|
143
143
|
# the grace period (in seconds) that will be given to runtime resources (after they're in terminal state)
|
|
144
144
|
# before deleting them (4 hours)
|
|
145
145
|
"runtime_resources_deletion_grace_period": "14400",
|
|
@@ -159,6 +159,7 @@ default_config = {
|
|
|
159
159
|
# migration from artifacts to artifacts_v2 is done in batches, and requires a state file to keep track of the
|
|
160
160
|
# migration progress.
|
|
161
161
|
"artifact_migration_batch_size": 200,
|
|
162
|
+
"artifact_migration_v9_batch_size": 30000,
|
|
162
163
|
"artifact_migration_state_file_path": "./db/_artifact_migration_state.json",
|
|
163
164
|
"datasets": {
|
|
164
165
|
"max_preview_columns": 100,
|
|
@@ -167,6 +168,7 @@ default_config = {
|
|
|
167
168
|
"max_chunk_size": 1024 * 1024 * 1, # 1MB
|
|
168
169
|
"max_preview_size": 1024 * 1024 * 10, # 10MB
|
|
169
170
|
"max_download_size": 1024 * 1024 * 100, # 100MB
|
|
171
|
+
"max_deletions": 200,
|
|
170
172
|
},
|
|
171
173
|
},
|
|
172
174
|
# FIXME: Adding these defaults here so we won't need to patch the "installing component" (provazio-controller) to
|
|
@@ -229,8 +231,11 @@ default_config = {
|
|
|
229
231
|
"abort_grace_period": "10",
|
|
230
232
|
"delete_project": "900",
|
|
231
233
|
"delete_function": "900",
|
|
234
|
+
"model_endpoint_creation": "600",
|
|
235
|
+
"model_endpoint_tsdb_leftovers": "900",
|
|
232
236
|
},
|
|
233
237
|
"runtimes": {"dask": "600"},
|
|
238
|
+
"push_notifications": "60",
|
|
234
239
|
},
|
|
235
240
|
},
|
|
236
241
|
"function": {
|
|
@@ -262,6 +267,7 @@ default_config = {
|
|
|
262
267
|
# When the module is reloaded, the maximum depth recursion configuration for the recursive reload
|
|
263
268
|
# function is used to prevent infinite loop
|
|
264
269
|
"reload_max_recursion_depth": 100,
|
|
270
|
+
"source_code_max_bytes": 10000,
|
|
265
271
|
},
|
|
266
272
|
"databricks": {
|
|
267
273
|
"artifact_directory_path": "/mlrun_databricks_runtime/artifacts_dictionaries"
|
|
@@ -314,7 +320,7 @@ default_config = {
|
|
|
314
320
|
},
|
|
315
321
|
"request_timeout": 45, # seconds
|
|
316
322
|
},
|
|
317
|
-
# see server.api.utils.helpers.ensure_running_on_chief
|
|
323
|
+
# see server.py.services.api.utils.helpers.ensure_running_on_chief
|
|
318
324
|
"ensure_function_running_on_chief_mode": "enabled",
|
|
319
325
|
},
|
|
320
326
|
"port": 8080,
|
|
@@ -480,6 +486,10 @@ default_config = {
|
|
|
480
486
|
"iguazio_client_job_cache_ttl": "20 minutes",
|
|
481
487
|
"nuclio_project_deletion_verification_timeout": "300 seconds",
|
|
482
488
|
"nuclio_project_deletion_verification_interval": "5 seconds",
|
|
489
|
+
"summaries": {
|
|
490
|
+
# Number of days back to include when calculating the project pipeline summary.
|
|
491
|
+
"list_pipelines_time_period_in_days": 7,
|
|
492
|
+
},
|
|
483
493
|
},
|
|
484
494
|
# The API needs to know what is its k8s svc url so it could enrich it in the jobs it creates
|
|
485
495
|
"api_url": "",
|
|
@@ -532,7 +542,9 @@ default_config = {
|
|
|
532
542
|
"verbose": True,
|
|
533
543
|
},
|
|
534
544
|
"pagination": {
|
|
535
|
-
"default_page_size":
|
|
545
|
+
"default_page_size": 200,
|
|
546
|
+
"page_limit": 1000000,
|
|
547
|
+
"page_size_limit": 1000000,
|
|
536
548
|
"pagination_cache": {
|
|
537
549
|
"interval": 60,
|
|
538
550
|
"ttl": 3600,
|
|
@@ -541,6 +553,10 @@ default_config = {
|
|
|
541
553
|
},
|
|
542
554
|
},
|
|
543
555
|
"model_endpoint_monitoring": {
|
|
556
|
+
# Scaling Rule
|
|
557
|
+
# The fundamental scaling rule to maintain is: Shards/Partitions = Replicas * Workers
|
|
558
|
+
# In other words, the number of shards (V3IO) or partitions (Kafka) must be equal to the
|
|
559
|
+
# total number of worker processes across all pods.
|
|
544
560
|
"serving_stream": {
|
|
545
561
|
"v3io": {
|
|
546
562
|
"shard_count": 2,
|
|
@@ -559,33 +575,49 @@ default_config = {
|
|
|
559
575
|
},
|
|
560
576
|
"application_stream_args": {
|
|
561
577
|
"v3io": {
|
|
562
|
-
"shard_count":
|
|
578
|
+
"shard_count": 4,
|
|
563
579
|
"retention_period_hours": 24,
|
|
564
|
-
"num_workers":
|
|
580
|
+
"num_workers": 4,
|
|
565
581
|
"min_replicas": 1,
|
|
566
582
|
"max_replicas": 1,
|
|
567
583
|
},
|
|
568
584
|
"kafka": {
|
|
569
|
-
"partition_count":
|
|
585
|
+
"partition_count": 4,
|
|
570
586
|
"replication_factor": 1,
|
|
571
|
-
"num_workers":
|
|
587
|
+
"num_workers": 4,
|
|
572
588
|
"min_replicas": 1,
|
|
573
589
|
"max_replicas": 1,
|
|
574
590
|
},
|
|
575
591
|
},
|
|
576
592
|
"writer_stream_args": {
|
|
577
593
|
"v3io": {
|
|
578
|
-
"shard_count":
|
|
594
|
+
"shard_count": 4,
|
|
579
595
|
"retention_period_hours": 24,
|
|
580
|
-
"num_workers":
|
|
596
|
+
"num_workers": 4,
|
|
581
597
|
"min_replicas": 1,
|
|
582
598
|
"max_replicas": 1,
|
|
583
599
|
},
|
|
584
600
|
"kafka": {
|
|
585
|
-
"partition_count":
|
|
601
|
+
"partition_count": 4,
|
|
586
602
|
# TODO: add retention period configuration
|
|
587
603
|
"replication_factor": 1,
|
|
588
|
-
"num_workers":
|
|
604
|
+
"num_workers": 4,
|
|
605
|
+
"min_replicas": 1,
|
|
606
|
+
"max_replicas": 1,
|
|
607
|
+
},
|
|
608
|
+
},
|
|
609
|
+
"controller_stream_args": {
|
|
610
|
+
"v3io": {
|
|
611
|
+
"shard_count": 10,
|
|
612
|
+
"retention_period_hours": 24,
|
|
613
|
+
"num_workers": 10,
|
|
614
|
+
"min_replicas": 1,
|
|
615
|
+
"max_replicas": 1,
|
|
616
|
+
},
|
|
617
|
+
"kafka": {
|
|
618
|
+
"partition_count": 10,
|
|
619
|
+
"replication_factor": 1,
|
|
620
|
+
"num_workers": 10,
|
|
589
621
|
"min_replicas": 1,
|
|
590
622
|
"max_replicas": 1,
|
|
591
623
|
},
|
|
@@ -600,22 +632,8 @@ default_config = {
|
|
|
600
632
|
# Offline storage path can be either relative or a full path. This path is used for general offline data
|
|
601
633
|
# storage such as the parquet file which is generated from the monitoring stream function for the drift analysis
|
|
602
634
|
"offline_storage_path": "model-endpoints/{kind}",
|
|
603
|
-
# Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
|
|
604
|
-
# when the user is working in CE environment and has not provided any stream path.
|
|
605
|
-
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
|
|
606
|
-
"default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
|
|
607
635
|
"parquet_batching_max_events": 10_000,
|
|
608
636
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
609
|
-
# See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
|
|
610
|
-
"endpoint_store_connection": "",
|
|
611
|
-
# See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
|
|
612
|
-
"tsdb_connection": "",
|
|
613
|
-
# See mlrun.common.schemas.model_monitoring.constants.StreamKind for available options
|
|
614
|
-
"stream_connection": "",
|
|
615
|
-
"tdengine": {
|
|
616
|
-
"timeout": 10,
|
|
617
|
-
"retries": 1,
|
|
618
|
-
},
|
|
619
637
|
},
|
|
620
638
|
"secret_stores": {
|
|
621
639
|
# Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory
|
|
@@ -644,7 +662,7 @@ default_config = {
|
|
|
644
662
|
"auto_add_project_secrets": True,
|
|
645
663
|
"project_secret_name": "mlrun-project-secrets-{project}",
|
|
646
664
|
"auth_secret_name": "mlrun-auth-secrets.{hashed_access_key}",
|
|
647
|
-
"env_variable_prefix": "
|
|
665
|
+
"env_variable_prefix": "",
|
|
648
666
|
"global_function_env_secret_name": None,
|
|
649
667
|
},
|
|
650
668
|
},
|
|
@@ -729,6 +747,7 @@ default_config = {
|
|
|
729
747
|
},
|
|
730
748
|
"workflows": {
|
|
731
749
|
"default_workflow_runner_name": "workflow-runner-{}",
|
|
750
|
+
"concurrent_delete_worker_count": 20,
|
|
732
751
|
# Default timeout seconds for retrieving workflow id after execution
|
|
733
752
|
# Remote workflow timeout is the maximum between remote and the inner engine timeout
|
|
734
753
|
"timeouts": {"local": 120, "kfp": 60, "remote": 60 * 5},
|
|
@@ -794,17 +813,44 @@ default_config = {
|
|
|
794
813
|
"grafana_url": "",
|
|
795
814
|
"alerts": {
|
|
796
815
|
# supported modes: "enabled", "disabled".
|
|
797
|
-
"mode": "
|
|
816
|
+
"mode": "enabled",
|
|
798
817
|
# maximum number of alerts we allow to be configured.
|
|
799
818
|
# user will get an error when exceeding this
|
|
800
|
-
"max_allowed":
|
|
819
|
+
"max_allowed": 20000,
|
|
801
820
|
# maximum allowed value for count in criteria field inside AlertConfig
|
|
802
821
|
"max_criteria_count": 100,
|
|
822
|
+
# interval for periodic events generation job
|
|
823
|
+
"events_generation_interval": 30, # seconds
|
|
824
|
+
# number of alerts to delete in each chunk
|
|
825
|
+
"chunk_size_during_project_deletion": 100,
|
|
826
|
+
# maximum allowed alert config cache size in alert's CRUD
|
|
827
|
+
# for the best performance, it is recommended to set this value to the maximum number of alerts
|
|
828
|
+
"max_allowed_cache_size": 20000,
|
|
829
|
+
# default limit for listing alert configs
|
|
830
|
+
"default_list_alert_configs_limit": 2000,
|
|
803
831
|
},
|
|
804
832
|
"auth_with_client_id": {
|
|
805
833
|
"enabled": False,
|
|
806
834
|
"request_timeout": 5,
|
|
807
835
|
},
|
|
836
|
+
"services": {
|
|
837
|
+
# The running service name. One of: "api", "alerts"
|
|
838
|
+
"service_name": "api",
|
|
839
|
+
"hydra": {
|
|
840
|
+
# Comma separated list of services to run on the instance.
|
|
841
|
+
# Currently, this is only considered when the service_name is "api".
|
|
842
|
+
# "*" starts all services on the same instance,
|
|
843
|
+
# other options are considered as running only the api service.
|
|
844
|
+
"services": "*",
|
|
845
|
+
},
|
|
846
|
+
},
|
|
847
|
+
"notifications": {
|
|
848
|
+
"smtp": {
|
|
849
|
+
"config_secret_name": "mlrun-smtp-config",
|
|
850
|
+
"refresh_interval": "30",
|
|
851
|
+
}
|
|
852
|
+
},
|
|
853
|
+
"system_id": "",
|
|
808
854
|
}
|
|
809
855
|
_is_running_as_api = None
|
|
810
856
|
|
|
@@ -837,6 +883,14 @@ class Config:
|
|
|
837
883
|
return self.__class__(val)
|
|
838
884
|
return val
|
|
839
885
|
|
|
886
|
+
def __deepcopy__(self, memo):
|
|
887
|
+
cls = self.__class__
|
|
888
|
+
# create a new Config without calling __init__ (avoid recursion)
|
|
889
|
+
result = cls.__new__(cls)
|
|
890
|
+
# manually deep-copy _cfg
|
|
891
|
+
object.__setattr__(result, "_cfg", copy.deepcopy(self._cfg, memo))
|
|
892
|
+
return result
|
|
893
|
+
|
|
840
894
|
def __setattr__(self, attr, value):
|
|
841
895
|
# in order for the dbpath setter to work
|
|
842
896
|
if attr == "dbpath":
|
|
@@ -851,6 +905,22 @@ class Config:
|
|
|
851
905
|
name = self.__class__.__name__
|
|
852
906
|
return f"{name}({self._cfg!r})"
|
|
853
907
|
|
|
908
|
+
def __iter__(self):
|
|
909
|
+
if isinstance(self._cfg, Mapping):
|
|
910
|
+
return self._cfg.__iter__()
|
|
911
|
+
|
|
912
|
+
def items(self):
|
|
913
|
+
if isinstance(self._cfg, Mapping):
|
|
914
|
+
return iter(self._cfg.items())
|
|
915
|
+
|
|
916
|
+
def keys(self):
|
|
917
|
+
if isinstance(self._cfg, Mapping):
|
|
918
|
+
return iter(self.data.keys())
|
|
919
|
+
|
|
920
|
+
def values(self):
|
|
921
|
+
if isinstance(self._cfg, Mapping):
|
|
922
|
+
return iter(self.data.values())
|
|
923
|
+
|
|
854
924
|
def update(self, cfg, skip_errors=False):
|
|
855
925
|
for key, value in cfg.items():
|
|
856
926
|
if hasattr(self, key):
|
|
@@ -1043,6 +1113,17 @@ class Config:
|
|
|
1043
1113
|
f"is not allowed for iguazio version: {igz_version} < 3.5.1"
|
|
1044
1114
|
)
|
|
1045
1115
|
|
|
1116
|
+
def validate_object_retentions(self):
|
|
1117
|
+
for table_name, retention_days in self.object_retentions.items():
|
|
1118
|
+
if retention_days < 7 and not os.getenv("PARTITION_INTERVAL"):
|
|
1119
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1120
|
+
f"{table_name} partition interval must be greater than a week"
|
|
1121
|
+
)
|
|
1122
|
+
elif retention_days > 53 * 7:
|
|
1123
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1124
|
+
f"{table_name} partition interval must be less than a year"
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1046
1127
|
def resolve_chief_api_url(self) -> str:
|
|
1047
1128
|
if self.httpdb.clusterization.chief.url:
|
|
1048
1129
|
return self.httpdb.clusterization.chief.url
|
|
@@ -1201,9 +1282,9 @@ class Config:
|
|
|
1201
1282
|
|
|
1202
1283
|
def get_model_monitoring_file_target_path(
|
|
1203
1284
|
self,
|
|
1204
|
-
project: str
|
|
1205
|
-
kind: str
|
|
1206
|
-
target:
|
|
1285
|
+
project: str,
|
|
1286
|
+
kind: str,
|
|
1287
|
+
target: typing.Literal["online", "offline"] = "online",
|
|
1207
1288
|
artifact_path: typing.Optional[str] = None,
|
|
1208
1289
|
function_name: typing.Optional[str] = None,
|
|
1209
1290
|
**kwargs,
|
|
@@ -1237,24 +1318,39 @@ class Config:
|
|
|
1237
1318
|
function_name
|
|
1238
1319
|
and function_name
|
|
1239
1320
|
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
|
|
1321
|
+
and function_name
|
|
1322
|
+
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.APPLICATION_CONTROLLER
|
|
1240
1323
|
):
|
|
1241
1324
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1242
1325
|
project=project,
|
|
1243
1326
|
kind=kind
|
|
1244
1327
|
if function_name is None
|
|
1245
|
-
else f"{kind}-{function_name.lower()}",
|
|
1328
|
+
else f"{kind}-{function_name.lower()}-v1",
|
|
1246
1329
|
)
|
|
1247
|
-
elif
|
|
1330
|
+
elif (
|
|
1331
|
+
kind == "stream"
|
|
1332
|
+
and function_name
|
|
1333
|
+
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.APPLICATION_CONTROLLER
|
|
1334
|
+
):
|
|
1248
1335
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1249
1336
|
project=project,
|
|
1250
|
-
kind=kind,
|
|
1337
|
+
kind=f"{kind}-v1",
|
|
1251
1338
|
)
|
|
1252
|
-
|
|
1339
|
+
elif (
|
|
1340
|
+
function_name
|
|
1341
|
+
== mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.APPLICATION_CONTROLLER
|
|
1342
|
+
and kind == "stream"
|
|
1343
|
+
):
|
|
1253
1344
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1254
1345
|
project=project,
|
|
1255
|
-
kind=kind,
|
|
1346
|
+
kind=f"{kind}-{function_name.lower()}-v1",
|
|
1256
1347
|
)
|
|
1257
1348
|
|
|
1349
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1350
|
+
project=project,
|
|
1351
|
+
kind=kind,
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1258
1354
|
# Get the current offline path from the configuration
|
|
1259
1355
|
file_path = mlrun.mlconf.model_endpoint_monitoring.offline_storage_path.format(
|
|
1260
1356
|
project=project, kind=kind
|
|
@@ -1282,35 +1378,6 @@ class Config:
|
|
|
1282
1378
|
ver in mlrun.mlconf.ce.mode for ver in ["lite", "full"]
|
|
1283
1379
|
)
|
|
1284
1380
|
|
|
1285
|
-
def get_s3_storage_options(self) -> dict[str, typing.Any]:
|
|
1286
|
-
"""
|
|
1287
|
-
Generate storage options dictionary as required for handling S3 path in fsspec. The model monitoring stream
|
|
1288
|
-
graph uses this method for generating the storage options for S3 parquet target path.
|
|
1289
|
-
:return: A storage options dictionary in which each key-value pair represents a particular configuration,
|
|
1290
|
-
such as endpoint_url or aws access key.
|
|
1291
|
-
"""
|
|
1292
|
-
key = mlrun.get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
1293
|
-
secret = mlrun.get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
1294
|
-
|
|
1295
|
-
force_non_anonymous = mlrun.get_secret_or_env("S3_NON_ANONYMOUS")
|
|
1296
|
-
profile = mlrun.get_secret_or_env("AWS_PROFILE")
|
|
1297
|
-
|
|
1298
|
-
storage_options = dict(
|
|
1299
|
-
anon=not (force_non_anonymous or (key and secret)),
|
|
1300
|
-
key=key,
|
|
1301
|
-
secret=secret,
|
|
1302
|
-
)
|
|
1303
|
-
|
|
1304
|
-
endpoint_url = mlrun.get_secret_or_env("S3_ENDPOINT_URL")
|
|
1305
|
-
if endpoint_url:
|
|
1306
|
-
client_kwargs = {"endpoint_url": endpoint_url}
|
|
1307
|
-
storage_options["client_kwargs"] = client_kwargs
|
|
1308
|
-
|
|
1309
|
-
if profile:
|
|
1310
|
-
storage_options["profile"] = profile
|
|
1311
|
-
|
|
1312
|
-
return storage_options
|
|
1313
|
-
|
|
1314
1381
|
def is_explicit_ack_enabled(self) -> bool:
|
|
1315
1382
|
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1316
1383
|
not self.nuclio_version
|
|
@@ -1381,9 +1448,12 @@ def _validate_config(config):
|
|
|
1381
1448
|
pass
|
|
1382
1449
|
|
|
1383
1450
|
config.verify_security_context_enrichment_mode_is_allowed()
|
|
1451
|
+
config.validate_object_retentions()
|
|
1384
1452
|
|
|
1385
1453
|
|
|
1386
|
-
def _verify_gpu_requests_and_limits(
|
|
1454
|
+
def _verify_gpu_requests_and_limits(
|
|
1455
|
+
requests_gpu: typing.Optional[str] = None, limits_gpu: typing.Optional[str] = None
|
|
1456
|
+
):
|
|
1387
1457
|
# https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
|
|
1388
1458
|
if requests_gpu and not limits_gpu:
|
|
1389
1459
|
raise mlrun.errors.MLRunConflictError(
|
|
@@ -1396,7 +1466,7 @@ def _verify_gpu_requests_and_limits(requests_gpu: str = None, limits_gpu: str =
|
|
|
1396
1466
|
)
|
|
1397
1467
|
|
|
1398
1468
|
|
|
1399
|
-
def _convert_resources_to_str(config: dict = None):
|
|
1469
|
+
def _convert_resources_to_str(config: typing.Optional[dict] = None):
|
|
1400
1470
|
resources_types = ["cpu", "memory", "gpu"]
|
|
1401
1471
|
resource_requirements = ["requests", "limits"]
|
|
1402
1472
|
if not config.get("default_function_pod_resources"):
|
|
@@ -1414,17 +1484,6 @@ def _convert_resources_to_str(config: dict = None):
|
|
|
1414
1484
|
resource_requirement[resource_type] = str(value)
|
|
1415
1485
|
|
|
1416
1486
|
|
|
1417
|
-
def _convert_str(value, typ):
|
|
1418
|
-
if typ in (str, _none_type):
|
|
1419
|
-
return value
|
|
1420
|
-
|
|
1421
|
-
if typ is bool:
|
|
1422
|
-
return strtobool(value)
|
|
1423
|
-
|
|
1424
|
-
# e.g. int('8080') → 8080
|
|
1425
|
-
return typ(value)
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
1487
|
def _configure_ssl_verification(verify_ssl: bool) -> None:
|
|
1429
1488
|
"""Configure SSL verification warnings based on the setting."""
|
|
1430
1489
|
if not verify_ssl:
|
|
@@ -1532,6 +1591,15 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1532
1591
|
# The default function pod resource values are of type str; however, when reading from environment variable numbers,
|
|
1533
1592
|
# it converts them to type int if contains only number, so we want to convert them to str.
|
|
1534
1593
|
_convert_resources_to_str(config)
|
|
1594
|
+
|
|
1595
|
+
# If the environment variable MLRUN_HTTPDB__HTTP__VERIFY is set, we ensure SSL verification settings take precedence
|
|
1596
|
+
# by moving the 'httpdb' configuration to the beginning of the config dictionary.
|
|
1597
|
+
# This ensures that SSL verification is applied before other settings.
|
|
1598
|
+
if "MLRUN_HTTPDB__HTTP__VERIFY" in env:
|
|
1599
|
+
httpdb = config.pop("httpdb", None)
|
|
1600
|
+
if httpdb:
|
|
1601
|
+
config = {"httpdb": httpdb, **config}
|
|
1602
|
+
|
|
1535
1603
|
return config
|
|
1536
1604
|
|
|
1537
1605
|
|
mlrun/data_types/__init__.py
CHANGED
|
@@ -11,8 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
|
-
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
14
|
|
|
17
15
|
from .data_types import (
|
|
18
16
|
InferOptions,
|
|
@@ -29,8 +27,12 @@ class BaseDataInfer:
|
|
|
29
27
|
get_stats = None
|
|
30
28
|
|
|
31
29
|
|
|
30
|
+
def is_spark_dataframe(df) -> bool:
|
|
31
|
+
return "rdd" in dir(df)
|
|
32
|
+
|
|
33
|
+
|
|
32
34
|
def get_infer_interface(df) -> BaseDataInfer:
|
|
33
|
-
if
|
|
35
|
+
if is_spark_dataframe(df):
|
|
34
36
|
from .spark import SparkDataInfer
|
|
35
37
|
|
|
36
38
|
return SparkDataInfer
|
mlrun/data_types/infer.py
CHANGED
|
@@ -12,12 +12,16 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import packaging.version
|
|
17
19
|
import pandas as pd
|
|
18
20
|
import pyarrow
|
|
19
21
|
from pandas.io.json._table_schema import convert_pandas_type_to_json_field
|
|
20
22
|
|
|
23
|
+
import mlrun.features
|
|
24
|
+
from mlrun.model import ObjectList
|
|
21
25
|
from mlrun.utils import logger
|
|
22
26
|
|
|
23
27
|
from .data_types import InferOptions, pa_type_to_value_type, pd_schema_to_value_type
|
|
@@ -27,17 +31,19 @@ default_num_bins = 20
|
|
|
27
31
|
|
|
28
32
|
def infer_schema_from_df(
|
|
29
33
|
df: pd.DataFrame,
|
|
30
|
-
features,
|
|
34
|
+
features: ObjectList,
|
|
31
35
|
entities,
|
|
32
|
-
timestamp_key: str = None,
|
|
36
|
+
timestamp_key: Optional[str] = None,
|
|
33
37
|
entity_columns=None,
|
|
34
38
|
options: InferOptions = InferOptions.Null,
|
|
39
|
+
push_at_start: Optional[bool] = False,
|
|
35
40
|
):
|
|
36
41
|
"""infer feature set schema from dataframe"""
|
|
37
42
|
timestamp_fields = []
|
|
38
43
|
current_entities = list(entities.keys())
|
|
39
44
|
entity_columns = entity_columns or []
|
|
40
45
|
index_columns = dict()
|
|
46
|
+
temp_features = ObjectList(mlrun.features.Feature)
|
|
41
47
|
|
|
42
48
|
def upsert_entity(name, value_type):
|
|
43
49
|
if name in current_entities:
|
|
@@ -72,10 +78,14 @@ def infer_schema_from_df(
|
|
|
72
78
|
if column in features.keys():
|
|
73
79
|
features[column].value_type = value_type
|
|
74
80
|
else:
|
|
75
|
-
|
|
81
|
+
temp_features[column] = {"name": column, "value_type": value_type}
|
|
76
82
|
if value_type == "datetime" and not is_entity:
|
|
77
83
|
timestamp_fields.append(column)
|
|
78
84
|
|
|
85
|
+
features.update_list(
|
|
86
|
+
object_list=temp_features, push_at_start=push_at_start
|
|
87
|
+
) # Push to start of the Object list
|
|
88
|
+
|
|
79
89
|
index_type = None
|
|
80
90
|
if InferOptions.get_common_options(options, InferOptions.Index):
|
|
81
91
|
# infer types of index fields
|
mlrun/data_types/spark.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
from os import environ
|
|
17
|
+
from typing import Optional
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
19
20
|
import pytz
|
|
@@ -35,7 +36,7 @@ def infer_schema_from_df_spark(
|
|
|
35
36
|
df,
|
|
36
37
|
features,
|
|
37
38
|
entities,
|
|
38
|
-
timestamp_key: str = None,
|
|
39
|
+
timestamp_key: Optional[str] = None,
|
|
39
40
|
entity_columns=None,
|
|
40
41
|
options: InferOptions = InferOptions.Null,
|
|
41
42
|
):
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -12,8 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
|
-
|
|
17
15
|
__all__ = [
|
|
18
16
|
"DataItem",
|
|
19
17
|
"get_store_resource",
|
|
@@ -32,11 +30,21 @@ __all__ = [
|
|
|
32
30
|
"DatabricksFileSystemDisableCache",
|
|
33
31
|
"DatabricksFileBugFixed",
|
|
34
32
|
"get_stream_pusher",
|
|
33
|
+
"ConfigProfile",
|
|
34
|
+
"VectorStoreCollection",
|
|
35
35
|
]
|
|
36
36
|
|
|
37
|
+
from urllib.parse import urlparse
|
|
38
|
+
|
|
37
39
|
import fsspec
|
|
40
|
+
from mergedeep import merge
|
|
38
41
|
|
|
39
42
|
import mlrun.datastore.wasbfs
|
|
43
|
+
from mlrun.datastore.datastore_profile import (
|
|
44
|
+
DatastoreProfileKafkaSource,
|
|
45
|
+
DatastoreProfileKafkaTarget,
|
|
46
|
+
DatastoreProfileV3io,
|
|
47
|
+
)
|
|
40
48
|
from mlrun.platforms.iguazio import (
|
|
41
49
|
HTTPOutputStream,
|
|
42
50
|
KafkaOutputStream,
|
|
@@ -106,23 +114,56 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
106
114
|
|
|
107
115
|
:param stream_path: path/url of stream
|
|
108
116
|
"""
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
117
|
+
if stream_path.startswith("ds://"):
|
|
118
|
+
datastore_profile = mlrun.datastore.datastore_profile.datastore_profile_read(
|
|
119
|
+
stream_path
|
|
120
|
+
)
|
|
121
|
+
if isinstance(
|
|
122
|
+
datastore_profile,
|
|
123
|
+
(DatastoreProfileKafkaSource, DatastoreProfileKafkaTarget),
|
|
124
|
+
):
|
|
125
|
+
attributes = datastore_profile.attributes()
|
|
126
|
+
brokers = attributes.pop("brokers", None)
|
|
127
|
+
# Override the topic with the one in the url (if any)
|
|
128
|
+
parsed_url = urlparse(stream_path)
|
|
129
|
+
topic = (
|
|
130
|
+
parsed_url.path.strip("/")
|
|
131
|
+
if parsed_url.path
|
|
132
|
+
else datastore_profile.get_topic()
|
|
133
|
+
)
|
|
134
|
+
producer_options = mlrun.datastore.utils.KafkaParameters(
|
|
135
|
+
attributes
|
|
136
|
+
).producer()
|
|
137
|
+
return KafkaOutputStream(topic, brokers, producer_options=producer_options)
|
|
138
|
+
|
|
139
|
+
elif isinstance(datastore_profile, DatastoreProfileV3io):
|
|
140
|
+
parsed_url = urlparse(stream_path)
|
|
141
|
+
stream_path = datastore_profile.url(parsed_url.path)
|
|
142
|
+
endpoint, stream_path = parse_path(stream_path)
|
|
143
|
+
return OutputStream(stream_path, endpoint=endpoint, **kwargs)
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Unsupported datastore profile type: {type(datastore_profile)}"
|
|
147
|
+
)
|
|
124
148
|
else:
|
|
125
|
-
|
|
149
|
+
kafka_brokers = get_kafka_brokers_from_dict(kwargs)
|
|
150
|
+
if stream_path.startswith("kafka://") or kafka_brokers:
|
|
151
|
+
topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
|
|
152
|
+
return KafkaOutputStream(
|
|
153
|
+
topic, brokers, kwargs.get("kafka_producer_options")
|
|
154
|
+
)
|
|
155
|
+
elif stream_path.startswith("http://") or stream_path.startswith("https://"):
|
|
156
|
+
return HTTPOutputStream(stream_path=stream_path)
|
|
157
|
+
elif "://" not in stream_path:
|
|
158
|
+
return OutputStream(stream_path, **kwargs)
|
|
159
|
+
elif stream_path.startswith("v3io"):
|
|
160
|
+
endpoint, stream_path = parse_path(stream_path)
|
|
161
|
+
endpoint = kwargs.pop("endpoint", None) or endpoint
|
|
162
|
+
return OutputStream(stream_path, endpoint=endpoint, **kwargs)
|
|
163
|
+
elif stream_path.startswith("dummy://"):
|
|
164
|
+
return _DummyStream(**kwargs)
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(f"unsupported stream path {stream_path}")
|
|
126
167
|
|
|
127
168
|
|
|
128
169
|
class _DummyStream:
|