mlrun 1.5.0rc11__py3-none-any.whl → 1.5.0rc13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +31 -2
- mlrun/api/api/endpoints/functions.py +110 -52
- mlrun/api/api/endpoints/model_endpoints.py +0 -56
- mlrun/api/crud/model_monitoring/deployment.py +208 -38
- mlrun/api/crud/model_monitoring/helpers.py +19 -6
- mlrun/api/crud/model_monitoring/model_endpoints.py +14 -31
- mlrun/api/db/sqldb/db.py +3 -1
- mlrun/api/utils/builder.py +2 -4
- mlrun/common/model_monitoring/helpers.py +19 -5
- mlrun/common/schemas/model_monitoring/constants.py +69 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +22 -1
- mlrun/config.py +30 -12
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/datastore_profile.py +2 -2
- mlrun/datastore/sources.py +4 -30
- mlrun/datastore/targets.py +106 -55
- mlrun/db/httpdb.py +20 -6
- mlrun/feature_store/__init__.py +2 -0
- mlrun/feature_store/api.py +3 -31
- mlrun/feature_store/feature_vector.py +1 -1
- mlrun/feature_store/retrieval/base.py +8 -3
- mlrun/launcher/remote.py +3 -3
- mlrun/lists.py +11 -0
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +1 -1
- mlrun/model_monitoring/application.py +313 -0
- mlrun/model_monitoring/batch_application.py +526 -0
- mlrun/model_monitoring/batch_application_handler.py +32 -0
- mlrun/model_monitoring/evidently_application.py +89 -0
- mlrun/model_monitoring/helpers.py +39 -3
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +38 -7
- mlrun/model_monitoring/tracking_policy.py +4 -4
- mlrun/model_monitoring/writer.py +37 -0
- mlrun/projects/pipelines.py +38 -4
- mlrun/projects/project.py +257 -43
- mlrun/run.py +5 -2
- mlrun/runtimes/__init__.py +2 -0
- mlrun/runtimes/function.py +2 -1
- mlrun/utils/helpers.py +12 -0
- mlrun/utils/http.py +3 -0
- mlrun/utils/notifications/notification_pusher.py +22 -8
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/METADATA +5 -5
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/RECORD +49 -44
- /mlrun/model_monitoring/{model_monitoring_batch.py → batch.py} +0 -0
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/WHEEL +0 -0
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/top_level.txt +0 -0
|
@@ -77,6 +77,30 @@ class EventFieldType:
|
|
|
77
77
|
DRIFT_DETECTED_THRESHOLD = "drift_detected_threshold"
|
|
78
78
|
POSSIBLE_DRIFT_THRESHOLD = "possible_drift_threshold"
|
|
79
79
|
|
|
80
|
+
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ApplicationEvent:
|
|
84
|
+
APPLICATION_NAME = "application_name"
|
|
85
|
+
CURRENT_STATS = "current_stats"
|
|
86
|
+
FEATURE_STATS = "feature_stats"
|
|
87
|
+
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
88
|
+
SCHEDULE_TIME = "schedule_time"
|
|
89
|
+
LAST_REQUEST = "last_request"
|
|
90
|
+
ENDPOINT_ID = "endpoint_id"
|
|
91
|
+
OUTPUT_STREAM_URI = "output_stream_uri"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class WriterEvent:
|
|
95
|
+
APPLICATION_NAME = "application_name"
|
|
96
|
+
ENDPOINT_ID = "endpoint_id"
|
|
97
|
+
SCHEDULE_TIME = "schedule_time"
|
|
98
|
+
RESULT_NAME = "result_name"
|
|
99
|
+
RESULT_VALUE = "result_value"
|
|
100
|
+
RESULT_KIND = "result_kind"
|
|
101
|
+
RESULT_STATUS = "result_status"
|
|
102
|
+
RESULT_EXTRA_DATA = "result_extra_data"
|
|
103
|
+
|
|
80
104
|
|
|
81
105
|
class EventLiveStats:
|
|
82
106
|
LATENCY_AVG_5M = "latency_avg_5m"
|
|
@@ -106,6 +130,7 @@ class ModelEndpointTarget:
|
|
|
106
130
|
class ProjectSecretKeys:
|
|
107
131
|
ENDPOINT_STORE_CONNECTION = "MODEL_MONITORING_ENDPOINT_STORE_CONNECTION"
|
|
108
132
|
ACCESS_KEY = "MODEL_MONITORING_ACCESS_KEY"
|
|
133
|
+
PIPELINES_ACCESS_KEY = "MODEL_MONITORING_PIPELINES_ACCESS_KEY"
|
|
109
134
|
KAFKA_BOOTSTRAP_SERVERS = "KAFKA_BOOTSTRAP_SERVERS"
|
|
110
135
|
STREAM_PATH = "STREAM_PATH"
|
|
111
136
|
|
|
@@ -120,6 +145,7 @@ class FileTargetKind:
|
|
|
120
145
|
EVENTS = "events"
|
|
121
146
|
STREAM = "stream"
|
|
122
147
|
PARQUET = "parquet"
|
|
148
|
+
BATCH_CONTROLLER_PARQUET = "batch_controller_parquet"
|
|
123
149
|
LOG_STREAM = "log_stream"
|
|
124
150
|
|
|
125
151
|
|
|
@@ -143,6 +169,22 @@ class PrometheusMetric:
|
|
|
143
169
|
DRIFT_STATUS = "drift_status"
|
|
144
170
|
|
|
145
171
|
|
|
172
|
+
class MonitoringFunctionNames:
|
|
173
|
+
WRITER = "model-monitoring-writer"
|
|
174
|
+
BATCH = "model-monitoring-batch"
|
|
175
|
+
BATCH_APPLICATION = "model-monitoring-batch-application"
|
|
176
|
+
STREAM = None
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def all():
|
|
180
|
+
return [
|
|
181
|
+
MonitoringFunctionNames.WRITER,
|
|
182
|
+
MonitoringFunctionNames.STREAM,
|
|
183
|
+
MonitoringFunctionNames.BATCH,
|
|
184
|
+
MonitoringFunctionNames.BATCH_APPLICATION,
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
146
188
|
@dataclass
|
|
147
189
|
class FunctionURI:
|
|
148
190
|
project: str
|
|
@@ -208,3 +250,30 @@ class DriftStatus(Enum):
|
|
|
208
250
|
NO_DRIFT = "NO_DRIFT"
|
|
209
251
|
DRIFT_DETECTED = "DRIFT_DETECTED"
|
|
210
252
|
POSSIBLE_DRIFT = "POSSIBLE_DRIFT"
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class ResultKindApp(enum.Enum):
|
|
256
|
+
"""
|
|
257
|
+
Enum for the result kind values
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
data_drift = 0
|
|
261
|
+
concept_drift = 1
|
|
262
|
+
model_performance = 2
|
|
263
|
+
system_performance = 3
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class ResultStatusApp(enum.Enum):
|
|
267
|
+
"""
|
|
268
|
+
Enum for the result status values, detected means that the app detected some problem.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
irrelevant = -1
|
|
272
|
+
no_detection = 0
|
|
273
|
+
potential_detection = 1
|
|
274
|
+
detected = 2
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class ModelMonitoringAppTag:
|
|
278
|
+
KEY = "type"
|
|
279
|
+
VAL = "model-monitoring-application"
|
|
@@ -18,7 +18,7 @@ import json
|
|
|
18
18
|
import typing
|
|
19
19
|
from typing import Any, Dict, List, Optional
|
|
20
20
|
|
|
21
|
-
from pydantic import BaseModel, Field
|
|
21
|
+
from pydantic import BaseModel, Field, validator
|
|
22
22
|
from pydantic.main import Extra
|
|
23
23
|
|
|
24
24
|
import mlrun.common.model_monitoring
|
|
@@ -100,6 +100,27 @@ class ModelEndpointSpec(ObjectSpec):
|
|
|
100
100
|
json_parse_values=json_parse_values,
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
+
@validator("monitor_configuration")
|
|
104
|
+
def set_name(cls, monitor_configuration):
|
|
105
|
+
return monitor_configuration or {
|
|
106
|
+
EventFieldType.DRIFT_DETECTED_THRESHOLD: (
|
|
107
|
+
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
|
|
108
|
+
),
|
|
109
|
+
EventFieldType.POSSIBLE_DRIFT_THRESHOLD: (
|
|
110
|
+
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
|
|
111
|
+
),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
@validator("model_uri")
|
|
115
|
+
def validate_model_uri(cls, model_uri):
|
|
116
|
+
"""Validate that the model uri includes the required prefix"""
|
|
117
|
+
prefix, uri = mlrun.datastore.parse_store_uri(model_uri)
|
|
118
|
+
if prefix and prefix != mlrun.utils.helpers.StorePrefix.Model:
|
|
119
|
+
return mlrun.datastore.get_store_uri(
|
|
120
|
+
mlrun.utils.helpers.StorePrefix.Model, uri
|
|
121
|
+
)
|
|
122
|
+
return model_uri
|
|
123
|
+
|
|
103
124
|
|
|
104
125
|
class Histogram(BaseModel):
|
|
105
126
|
buckets: List[float]
|
mlrun/config.py
CHANGED
|
@@ -403,6 +403,7 @@ default_config = {
|
|
|
403
403
|
},
|
|
404
404
|
"model_endpoint_monitoring": {
|
|
405
405
|
"serving_stream_args": {"shard_count": 1, "retention_period_hours": 24},
|
|
406
|
+
"application_stream_args": {"shard_count": 3, "retention_period_hours": 24},
|
|
406
407
|
"drift_thresholds": {"default": {"possible_drift": 0.5, "drift_detected": 0.7}},
|
|
407
408
|
# Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
|
|
408
409
|
# stream, and endpoints.
|
|
@@ -417,6 +418,7 @@ default_config = {
|
|
|
417
418
|
# Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
|
|
418
419
|
# when the user is working in CE environment and has not provided any stream path.
|
|
419
420
|
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
|
|
421
|
+
"default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
|
|
420
422
|
"batch_processing_function_branch": "master",
|
|
421
423
|
"parquet_batching_max_events": 10000,
|
|
422
424
|
"parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
|
|
@@ -981,20 +983,22 @@ class Config:
|
|
|
981
983
|
kind: str = "",
|
|
982
984
|
target: str = "online",
|
|
983
985
|
artifact_path: str = None,
|
|
986
|
+
application_name: str = None,
|
|
984
987
|
) -> str:
|
|
985
988
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
986
989
|
|
|
987
|
-
:param project:
|
|
988
|
-
:param kind:
|
|
989
|
-
:param target:
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
:param artifact_path:
|
|
997
|
-
|
|
990
|
+
:param project: Project name.
|
|
991
|
+
:param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
|
|
992
|
+
:param target: Can be either online or offline. If the target is online, then we try to get a specific
|
|
993
|
+
path for the provided kind. If it doesn't exist, use the default path.
|
|
994
|
+
If the target path is offline and the offline path is already a full path in the
|
|
995
|
+
configuration, then the result will be that path as-is. If the offline path is a
|
|
996
|
+
relative path, then the result will be based on the project artifact path and the
|
|
997
|
+
offline relative path. If project artifact path wasn't provided, then we use MLRun
|
|
998
|
+
artifact path instead.
|
|
999
|
+
:param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
|
|
1000
|
+
relative artifact path will be taken from the global MLRun artifact path.
|
|
1001
|
+
:param application_name:Application name, None for model_monitoring_stream.
|
|
998
1002
|
|
|
999
1003
|
:return: Full configured path for the provided kind.
|
|
1000
1004
|
"""
|
|
@@ -1006,8 +1010,22 @@ class Config:
|
|
|
1006
1010
|
if store_prefix_dict.get(kind):
|
|
1007
1011
|
# Target exist in store prefix and has a valid string value
|
|
1008
1012
|
return store_prefix_dict[kind].format(project=project)
|
|
1013
|
+
|
|
1014
|
+
if (
|
|
1015
|
+
application_name
|
|
1016
|
+
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
|
|
1017
|
+
):
|
|
1018
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1019
|
+
project=project,
|
|
1020
|
+
kind=kind
|
|
1021
|
+
if application_name is None
|
|
1022
|
+
else f"{kind}-{application_name.lower()}",
|
|
1023
|
+
)
|
|
1009
1024
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1010
|
-
project=project,
|
|
1025
|
+
project=project,
|
|
1026
|
+
kind=kind
|
|
1027
|
+
if application_name is None
|
|
1028
|
+
else f"{kind}-{application_name.lower()}",
|
|
1011
1029
|
)
|
|
1012
1030
|
|
|
1013
1031
|
# Get the current offline path from the configuration
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -37,9 +37,9 @@ class DatastoreProfile(pydantic.BaseModel):
|
|
|
37
37
|
|
|
38
38
|
@staticmethod
|
|
39
39
|
def generate_secret_key(profile_name: str, project: str):
|
|
40
|
-
secret_name_separator = "
|
|
40
|
+
secret_name_separator = "."
|
|
41
41
|
full_key = (
|
|
42
|
-
"
|
|
42
|
+
"datastore-profiles"
|
|
43
43
|
+ secret_name_separator
|
|
44
44
|
+ project
|
|
45
45
|
+ secret_name_separator
|
mlrun/datastore/sources.py
CHANGED
|
@@ -138,7 +138,6 @@ class CSVSource(BaseSourceDriver):
|
|
|
138
138
|
:parameter path: path to CSV file
|
|
139
139
|
:parameter key_field: the CSV field to be used as the key for events. May be an int (field index) or string
|
|
140
140
|
(field name) if with_header is True. Defaults to None (no key). Can be a list of keys.
|
|
141
|
-
:parameter time_field: DEPRECATED. Use parse_dates to parse timestamps.
|
|
142
141
|
:parameter schedule: string to configure scheduling of the ingestion job.
|
|
143
142
|
:parameter attributes: additional parameters to pass to storey. For example:
|
|
144
143
|
attributes={"timestamp_format": '%Y%m%d%H'}
|
|
@@ -156,29 +155,13 @@ class CSVSource(BaseSourceDriver):
|
|
|
156
155
|
path: str = None,
|
|
157
156
|
attributes: Dict[str, str] = None,
|
|
158
157
|
key_field: str = None,
|
|
159
|
-
time_field: str = None,
|
|
160
158
|
schedule: str = None,
|
|
161
159
|
parse_dates: Union[None, int, str, List[int], List[str]] = None,
|
|
162
160
|
**kwargs,
|
|
163
161
|
):
|
|
164
|
-
super().__init__(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
if time_field is not None:
|
|
168
|
-
warnings.warn(
|
|
169
|
-
"CSVSource's time_field parameter is deprecated in 1.3.0 and will be removed in 1.5.0. "
|
|
170
|
-
"Use parse_dates instead.",
|
|
171
|
-
# TODO: remove in 1.5.0
|
|
172
|
-
FutureWarning,
|
|
173
|
-
)
|
|
174
|
-
if isinstance(parse_dates, (int, str)):
|
|
175
|
-
parse_dates = [parse_dates]
|
|
176
|
-
|
|
177
|
-
if parse_dates is None:
|
|
178
|
-
parse_dates = [time_field]
|
|
179
|
-
elif time_field not in parse_dates:
|
|
180
|
-
parse_dates = copy(parse_dates)
|
|
181
|
-
parse_dates.append(time_field)
|
|
162
|
+
super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
|
|
163
|
+
if parse_dates and not isinstance(parse_dates, list):
|
|
164
|
+
parse_dates = [parse_dates]
|
|
182
165
|
self._parse_dates = parse_dates
|
|
183
166
|
|
|
184
167
|
def to_step(self, key_field=None, time_field=None, context=None):
|
|
@@ -724,16 +707,7 @@ class DataFrameSource:
|
|
|
724
707
|
|
|
725
708
|
support_storey = True
|
|
726
709
|
|
|
727
|
-
def __init__(
|
|
728
|
-
self, df, key_field=None, time_field=None, context=None, iterator=False
|
|
729
|
-
):
|
|
730
|
-
if time_field:
|
|
731
|
-
warnings.warn(
|
|
732
|
-
"DataFrameSource's time_field parameter has no effect. "
|
|
733
|
-
"It is deprecated in 1.3.0 and will be removed in 1.5.0",
|
|
734
|
-
FutureWarning,
|
|
735
|
-
)
|
|
736
|
-
|
|
710
|
+
def __init__(self, df, key_field=None, context=None, iterator=False):
|
|
737
711
|
self._df = df
|
|
738
712
|
if isinstance(key_field, str):
|
|
739
713
|
self.key_field = [key_field]
|
mlrun/datastore/targets.py
CHANGED
|
@@ -484,6 +484,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
484
484
|
if hasattr(df, "rdd"):
|
|
485
485
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
486
486
|
options.update(kwargs)
|
|
487
|
+
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
487
488
|
df.write.mode("overwrite").save(**options)
|
|
488
489
|
elif hasattr(df, "dask"):
|
|
489
490
|
dask_options = self.get_dask_options()
|
|
@@ -513,36 +514,41 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
513
514
|
dir = os.path.dirname(target_path)
|
|
514
515
|
if dir:
|
|
515
516
|
os.makedirs(dir, exist_ok=True)
|
|
516
|
-
partition_cols = []
|
|
517
|
-
if target_path.endswith(".parquet") or target_path.endswith(".pq"):
|
|
518
|
-
partition_cols = None
|
|
519
517
|
target_df = df
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
if
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
518
|
+
partition_cols = None # single parquet file
|
|
519
|
+
if not target_path.endswith(".parquet") and not target_path.endswith(
|
|
520
|
+
".pq"
|
|
521
|
+
): # directory
|
|
522
|
+
partition_cols = []
|
|
523
|
+
if timestamp_key and (
|
|
524
|
+
self.partitioned or self.time_partitioning_granularity
|
|
525
|
+
):
|
|
526
|
+
target_df = df.copy(deep=False)
|
|
527
|
+
time_partitioning_granularity = self.time_partitioning_granularity
|
|
528
|
+
if not time_partitioning_granularity and self.partitioned:
|
|
529
|
+
time_partitioning_granularity = (
|
|
530
|
+
mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
|
|
531
|
+
)
|
|
532
|
+
for unit, fmt in [
|
|
533
|
+
("year", "%Y"),
|
|
534
|
+
("month", "%m"),
|
|
535
|
+
("day", "%d"),
|
|
536
|
+
("hour", "%H"),
|
|
537
|
+
("minute", "%M"),
|
|
538
|
+
]:
|
|
539
|
+
partition_cols.append(unit)
|
|
540
|
+
target_df[unit] = pd.DatetimeIndex(
|
|
541
|
+
target_df[timestamp_key]
|
|
542
|
+
).format(date_format=fmt)
|
|
543
|
+
if unit == time_partitioning_granularity:
|
|
544
|
+
break
|
|
545
|
+
# Partitioning will be performed on timestamp_key and then on self.partition_cols
|
|
546
|
+
# (We might want to give the user control on this order as additional functionality)
|
|
547
|
+
partition_cols += self.partition_cols or []
|
|
542
548
|
storage_options = self._get_store().get_storage_options()
|
|
543
549
|
self._write_dataframe(
|
|
544
550
|
target_df,
|
|
545
|
-
storage_options,
|
|
551
|
+
self.storage_options or storage_options,
|
|
546
552
|
target_path,
|
|
547
553
|
partition_cols=partition_cols,
|
|
548
554
|
**kwargs,
|
|
@@ -690,7 +696,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
690
696
|
# options used in spark.read.load(**options)
|
|
691
697
|
raise NotImplementedError()
|
|
692
698
|
|
|
693
|
-
def prepare_spark_df(self, df, key_columns):
|
|
699
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
|
|
694
700
|
return df
|
|
695
701
|
|
|
696
702
|
def get_dask_options(self):
|
|
@@ -924,6 +930,37 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
924
930
|
return self.path.endswith(".parquet") or self.path.endswith(".pq")
|
|
925
931
|
return False
|
|
926
932
|
|
|
933
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
934
|
+
# If partitioning by time, add the necessary columns
|
|
935
|
+
if (
|
|
936
|
+
timestamp_key
|
|
937
|
+
and isinstance(spark_options, dict)
|
|
938
|
+
and "partitionBy" in spark_options
|
|
939
|
+
):
|
|
940
|
+
from pyspark.sql.functions import (
|
|
941
|
+
dayofmonth,
|
|
942
|
+
hour,
|
|
943
|
+
minute,
|
|
944
|
+
month,
|
|
945
|
+
second,
|
|
946
|
+
year,
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
time_unit_to_op = {
|
|
950
|
+
"year": year,
|
|
951
|
+
"month": month,
|
|
952
|
+
"day": dayofmonth,
|
|
953
|
+
"hour": hour,
|
|
954
|
+
"minute": minute,
|
|
955
|
+
"second": second,
|
|
956
|
+
}
|
|
957
|
+
timestamp_col = df[timestamp_key]
|
|
958
|
+
for partition in spark_options["partitionBy"]:
|
|
959
|
+
if partition not in df.columns and partition in time_unit_to_op:
|
|
960
|
+
op = time_unit_to_op[partition]
|
|
961
|
+
df = df.withColumn(partition, op(timestamp_col))
|
|
962
|
+
return df
|
|
963
|
+
|
|
927
964
|
|
|
928
965
|
class CSVTarget(BaseStoreTarget):
|
|
929
966
|
kind = TargetTypes.csv
|
|
@@ -973,7 +1010,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
973
1010
|
"header": "true",
|
|
974
1011
|
}
|
|
975
1012
|
|
|
976
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1013
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
977
1014
|
import pyspark.sql.functions as funcs
|
|
978
1015
|
|
|
979
1016
|
for col_name, col_type in df.dtypes:
|
|
@@ -1067,7 +1104,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1067
1104
|
**self.attributes,
|
|
1068
1105
|
)
|
|
1069
1106
|
|
|
1070
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1107
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1071
1108
|
raise NotImplementedError()
|
|
1072
1109
|
|
|
1073
1110
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
@@ -1139,7 +1176,7 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1139
1176
|
spark_options["columnUpdate"] = True
|
|
1140
1177
|
return spark_options
|
|
1141
1178
|
|
|
1142
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1179
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1143
1180
|
from pyspark.sql.functions import col
|
|
1144
1181
|
|
|
1145
1182
|
spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -1232,7 +1269,7 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1232
1269
|
endpoint, uri = self._get_server_endpoint()
|
|
1233
1270
|
return endpoint
|
|
1234
1271
|
|
|
1235
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1272
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1236
1273
|
from pyspark.sql.functions import col
|
|
1237
1274
|
|
|
1238
1275
|
spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -1580,16 +1617,6 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1580
1617
|
:param parse_dates : all the field to be parsed as timestamp.
|
|
1581
1618
|
"""
|
|
1582
1619
|
|
|
1583
|
-
# Validate sqlalchemy (not installed by default):
|
|
1584
|
-
try:
|
|
1585
|
-
import sqlalchemy
|
|
1586
|
-
|
|
1587
|
-
self.sqlalchemy = sqlalchemy
|
|
1588
|
-
except (ModuleNotFoundError, ImportError) as exc:
|
|
1589
|
-
raise mlrun.errors.MLRunMissingDependencyError(
|
|
1590
|
-
"Using 'SQLTarget' requires sqlalchemy package. Use pip install mlrun[sqlalchemy] to install it."
|
|
1591
|
-
) from exc
|
|
1592
|
-
|
|
1593
1620
|
create_according_to_data = False # TODO: open for user
|
|
1594
1621
|
if time_fields:
|
|
1595
1622
|
warnings.warn(
|
|
@@ -1696,8 +1723,14 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1696
1723
|
time_column=None,
|
|
1697
1724
|
**kwargs,
|
|
1698
1725
|
):
|
|
1726
|
+
try:
|
|
1727
|
+
import sqlalchemy
|
|
1728
|
+
|
|
1729
|
+
except (ModuleNotFoundError, ImportError) as exc:
|
|
1730
|
+
self._raise_sqlalchemy_import_error(exc)
|
|
1731
|
+
|
|
1699
1732
|
db_path, table_name, _, _, _, _ = self._parse_url()
|
|
1700
|
-
engine =
|
|
1733
|
+
engine = sqlalchemy.create_engine(db_path)
|
|
1701
1734
|
parse_dates: Optional[List[str]] = self.attributes.get("parse_dates")
|
|
1702
1735
|
with engine.connect() as conn:
|
|
1703
1736
|
query, parse_dates = _generate_sql_query_with_time_filter(
|
|
@@ -1721,6 +1754,12 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1721
1754
|
def write_dataframe(
|
|
1722
1755
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1723
1756
|
):
|
|
1757
|
+
try:
|
|
1758
|
+
import sqlalchemy
|
|
1759
|
+
|
|
1760
|
+
except (ModuleNotFoundError, ImportError) as exc:
|
|
1761
|
+
self._raise_sqlalchemy_import_error(exc)
|
|
1762
|
+
|
|
1724
1763
|
self._create_sql_table()
|
|
1725
1764
|
|
|
1726
1765
|
if hasattr(df, "rdd"):
|
|
@@ -1735,7 +1774,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1735
1774
|
_,
|
|
1736
1775
|
) = self._parse_url()
|
|
1737
1776
|
create_according_to_data = bool(create_according_to_data)
|
|
1738
|
-
engine =
|
|
1777
|
+
engine = sqlalchemy.create_engine(
|
|
1739
1778
|
db_path,
|
|
1740
1779
|
)
|
|
1741
1780
|
connection = engine.connect()
|
|
@@ -1760,28 +1799,34 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1760
1799
|
primary_key,
|
|
1761
1800
|
create_table,
|
|
1762
1801
|
) = self._parse_url()
|
|
1802
|
+
try:
|
|
1803
|
+
import sqlalchemy
|
|
1804
|
+
|
|
1805
|
+
except (ModuleNotFoundError, ImportError) as exc:
|
|
1806
|
+
self._raise_sqlalchemy_import_error(exc)
|
|
1807
|
+
|
|
1763
1808
|
try:
|
|
1764
1809
|
primary_key = ast.literal_eval(primary_key)
|
|
1765
1810
|
primary_key_for_check = primary_key
|
|
1766
1811
|
except Exception:
|
|
1767
1812
|
primary_key_for_check = [primary_key]
|
|
1768
|
-
engine =
|
|
1813
|
+
engine = sqlalchemy.create_engine(db_path)
|
|
1769
1814
|
with engine.connect() as conn:
|
|
1770
|
-
metadata =
|
|
1815
|
+
metadata = sqlalchemy.MetaData()
|
|
1771
1816
|
table_exists = engine.dialect.has_table(conn, table_name)
|
|
1772
1817
|
if not table_exists and not create_table:
|
|
1773
1818
|
raise ValueError(f"Table named {table_name} is not exist")
|
|
1774
1819
|
|
|
1775
1820
|
elif not table_exists and create_table:
|
|
1776
1821
|
TYPE_TO_SQL_TYPE = {
|
|
1777
|
-
int:
|
|
1778
|
-
str:
|
|
1779
|
-
datetime.datetime:
|
|
1780
|
-
pd.Timestamp:
|
|
1781
|
-
bool:
|
|
1782
|
-
float:
|
|
1783
|
-
datetime.timedelta:
|
|
1784
|
-
pd.Timedelta:
|
|
1822
|
+
int: sqlalchemy.Integer,
|
|
1823
|
+
str: sqlalchemy.String(self.attributes.get("varchar_len")),
|
|
1824
|
+
datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
1825
|
+
pd.Timestamp: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
|
|
1826
|
+
bool: sqlalchemy.Boolean,
|
|
1827
|
+
float: sqlalchemy.Float,
|
|
1828
|
+
datetime.timedelta: sqlalchemy.Interval,
|
|
1829
|
+
pd.Timedelta: sqlalchemy.Interval,
|
|
1785
1830
|
}
|
|
1786
1831
|
# creat new table with the given name
|
|
1787
1832
|
columns = []
|
|
@@ -1790,12 +1835,12 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1790
1835
|
if col_type is None:
|
|
1791
1836
|
raise TypeError(f"{col_type} unsupported type")
|
|
1792
1837
|
columns.append(
|
|
1793
|
-
|
|
1838
|
+
sqlalchemy.Column(
|
|
1794
1839
|
col, col_type, primary_key=(col in primary_key_for_check)
|
|
1795
1840
|
)
|
|
1796
1841
|
)
|
|
1797
1842
|
|
|
1798
|
-
|
|
1843
|
+
sqlalchemy.Table(table_name, metadata, *columns)
|
|
1799
1844
|
metadata.create_all(engine)
|
|
1800
1845
|
if_exists = "append"
|
|
1801
1846
|
self.path = (
|
|
@@ -1804,6 +1849,12 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1804
1849
|
)
|
|
1805
1850
|
conn.close()
|
|
1806
1851
|
|
|
1852
|
+
@staticmethod
|
|
1853
|
+
def _raise_sqlalchemy_import_error(exc):
|
|
1854
|
+
raise mlrun.errors.MLRunMissingDependencyError(
|
|
1855
|
+
"Using 'SQLTarget' requires sqlalchemy package. Use pip install mlrun[sqlalchemy] to install it."
|
|
1856
|
+
) from exc
|
|
1857
|
+
|
|
1807
1858
|
|
|
1808
1859
|
kind_to_driver = {
|
|
1809
1860
|
TargetTypes.parquet: ParquetTarget,
|
mlrun/db/httpdb.py
CHANGED
|
@@ -18,6 +18,7 @@ import tempfile
|
|
|
18
18
|
import time
|
|
19
19
|
import traceback
|
|
20
20
|
import typing
|
|
21
|
+
import warnings
|
|
21
22
|
from datetime import datetime, timedelta
|
|
22
23
|
from os import path, remove
|
|
23
24
|
from typing import Dict, List, Optional, Union
|
|
@@ -1411,6 +1412,8 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1411
1412
|
namespace=None,
|
|
1412
1413
|
artifact_path=None,
|
|
1413
1414
|
ops=None,
|
|
1415
|
+
# TODO: deprecated, remove in 1.6.0
|
|
1416
|
+
ttl=None,
|
|
1414
1417
|
cleanup_ttl=None,
|
|
1415
1418
|
):
|
|
1416
1419
|
"""Submit a KFP pipeline for execution.
|
|
@@ -1423,9 +1426,18 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1423
1426
|
:param namespace: Kubernetes namespace to execute the pipeline in.
|
|
1424
1427
|
:param artifact_path: A path to artifacts used by this pipeline.
|
|
1425
1428
|
:param ops: Transformers to apply on all ops in the pipeline.
|
|
1429
|
+
:param ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the workflow
|
|
1430
|
+
and all its resources are deleted) (deprecated, use cleanup_ttl instead)
|
|
1426
1431
|
:param cleanup_ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the
|
|
1427
1432
|
workflow and all its resources are deleted)
|
|
1428
1433
|
"""
|
|
1434
|
+
if ttl:
|
|
1435
|
+
warnings.warn(
|
|
1436
|
+
"'ttl' is deprecated, use 'cleanup_ttl' instead. "
|
|
1437
|
+
"This will be removed in 1.6.0",
|
|
1438
|
+
# TODO: Remove this in 1.6.0
|
|
1439
|
+
FutureWarning,
|
|
1440
|
+
)
|
|
1429
1441
|
|
|
1430
1442
|
if isinstance(pipeline, str):
|
|
1431
1443
|
pipe_file = pipeline
|
|
@@ -1433,7 +1445,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1433
1445
|
pipe_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=False).name
|
|
1434
1446
|
conf = new_pipe_metadata(
|
|
1435
1447
|
artifact_path=artifact_path,
|
|
1436
|
-
cleanup_ttl=cleanup_ttl,
|
|
1448
|
+
cleanup_ttl=cleanup_ttl or ttl,
|
|
1437
1449
|
op_transformers=ops,
|
|
1438
1450
|
)
|
|
1439
1451
|
kfp.compiler.Compiler().compile(
|
|
@@ -1471,15 +1483,17 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1471
1483
|
headers=headers,
|
|
1472
1484
|
)
|
|
1473
1485
|
except OSError as err:
|
|
1474
|
-
logger.error(
|
|
1475
|
-
raise OSError(f"
|
|
1486
|
+
logger.error("Error: Cannot submit pipeline", err=err_to_str(err))
|
|
1487
|
+
raise OSError(f"Error: Cannot submit pipeline, {err_to_str(err)}")
|
|
1476
1488
|
|
|
1477
1489
|
if not resp.ok:
|
|
1478
|
-
logger.error(
|
|
1479
|
-
raise ValueError(f"
|
|
1490
|
+
logger.error("Failed to submit pipeline", respones_text=resp.text)
|
|
1491
|
+
raise ValueError(f"Failed to submit pipeline, {resp.text}")
|
|
1480
1492
|
|
|
1481
1493
|
resp = resp.json()
|
|
1482
|
-
logger.info(
|
|
1494
|
+
logger.info(
|
|
1495
|
+
"Pipeline submitted successfully", pipeline_name=resp["name"], id=resp["id"]
|
|
1496
|
+
)
|
|
1483
1497
|
return resp["id"]
|
|
1484
1498
|
|
|
1485
1499
|
def list_pipelines(
|
mlrun/feature_store/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ __all__ = [
|
|
|
20
20
|
"ingest",
|
|
21
21
|
"preview",
|
|
22
22
|
"deploy_ingestion_service",
|
|
23
|
+
"deploy_ingestion_service_v2",
|
|
23
24
|
"delete_feature_set",
|
|
24
25
|
"delete_feature_vector",
|
|
25
26
|
"get_feature_set",
|
|
@@ -41,6 +42,7 @@ from .api import (
|
|
|
41
42
|
delete_feature_set,
|
|
42
43
|
delete_feature_vector,
|
|
43
44
|
deploy_ingestion_service,
|
|
45
|
+
deploy_ingestion_service_v2,
|
|
44
46
|
get_feature_set,
|
|
45
47
|
get_feature_vector,
|
|
46
48
|
get_offline_features,
|