mlrun 1.5.0rc12__py3-none-any.whl → 1.5.0rc13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +31 -2
- mlrun/api/api/endpoints/functions.py +110 -52
- mlrun/api/crud/model_monitoring/deployment.py +208 -38
- mlrun/api/crud/model_monitoring/helpers.py +19 -6
- mlrun/api/crud/model_monitoring/model_endpoints.py +14 -1
- mlrun/api/db/sqldb/db.py +3 -1
- mlrun/api/utils/builder.py +2 -4
- mlrun/common/model_monitoring/helpers.py +19 -5
- mlrun/common/schemas/model_monitoring/constants.py +69 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -0
- mlrun/config.py +30 -12
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/sources.py +4 -30
- mlrun/datastore/targets.py +68 -31
- mlrun/db/httpdb.py +20 -6
- mlrun/feature_store/api.py +3 -31
- mlrun/feature_store/feature_vector.py +1 -1
- mlrun/feature_store/retrieval/base.py +8 -3
- mlrun/launcher/remote.py +3 -3
- mlrun/lists.py +11 -0
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +1 -1
- mlrun/model_monitoring/application.py +313 -0
- mlrun/model_monitoring/batch_application.py +526 -0
- mlrun/model_monitoring/batch_application_handler.py +32 -0
- mlrun/model_monitoring/evidently_application.py +89 -0
- mlrun/model_monitoring/helpers.py +39 -3
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +37 -0
- mlrun/model_monitoring/tracking_policy.py +4 -4
- mlrun/model_monitoring/writer.py +37 -0
- mlrun/projects/pipelines.py +38 -4
- mlrun/projects/project.py +257 -43
- mlrun/run.py +5 -2
- mlrun/runtimes/__init__.py +2 -0
- mlrun/runtimes/function.py +2 -1
- mlrun/utils/helpers.py +12 -0
- mlrun/utils/http.py +3 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/METADATA +5 -5
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/RECORD +45 -40
- /mlrun/model_monitoring/{model_monitoring_batch.py → batch.py} +0 -0
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/WHEEL +0 -0
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/top_level.txt +0 -0
|
@@ -77,6 +77,30 @@ class EventFieldType:
|
|
|
77
77
|
DRIFT_DETECTED_THRESHOLD = "drift_detected_threshold"
|
|
78
78
|
POSSIBLE_DRIFT_THRESHOLD = "possible_drift_threshold"
|
|
79
79
|
|
|
80
|
+
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ApplicationEvent:
|
|
84
|
+
APPLICATION_NAME = "application_name"
|
|
85
|
+
CURRENT_STATS = "current_stats"
|
|
86
|
+
FEATURE_STATS = "feature_stats"
|
|
87
|
+
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
88
|
+
SCHEDULE_TIME = "schedule_time"
|
|
89
|
+
LAST_REQUEST = "last_request"
|
|
90
|
+
ENDPOINT_ID = "endpoint_id"
|
|
91
|
+
OUTPUT_STREAM_URI = "output_stream_uri"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class WriterEvent:
|
|
95
|
+
APPLICATION_NAME = "application_name"
|
|
96
|
+
ENDPOINT_ID = "endpoint_id"
|
|
97
|
+
SCHEDULE_TIME = "schedule_time"
|
|
98
|
+
RESULT_NAME = "result_name"
|
|
99
|
+
RESULT_VALUE = "result_value"
|
|
100
|
+
RESULT_KIND = "result_kind"
|
|
101
|
+
RESULT_STATUS = "result_status"
|
|
102
|
+
RESULT_EXTRA_DATA = "result_extra_data"
|
|
103
|
+
|
|
80
104
|
|
|
81
105
|
class EventLiveStats:
|
|
82
106
|
LATENCY_AVG_5M = "latency_avg_5m"
|
|
@@ -106,6 +130,7 @@ class ModelEndpointTarget:
|
|
|
106
130
|
class ProjectSecretKeys:
|
|
107
131
|
ENDPOINT_STORE_CONNECTION = "MODEL_MONITORING_ENDPOINT_STORE_CONNECTION"
|
|
108
132
|
ACCESS_KEY = "MODEL_MONITORING_ACCESS_KEY"
|
|
133
|
+
PIPELINES_ACCESS_KEY = "MODEL_MONITORING_PIPELINES_ACCESS_KEY"
|
|
109
134
|
KAFKA_BOOTSTRAP_SERVERS = "KAFKA_BOOTSTRAP_SERVERS"
|
|
110
135
|
STREAM_PATH = "STREAM_PATH"
|
|
111
136
|
|
|
@@ -120,6 +145,7 @@ class FileTargetKind:
|
|
|
120
145
|
EVENTS = "events"
|
|
121
146
|
STREAM = "stream"
|
|
122
147
|
PARQUET = "parquet"
|
|
148
|
+
BATCH_CONTROLLER_PARQUET = "batch_controller_parquet"
|
|
123
149
|
LOG_STREAM = "log_stream"
|
|
124
150
|
|
|
125
151
|
|
|
@@ -143,6 +169,22 @@ class PrometheusMetric:
|
|
|
143
169
|
DRIFT_STATUS = "drift_status"
|
|
144
170
|
|
|
145
171
|
|
|
172
|
+
class MonitoringFunctionNames:
|
|
173
|
+
WRITER = "model-monitoring-writer"
|
|
174
|
+
BATCH = "model-monitoring-batch"
|
|
175
|
+
BATCH_APPLICATION = "model-monitoring-batch-application"
|
|
176
|
+
STREAM = None
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def all():
|
|
180
|
+
return [
|
|
181
|
+
MonitoringFunctionNames.WRITER,
|
|
182
|
+
MonitoringFunctionNames.STREAM,
|
|
183
|
+
MonitoringFunctionNames.BATCH,
|
|
184
|
+
MonitoringFunctionNames.BATCH_APPLICATION,
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
146
188
|
@dataclass
|
|
147
189
|
class FunctionURI:
|
|
148
190
|
project: str
|
|
@@ -208,3 +250,30 @@ class DriftStatus(Enum):
|
|
|
208
250
|
NO_DRIFT = "NO_DRIFT"
|
|
209
251
|
DRIFT_DETECTED = "DRIFT_DETECTED"
|
|
210
252
|
POSSIBLE_DRIFT = "POSSIBLE_DRIFT"
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class ResultKindApp(enum.Enum):
|
|
256
|
+
"""
|
|
257
|
+
Enum for the result kind values
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
data_drift = 0
|
|
261
|
+
concept_drift = 1
|
|
262
|
+
model_performance = 2
|
|
263
|
+
system_performance = 3
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class ResultStatusApp(enum.Enum):
|
|
267
|
+
"""
|
|
268
|
+
Enum for the result status values, detected means that the app detected some problem.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
irrelevant = -1
|
|
272
|
+
no_detection = 0
|
|
273
|
+
potential_detection = 1
|
|
274
|
+
detected = 2
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class ModelMonitoringAppTag:
|
|
278
|
+
KEY = "type"
|
|
279
|
+
VAL = "model-monitoring-application"
|
|
@@ -111,6 +111,16 @@ class ModelEndpointSpec(ObjectSpec):
|
|
|
111
111
|
),
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
+
@validator("model_uri")
|
|
115
|
+
def validate_model_uri(cls, model_uri):
|
|
116
|
+
"""Validate that the model uri includes the required prefix"""
|
|
117
|
+
prefix, uri = mlrun.datastore.parse_store_uri(model_uri)
|
|
118
|
+
if prefix and prefix != mlrun.utils.helpers.StorePrefix.Model:
|
|
119
|
+
return mlrun.datastore.get_store_uri(
|
|
120
|
+
mlrun.utils.helpers.StorePrefix.Model, uri
|
|
121
|
+
)
|
|
122
|
+
return model_uri
|
|
123
|
+
|
|
114
124
|
|
|
115
125
|
class Histogram(BaseModel):
|
|
116
126
|
buckets: List[float]
|
mlrun/config.py
CHANGED
|
@@ -403,6 +403,7 @@ default_config = {
|
|
|
403
403
|
},
|
|
404
404
|
"model_endpoint_monitoring": {
|
|
405
405
|
"serving_stream_args": {"shard_count": 1, "retention_period_hours": 24},
|
|
406
|
+
"application_stream_args": {"shard_count": 3, "retention_period_hours": 24},
|
|
406
407
|
"drift_thresholds": {"default": {"possible_drift": 0.5, "drift_detected": 0.7}},
|
|
407
408
|
# Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
|
|
408
409
|
# stream, and endpoints.
|
|
@@ -417,6 +418,7 @@ default_config = {
|
|
|
417
418
|
# Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
|
|
418
419
|
# when the user is working in CE environment and has not provided any stream path.
|
|
419
420
|
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
|
|
421
|
+
"default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
|
|
420
422
|
"batch_processing_function_branch": "master",
|
|
421
423
|
"parquet_batching_max_events": 10000,
|
|
422
424
|
"parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
|
|
@@ -981,20 +983,22 @@ class Config:
|
|
|
981
983
|
kind: str = "",
|
|
982
984
|
target: str = "online",
|
|
983
985
|
artifact_path: str = None,
|
|
986
|
+
application_name: str = None,
|
|
984
987
|
) -> str:
|
|
985
988
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
986
989
|
|
|
987
|
-
:param project:
|
|
988
|
-
:param kind:
|
|
989
|
-
:param target:
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
:param artifact_path:
|
|
997
|
-
|
|
990
|
+
:param project: Project name.
|
|
991
|
+
:param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
|
|
992
|
+
:param target: Can be either online or offline. If the target is online, then we try to get a specific
|
|
993
|
+
path for the provided kind. If it doesn't exist, use the default path.
|
|
994
|
+
If the target path is offline and the offline path is already a full path in the
|
|
995
|
+
configuration, then the result will be that path as-is. If the offline path is a
|
|
996
|
+
relative path, then the result will be based on the project artifact path and the
|
|
997
|
+
offline relative path. If project artifact path wasn't provided, then we use MLRun
|
|
998
|
+
artifact path instead.
|
|
999
|
+
:param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
|
|
1000
|
+
relative artifact path will be taken from the global MLRun artifact path.
|
|
1001
|
+
:param application_name:Application name, None for model_monitoring_stream.
|
|
998
1002
|
|
|
999
1003
|
:return: Full configured path for the provided kind.
|
|
1000
1004
|
"""
|
|
@@ -1006,8 +1010,22 @@ class Config:
|
|
|
1006
1010
|
if store_prefix_dict.get(kind):
|
|
1007
1011
|
# Target exist in store prefix and has a valid string value
|
|
1008
1012
|
return store_prefix_dict[kind].format(project=project)
|
|
1013
|
+
|
|
1014
|
+
if (
|
|
1015
|
+
application_name
|
|
1016
|
+
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
|
|
1017
|
+
):
|
|
1018
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1019
|
+
project=project,
|
|
1020
|
+
kind=kind
|
|
1021
|
+
if application_name is None
|
|
1022
|
+
else f"{kind}-{application_name.lower()}",
|
|
1023
|
+
)
|
|
1009
1024
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1010
|
-
project=project,
|
|
1025
|
+
project=project,
|
|
1026
|
+
kind=kind
|
|
1027
|
+
if application_name is None
|
|
1028
|
+
else f"{kind}-{application_name.lower()}",
|
|
1011
1029
|
)
|
|
1012
1030
|
|
|
1013
1031
|
# Get the current offline path from the configuration
|
mlrun/datastore/__init__.py
CHANGED
mlrun/datastore/sources.py
CHANGED
|
@@ -138,7 +138,6 @@ class CSVSource(BaseSourceDriver):
|
|
|
138
138
|
:parameter path: path to CSV file
|
|
139
139
|
:parameter key_field: the CSV field to be used as the key for events. May be an int (field index) or string
|
|
140
140
|
(field name) if with_header is True. Defaults to None (no key). Can be a list of keys.
|
|
141
|
-
:parameter time_field: DEPRECATED. Use parse_dates to parse timestamps.
|
|
142
141
|
:parameter schedule: string to configure scheduling of the ingestion job.
|
|
143
142
|
:parameter attributes: additional parameters to pass to storey. For example:
|
|
144
143
|
attributes={"timestamp_format": '%Y%m%d%H'}
|
|
@@ -156,29 +155,13 @@ class CSVSource(BaseSourceDriver):
|
|
|
156
155
|
path: str = None,
|
|
157
156
|
attributes: Dict[str, str] = None,
|
|
158
157
|
key_field: str = None,
|
|
159
|
-
time_field: str = None,
|
|
160
158
|
schedule: str = None,
|
|
161
159
|
parse_dates: Union[None, int, str, List[int], List[str]] = None,
|
|
162
160
|
**kwargs,
|
|
163
161
|
):
|
|
164
|
-
super().__init__(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
if time_field is not None:
|
|
168
|
-
warnings.warn(
|
|
169
|
-
"CSVSource's time_field parameter is deprecated in 1.3.0 and will be removed in 1.5.0. "
|
|
170
|
-
"Use parse_dates instead.",
|
|
171
|
-
# TODO: remove in 1.5.0
|
|
172
|
-
FutureWarning,
|
|
173
|
-
)
|
|
174
|
-
if isinstance(parse_dates, (int, str)):
|
|
175
|
-
parse_dates = [parse_dates]
|
|
176
|
-
|
|
177
|
-
if parse_dates is None:
|
|
178
|
-
parse_dates = [time_field]
|
|
179
|
-
elif time_field not in parse_dates:
|
|
180
|
-
parse_dates = copy(parse_dates)
|
|
181
|
-
parse_dates.append(time_field)
|
|
162
|
+
super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
|
|
163
|
+
if parse_dates and not isinstance(parse_dates, list):
|
|
164
|
+
parse_dates = [parse_dates]
|
|
182
165
|
self._parse_dates = parse_dates
|
|
183
166
|
|
|
184
167
|
def to_step(self, key_field=None, time_field=None, context=None):
|
|
@@ -724,16 +707,7 @@ class DataFrameSource:
|
|
|
724
707
|
|
|
725
708
|
support_storey = True
|
|
726
709
|
|
|
727
|
-
def __init__(
|
|
728
|
-
self, df, key_field=None, time_field=None, context=None, iterator=False
|
|
729
|
-
):
|
|
730
|
-
if time_field:
|
|
731
|
-
warnings.warn(
|
|
732
|
-
"DataFrameSource's time_field parameter has no effect. "
|
|
733
|
-
"It is deprecated in 1.3.0 and will be removed in 1.5.0",
|
|
734
|
-
FutureWarning,
|
|
735
|
-
)
|
|
736
|
-
|
|
710
|
+
def __init__(self, df, key_field=None, context=None, iterator=False):
|
|
737
711
|
self._df = df
|
|
738
712
|
if isinstance(key_field, str):
|
|
739
713
|
self.key_field = [key_field]
|
mlrun/datastore/targets.py
CHANGED
|
@@ -484,6 +484,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
484
484
|
if hasattr(df, "rdd"):
|
|
485
485
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
486
486
|
options.update(kwargs)
|
|
487
|
+
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
487
488
|
df.write.mode("overwrite").save(**options)
|
|
488
489
|
elif hasattr(df, "dask"):
|
|
489
490
|
dask_options = self.get_dask_options()
|
|
@@ -513,36 +514,41 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
513
514
|
dir = os.path.dirname(target_path)
|
|
514
515
|
if dir:
|
|
515
516
|
os.makedirs(dir, exist_ok=True)
|
|
516
|
-
partition_cols = []
|
|
517
|
-
if target_path.endswith(".parquet") or target_path.endswith(".pq"):
|
|
518
|
-
partition_cols = None
|
|
519
517
|
target_df = df
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
if
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
518
|
+
partition_cols = None # single parquet file
|
|
519
|
+
if not target_path.endswith(".parquet") and not target_path.endswith(
|
|
520
|
+
".pq"
|
|
521
|
+
): # directory
|
|
522
|
+
partition_cols = []
|
|
523
|
+
if timestamp_key and (
|
|
524
|
+
self.partitioned or self.time_partitioning_granularity
|
|
525
|
+
):
|
|
526
|
+
target_df = df.copy(deep=False)
|
|
527
|
+
time_partitioning_granularity = self.time_partitioning_granularity
|
|
528
|
+
if not time_partitioning_granularity and self.partitioned:
|
|
529
|
+
time_partitioning_granularity = (
|
|
530
|
+
mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
|
|
531
|
+
)
|
|
532
|
+
for unit, fmt in [
|
|
533
|
+
("year", "%Y"),
|
|
534
|
+
("month", "%m"),
|
|
535
|
+
("day", "%d"),
|
|
536
|
+
("hour", "%H"),
|
|
537
|
+
("minute", "%M"),
|
|
538
|
+
]:
|
|
539
|
+
partition_cols.append(unit)
|
|
540
|
+
target_df[unit] = pd.DatetimeIndex(
|
|
541
|
+
target_df[timestamp_key]
|
|
542
|
+
).format(date_format=fmt)
|
|
543
|
+
if unit == time_partitioning_granularity:
|
|
544
|
+
break
|
|
545
|
+
# Partitioning will be performed on timestamp_key and then on self.partition_cols
|
|
546
|
+
# (We might want to give the user control on this order as additional functionality)
|
|
547
|
+
partition_cols += self.partition_cols or []
|
|
542
548
|
storage_options = self._get_store().get_storage_options()
|
|
543
549
|
self._write_dataframe(
|
|
544
550
|
target_df,
|
|
545
|
-
storage_options,
|
|
551
|
+
self.storage_options or storage_options,
|
|
546
552
|
target_path,
|
|
547
553
|
partition_cols=partition_cols,
|
|
548
554
|
**kwargs,
|
|
@@ -690,7 +696,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
690
696
|
# options used in spark.read.load(**options)
|
|
691
697
|
raise NotImplementedError()
|
|
692
698
|
|
|
693
|
-
def prepare_spark_df(self, df, key_columns):
|
|
699
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
|
|
694
700
|
return df
|
|
695
701
|
|
|
696
702
|
def get_dask_options(self):
|
|
@@ -924,6 +930,37 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
924
930
|
return self.path.endswith(".parquet") or self.path.endswith(".pq")
|
|
925
931
|
return False
|
|
926
932
|
|
|
933
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
934
|
+
# If partitioning by time, add the necessary columns
|
|
935
|
+
if (
|
|
936
|
+
timestamp_key
|
|
937
|
+
and isinstance(spark_options, dict)
|
|
938
|
+
and "partitionBy" in spark_options
|
|
939
|
+
):
|
|
940
|
+
from pyspark.sql.functions import (
|
|
941
|
+
dayofmonth,
|
|
942
|
+
hour,
|
|
943
|
+
minute,
|
|
944
|
+
month,
|
|
945
|
+
second,
|
|
946
|
+
year,
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
time_unit_to_op = {
|
|
950
|
+
"year": year,
|
|
951
|
+
"month": month,
|
|
952
|
+
"day": dayofmonth,
|
|
953
|
+
"hour": hour,
|
|
954
|
+
"minute": minute,
|
|
955
|
+
"second": second,
|
|
956
|
+
}
|
|
957
|
+
timestamp_col = df[timestamp_key]
|
|
958
|
+
for partition in spark_options["partitionBy"]:
|
|
959
|
+
if partition not in df.columns and partition in time_unit_to_op:
|
|
960
|
+
op = time_unit_to_op[partition]
|
|
961
|
+
df = df.withColumn(partition, op(timestamp_col))
|
|
962
|
+
return df
|
|
963
|
+
|
|
927
964
|
|
|
928
965
|
class CSVTarget(BaseStoreTarget):
|
|
929
966
|
kind = TargetTypes.csv
|
|
@@ -973,7 +1010,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
973
1010
|
"header": "true",
|
|
974
1011
|
}
|
|
975
1012
|
|
|
976
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1013
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
977
1014
|
import pyspark.sql.functions as funcs
|
|
978
1015
|
|
|
979
1016
|
for col_name, col_type in df.dtypes:
|
|
@@ -1067,7 +1104,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1067
1104
|
**self.attributes,
|
|
1068
1105
|
)
|
|
1069
1106
|
|
|
1070
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1107
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1071
1108
|
raise NotImplementedError()
|
|
1072
1109
|
|
|
1073
1110
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
@@ -1139,7 +1176,7 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1139
1176
|
spark_options["columnUpdate"] = True
|
|
1140
1177
|
return spark_options
|
|
1141
1178
|
|
|
1142
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1179
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1143
1180
|
from pyspark.sql.functions import col
|
|
1144
1181
|
|
|
1145
1182
|
spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -1232,7 +1269,7 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1232
1269
|
endpoint, uri = self._get_server_endpoint()
|
|
1233
1270
|
return endpoint
|
|
1234
1271
|
|
|
1235
|
-
def prepare_spark_df(self, df, key_columns):
|
|
1272
|
+
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1236
1273
|
from pyspark.sql.functions import col
|
|
1237
1274
|
|
|
1238
1275
|
spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
|
mlrun/db/httpdb.py
CHANGED
|
@@ -18,6 +18,7 @@ import tempfile
|
|
|
18
18
|
import time
|
|
19
19
|
import traceback
|
|
20
20
|
import typing
|
|
21
|
+
import warnings
|
|
21
22
|
from datetime import datetime, timedelta
|
|
22
23
|
from os import path, remove
|
|
23
24
|
from typing import Dict, List, Optional, Union
|
|
@@ -1411,6 +1412,8 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1411
1412
|
namespace=None,
|
|
1412
1413
|
artifact_path=None,
|
|
1413
1414
|
ops=None,
|
|
1415
|
+
# TODO: deprecated, remove in 1.6.0
|
|
1416
|
+
ttl=None,
|
|
1414
1417
|
cleanup_ttl=None,
|
|
1415
1418
|
):
|
|
1416
1419
|
"""Submit a KFP pipeline for execution.
|
|
@@ -1423,9 +1426,18 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1423
1426
|
:param namespace: Kubernetes namespace to execute the pipeline in.
|
|
1424
1427
|
:param artifact_path: A path to artifacts used by this pipeline.
|
|
1425
1428
|
:param ops: Transformers to apply on all ops in the pipeline.
|
|
1429
|
+
:param ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the workflow
|
|
1430
|
+
and all its resources are deleted) (deprecated, use cleanup_ttl instead)
|
|
1426
1431
|
:param cleanup_ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the
|
|
1427
1432
|
workflow and all its resources are deleted)
|
|
1428
1433
|
"""
|
|
1434
|
+
if ttl:
|
|
1435
|
+
warnings.warn(
|
|
1436
|
+
"'ttl' is deprecated, use 'cleanup_ttl' instead. "
|
|
1437
|
+
"This will be removed in 1.6.0",
|
|
1438
|
+
# TODO: Remove this in 1.6.0
|
|
1439
|
+
FutureWarning,
|
|
1440
|
+
)
|
|
1429
1441
|
|
|
1430
1442
|
if isinstance(pipeline, str):
|
|
1431
1443
|
pipe_file = pipeline
|
|
@@ -1433,7 +1445,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1433
1445
|
pipe_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=False).name
|
|
1434
1446
|
conf = new_pipe_metadata(
|
|
1435
1447
|
artifact_path=artifact_path,
|
|
1436
|
-
cleanup_ttl=cleanup_ttl,
|
|
1448
|
+
cleanup_ttl=cleanup_ttl or ttl,
|
|
1437
1449
|
op_transformers=ops,
|
|
1438
1450
|
)
|
|
1439
1451
|
kfp.compiler.Compiler().compile(
|
|
@@ -1471,15 +1483,17 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1471
1483
|
headers=headers,
|
|
1472
1484
|
)
|
|
1473
1485
|
except OSError as err:
|
|
1474
|
-
logger.error(
|
|
1475
|
-
raise OSError(f"
|
|
1486
|
+
logger.error("Error: Cannot submit pipeline", err=err_to_str(err))
|
|
1487
|
+
raise OSError(f"Error: Cannot submit pipeline, {err_to_str(err)}")
|
|
1476
1488
|
|
|
1477
1489
|
if not resp.ok:
|
|
1478
|
-
logger.error(
|
|
1479
|
-
raise ValueError(f"
|
|
1490
|
+
logger.error("Failed to submit pipeline", respones_text=resp.text)
|
|
1491
|
+
raise ValueError(f"Failed to submit pipeline, {resp.text}")
|
|
1480
1492
|
|
|
1481
1493
|
resp = resp.json()
|
|
1482
|
-
logger.info(
|
|
1494
|
+
logger.info(
|
|
1495
|
+
"Pipeline submitted successfully", pipeline_name=resp["name"], id=resp["id"]
|
|
1496
|
+
)
|
|
1483
1497
|
return resp["id"]
|
|
1484
1498
|
|
|
1485
1499
|
def list_pipelines(
|
mlrun/feature_store/api.py
CHANGED
|
@@ -975,37 +975,9 @@ def _ingest_with_spark(
|
|
|
975
975
|
)
|
|
976
976
|
|
|
977
977
|
df_to_write = df
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
from pyspark.sql.functions import (
|
|
982
|
-
dayofmonth,
|
|
983
|
-
hour,
|
|
984
|
-
minute,
|
|
985
|
-
month,
|
|
986
|
-
second,
|
|
987
|
-
year,
|
|
988
|
-
)
|
|
989
|
-
|
|
990
|
-
time_unit_to_op = {
|
|
991
|
-
"year": year,
|
|
992
|
-
"month": month,
|
|
993
|
-
"day": dayofmonth,
|
|
994
|
-
"hour": hour,
|
|
995
|
-
"minute": minute,
|
|
996
|
-
"second": second,
|
|
997
|
-
}
|
|
998
|
-
timestamp_col = df_to_write[timestamp_key]
|
|
999
|
-
for partition in spark_options["partitionBy"]:
|
|
1000
|
-
if (
|
|
1001
|
-
partition not in df_to_write.columns
|
|
1002
|
-
and partition in time_unit_to_op
|
|
1003
|
-
):
|
|
1004
|
-
op = time_unit_to_op[partition]
|
|
1005
|
-
df_to_write = df_to_write.withColumn(
|
|
1006
|
-
partition, op(timestamp_col)
|
|
1007
|
-
)
|
|
1008
|
-
df_to_write = target.prepare_spark_df(df_to_write, key_columns)
|
|
978
|
+
df_to_write = target.prepare_spark_df(
|
|
979
|
+
df_to_write, key_columns, timestamp_key, spark_options
|
|
980
|
+
)
|
|
1009
981
|
if overwrite:
|
|
1010
982
|
df_to_write.write.mode("overwrite").save(**spark_options)
|
|
1011
983
|
else:
|
|
@@ -631,7 +631,7 @@ class FeatureVector(ModelObj):
|
|
|
631
631
|
feature_set_fields: list of field (name, alias) per featureset
|
|
632
632
|
"""
|
|
633
633
|
processed_features = {} # dict of name to (featureset, feature object)
|
|
634
|
-
feature_set_objects = {}
|
|
634
|
+
feature_set_objects = self.feature_set_objects or {}
|
|
635
635
|
index_keys = []
|
|
636
636
|
feature_set_fields = collections.defaultdict(list)
|
|
637
637
|
features = copy(self.spec.features)
|
|
@@ -136,7 +136,7 @@ class BaseMerger(abc.ABC):
|
|
|
136
136
|
order_by=order_by,
|
|
137
137
|
)
|
|
138
138
|
|
|
139
|
-
def _write_to_offline_target(self):
|
|
139
|
+
def _write_to_offline_target(self, timestamp_key=None):
|
|
140
140
|
if self._target:
|
|
141
141
|
is_persistent_vector = self.vector.metadata.name is not None
|
|
142
142
|
if not self._target.path and not is_persistent_vector:
|
|
@@ -144,7 +144,12 @@ class BaseMerger(abc.ABC):
|
|
|
144
144
|
"target path was not specified"
|
|
145
145
|
)
|
|
146
146
|
self._target.set_resource(self.vector)
|
|
147
|
-
size = self._target.write_dataframe(
|
|
147
|
+
size = self._target.write_dataframe(
|
|
148
|
+
self._result_df,
|
|
149
|
+
timestamp_key=timestamp_key
|
|
150
|
+
if not self._drop_indexes and timestamp_key not in self._drop_columns
|
|
151
|
+
else None,
|
|
152
|
+
)
|
|
148
153
|
if is_persistent_vector:
|
|
149
154
|
target_status = self._target.update_resource_status("ready", size=size)
|
|
150
155
|
logger.info(f"wrote target: {target_status}")
|
|
@@ -361,7 +366,7 @@ class BaseMerger(abc.ABC):
|
|
|
361
366
|
)
|
|
362
367
|
self._order_by(order_by_active)
|
|
363
368
|
|
|
364
|
-
self._write_to_offline_target()
|
|
369
|
+
self._write_to_offline_target(timestamp_key=result_timestamp)
|
|
365
370
|
return OfflineVectorResponse(self)
|
|
366
371
|
|
|
367
372
|
def init_online_vector_service(
|
mlrun/launcher/remote.py
CHANGED
|
@@ -89,7 +89,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
89
89
|
|
|
90
90
|
else:
|
|
91
91
|
raise mlrun.errors.MLRunRuntimeError(
|
|
92
|
-
"
|
|
92
|
+
"Function image is not built/ready, set auto_build=True or use .deploy() method first"
|
|
93
93
|
)
|
|
94
94
|
|
|
95
95
|
if runtime.verbose:
|
|
@@ -122,11 +122,11 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
122
122
|
resp = db.submit_job(run, schedule=schedule)
|
|
123
123
|
if schedule:
|
|
124
124
|
action = resp.pop("action", "created")
|
|
125
|
-
logger.info(f"
|
|
125
|
+
logger.info(f"Task schedule {action}", **resp)
|
|
126
126
|
return
|
|
127
127
|
|
|
128
128
|
except (requests.HTTPError, Exception) as err:
|
|
129
|
-
logger.error(
|
|
129
|
+
logger.error("Failed remote run", error=mlrun.errors.err_to_str(err))
|
|
130
130
|
|
|
131
131
|
if isinstance(err, requests.HTTPError):
|
|
132
132
|
runtime._handle_submit_job_http_error(err)
|
mlrun/lists.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import warnings
|
|
14
15
|
from copy import copy
|
|
15
16
|
from typing import List
|
|
16
17
|
|
|
@@ -219,6 +220,16 @@ class ArtifactList(list):
|
|
|
219
220
|
"""return as a list of artifact objects"""
|
|
220
221
|
return [dict_to_artifact(artifact) for artifact in self]
|
|
221
222
|
|
|
223
|
+
def objects(self) -> List[Artifact]:
|
|
224
|
+
"""return as a list of artifact objects"""
|
|
225
|
+
warnings.warn(
|
|
226
|
+
"'objects' is deprecated in 1.3.0 and will be removed in 1.6.0. "
|
|
227
|
+
"Use 'to_objects' instead.",
|
|
228
|
+
# TODO: remove in 1.6.0
|
|
229
|
+
FutureWarning,
|
|
230
|
+
)
|
|
231
|
+
return [dict_to_artifact(artifact) for artifact in self]
|
|
232
|
+
|
|
222
233
|
def dataitems(self) -> List["mlrun.DataItem"]:
|
|
223
234
|
"""return as a list of DataItem objects"""
|
|
224
235
|
dataitems = []
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
16
|
# for backwards compatibility
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
from .helpers import get_stream_path
|
|
20
19
|
from .model_endpoint import ModelEndpoint
|
|
21
20
|
from .stores import ModelEndpointStore, ModelEndpointStoreType, get_model_endpoint_store
|
mlrun/model_monitoring/api.py
CHANGED
|
@@ -28,9 +28,9 @@ from mlrun.common.schemas.model_monitoring import EventFieldType, ModelMonitorin
|
|
|
28
28
|
from mlrun.data_types.infer import InferOptions, get_df_stats
|
|
29
29
|
from mlrun.utils import logger
|
|
30
30
|
|
|
31
|
+
from .batch import VirtualDrift
|
|
31
32
|
from .features_drift_table import FeaturesDriftTablePlot
|
|
32
33
|
from .model_endpoint import ModelEndpoint
|
|
33
|
-
from .model_monitoring_batch import VirtualDrift
|
|
34
34
|
|
|
35
35
|
# A union of all supported dataset types:
|
|
36
36
|
DatasetType = typing.Union[
|