mlrun 1.8.0rc19__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +37 -3
- mlrun/__main__.py +5 -0
- mlrun/alerts/alert.py +1 -0
- mlrun/artifacts/document.py +78 -36
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/runtimes/constants.py +17 -0
- mlrun/common/schemas/alert.py +3 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/model_monitoring/constants.py +32 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +2 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/config.py +39 -6
- mlrun/datastore/datastore_profile.py +58 -16
- mlrun/datastore/sources.py +7 -1
- mlrun/datastore/vectorstore.py +20 -1
- mlrun/db/base.py +20 -0
- mlrun/db/httpdb.py +97 -10
- mlrun/db/nopdb.py +19 -0
- mlrun/errors.py +4 -0
- mlrun/execution.py +15 -6
- mlrun/frameworks/_common/model_handler.py +0 -2
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +5 -1
- mlrun/model_monitoring/applications/_application_steps.py +3 -1
- mlrun/model_monitoring/controller.py +266 -103
- mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +20 -21
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -34
- mlrun/model_monitoring/helpers.py +16 -10
- mlrun/model_monitoring/stream_processing.py +106 -35
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packagers_manager.py +4 -18
- mlrun/projects/pipelines.py +18 -5
- mlrun/projects/project.py +156 -39
- mlrun/runtimes/nuclio/serving.py +22 -13
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/secrets.py +1 -1
- mlrun/serving/server.py +11 -3
- mlrun/serving/states.py +65 -8
- mlrun/serving/v2_serving.py +67 -44
- mlrun/utils/helpers.py +111 -23
- mlrun/utils/notifications/notification/base.py +6 -1
- mlrun/utils/notifications/notification/slack.py +5 -1
- mlrun/utils/notifications/notification_pusher.py +67 -36
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +52 -52
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0
|
@@ -33,7 +33,7 @@ _TSDB_BE = "tsdb"
|
|
|
33
33
|
_TSDB_RATE = "1/s"
|
|
34
34
|
_CONTAINER = "users"
|
|
35
35
|
|
|
36
|
-
V3IO_MEPS_LIMIT =
|
|
36
|
+
V3IO_MEPS_LIMIT = 200
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
@@ -135,7 +135,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
135
135
|
monitoring_predictions_full_path = (
|
|
136
136
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
137
137
|
project=self.project,
|
|
138
|
-
kind=mm_schemas.
|
|
138
|
+
kind=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
139
139
|
)
|
|
140
140
|
)
|
|
141
141
|
(
|
|
@@ -145,7 +145,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
145
145
|
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
146
146
|
monitoring_predictions_full_path
|
|
147
147
|
)
|
|
148
|
-
self.tables[mm_schemas.
|
|
148
|
+
self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS] = monitoring_predictions_path
|
|
149
149
|
|
|
150
150
|
def create_tables(self) -> None:
|
|
151
151
|
"""
|
|
@@ -204,7 +204,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
204
204
|
}
|
|
205
205
|
],
|
|
206
206
|
name=EventFieldType.LATENCY,
|
|
207
|
-
after="
|
|
207
|
+
after="FilterNOP",
|
|
208
208
|
step_name="Aggregates",
|
|
209
209
|
table=".",
|
|
210
210
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
@@ -225,8 +225,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
225
225
|
graph.add_step(
|
|
226
226
|
"storey.TSDBTarget",
|
|
227
227
|
name="tsdb_predictions",
|
|
228
|
-
after="
|
|
229
|
-
path=f"{self.container}/{self.tables[mm_schemas.
|
|
228
|
+
after="FilterNOP",
|
|
229
|
+
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS]}",
|
|
230
230
|
rate="1/s",
|
|
231
231
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
232
232
|
container=self.container,
|
|
@@ -234,6 +234,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
234
234
|
columns=[
|
|
235
235
|
mm_schemas.EventFieldType.LATENCY,
|
|
236
236
|
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
237
|
+
mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
|
|
238
|
+
mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
|
|
237
239
|
],
|
|
238
240
|
index_cols=[
|
|
239
241
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
@@ -580,14 +582,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
580
582
|
)
|
|
581
583
|
|
|
582
584
|
@staticmethod
|
|
583
|
-
def _get_endpoint_filter(endpoint_id: Union[str, list[str]]):
|
|
585
|
+
def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
|
|
584
586
|
if isinstance(endpoint_id, str):
|
|
585
587
|
return f"endpoint_id=='{endpoint_id}'"
|
|
586
588
|
elif isinstance(endpoint_id, list):
|
|
587
589
|
if len(endpoint_id) > V3IO_MEPS_LIMIT:
|
|
588
|
-
|
|
589
|
-
|
|
590
|
+
logger.info(
|
|
591
|
+
"The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
|
|
592
|
+
"retrieving all the model endpoints from the db.",
|
|
593
|
+
limit=V3IO_MEPS_LIMIT,
|
|
594
|
+
amount=len(endpoint_id),
|
|
590
595
|
)
|
|
596
|
+
return None
|
|
591
597
|
return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
|
|
592
598
|
else:
|
|
593
599
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -734,10 +740,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
734
740
|
"both or neither of `aggregation_window` and `agg_funcs` must be provided"
|
|
735
741
|
)
|
|
736
742
|
df = self._get_records(
|
|
737
|
-
table=mm_schemas.
|
|
743
|
+
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
738
744
|
start=start,
|
|
739
745
|
end=end,
|
|
740
|
-
columns=[mm_schemas.EventFieldType.
|
|
746
|
+
columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
|
|
741
747
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
742
748
|
agg_funcs=agg_funcs,
|
|
743
749
|
sliding_window_step=aggregation_window,
|
|
@@ -751,10 +757,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
751
757
|
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
752
758
|
)
|
|
753
759
|
|
|
754
|
-
|
|
755
|
-
f"{agg_funcs[0]}({mm_schemas.EventFieldType.
|
|
760
|
+
estimated_prediction_count = (
|
|
761
|
+
f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
|
|
756
762
|
if agg_funcs
|
|
757
|
-
else mm_schemas.EventFieldType.
|
|
763
|
+
else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
|
|
758
764
|
)
|
|
759
765
|
|
|
760
766
|
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
@@ -762,7 +768,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
762
768
|
values=list(
|
|
763
769
|
zip(
|
|
764
770
|
df.index,
|
|
765
|
-
df[
|
|
771
|
+
df[estimated_prediction_count],
|
|
766
772
|
)
|
|
767
773
|
), # pyright: ignore[reportArgumentType]
|
|
768
774
|
)
|
|
@@ -773,15 +779,13 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
773
779
|
start: Optional[datetime] = None,
|
|
774
780
|
end: Optional[datetime] = None,
|
|
775
781
|
) -> pd.DataFrame:
|
|
776
|
-
|
|
777
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
778
|
-
)
|
|
782
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
779
783
|
start, end = self._get_start_end(start, end)
|
|
780
784
|
df = self._get_records(
|
|
781
|
-
table=mm_schemas.
|
|
785
|
+
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
782
786
|
start=start,
|
|
783
787
|
end=end,
|
|
784
|
-
filter_query=
|
|
788
|
+
filter_query=filter_query,
|
|
785
789
|
agg_funcs=["last"],
|
|
786
790
|
)
|
|
787
791
|
if not df.empty:
|
|
@@ -808,9 +812,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
808
812
|
start: Optional[datetime] = None,
|
|
809
813
|
end: Optional[datetime] = None,
|
|
810
814
|
) -> pd.DataFrame:
|
|
811
|
-
|
|
812
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
813
|
-
)
|
|
815
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
814
816
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
815
817
|
start, end = self._get_start_end(start, end)
|
|
816
818
|
df = self._get_records(
|
|
@@ -818,7 +820,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
818
820
|
start=start,
|
|
819
821
|
end=end,
|
|
820
822
|
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
821
|
-
filter_query=
|
|
823
|
+
filter_query=filter_query,
|
|
822
824
|
agg_funcs=["max"],
|
|
823
825
|
group_by="endpoint_id",
|
|
824
826
|
)
|
|
@@ -883,17 +885,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
883
885
|
start: Optional[datetime] = None,
|
|
884
886
|
end: Optional[datetime] = None,
|
|
885
887
|
) -> pd.DataFrame:
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
888
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
889
|
+
if filter_query:
|
|
890
|
+
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
891
|
+
else:
|
|
892
|
+
filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
|
|
889
893
|
start, end = self._get_start_end(start, end)
|
|
890
894
|
df = self._get_records(
|
|
891
895
|
table=mm_schemas.FileTargetKind.ERRORS,
|
|
892
896
|
start=start,
|
|
893
897
|
end=end,
|
|
894
898
|
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
895
|
-
filter_query=
|
|
896
|
-
f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'",
|
|
899
|
+
filter_query=filter_query,
|
|
897
900
|
agg_funcs=["count"],
|
|
898
901
|
)
|
|
899
902
|
if not df.empty:
|
|
@@ -912,17 +915,15 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
912
915
|
start: Optional[datetime] = None,
|
|
913
916
|
end: Optional[datetime] = None,
|
|
914
917
|
) -> pd.DataFrame:
|
|
915
|
-
|
|
916
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
917
|
-
)
|
|
918
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
918
919
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
919
920
|
start, end = self._get_start_end(start, end)
|
|
920
921
|
df = self._get_records(
|
|
921
|
-
table=mm_schemas.
|
|
922
|
+
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
922
923
|
start=start,
|
|
923
924
|
end=end,
|
|
924
925
|
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
925
|
-
filter_query=
|
|
926
|
+
filter_query=filter_query,
|
|
926
927
|
agg_funcs=["avg"],
|
|
927
928
|
)
|
|
928
929
|
if not df.empty:
|
|
@@ -109,7 +109,7 @@ def filter_results_by_regex(
|
|
|
109
109
|
result_name_filters=validated_filters,
|
|
110
110
|
):
|
|
111
111
|
filtered_metrics_names.append(existing_result_name)
|
|
112
|
-
return filtered_metrics_names
|
|
112
|
+
return list(set(filtered_metrics_names))
|
|
113
113
|
|
|
114
114
|
|
|
115
115
|
def get_stream_path(
|
|
@@ -117,6 +117,7 @@ def get_stream_path(
|
|
|
117
117
|
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
118
118
|
stream_uri: Optional[str] = None,
|
|
119
119
|
secret_provider: Optional[Callable[[str], str]] = None,
|
|
120
|
+
profile: Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
|
|
120
121
|
) -> str:
|
|
121
122
|
"""
|
|
122
123
|
Get stream path from the project secret. If wasn't set, take it from the system configurations
|
|
@@ -126,20 +127,25 @@ def get_stream_path(
|
|
|
126
127
|
:param stream_uri: Stream URI. If provided, it will be used instead of the one from the project's secret.
|
|
127
128
|
:param secret_provider: Optional secret provider to get the connection string secret.
|
|
128
129
|
If not set, the env vars are used.
|
|
130
|
+
:param profile: Optional datastore profile of the stream (V3IO/KafkaSource profile).
|
|
129
131
|
:return: Monitoring stream path to the relevant application.
|
|
130
132
|
"""
|
|
131
133
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
profile = None
|
|
134
|
+
profile = profile or _get_stream_profile(
|
|
135
|
+
project=project, secret_provider=secret_provider
|
|
136
|
+
)
|
|
136
137
|
|
|
137
138
|
if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
|
|
138
139
|
stream_uri = "v3io"
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
elif isinstance(
|
|
141
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
|
|
142
|
+
):
|
|
143
|
+
stream_uri = f"kafka://{profile.brokers[0]}"
|
|
144
|
+
else:
|
|
145
|
+
raise mlrun.errors.MLRunValueError(
|
|
146
|
+
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
147
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
|
|
148
|
+
)
|
|
143
149
|
|
|
144
150
|
if not stream_uri or stream_uri == "v3io":
|
|
145
151
|
stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
@@ -273,7 +279,7 @@ def _get_profile(
|
|
|
273
279
|
)
|
|
274
280
|
if not profile_name:
|
|
275
281
|
raise mlrun.errors.MLRunNotFoundError(
|
|
276
|
-
f"Not found `{profile_name_key}` profile name"
|
|
282
|
+
f"Not found `{profile_name_key}` profile name for project '{project}'"
|
|
277
283
|
)
|
|
278
284
|
return mlrun.datastore.datastore_profile.datastore_profile_read(
|
|
279
285
|
url=f"ds://{profile_name}", project_name=project, secrets=secret_provider
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import collections
|
|
16
15
|
import datetime
|
|
17
16
|
import os
|
|
18
17
|
import typing
|
|
@@ -29,11 +28,14 @@ import mlrun.model_monitoring.db
|
|
|
29
28
|
import mlrun.serving.states
|
|
30
29
|
import mlrun.utils
|
|
31
30
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
31
|
+
ControllerEvent,
|
|
32
|
+
ControllerEventKind,
|
|
32
33
|
EndpointType,
|
|
33
34
|
EventFieldType,
|
|
34
35
|
FileTargetKind,
|
|
35
36
|
ProjectSecretKeys,
|
|
36
37
|
)
|
|
38
|
+
from mlrun.datastore import parse_kafka_url
|
|
37
39
|
from mlrun.model_monitoring.db import TSDBConnector
|
|
38
40
|
from mlrun.utils import logger
|
|
39
41
|
|
|
@@ -88,7 +90,9 @@ class EventStreamProcessor:
|
|
|
88
90
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
89
91
|
self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
|
|
90
92
|
|
|
91
|
-
self.v3io_access_key = v3io_access_key or
|
|
93
|
+
self.v3io_access_key = v3io_access_key or mlrun.get_secret_or_env(
|
|
94
|
+
"V3IO_ACCESS_KEY"
|
|
95
|
+
)
|
|
92
96
|
self.model_monitoring_access_key = (
|
|
93
97
|
model_monitoring_access_key
|
|
94
98
|
or os.environ.get(ProjectSecretKeys.ACCESS_KEY)
|
|
@@ -118,6 +122,7 @@ class EventStreamProcessor:
|
|
|
118
122
|
self,
|
|
119
123
|
fn: mlrun.runtimes.ServingRuntime,
|
|
120
124
|
tsdb_connector: TSDBConnector,
|
|
125
|
+
controller_stream_uri: str,
|
|
121
126
|
) -> None:
|
|
122
127
|
"""
|
|
123
128
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -146,6 +151,8 @@ class EventStreamProcessor:
|
|
|
146
151
|
|
|
147
152
|
:param fn: A serving function.
|
|
148
153
|
:param tsdb_connector: Time series database connector.
|
|
154
|
+
:param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
|
|
155
|
+
input
|
|
149
156
|
"""
|
|
150
157
|
|
|
151
158
|
graph = typing.cast(
|
|
@@ -209,6 +216,20 @@ class EventStreamProcessor:
|
|
|
209
216
|
)
|
|
210
217
|
|
|
211
218
|
apply_map_feature_names()
|
|
219
|
+
# split the graph between event with error vs valid event
|
|
220
|
+
graph.add_step(
|
|
221
|
+
"storey.Filter",
|
|
222
|
+
"FilterNOP",
|
|
223
|
+
after="MapFeatureNames",
|
|
224
|
+
_fn="(event.get('kind', " ") != 'nop_event')",
|
|
225
|
+
)
|
|
226
|
+
graph.add_step(
|
|
227
|
+
"storey.Filter",
|
|
228
|
+
"ForwardNOP",
|
|
229
|
+
after="MapFeatureNames",
|
|
230
|
+
_fn="(event.get('kind', " ") == 'nop_event')",
|
|
231
|
+
)
|
|
232
|
+
|
|
212
233
|
tsdb_connector.apply_monitoring_stream_steps(
|
|
213
234
|
graph=graph,
|
|
214
235
|
aggregate_windows=self.aggregate_windows,
|
|
@@ -221,7 +242,7 @@ class EventStreamProcessor:
|
|
|
221
242
|
graph.add_step(
|
|
222
243
|
"ProcessBeforeParquet",
|
|
223
244
|
name="ProcessBeforeParquet",
|
|
224
|
-
after="
|
|
245
|
+
after="FilterNOP",
|
|
225
246
|
_fn="(event)",
|
|
226
247
|
)
|
|
227
248
|
|
|
@@ -248,6 +269,44 @@ class EventStreamProcessor:
|
|
|
248
269
|
|
|
249
270
|
apply_parquet_target()
|
|
250
271
|
|
|
272
|
+
# controller branch
|
|
273
|
+
def apply_push_controller_stream(stream_uri: str):
|
|
274
|
+
if stream_uri.startswith("v3io://"):
|
|
275
|
+
graph.add_step(
|
|
276
|
+
">>",
|
|
277
|
+
"controller_stream_v3io",
|
|
278
|
+
path=stream_uri,
|
|
279
|
+
sharding_func=ControllerEvent.ENDPOINT_ID,
|
|
280
|
+
access_key=self.v3io_access_key,
|
|
281
|
+
after="ForwardNOP",
|
|
282
|
+
)
|
|
283
|
+
elif stream_uri.startswith("kafka://"):
|
|
284
|
+
topic, brokers = parse_kafka_url(stream_uri)
|
|
285
|
+
logger.info(
|
|
286
|
+
"Controller stream uri for kafka",
|
|
287
|
+
stream_uri=stream_uri,
|
|
288
|
+
topic=topic,
|
|
289
|
+
brokers=brokers,
|
|
290
|
+
)
|
|
291
|
+
if isinstance(brokers, list):
|
|
292
|
+
path = f"kafka://{brokers[0]}/{topic}"
|
|
293
|
+
elif isinstance(brokers, str):
|
|
294
|
+
path = f"kafka://{brokers}/{topic}"
|
|
295
|
+
else:
|
|
296
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
297
|
+
"Brokers must be a list or str check controller stream uri"
|
|
298
|
+
)
|
|
299
|
+
graph.add_step(
|
|
300
|
+
">>",
|
|
301
|
+
"controller_stream_kafka",
|
|
302
|
+
path=path,
|
|
303
|
+
kafka_brokers=brokers,
|
|
304
|
+
_sharding_func=ControllerEvent.ENDPOINT_ID,
|
|
305
|
+
after="ForwardNOP",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
apply_push_controller_stream(controller_stream_uri)
|
|
309
|
+
|
|
251
310
|
|
|
252
311
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
253
312
|
def __init__(self, **kwargs):
|
|
@@ -313,14 +372,14 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
313
372
|
self.first_request: dict[str, str] = dict()
|
|
314
373
|
self.last_request: dict[str, str] = dict()
|
|
315
374
|
|
|
316
|
-
# Number of errors (value) per endpoint (key)
|
|
317
|
-
self.error_count: dict[str, int] = collections.defaultdict(int)
|
|
318
|
-
|
|
319
375
|
# Set of endpoints in the current events
|
|
320
376
|
self.endpoints: set[str] = set()
|
|
321
377
|
|
|
322
378
|
def do(self, full_event):
|
|
323
379
|
event = full_event.body
|
|
380
|
+
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
381
|
+
logger.info("Skipped nop event inside of ProcessEndpointEvent", event=event)
|
|
382
|
+
return storey.Event(body=[event])
|
|
324
383
|
# Getting model version and function uri from event
|
|
325
384
|
# and use them for retrieving the endpoint_id
|
|
326
385
|
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
@@ -354,10 +413,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
354
413
|
predictions = event.get("resp", {}).get("outputs")
|
|
355
414
|
|
|
356
415
|
if not self.is_valid(
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
["when"],
|
|
416
|
+
validation_function=is_not_none,
|
|
417
|
+
field=timestamp,
|
|
418
|
+
dict_path=["when"],
|
|
361
419
|
):
|
|
362
420
|
return None
|
|
363
421
|
|
|
@@ -369,31 +427,27 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
369
427
|
self.last_request[endpoint_id] = timestamp
|
|
370
428
|
|
|
371
429
|
if not self.is_valid(
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
["request", "id"],
|
|
430
|
+
validation_function=is_not_none,
|
|
431
|
+
field=request_id,
|
|
432
|
+
dict_path=["request", "id"],
|
|
376
433
|
):
|
|
377
434
|
return None
|
|
378
435
|
if not self.is_valid(
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
["microsec"],
|
|
436
|
+
validation_function=is_not_none,
|
|
437
|
+
field=latency,
|
|
438
|
+
dict_path=["microsec"],
|
|
383
439
|
):
|
|
384
440
|
return None
|
|
385
441
|
if not self.is_valid(
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
["request", "inputs"],
|
|
442
|
+
validation_function=is_not_none,
|
|
443
|
+
field=features,
|
|
444
|
+
dict_path=["request", "inputs"],
|
|
390
445
|
):
|
|
391
446
|
return None
|
|
392
447
|
if not self.is_valid(
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
["resp", "outputs"],
|
|
448
|
+
validation_function=is_not_none,
|
|
449
|
+
field=predictions,
|
|
450
|
+
dict_path=["resp", "outputs"],
|
|
397
451
|
):
|
|
398
452
|
return None
|
|
399
453
|
|
|
@@ -430,6 +484,10 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
430
484
|
if not isinstance(feature, list):
|
|
431
485
|
feature = [feature]
|
|
432
486
|
|
|
487
|
+
effective_sample_count, estimated_prediction_count = (
|
|
488
|
+
self._get_effective_and_estimated_counts(event=event)
|
|
489
|
+
)
|
|
490
|
+
|
|
433
491
|
events.append(
|
|
434
492
|
{
|
|
435
493
|
EventFieldType.FUNCTION_URI: function_uri,
|
|
@@ -447,12 +505,13 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
447
505
|
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
448
506
|
self.last_request[endpoint_id]
|
|
449
507
|
).timestamp(),
|
|
450
|
-
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
451
508
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
452
509
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
|
453
510
|
EventFieldType.ENTITIES: event.get("request", {}).get(
|
|
454
511
|
EventFieldType.ENTITIES, {}
|
|
455
512
|
),
|
|
513
|
+
EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
|
|
514
|
+
EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
|
|
456
515
|
}
|
|
457
516
|
)
|
|
458
517
|
|
|
@@ -476,7 +535,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
476
535
|
.flat_dict()
|
|
477
536
|
)
|
|
478
537
|
|
|
479
|
-
# If model endpoint found, get first_request
|
|
538
|
+
# If model endpoint found, get first_request & last_request values
|
|
480
539
|
if endpoint_record:
|
|
481
540
|
first_request = endpoint_record.get(EventFieldType.FIRST_REQUEST)
|
|
482
541
|
|
|
@@ -487,26 +546,34 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
487
546
|
if last_request:
|
|
488
547
|
self.last_request[endpoint_id] = last_request
|
|
489
548
|
|
|
490
|
-
error_count = endpoint_record.get(EventFieldType.ERROR_COUNT)
|
|
491
|
-
|
|
492
|
-
if error_count:
|
|
493
|
-
self.error_count[endpoint_id] = int(error_count)
|
|
494
|
-
|
|
495
549
|
# add endpoint to endpoints set
|
|
496
550
|
self.endpoints.add(endpoint_id)
|
|
497
551
|
|
|
498
552
|
def is_valid(
|
|
499
553
|
self,
|
|
500
|
-
endpoint_id: str,
|
|
501
554
|
validation_function,
|
|
502
555
|
field: typing.Any,
|
|
503
556
|
dict_path: list[str],
|
|
504
557
|
):
|
|
505
558
|
if validation_function(field, dict_path):
|
|
506
559
|
return True
|
|
507
|
-
|
|
560
|
+
|
|
508
561
|
return False
|
|
509
562
|
|
|
563
|
+
@staticmethod
|
|
564
|
+
def _get_effective_and_estimated_counts(event):
|
|
565
|
+
"""
|
|
566
|
+
Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
|
|
567
|
+
sampling percentage. These values will be stored in the TSDB target.
|
|
568
|
+
Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
|
|
569
|
+
percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
|
|
570
|
+
"""
|
|
571
|
+
effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
|
|
572
|
+
estimated_prediction_count = effective_sample_count * (
|
|
573
|
+
100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
|
|
574
|
+
)
|
|
575
|
+
return effective_sample_count, estimated_prediction_count
|
|
576
|
+
|
|
510
577
|
|
|
511
578
|
def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
512
579
|
if field is not None:
|
|
@@ -569,6 +636,9 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
569
636
|
return None
|
|
570
637
|
|
|
571
638
|
def do(self, event: dict):
|
|
639
|
+
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
640
|
+
logger.info("Skipped nop event inside of MapFeatureNames", event=event)
|
|
641
|
+
return event
|
|
572
642
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
573
643
|
|
|
574
644
|
feature_values = event[EventFieldType.FEATURES]
|
|
@@ -672,6 +742,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
672
742
|
)
|
|
673
743
|
)
|
|
674
744
|
self.first_request[endpoint_id] = True
|
|
745
|
+
|
|
675
746
|
if attributes_to_update:
|
|
676
747
|
logger.info(
|
|
677
748
|
"Updating endpoint record",
|
mlrun/package/context_handler.py
CHANGED
|
@@ -50,7 +50,7 @@ class ContextHandler:
|
|
|
50
50
|
"numpy",
|
|
51
51
|
]
|
|
52
52
|
# Optional packagers to be collected at initialization time:
|
|
53
|
-
_EXTENDED_PACKAGERS = [] # TODO: Create "matplotlib", "plotly",
|
|
53
|
+
_EXTENDED_PACKAGERS = [] # TODO: Create "matplotlib", "plotly", packagers.
|
|
54
54
|
# Optional packagers from the `mlrun.frameworks` package:
|
|
55
55
|
_MLRUN_FRAMEWORKS_PACKAGERS = [] # TODO: Create frameworks packagers.
|
|
56
56
|
# Default priority values for packagers:
|
|
@@ -667,16 +667,9 @@ class PackagersManager:
|
|
|
667
667
|
data_item=data_item,
|
|
668
668
|
instructions={},
|
|
669
669
|
)
|
|
670
|
-
except Exception
|
|
670
|
+
except Exception:
|
|
671
671
|
# Could not unpack as the reduced type hint, collect the exception and go to the next one:
|
|
672
|
-
|
|
673
|
-
traceback.format_exception(
|
|
674
|
-
etype=type(exception),
|
|
675
|
-
value=exception,
|
|
676
|
-
tb=exception.__traceback__,
|
|
677
|
-
)
|
|
678
|
-
)
|
|
679
|
-
found_packagers.append((packager, exception_string))
|
|
672
|
+
found_packagers.append((packager, traceback.format_exc()))
|
|
680
673
|
# Reduce the type hint list and continue:
|
|
681
674
|
possible_type_hints = TypeHintUtils.reduce_type_hint(
|
|
682
675
|
type_hint=possible_type_hints
|
|
@@ -692,15 +685,8 @@ class PackagersManager:
|
|
|
692
685
|
artifact_type=None,
|
|
693
686
|
instructions={},
|
|
694
687
|
)
|
|
695
|
-
except Exception
|
|
696
|
-
|
|
697
|
-
traceback.format_exception(
|
|
698
|
-
etype=type(exception),
|
|
699
|
-
value=exception,
|
|
700
|
-
tb=exception.__traceback__,
|
|
701
|
-
)
|
|
702
|
-
)
|
|
703
|
-
found_packagers.append((self._default_packager, exception_string))
|
|
688
|
+
except Exception:
|
|
689
|
+
found_packagers.append((self._default_packager, traceback.format_exc()))
|
|
704
690
|
|
|
705
691
|
# The method did not return until this point, raise an error:
|
|
706
692
|
raise MLRunPackageUnpackingError(
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -31,7 +31,7 @@ import mlrun_pipelines.patcher
|
|
|
31
31
|
import mlrun_pipelines.utils
|
|
32
32
|
from mlrun.errors import err_to_str
|
|
33
33
|
from mlrun.utils import (
|
|
34
|
-
|
|
34
|
+
get_workflow_url,
|
|
35
35
|
logger,
|
|
36
36
|
normalize_workflow_name,
|
|
37
37
|
retry_until_successful,
|
|
@@ -523,11 +523,12 @@ class _PipelineRunner(abc.ABC):
|
|
|
523
523
|
text = _PipelineRunner._generate_workflow_finished_message(
|
|
524
524
|
run.run_id, errors_counter, run._state
|
|
525
525
|
)
|
|
526
|
-
|
|
527
526
|
notifiers = notifiers or project.notifiers
|
|
528
527
|
if notifiers:
|
|
529
528
|
notifiers.push(text, "info", runs)
|
|
530
529
|
|
|
530
|
+
project.push_pipeline_notification_kfp_runner(run.run_id, run._state, text)
|
|
531
|
+
|
|
531
532
|
if raise_error:
|
|
532
533
|
raise raise_error
|
|
533
534
|
return state or run._state, errors_counter, text
|
|
@@ -620,6 +621,8 @@ class _KFPRunner(_PipelineRunner):
|
|
|
620
621
|
params.update(notification.secret_params)
|
|
621
622
|
project.notifiers.add_notification(notification.kind, params)
|
|
622
623
|
|
|
624
|
+
project.spec.notifications = notifications
|
|
625
|
+
|
|
623
626
|
run_id = _run_pipeline(
|
|
624
627
|
workflow_handler,
|
|
625
628
|
project=project.metadata.name,
|
|
@@ -647,13 +650,23 @@ class _KFPRunner(_PipelineRunner):
|
|
|
647
650
|
exc_info=err_to_str(exc),
|
|
648
651
|
)
|
|
649
652
|
|
|
650
|
-
#
|
|
651
|
-
|
|
653
|
+
# Pushing only relevant notification for the client (ipython and console)
|
|
654
|
+
project.notifiers.push_pipeline_start_message_from_client(
|
|
655
|
+
project.metadata.name, pipeline_id=run_id
|
|
656
|
+
)
|
|
657
|
+
|
|
652
658
|
if context:
|
|
653
659
|
project.notifiers.push_pipeline_start_message(
|
|
654
660
|
project.metadata.name,
|
|
655
661
|
context.uid,
|
|
656
662
|
)
|
|
663
|
+
else:
|
|
664
|
+
project.push_pipeline_notification_kfp_runner(
|
|
665
|
+
run_id,
|
|
666
|
+
mlrun_pipelines.common.models.RunStatuses.running,
|
|
667
|
+
f"Workflow {run_id} started in project {project.metadata.name}",
|
|
668
|
+
notifications,
|
|
669
|
+
)
|
|
657
670
|
pipeline_context.clear()
|
|
658
671
|
return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
|
|
659
672
|
|
|
@@ -1212,7 +1225,7 @@ def notify_scheduled_workflow_failure(
|
|
|
1212
1225
|
notification_pusher = mlrun.utils.notifications.CustomNotificationPusher(
|
|
1213
1226
|
["slack"]
|
|
1214
1227
|
)
|
|
1215
|
-
url =
|
|
1228
|
+
url = get_workflow_url(project_name, context_uid)
|
|
1216
1229
|
link = f"<{url}|*view workflow job details*>"
|
|
1217
1230
|
message = (
|
|
1218
1231
|
f":x: Failed to run scheduled workflow {workflow_name} "
|