mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +4 -0
- mlrun/common/schemas/notification.py +3 -3
- mlrun/datastore/azure_blob.py +120 -30
- mlrun/feature_store/common.py +6 -11
- mlrun/model.py +5 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -6
- mlrun/model_monitoring/db/tsdb/base.py +121 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +65 -5
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +211 -35
- mlrun/model_monitoring/stream_processing.py +67 -25
- mlrun/projects/operations.py +1 -1
- mlrun/projects/project.py +7 -1
- mlrun/runtimes/__init__.py +15 -8
- mlrun/runtimes/nuclio/application/application.py +45 -5
- mlrun/runtimes/pod.py +2 -2
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +7 -9
- mlrun/serving/v2_serving.py +1 -0
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/METADATA +7 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/RECORD +28 -28
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from datetime import datetime
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
16
|
from io import StringIO
|
|
17
17
|
from typing import Literal, Optional, Union
|
|
18
18
|
|
|
@@ -33,7 +33,7 @@ _TSDB_RATE = "1/s"
|
|
|
33
33
|
_CONTAINER = "users"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def _is_no_schema_error(exc: v3io_frames.
|
|
36
|
+
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
37
37
|
"""
|
|
38
38
|
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
39
|
Check if the error message contains the relevant string to verify the cause.
|
|
@@ -89,6 +89,19 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
89
89
|
)
|
|
90
90
|
self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
|
|
91
91
|
|
|
92
|
+
errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
93
|
+
project=self.project,
|
|
94
|
+
kind=mm_schemas.FileTargetKind.ERRORS,
|
|
95
|
+
)
|
|
96
|
+
(
|
|
97
|
+
_,
|
|
98
|
+
_,
|
|
99
|
+
errors_path,
|
|
100
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
101
|
+
errors_table_full_path
|
|
102
|
+
)
|
|
103
|
+
self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
|
|
104
|
+
|
|
92
105
|
monitoring_application_full_path = (
|
|
93
106
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
94
107
|
project=self.project,
|
|
@@ -160,7 +173,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
160
173
|
- endpoint_features (Prediction and feature names and values)
|
|
161
174
|
- custom_metrics (user-defined metrics)
|
|
162
175
|
"""
|
|
163
|
-
|
|
164
176
|
# Write latency per prediction, labeled by endpoint ID only
|
|
165
177
|
graph.add_step(
|
|
166
178
|
"storey.TSDBTarget",
|
|
@@ -171,7 +183,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
171
183
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
172
184
|
container=self.container,
|
|
173
185
|
v3io_frames=self.v3io_framesd,
|
|
174
|
-
columns=[
|
|
186
|
+
columns=[
|
|
187
|
+
mm_schemas.EventFieldType.LATENCY,
|
|
188
|
+
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
189
|
+
],
|
|
175
190
|
index_cols=[
|
|
176
191
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
177
192
|
],
|
|
@@ -255,6 +270,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
255
270
|
apply_storey_filter()
|
|
256
271
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
257
272
|
|
|
273
|
+
def handle_model_error(
|
|
274
|
+
self,
|
|
275
|
+
graph,
|
|
276
|
+
tsdb_batching_max_events: int = 10,
|
|
277
|
+
tsdb_batching_timeout_secs: int = 60,
|
|
278
|
+
**kwargs,
|
|
279
|
+
) -> None:
|
|
280
|
+
graph.add_step(
|
|
281
|
+
"mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
|
|
282
|
+
name="error_extractor",
|
|
283
|
+
after="ForwardError",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
graph.add_step(
|
|
287
|
+
"storey.TSDBTarget",
|
|
288
|
+
name="tsdb_error",
|
|
289
|
+
after="error_extractor",
|
|
290
|
+
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
|
|
291
|
+
rate="1/s",
|
|
292
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
293
|
+
container=self.container,
|
|
294
|
+
v3io_frames=self.v3io_framesd,
|
|
295
|
+
columns=[
|
|
296
|
+
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
297
|
+
mm_schemas.EventFieldType.ERROR_COUNT,
|
|
298
|
+
],
|
|
299
|
+
index_cols=[
|
|
300
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
301
|
+
],
|
|
302
|
+
max_events=tsdb_batching_max_events,
|
|
303
|
+
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
304
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
305
|
+
)
|
|
306
|
+
|
|
258
307
|
def write_application_event(
|
|
259
308
|
self,
|
|
260
309
|
event: dict,
|
|
@@ -437,7 +486,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
437
486
|
step=sliding_window_step,
|
|
438
487
|
**kwargs,
|
|
439
488
|
)
|
|
440
|
-
except v3io_frames.
|
|
489
|
+
except v3io_frames.Error as err:
|
|
441
490
|
if _is_no_schema_error(err):
|
|
442
491
|
return pd.DataFrame()
|
|
443
492
|
else:
|
|
@@ -504,10 +553,16 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
504
553
|
if type == "metrics":
|
|
505
554
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
506
555
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
556
|
+
columns = [mm_schemas.MetricData.METRIC_VALUE]
|
|
507
557
|
df_handler = self.df_to_metrics_values
|
|
508
558
|
elif type == "results":
|
|
509
559
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
510
560
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
561
|
+
columns = [
|
|
562
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
563
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
564
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
565
|
+
]
|
|
511
566
|
df_handler = self.df_to_results_values
|
|
512
567
|
else:
|
|
513
568
|
raise ValueError(f"Invalid {type = }")
|
|
@@ -517,6 +572,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
517
572
|
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
518
573
|
table_path=table_path,
|
|
519
574
|
name=name,
|
|
575
|
+
columns=columns,
|
|
520
576
|
)
|
|
521
577
|
|
|
522
578
|
logger.debug("Querying V3IO TSDB", query=query)
|
|
@@ -627,33 +683,153 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
627
683
|
), # pyright: ignore[reportArgumentType]
|
|
628
684
|
)
|
|
629
685
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
686
|
+
def get_last_request(
|
|
687
|
+
self,
|
|
688
|
+
endpoint_ids: Union[str, list[str]],
|
|
689
|
+
start: Union[datetime, str] = "0",
|
|
690
|
+
end: Union[datetime, str] = "now",
|
|
691
|
+
) -> pd.DataFrame:
|
|
692
|
+
endpoint_ids = (
|
|
693
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
694
|
+
)
|
|
695
|
+
df = self._get_records(
|
|
696
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
697
|
+
start=start,
|
|
698
|
+
end=end,
|
|
699
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
700
|
+
agg_funcs=["last"],
|
|
701
|
+
)
|
|
702
|
+
if not df.empty:
|
|
703
|
+
df.rename(
|
|
704
|
+
columns={
|
|
705
|
+
f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
|
|
706
|
+
f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
|
|
707
|
+
},
|
|
708
|
+
inplace=True,
|
|
709
|
+
)
|
|
710
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
|
|
711
|
+
mm_schemas.EventFieldType.LAST_REQUEST
|
|
712
|
+
].map(
|
|
713
|
+
lambda last_request: datetime.fromtimestamp(
|
|
714
|
+
last_request, tz=timezone.utc
|
|
715
|
+
)
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
return df.reset_index(drop=True)
|
|
719
|
+
|
|
720
|
+
def get_drift_status(
|
|
721
|
+
self,
|
|
722
|
+
endpoint_ids: Union[str, list[str]],
|
|
723
|
+
start: Union[datetime, str] = "now-24h",
|
|
724
|
+
end: Union[datetime, str] = "now",
|
|
725
|
+
) -> pd.DataFrame:
|
|
726
|
+
endpoint_ids = (
|
|
727
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
728
|
+
)
|
|
729
|
+
df = self._get_records(
|
|
730
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
731
|
+
start=start,
|
|
732
|
+
end=end,
|
|
733
|
+
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
734
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
735
|
+
agg_funcs=["max"],
|
|
736
|
+
group_by="endpoint_id",
|
|
737
|
+
)
|
|
738
|
+
if not df.empty:
|
|
739
|
+
df.columns = [
|
|
740
|
+
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
741
|
+
]
|
|
742
|
+
return df.reset_index(drop=True)
|
|
743
|
+
|
|
744
|
+
def get_metrics_metadata(
|
|
745
|
+
self,
|
|
746
|
+
endpoint_id: str,
|
|
747
|
+
start: Union[datetime, str] = "0",
|
|
748
|
+
end: Union[datetime, str] = "now",
|
|
749
|
+
) -> pd.DataFrame:
|
|
750
|
+
df = self._get_records(
|
|
751
|
+
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
752
|
+
start=start,
|
|
753
|
+
end=end,
|
|
754
|
+
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
755
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
756
|
+
agg_funcs=["last"],
|
|
757
|
+
)
|
|
758
|
+
if not df.empty:
|
|
759
|
+
df.drop(
|
|
760
|
+
columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
|
|
761
|
+
)
|
|
762
|
+
return df.reset_index(drop=True)
|
|
763
|
+
|
|
764
|
+
def get_results_metadata(
|
|
765
|
+
self,
|
|
766
|
+
endpoint_id: str,
|
|
767
|
+
start: Union[datetime, str] = "0",
|
|
768
|
+
end: Union[datetime, str] = "now",
|
|
769
|
+
) -> pd.DataFrame:
|
|
770
|
+
df = self._get_records(
|
|
771
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
772
|
+
start=start,
|
|
773
|
+
end=end,
|
|
774
|
+
columns=[
|
|
775
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
776
|
+
],
|
|
777
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
778
|
+
agg_funcs=["last"],
|
|
779
|
+
)
|
|
780
|
+
if not df.empty:
|
|
781
|
+
df.rename(
|
|
782
|
+
columns={
|
|
783
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
|
|
784
|
+
},
|
|
785
|
+
inplace=True,
|
|
786
|
+
)
|
|
787
|
+
return df.reset_index(drop=True)
|
|
788
|
+
|
|
789
|
+
def get_error_count(
|
|
790
|
+
self,
|
|
791
|
+
endpoint_ids: Union[str, list[str]],
|
|
792
|
+
start: Union[datetime, str] = "0",
|
|
793
|
+
end: Union[datetime, str] = "now",
|
|
794
|
+
) -> pd.DataFrame:
|
|
795
|
+
endpoint_ids = (
|
|
796
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
797
|
+
)
|
|
798
|
+
df = self._get_records(
|
|
799
|
+
table=mm_schemas.FileTargetKind.ERRORS,
|
|
800
|
+
start=start,
|
|
801
|
+
end=end,
|
|
802
|
+
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
803
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
804
|
+
agg_funcs=["count"],
|
|
805
|
+
)
|
|
806
|
+
if not df.empty:
|
|
807
|
+
df.rename(
|
|
808
|
+
columns={
|
|
809
|
+
f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
|
|
810
|
+
},
|
|
811
|
+
inplace=True,
|
|
812
|
+
)
|
|
813
|
+
df.dropna(inplace=True)
|
|
814
|
+
return df.reset_index(drop=True)
|
|
815
|
+
|
|
816
|
+
def get_avg_latency(
|
|
817
|
+
self,
|
|
818
|
+
endpoint_ids: Union[str, list[str]],
|
|
819
|
+
start: Union[datetime, str] = "0",
|
|
820
|
+
end: Union[datetime, str] = "now",
|
|
821
|
+
) -> pd.DataFrame:
|
|
822
|
+
endpoint_ids = (
|
|
823
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
824
|
+
)
|
|
825
|
+
df = self._get_records(
|
|
826
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
827
|
+
start=start,
|
|
828
|
+
end=end,
|
|
829
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
830
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
831
|
+
agg_funcs=["avg"],
|
|
832
|
+
)
|
|
833
|
+
if not df.empty:
|
|
834
|
+
df.dropna(inplace=True)
|
|
835
|
+
return df.reset_index(drop=True)
|
|
@@ -169,11 +169,40 @@ class EventStreamProcessor:
|
|
|
169
169
|
mlrun.serving.states.RootFlowStep,
|
|
170
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
171
171
|
)
|
|
172
|
+
graph.add_step(
|
|
173
|
+
"ExtractEndpointID",
|
|
174
|
+
"extract_endpoint",
|
|
175
|
+
full_event=True,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# split the graph between event with error vs valid event
|
|
179
|
+
graph.add_step(
|
|
180
|
+
"storey.Filter",
|
|
181
|
+
"FilterError",
|
|
182
|
+
after="extract_endpoint",
|
|
183
|
+
_fn="(event.get('error') is None)",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
graph.add_step(
|
|
187
|
+
"storey.Filter",
|
|
188
|
+
"ForwardError",
|
|
189
|
+
after="extract_endpoint",
|
|
190
|
+
_fn="(event.get('error') is not None)",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
194
|
+
project=self.project, secret_provider=secret_provider
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
tsdb_connector.handle_model_error(
|
|
198
|
+
graph,
|
|
199
|
+
)
|
|
172
200
|
|
|
173
201
|
# Process endpoint event: splitting into sub-events and validate event data
|
|
174
202
|
def apply_process_endpoint_event():
|
|
175
203
|
graph.add_step(
|
|
176
204
|
"ProcessEndpointEvent",
|
|
205
|
+
after="FilterError",
|
|
177
206
|
full_event=True,
|
|
178
207
|
project=self.project,
|
|
179
208
|
)
|
|
@@ -295,9 +324,6 @@ class EventStreamProcessor:
|
|
|
295
324
|
|
|
296
325
|
apply_storey_sample_window()
|
|
297
326
|
|
|
298
|
-
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
299
|
-
project=self.project, secret_provider=secret_provider
|
|
300
|
-
)
|
|
301
327
|
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
302
328
|
|
|
303
329
|
# Parquet branch
|
|
@@ -386,6 +412,38 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
386
412
|
return e
|
|
387
413
|
|
|
388
414
|
|
|
415
|
+
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
416
|
+
def __init__(self, **kwargs) -> None:
|
|
417
|
+
"""
|
|
418
|
+
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
419
|
+
"""
|
|
420
|
+
super().__init__(**kwargs)
|
|
421
|
+
|
|
422
|
+
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
423
|
+
# Getting model version and function uri from event
|
|
424
|
+
# and use them for retrieving the endpoint_id
|
|
425
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
426
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
430
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
version = full_event.body.get(EventFieldType.VERSION)
|
|
434
|
+
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
435
|
+
|
|
436
|
+
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
437
|
+
function_uri=function_uri,
|
|
438
|
+
versioned_model=versioned_model,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
endpoint_id = str(endpoint_id)
|
|
442
|
+
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
443
|
+
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
444
|
+
return full_event
|
|
445
|
+
|
|
446
|
+
|
|
389
447
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
390
448
|
def __init__(self, **kwargs):
|
|
391
449
|
"""
|
|
@@ -459,28 +517,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
459
517
|
def do(self, full_event):
|
|
460
518
|
event = full_event.body
|
|
461
519
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
function_uri = event
|
|
465
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
466
|
-
return None
|
|
467
|
-
|
|
468
|
-
model = event.get(EventFieldType.MODEL)
|
|
469
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
470
|
-
return None
|
|
471
|
-
|
|
472
|
-
version = event.get(EventFieldType.VERSION)
|
|
473
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
474
|
-
|
|
475
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
476
|
-
function_uri=function_uri,
|
|
477
|
-
versioned_model=versioned_model,
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
endpoint_id = str(endpoint_id)
|
|
481
|
-
|
|
482
|
-
event[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
483
|
-
event[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
520
|
+
versioned_model = event[EventFieldType.VERSIONED_MODEL]
|
|
521
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
522
|
+
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
484
523
|
|
|
485
524
|
# In case this process fails, resume state from existing record
|
|
486
525
|
self.resume_state(endpoint_id)
|
|
@@ -598,6 +637,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
598
637
|
EventFieldType.PREDICTION: prediction,
|
|
599
638
|
EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
|
|
600
639
|
EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
|
|
640
|
+
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
641
|
+
self.last_request[endpoint_id]
|
|
642
|
+
).timestamp(),
|
|
601
643
|
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
602
644
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
603
645
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
mlrun/projects/operations.py
CHANGED
|
@@ -189,7 +189,7 @@ def run_function(
|
|
|
189
189
|
if engine == "kfp":
|
|
190
190
|
if schedule:
|
|
191
191
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
192
|
-
"Scheduling
|
|
192
|
+
"Scheduling jobs is not supported when running a workflow with the kfp engine."
|
|
193
193
|
)
|
|
194
194
|
return function.as_step(
|
|
195
195
|
name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels
|
mlrun/projects/project.py
CHANGED
|
@@ -600,6 +600,10 @@ def _run_project_setup(
|
|
|
600
600
|
if hasattr(mod, "setup"):
|
|
601
601
|
try:
|
|
602
602
|
project = getattr(mod, "setup")(project)
|
|
603
|
+
if not project or not isinstance(project, mlrun.projects.MlrunProject):
|
|
604
|
+
raise ValueError(
|
|
605
|
+
"MLRun project_setup:setup() must return a project object"
|
|
606
|
+
)
|
|
603
607
|
except Exception as exc:
|
|
604
608
|
logger.error(
|
|
605
609
|
"Failed to run project_setup script",
|
|
@@ -610,7 +614,9 @@ def _run_project_setup(
|
|
|
610
614
|
if save:
|
|
611
615
|
project.save()
|
|
612
616
|
else:
|
|
613
|
-
logger.warn(
|
|
617
|
+
logger.warn(
|
|
618
|
+
f"skipping setup, setup() handler was not found in {path.basename(setup_file_path)}"
|
|
619
|
+
)
|
|
614
620
|
return project
|
|
615
621
|
|
|
616
622
|
|
mlrun/runtimes/__init__.py
CHANGED
|
@@ -30,6 +30,8 @@ __all__ = [
|
|
|
30
30
|
"MpiRuntimeV1",
|
|
31
31
|
]
|
|
32
32
|
|
|
33
|
+
import typing
|
|
34
|
+
|
|
33
35
|
from mlrun.runtimes.utils import resolve_spark_operator_version
|
|
34
36
|
|
|
35
37
|
from ..common.runtimes.constants import MPIJobCRDVersions
|
|
@@ -181,7 +183,7 @@ class RuntimeKinds:
|
|
|
181
183
|
]
|
|
182
184
|
|
|
183
185
|
@staticmethod
|
|
184
|
-
def is_log_collectable_runtime(kind: str):
|
|
186
|
+
def is_log_collectable_runtime(kind: typing.Optional[str]):
|
|
185
187
|
"""
|
|
186
188
|
whether log collector can collect logs for that runtime
|
|
187
189
|
:param kind: kind name
|
|
@@ -192,13 +194,18 @@ class RuntimeKinds:
|
|
|
192
194
|
if RuntimeKinds.is_local_runtime(kind):
|
|
193
195
|
return False
|
|
194
196
|
|
|
195
|
-
if
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
197
|
+
if (
|
|
198
|
+
kind
|
|
199
|
+
not in [
|
|
200
|
+
# dask implementation is different from other runtimes, because few runs can be run against the same
|
|
201
|
+
# runtime resource, so collecting logs on that runtime resource won't be correct, the way we collect
|
|
202
|
+
# logs for dask is by using `log_std` on client side after we execute the code against the cluster,
|
|
203
|
+
# as submitting the run with the dask client will return the run stdout.
|
|
204
|
+
# For more information head to `DaskCluster._run`.
|
|
205
|
+
RuntimeKinds.dask
|
|
206
|
+
]
|
|
207
|
+
+ RuntimeKinds.nuclio_runtimes()
|
|
208
|
+
):
|
|
202
209
|
return True
|
|
203
210
|
|
|
204
211
|
return False
|
|
@@ -122,6 +122,11 @@ class ApplicationSpec(NuclioSpec):
|
|
|
122
122
|
state_thresholds=state_thresholds,
|
|
123
123
|
disable_default_http_trigger=disable_default_http_trigger,
|
|
124
124
|
)
|
|
125
|
+
|
|
126
|
+
# Override default min/max replicas (don't assume application is stateless)
|
|
127
|
+
self.min_replicas = min_replicas or 1
|
|
128
|
+
self.max_replicas = max_replicas or 1
|
|
129
|
+
|
|
125
130
|
self.internal_application_port = (
|
|
126
131
|
internal_application_port
|
|
127
132
|
or mlrun.mlconf.function.application.default_sidecar_internal_port
|
|
@@ -169,7 +174,7 @@ class ApplicationStatus(NuclioStatus):
|
|
|
169
174
|
self.application_source = application_source or None
|
|
170
175
|
self.sidecar_name = sidecar_name or None
|
|
171
176
|
self.api_gateway_name = api_gateway_name or None
|
|
172
|
-
self.api_gateway = api_gateway or None
|
|
177
|
+
self.api_gateway: typing.Optional[APIGateway] = api_gateway or None
|
|
173
178
|
self.url = url or None
|
|
174
179
|
|
|
175
180
|
|
|
@@ -254,6 +259,15 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
254
259
|
"Application sidecar spec must include a command if args are provided"
|
|
255
260
|
)
|
|
256
261
|
|
|
262
|
+
def prepare_image_for_deploy(self):
|
|
263
|
+
if self.spec.build.source and self.spec.build.load_source_on_run:
|
|
264
|
+
logger.warning(
|
|
265
|
+
"Application runtime requires loading the source into the application image. "
|
|
266
|
+
f"Even though {self.spec.build.load_source_on_run=}, loading on build will be forced."
|
|
267
|
+
)
|
|
268
|
+
self.spec.build.load_source_on_run = False
|
|
269
|
+
super().prepare_image_for_deploy()
|
|
270
|
+
|
|
257
271
|
def deploy(
|
|
258
272
|
self,
|
|
259
273
|
project="",
|
|
@@ -275,6 +289,7 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
275
289
|
"""
|
|
276
290
|
Deploy function, builds the application image if required (self.requires_build()) or force_build is True,
|
|
277
291
|
Once the image is built, the function is deployed.
|
|
292
|
+
|
|
278
293
|
:param project: Project name
|
|
279
294
|
:param tag: Function tag
|
|
280
295
|
:param verbose: Set True for verbose logging
|
|
@@ -349,9 +364,13 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
349
364
|
)
|
|
350
365
|
|
|
351
366
|
def with_source_archive(
|
|
352
|
-
self,
|
|
367
|
+
self,
|
|
368
|
+
source,
|
|
369
|
+
workdir=None,
|
|
370
|
+
pull_at_runtime: bool = False,
|
|
371
|
+
target_dir: str = None,
|
|
353
372
|
):
|
|
354
|
-
"""load the code from git/tar/zip archive at
|
|
373
|
+
"""load the code from git/tar/zip archive at build
|
|
355
374
|
|
|
356
375
|
:param source: valid absolute path or URL to git, zip, or tar file, e.g.
|
|
357
376
|
git://github.com/mlrun/something.git
|
|
@@ -359,13 +378,20 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
359
378
|
note path source must exist on the image or exist locally when run is local
|
|
360
379
|
(it is recommended to use 'workdir' when source is a filepath instead)
|
|
361
380
|
:param workdir: working dir relative to the archive root (e.g. './subdir') or absolute to the image root
|
|
362
|
-
:param pull_at_runtime:
|
|
381
|
+
:param pull_at_runtime: currently not supported, source must be loaded into the image during the build process
|
|
363
382
|
:param target_dir: target dir on runtime pod or repo clone / archive extraction
|
|
364
383
|
"""
|
|
384
|
+
if pull_at_runtime:
|
|
385
|
+
logger.warning(
|
|
386
|
+
f"{pull_at_runtime=} is currently not supported for application runtime "
|
|
387
|
+
"and will be overridden to False",
|
|
388
|
+
pull_at_runtime=pull_at_runtime,
|
|
389
|
+
)
|
|
390
|
+
|
|
365
391
|
self._configure_mlrun_build_with_source(
|
|
366
392
|
source=source,
|
|
367
393
|
workdir=workdir,
|
|
368
|
-
pull_at_runtime=
|
|
394
|
+
pull_at_runtime=False,
|
|
369
395
|
target_dir=target_dir,
|
|
370
396
|
)
|
|
371
397
|
|
|
@@ -551,6 +577,13 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
551
577
|
args=self.spec.args,
|
|
552
578
|
)
|
|
553
579
|
|
|
580
|
+
if self.spec.build.source in [".", "./"]:
|
|
581
|
+
logger.info(
|
|
582
|
+
"The application is configured to use the project's source. "
|
|
583
|
+
"Application runtime requires loading the source into the application image. "
|
|
584
|
+
"Loading on build will be forced regardless of whether 'pull_at_runtime=True' was configured."
|
|
585
|
+
)
|
|
586
|
+
|
|
554
587
|
with_mlrun = self._resolve_build_with_mlrun(with_mlrun)
|
|
555
588
|
return self._build_image(
|
|
556
589
|
builder_env=builder_env,
|
|
@@ -580,6 +613,13 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
580
613
|
)
|
|
581
614
|
function.spec.nuclio_runtime = mlrun.utils.get_in(spec, "spec.runtime")
|
|
582
615
|
|
|
616
|
+
# default the reverse proxy logger level to info
|
|
617
|
+
logger_sinks_key = "spec.loggerSinks"
|
|
618
|
+
if not function.spec.config.get(logger_sinks_key):
|
|
619
|
+
function.set_config(
|
|
620
|
+
logger_sinks_key, [{"level": "info", "sink": "myStdoutLoggerSink"}]
|
|
621
|
+
)
|
|
622
|
+
|
|
583
623
|
def _configure_application_sidecar(self):
|
|
584
624
|
# Save the application image in the status to allow overriding it with the reverse proxy entry point
|
|
585
625
|
if self.spec.image and (
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -1174,9 +1174,9 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
|
|
|
1174
1174
|
"""
|
|
1175
1175
|
if node_name:
|
|
1176
1176
|
self.spec.node_name = node_name
|
|
1177
|
-
if node_selector:
|
|
1177
|
+
if node_selector is not None:
|
|
1178
1178
|
self.spec.node_selector = node_selector
|
|
1179
|
-
if affinity:
|
|
1179
|
+
if affinity is not None:
|
|
1180
1180
|
self.spec.affinity = affinity
|
|
1181
1181
|
if tolerations is not None:
|
|
1182
1182
|
self.spec.tolerations = tolerations
|
mlrun/runtimes/remotesparkjob.py
CHANGED
|
@@ -102,16 +102,13 @@ class RemoteSparkRuntime(KubejobRuntime):
|
|
|
102
102
|
|
|
103
103
|
@classmethod
|
|
104
104
|
def deploy_default_image(cls):
|
|
105
|
-
|
|
106
|
-
from mlrun.run import new_function
|
|
107
|
-
|
|
108
|
-
sj = new_function(
|
|
105
|
+
sj = mlrun.new_function(
|
|
109
106
|
kind="remote-spark", name="remote-spark-default-image-deploy-temp"
|
|
110
107
|
)
|
|
111
108
|
sj.spec.build.image = cls.default_image
|
|
112
109
|
sj.with_spark_service(spark_service="dummy-spark")
|
|
113
110
|
sj.deploy()
|
|
114
|
-
get_run_db().delete_function(name=sj.metadata.name)
|
|
111
|
+
mlrun.get_run_db().delete_function(name=sj.metadata.name)
|
|
115
112
|
|
|
116
113
|
def is_deployed(self):
|
|
117
114
|
if (
|