mlrun 1.6.0rc6__py3-none-any.whl → 1.6.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +32 -31
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/workflow.py +2 -0
- mlrun/config.py +3 -3
- mlrun/datastore/base.py +9 -3
- mlrun/datastore/datastore.py +10 -7
- mlrun/datastore/datastore_profile.py +19 -2
- mlrun/datastore/dbfs_store.py +6 -6
- mlrun/datastore/s3.py +6 -2
- mlrun/datastore/sources.py +12 -2
- mlrun/datastore/targets.py +43 -20
- mlrun/db/httpdb.py +22 -0
- mlrun/feature_store/feature_set.py +5 -2
- mlrun/feature_store/retrieval/spark_merger.py +7 -1
- mlrun/kfpops.py +1 -1
- mlrun/launcher/client.py +1 -6
- mlrun/launcher/remote.py +5 -3
- mlrun/model.py +2 -2
- mlrun/model_monitoring/batch_application.py +61 -94
- mlrun/package/packager.py +115 -89
- mlrun/package/packagers/default_packager.py +66 -65
- mlrun/package/packagers/numpy_packagers.py +109 -62
- mlrun/package/packagers/pandas_packagers.py +12 -23
- mlrun/package/packagers/python_standard_library_packagers.py +35 -57
- mlrun/package/packagers_manager.py +16 -13
- mlrun/package/utils/_pickler.py +8 -18
- mlrun/package/utils/_supported_format.py +1 -1
- mlrun/projects/pipelines.py +63 -4
- mlrun/projects/project.py +34 -11
- mlrun/runtimes/__init__.py +6 -0
- mlrun/runtimes/base.py +12 -1
- mlrun/runtimes/daskjob.py +73 -5
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -0
- mlrun/runtimes/function.py +53 -4
- mlrun/runtimes/kubejob.py +1 -1
- mlrun/runtimes/local.py +9 -9
- mlrun/runtimes/pod.py +1 -1
- mlrun/runtimes/remotesparkjob.py +1 -0
- mlrun/runtimes/serving.py +11 -1
- mlrun/runtimes/sparkjob/spark3job.py +4 -1
- mlrun/runtimes/utils.py +1 -46
- mlrun/utils/helpers.py +1 -17
- mlrun/utils/notifications/notification_pusher.py +27 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/METADATA +7 -6
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/RECORD +50 -50
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/WHEEL +1 -1
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -24,6 +24,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
24
24
|
from urllib.parse import urlparse
|
|
25
25
|
|
|
26
26
|
import pandas as pd
|
|
27
|
+
from mergedeep import merge
|
|
27
28
|
|
|
28
29
|
import mlrun
|
|
29
30
|
import mlrun.utils.helpers
|
|
@@ -293,6 +294,8 @@ def add_target_steps(graph, resource, targets, to_df=False, final_step=None):
|
|
|
293
294
|
driver = get_target_driver(target, resource)
|
|
294
295
|
table = driver.get_table_object() or table
|
|
295
296
|
driver.update_resource_status()
|
|
297
|
+
if target.after_step:
|
|
298
|
+
target.attributes["infer_columns_from_data"] = True
|
|
296
299
|
driver.add_writer_step(
|
|
297
300
|
graph,
|
|
298
301
|
target.after_step or final_step,
|
|
@@ -435,17 +438,20 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
435
438
|
prefix=self.credentials_prefix,
|
|
436
439
|
)
|
|
437
440
|
|
|
438
|
-
def
|
|
441
|
+
def _get_store_and_path(self):
|
|
439
442
|
credentials_prefix_secrets = (
|
|
440
443
|
{"CREDENTIALS_PREFIX": self.credentials_prefix}
|
|
441
444
|
if self.credentials_prefix
|
|
442
445
|
else None
|
|
443
446
|
)
|
|
444
|
-
store,
|
|
447
|
+
store, resolved_store_path = mlrun.store_manager.get_or_create_store(
|
|
445
448
|
self.get_target_path(),
|
|
446
449
|
credentials_prefix_secrets,
|
|
447
450
|
)
|
|
448
|
-
|
|
451
|
+
if self.get_target_path().startswith("ds://"):
|
|
452
|
+
return store, store.url + resolved_store_path
|
|
453
|
+
else:
|
|
454
|
+
return store, self.get_target_path()
|
|
449
455
|
|
|
450
456
|
def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
|
|
451
457
|
result = []
|
|
@@ -494,17 +500,18 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
494
500
|
df.write.mode("overwrite").save(**options)
|
|
495
501
|
elif hasattr(df, "dask"):
|
|
496
502
|
dask_options = self.get_dask_options()
|
|
497
|
-
|
|
503
|
+
store, target_path = self._get_store_and_path()
|
|
504
|
+
storage_options = store.get_storage_options()
|
|
498
505
|
df = df.repartition(partition_size="100MB")
|
|
499
506
|
try:
|
|
500
507
|
if dask_options["format"] == "parquet":
|
|
501
508
|
df.to_parquet(
|
|
502
|
-
generate_path_with_chunk(self, chunk_id),
|
|
509
|
+
generate_path_with_chunk(self, chunk_id, target_path),
|
|
503
510
|
storage_options=storage_options,
|
|
504
511
|
)
|
|
505
512
|
elif dask_options["format"] == "csv":
|
|
506
513
|
df.to_csv(
|
|
507
|
-
generate_path_with_chunk(self, chunk_id),
|
|
514
|
+
generate_path_with_chunk(self, chunk_id, target_path),
|
|
508
515
|
storage_options=storage_options,
|
|
509
516
|
)
|
|
510
517
|
else:
|
|
@@ -514,8 +521,9 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
514
521
|
except Exception as exc:
|
|
515
522
|
raise RuntimeError("Failed to write Dask Dataframe") from exc
|
|
516
523
|
else:
|
|
517
|
-
target_path =
|
|
518
|
-
|
|
524
|
+
store, target_path = self._get_store_and_path()
|
|
525
|
+
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
526
|
+
file_system = store.get_filesystem(False)
|
|
519
527
|
if file_system.protocol == "file":
|
|
520
528
|
dir = os.path.dirname(target_path)
|
|
521
529
|
if dir:
|
|
@@ -551,10 +559,16 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
551
559
|
# Partitioning will be performed on timestamp_key and then on self.partition_cols
|
|
552
560
|
# (We might want to give the user control on this order as additional functionality)
|
|
553
561
|
partition_cols += self.partition_cols or []
|
|
554
|
-
|
|
562
|
+
|
|
563
|
+
storage_options = store.get_storage_options()
|
|
564
|
+
if storage_options and self.storage_options:
|
|
565
|
+
storage_options = merge(storage_options, self.storage_options)
|
|
566
|
+
else:
|
|
567
|
+
storage_options = storage_options or self.storage_options
|
|
568
|
+
|
|
555
569
|
self._write_dataframe(
|
|
556
570
|
target_df,
|
|
557
|
-
|
|
571
|
+
storage_options,
|
|
558
572
|
target_path,
|
|
559
573
|
partition_cols=partition_cols,
|
|
560
574
|
**kwargs,
|
|
@@ -603,6 +617,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
603
617
|
|
|
604
618
|
driver._resource = resource
|
|
605
619
|
driver.run_id = spec.run_id
|
|
620
|
+
driver.after_step = spec.after_step
|
|
606
621
|
return driver
|
|
607
622
|
|
|
608
623
|
def get_table_object(self):
|
|
@@ -673,7 +688,8 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
673
688
|
raise NotImplementedError()
|
|
674
689
|
|
|
675
690
|
def purge(self):
|
|
676
|
-
self.
|
|
691
|
+
store, target_path = self._get_store_and_path()
|
|
692
|
+
store.rm(target_path, recursive=True)
|
|
677
693
|
|
|
678
694
|
def as_df(
|
|
679
695
|
self,
|
|
@@ -860,18 +876,25 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
860
876
|
"update_last_written": featureset_status.update_last_written_for_target
|
|
861
877
|
}
|
|
862
878
|
|
|
879
|
+
store, target_path = self._get_store_and_path()
|
|
880
|
+
|
|
881
|
+
storage_options = store.get_storage_options()
|
|
882
|
+
if storage_options and self.storage_options:
|
|
883
|
+
storage_options = merge(storage_options, self.storage_options)
|
|
884
|
+
else:
|
|
885
|
+
storage_options = storage_options or self.storage_options
|
|
886
|
+
|
|
863
887
|
graph.add_step(
|
|
864
888
|
name=self.name or "ParquetTarget",
|
|
865
889
|
after=after,
|
|
866
890
|
graph_shape="cylinder",
|
|
867
891
|
class_name="storey.ParquetTarget",
|
|
868
|
-
path=
|
|
892
|
+
path=target_path,
|
|
869
893
|
columns=column_list,
|
|
870
894
|
index_cols=tuple_key_columns,
|
|
871
895
|
partition_cols=partition_cols,
|
|
872
896
|
time_field=timestamp_key,
|
|
873
|
-
storage_options=
|
|
874
|
-
or self._get_store().get_storage_options(),
|
|
897
|
+
storage_options=storage_options,
|
|
875
898
|
max_events=self.max_events,
|
|
876
899
|
flush_after_seconds=self.flush_after_seconds,
|
|
877
900
|
**self.attributes,
|
|
@@ -1009,17 +1032,17 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1009
1032
|
column_list = self._get_column_list(
|
|
1010
1033
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1011
1034
|
)
|
|
1012
|
-
|
|
1035
|
+
store, target_path = self._get_store_and_path()
|
|
1013
1036
|
graph.add_step(
|
|
1014
1037
|
name=self.name or "CSVTarget",
|
|
1015
1038
|
after=after,
|
|
1016
1039
|
graph_shape="cylinder",
|
|
1017
1040
|
class_name="storey.CSVTarget",
|
|
1018
|
-
path=
|
|
1041
|
+
path=target_path,
|
|
1019
1042
|
columns=column_list,
|
|
1020
1043
|
header=True,
|
|
1021
1044
|
index_cols=key_columns,
|
|
1022
|
-
storage_options=
|
|
1045
|
+
storage_options=store.get_storage_options(),
|
|
1023
1046
|
**self.attributes,
|
|
1024
1047
|
)
|
|
1025
1048
|
|
|
@@ -1923,8 +1946,8 @@ def _get_target_path(driver, resource, run_id_mode=False):
|
|
|
1923
1946
|
return f"{data_prefix}/{kind_prefix}/{name}{suffix}"
|
|
1924
1947
|
|
|
1925
1948
|
|
|
1926
|
-
def generate_path_with_chunk(target, chunk_id):
|
|
1927
|
-
prefix, suffix = os.path.splitext(
|
|
1949
|
+
def generate_path_with_chunk(target, chunk_id, path):
|
|
1950
|
+
prefix, suffix = os.path.splitext(path)
|
|
1928
1951
|
if chunk_id and not target.partitioned and not target.time_partitioning_granularity:
|
|
1929
1952
|
return f"{prefix}/{chunk_id:0>4}{suffix}"
|
|
1930
|
-
return
|
|
1953
|
+
return path
|
mlrun/db/httpdb.py
CHANGED
|
@@ -3143,6 +3143,21 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3143
3143
|
body=dict_to_json(authorization_verification_input.dict()),
|
|
3144
3144
|
)
|
|
3145
3145
|
|
|
3146
|
+
def list_api_gateways(self, project=None):
|
|
3147
|
+
"""
|
|
3148
|
+
Returns a list of Nuclio api gateways
|
|
3149
|
+
:param project: optional str parameter to filter by project, if not passed, default Nuclio's value is taken
|
|
3150
|
+
|
|
3151
|
+
:return: json with the list of Nuclio Api Gateways
|
|
3152
|
+
(json example is here
|
|
3153
|
+
https://github.com/nuclio/nuclio/blob/development/docs/reference/api/README.md#listing-all-api-gateways)
|
|
3154
|
+
"""
|
|
3155
|
+
project = project or config.default_project
|
|
3156
|
+
error = "list api gateways"
|
|
3157
|
+
endpoint_path = f"projects/{project}/nuclio/api-gateways"
|
|
3158
|
+
resp = self.api_call("GET", endpoint_path, error)
|
|
3159
|
+
return resp.json()
|
|
3160
|
+
|
|
3146
3161
|
def trigger_migrations(self) -> Optional[mlrun.common.schemas.BackgroundTask]:
|
|
3147
3162
|
"""Trigger migrations (will do nothing if no migrations are needed) and wait for them to finish if actually
|
|
3148
3163
|
triggered
|
|
@@ -3238,6 +3253,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3238
3253
|
source: Optional[str] = None,
|
|
3239
3254
|
run_name: Optional[str] = None,
|
|
3240
3255
|
namespace: Optional[str] = None,
|
|
3256
|
+
notifications: typing.List[mlrun.model.Notification] = None,
|
|
3241
3257
|
):
|
|
3242
3258
|
"""
|
|
3243
3259
|
Submitting workflow for a remote execution.
|
|
@@ -3250,6 +3266,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3250
3266
|
:param source: source url of the project
|
|
3251
3267
|
:param run_name: run name to override the default: 'workflow-runner-<workflow name>'
|
|
3252
3268
|
:param namespace: kubernetes namespace if other than default
|
|
3269
|
+
:param notifications: list of notifications to send when workflow execution is completed
|
|
3253
3270
|
|
|
3254
3271
|
:returns: :py:class:`~mlrun.common.schemas.WorkflowResponse`.
|
|
3255
3272
|
"""
|
|
@@ -3281,6 +3298,11 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3281
3298
|
req["spec"] = workflow_spec
|
|
3282
3299
|
req["spec"]["image"] = image
|
|
3283
3300
|
req["spec"]["name"] = workflow_name
|
|
3301
|
+
if notifications:
|
|
3302
|
+
req["notifications"] = [
|
|
3303
|
+
notification.to_dict() for notification in notifications
|
|
3304
|
+
]
|
|
3305
|
+
|
|
3284
3306
|
response = self.api_call(
|
|
3285
3307
|
"POST",
|
|
3286
3308
|
f"projects/{project}/workflows/{workflow_name}/submit",
|
|
@@ -16,6 +16,7 @@ from datetime import datetime
|
|
|
16
16
|
from typing import Dict, List, Optional, Union
|
|
17
17
|
|
|
18
18
|
import pandas as pd
|
|
19
|
+
import pytz
|
|
19
20
|
from storey import EmitEveryEvent, EmitPolicy
|
|
20
21
|
|
|
21
22
|
import mlrun
|
|
@@ -929,9 +930,11 @@ class FeatureSet(ModelObj):
|
|
|
929
930
|
)
|
|
930
931
|
df = self.spec.source.to_dataframe(
|
|
931
932
|
columns=columns,
|
|
933
|
+
# overwrite `source.start_time` when the source is schedule.
|
|
932
934
|
start_time=start_time
|
|
933
|
-
or pd.Timestamp.min,
|
|
934
|
-
end_time=end_time
|
|
935
|
+
or pd.to_datetime(pd.Timestamp.min, unit="ns").replace(tzinfo=pytz.UTC),
|
|
936
|
+
end_time=end_time
|
|
937
|
+
or pd.to_datetime(pd.Timestamp.max, unit="ns").replace(tzinfo=pytz.UTC),
|
|
935
938
|
time_field=time_column,
|
|
936
939
|
**kwargs,
|
|
937
940
|
)
|
|
@@ -172,11 +172,17 @@ class SparkFeatureMerger(BaseMerger):
|
|
|
172
172
|
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
173
173
|
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
174
174
|
if semver.parse(pd.__version__)["major"] >= 2:
|
|
175
|
+
import pyspark.sql.functions as pyspark_functions
|
|
176
|
+
|
|
175
177
|
type_conversion_dict = {}
|
|
176
178
|
for field in df.schema.fields:
|
|
177
179
|
if str(field.dataType) == "TimestampType":
|
|
178
180
|
df = df.withColumn(
|
|
179
|
-
field.name,
|
|
181
|
+
field.name,
|
|
182
|
+
pyspark_functions.date_format(
|
|
183
|
+
pyspark_functions.to_timestamp(field.name),
|
|
184
|
+
"yyyy-MM-dd'T'HH:mm:ss.SSS",
|
|
185
|
+
),
|
|
180
186
|
)
|
|
181
187
|
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
182
188
|
df = df.toPandas()
|
mlrun/kfpops.py
CHANGED
|
@@ -93,7 +93,7 @@ def write_kfpmeta(struct):
|
|
|
93
93
|
val = results[key]
|
|
94
94
|
try:
|
|
95
95
|
path = "/".join([KFP_ARTIFACTS_DIR, key])
|
|
96
|
-
logger.info("
|
|
96
|
+
logger.info("Writing artifact output", path=path, val=val)
|
|
97
97
|
with open(path, "w") as fp:
|
|
98
98
|
fp.write(str(val))
|
|
99
99
|
except Exception as exc:
|
mlrun/launcher/client.py
CHANGED
|
@@ -52,12 +52,7 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
|
|
|
52
52
|
if runtime.kind in mlrun.runtimes.RuntimeKinds.nuclio_runtimes():
|
|
53
53
|
return
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
require_build = (
|
|
57
|
-
build.commands
|
|
58
|
-
or build.requirements
|
|
59
|
-
or (build.source and not build.load_source_on_run)
|
|
60
|
-
)
|
|
55
|
+
require_build = runtime.requires_build()
|
|
61
56
|
image = runtime.spec.image
|
|
62
57
|
# we allow users to not set an image, in that case we'll use the default
|
|
63
58
|
if (
|
mlrun/launcher/remote.py
CHANGED
|
@@ -90,9 +90,11 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
90
90
|
runtime.deploy(skip_deployed=True, show_on_failure=True)
|
|
91
91
|
|
|
92
92
|
else:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
if runtime.requires_build():
|
|
94
|
+
logger.warning(
|
|
95
|
+
"Function image is not built/ready and function requires build - execution will fail. "
|
|
96
|
+
"Need to set auto_build=True or use .deploy() method first"
|
|
97
|
+
)
|
|
96
98
|
|
|
97
99
|
if runtime.verbose:
|
|
98
100
|
logger.info(f"runspec:\n{run.to_yaml()}")
|
mlrun/model.py
CHANGED
|
@@ -922,7 +922,7 @@ class RunSpec(ModelObj):
|
|
|
922
922
|
"""
|
|
923
923
|
Set the dictionary of k8s states (pod phase) to thresholds time strings.
|
|
924
924
|
The state will be matched against the pod's status. The threshold should be a time string that conforms
|
|
925
|
-
to timelength python package standards and is at least 1
|
|
925
|
+
to timelength python package standards and is at least 1 minute (-1 for infinite). If the phase is active
|
|
926
926
|
for longer than the threshold, the run will be marked as aborted and the pod will be deleted.
|
|
927
927
|
See mlconf.function.spec.state_thresholds for the state options and default values.
|
|
928
928
|
|
|
@@ -1433,7 +1433,7 @@ class RunObject(RunTemplate):
|
|
|
1433
1433
|
self.logs(watch=False)
|
|
1434
1434
|
if raise_on_failure and state != mlrun.runtimes.constants.RunStates.completed:
|
|
1435
1435
|
raise mlrun.errors.MLRunRuntimeError(
|
|
1436
|
-
f"
|
|
1436
|
+
f"Task {self.metadata.name} did not complete (state={state})"
|
|
1437
1437
|
)
|
|
1438
1438
|
|
|
1439
1439
|
return state
|
|
@@ -11,25 +11,20 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
import concurrent.futures
|
|
16
16
|
import datetime
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Callable, Optional, Tuple
|
|
21
21
|
|
|
22
|
-
import numpy as np
|
|
23
22
|
import pandas as pd
|
|
24
23
|
|
|
25
24
|
import mlrun
|
|
26
|
-
import mlrun.common.helpers
|
|
27
|
-
import mlrun.common.model_monitoring.helpers
|
|
28
|
-
import mlrun.common.schemas.model_monitoring
|
|
29
25
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
30
26
|
import mlrun.data_types.infer
|
|
31
27
|
import mlrun.feature_store as fstore
|
|
32
|
-
import mlrun.utils.v3io_clients
|
|
33
28
|
from mlrun.datastore import get_stream_pusher
|
|
34
29
|
from mlrun.datastore.targets import ParquetTarget
|
|
35
30
|
from mlrun.model_monitoring.batch import calculate_inputs_statistics
|
|
@@ -72,46 +67,39 @@ class BatchApplicationProcessor:
|
|
|
72
67
|
|
|
73
68
|
# Get the batch interval range
|
|
74
69
|
self.batch_dict = context.parameters[
|
|
75
|
-
|
|
70
|
+
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
76
71
|
]
|
|
77
72
|
|
|
78
|
-
# TODO: This will be removed
|
|
73
|
+
# TODO: This will be removed once the job params can be parsed with different types
|
|
79
74
|
# Convert batch dict string into a dictionary
|
|
80
75
|
if isinstance(self.batch_dict, str):
|
|
81
76
|
self._parse_batch_dict_str()
|
|
82
77
|
# If provided, only model endpoints in that that list will be analyzed
|
|
83
78
|
self.model_endpoints = context.parameters.get(
|
|
84
|
-
|
|
85
|
-
)
|
|
86
|
-
self.v3io_access_key = os.environ.get("V3IO_ACCESS_KEY")
|
|
87
|
-
self.model_monitoring_access_key = (
|
|
88
|
-
os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
|
|
79
|
+
mm_constants.EventFieldType.MODEL_ENDPOINTS, None
|
|
89
80
|
)
|
|
81
|
+
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
90
82
|
self.parquet_directory = get_monitoring_parquet_path(
|
|
91
83
|
project=project,
|
|
92
|
-
kind=
|
|
84
|
+
kind=mm_constants.FileTargetKind.BATCH_CONTROLLER_PARQUET,
|
|
93
85
|
)
|
|
94
86
|
self.storage_options = None
|
|
95
87
|
if not mlrun.mlconf.is_ce_mode():
|
|
96
|
-
self._initialize_v3io_configurations(
|
|
97
|
-
model_monitoring_access_key=self.model_monitoring_access_key
|
|
98
|
-
)
|
|
88
|
+
self._initialize_v3io_configurations()
|
|
99
89
|
elif self.parquet_directory.startswith("s3://"):
|
|
100
90
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
101
91
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
self.
|
|
112
|
-
|
|
113
|
-
self.v3io_access_key = v3io_access_key or os.environ.get("V3IO_ACCESS_KEY")
|
|
114
|
-
self.model_monitoring_access_key = model_monitoring_access_key
|
|
92
|
+
@staticmethod
|
|
93
|
+
def _get_model_monitoring_access_key() -> Optional[str]:
|
|
94
|
+
access_key = os.getenv(mm_constants.ProjectSecretKeys.ACCESS_KEY)
|
|
95
|
+
# allow access key to be empty and don't fetch v3io access key if not needed
|
|
96
|
+
if access_key is None:
|
|
97
|
+
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
98
|
+
return access_key
|
|
99
|
+
|
|
100
|
+
def _initialize_v3io_configurations(self) -> None:
|
|
101
|
+
self.v3io_framesd = mlrun.mlconf.v3io_framesd
|
|
102
|
+
self.v3io_api = mlrun.mlconf.v3io_api
|
|
115
103
|
self.storage_options = dict(
|
|
116
104
|
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
117
105
|
)
|
|
@@ -126,9 +114,7 @@ class BatchApplicationProcessor:
|
|
|
126
114
|
self.project
|
|
127
115
|
).list_model_monitoring_functions()
|
|
128
116
|
if application:
|
|
129
|
-
applications_names =
|
|
130
|
-
[app.metadata.name for app in application]
|
|
131
|
-
).tolist()
|
|
117
|
+
applications_names = list({app.metadata.name for app in application})
|
|
132
118
|
else:
|
|
133
119
|
logger.info("There are no monitoring application found in this project")
|
|
134
120
|
applications_names = []
|
|
@@ -144,26 +130,18 @@ class BatchApplicationProcessor:
|
|
|
144
130
|
futures = []
|
|
145
131
|
for endpoint in endpoints:
|
|
146
132
|
if (
|
|
147
|
-
endpoint[
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
and endpoint[
|
|
151
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
|
|
152
|
-
]
|
|
153
|
-
== mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
|
|
133
|
+
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
134
|
+
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
135
|
+
== mm_constants.ModelMonitoringMode.enabled.value
|
|
154
136
|
):
|
|
155
137
|
# Skip router endpoint:
|
|
156
138
|
if (
|
|
157
|
-
int(
|
|
158
|
-
|
|
159
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
|
|
160
|
-
]
|
|
161
|
-
)
|
|
162
|
-
== mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
|
|
139
|
+
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
140
|
+
== mm_constants.EndpointType.ROUTER
|
|
163
141
|
):
|
|
164
142
|
# Router endpoint has no feature stats
|
|
165
143
|
logger.info(
|
|
166
|
-
f"{endpoint[
|
|
144
|
+
f"{endpoint[mm_constants.EventFieldType.UID]} is router skipping"
|
|
167
145
|
)
|
|
168
146
|
continue
|
|
169
147
|
future = pool.submit(
|
|
@@ -184,10 +162,11 @@ class BatchApplicationProcessor:
|
|
|
184
162
|
|
|
185
163
|
self._delete_old_parquet()
|
|
186
164
|
|
|
187
|
-
@
|
|
165
|
+
@classmethod
|
|
188
166
|
def model_endpoint_process(
|
|
167
|
+
cls,
|
|
189
168
|
endpoint: dict,
|
|
190
|
-
applications_names:
|
|
169
|
+
applications_names: list[str],
|
|
191
170
|
bath_dict: dict,
|
|
192
171
|
project: str,
|
|
193
172
|
parquet_directory: str,
|
|
@@ -207,20 +186,14 @@ class BatchApplicationProcessor:
|
|
|
207
186
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
208
187
|
|
|
209
188
|
"""
|
|
210
|
-
endpoint_id = endpoint[
|
|
189
|
+
endpoint_id = endpoint[mm_constants.EventFieldType.UID]
|
|
211
190
|
try:
|
|
212
191
|
# Getting batch interval start time and end time
|
|
213
|
-
start_time, end_time =
|
|
214
|
-
bath_dict
|
|
215
|
-
)
|
|
192
|
+
start_time, end_time = cls._get_interval_range(bath_dict)
|
|
216
193
|
m_fs = fstore.get_feature_set(
|
|
217
|
-
endpoint[
|
|
218
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
|
|
219
|
-
]
|
|
194
|
+
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
220
195
|
)
|
|
221
|
-
labels = endpoint[
|
|
222
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
|
|
223
|
-
]
|
|
196
|
+
labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
|
|
224
197
|
if labels:
|
|
225
198
|
if isinstance(labels, str):
|
|
226
199
|
labels = json.loads(labels)
|
|
@@ -232,7 +205,7 @@ class BatchApplicationProcessor:
|
|
|
232
205
|
|
|
233
206
|
try:
|
|
234
207
|
# get sample data
|
|
235
|
-
df =
|
|
208
|
+
df = cls._get_sample_df(
|
|
236
209
|
m_fs,
|
|
237
210
|
endpoint_id,
|
|
238
211
|
end_time,
|
|
@@ -245,14 +218,10 @@ class BatchApplicationProcessor:
|
|
|
245
218
|
logger.warn(
|
|
246
219
|
"Not enough model events since the beginning of the batch interval",
|
|
247
220
|
featureset_name=m_fs.metadata.name,
|
|
248
|
-
endpoint=endpoint[
|
|
249
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
250
|
-
],
|
|
221
|
+
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
251
222
|
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
252
|
-
start_time=
|
|
253
|
-
|
|
254
|
-
),
|
|
255
|
-
end_time=str(datetime.datetime.now()),
|
|
223
|
+
start_time=start_time,
|
|
224
|
+
end_time=end_time,
|
|
256
225
|
)
|
|
257
226
|
return
|
|
258
227
|
|
|
@@ -264,9 +233,7 @@ class BatchApplicationProcessor:
|
|
|
264
233
|
logger.warn(
|
|
265
234
|
"Parquet not found, probably due to not enough model events",
|
|
266
235
|
# parquet_target=m_fs.status.targets[0].path, TODO:
|
|
267
|
-
endpoint=endpoint[
|
|
268
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
269
|
-
],
|
|
236
|
+
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
270
237
|
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
271
238
|
)
|
|
272
239
|
return
|
|
@@ -282,15 +249,11 @@ class BatchApplicationProcessor:
|
|
|
282
249
|
m_fs.save()
|
|
283
250
|
|
|
284
251
|
# Get the timestamp of the latest request:
|
|
285
|
-
latest_request = df[
|
|
286
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
|
|
287
|
-
].iloc[-1]
|
|
252
|
+
latest_request = df[mm_constants.EventFieldType.TIMESTAMP].iloc[-1]
|
|
288
253
|
|
|
289
254
|
# Get the feature stats from the model endpoint for reference data
|
|
290
255
|
feature_stats = json.loads(
|
|
291
|
-
endpoint[
|
|
292
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
|
|
293
|
-
]
|
|
256
|
+
endpoint[mm_constants.EventFieldType.FEATURE_STATS]
|
|
294
257
|
)
|
|
295
258
|
|
|
296
259
|
# Get the current stats:
|
|
@@ -300,7 +263,7 @@ class BatchApplicationProcessor:
|
|
|
300
263
|
)
|
|
301
264
|
|
|
302
265
|
# create and push data to all applications
|
|
303
|
-
|
|
266
|
+
cls._push_to_applications(
|
|
304
267
|
current_stats,
|
|
305
268
|
feature_stats,
|
|
306
269
|
parquet_directory,
|
|
@@ -314,22 +277,27 @@ class BatchApplicationProcessor:
|
|
|
314
277
|
|
|
315
278
|
except FileNotFoundError as e:
|
|
316
279
|
logger.error(
|
|
317
|
-
f"Exception for endpoint {endpoint[
|
|
280
|
+
f"Exception for endpoint {endpoint[mm_constants.EventFieldType.UID]}"
|
|
318
281
|
)
|
|
319
282
|
return endpoint_id, e
|
|
320
283
|
|
|
321
284
|
@staticmethod
|
|
322
|
-
def _get_interval_range(
|
|
285
|
+
def _get_interval_range(
|
|
286
|
+
batch_dict: dict[str, int],
|
|
287
|
+
now_func: Callable[[], datetime.datetime] = datetime.datetime.now,
|
|
288
|
+
) -> Tuple[datetime.datetime, datetime.datetime]:
|
|
323
289
|
"""Getting batch interval time range"""
|
|
324
290
|
minutes, hours, days = (
|
|
325
|
-
batch_dict[
|
|
326
|
-
batch_dict[
|
|
327
|
-
batch_dict[
|
|
291
|
+
batch_dict[mm_constants.EventFieldType.MINUTES],
|
|
292
|
+
batch_dict[mm_constants.EventFieldType.HOURS],
|
|
293
|
+
batch_dict[mm_constants.EventFieldType.DAYS],
|
|
294
|
+
)
|
|
295
|
+
end_time = now_func() - datetime.timedelta(
|
|
296
|
+
seconds=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
|
|
328
297
|
)
|
|
329
|
-
start_time =
|
|
298
|
+
start_time = end_time - datetime.timedelta(
|
|
330
299
|
minutes=minutes, hours=hours, days=days
|
|
331
300
|
)
|
|
332
|
-
end_time = datetime.datetime.now()
|
|
333
301
|
return start_time, end_time
|
|
334
302
|
|
|
335
303
|
def _parse_batch_dict_str(self):
|
|
@@ -357,7 +325,7 @@ class BatchApplicationProcessor:
|
|
|
357
325
|
("minute", "%M"),
|
|
358
326
|
]:
|
|
359
327
|
schedule_time_str += f"{unit}={schedule_time.strftime(fmt)}/"
|
|
360
|
-
endpoint_str = f"{
|
|
328
|
+
endpoint_str = f"{mm_constants.EventFieldType.ENDPOINT_ID}={endpoint_id}"
|
|
361
329
|
|
|
362
330
|
return f"{parquet_directory}/{schedule_time_str}/{endpoint_str}"
|
|
363
331
|
|
|
@@ -374,10 +342,11 @@ class BatchApplicationProcessor:
|
|
|
374
342
|
|
|
375
343
|
base_directory = get_monitoring_parquet_path(
|
|
376
344
|
project=self.project,
|
|
377
|
-
kind=
|
|
345
|
+
kind=mm_constants.FileTargetKind.BATCH_CONTROLLER_PARQUET,
|
|
378
346
|
)
|
|
379
347
|
target = ParquetTarget(path=base_directory)
|
|
380
|
-
|
|
348
|
+
store, _ = target._get_store_and_path()
|
|
349
|
+
fs = store.get_filesystem()
|
|
381
350
|
|
|
382
351
|
try:
|
|
383
352
|
# List all subdirectories in the base directory
|
|
@@ -452,7 +421,7 @@ class BatchApplicationProcessor:
|
|
|
452
421
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
453
422
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
454
423
|
project=project,
|
|
455
|
-
application_name=
|
|
424
|
+
application_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
456
425
|
),
|
|
457
426
|
}
|
|
458
427
|
for app_name in applications_names:
|
|
@@ -500,9 +469,7 @@ class BatchApplicationProcessor:
|
|
|
500
469
|
} # to avoid exception when the taf is not latest
|
|
501
470
|
entity_rows = pd.DataFrame(
|
|
502
471
|
{
|
|
503
|
-
|
|
504
|
-
endpoint_id
|
|
505
|
-
],
|
|
472
|
+
mm_constants.EventFieldType.ENDPOINT_ID: [endpoint_id],
|
|
506
473
|
"scheduled_time": [end_time],
|
|
507
474
|
}
|
|
508
475
|
)
|
|
@@ -512,12 +479,12 @@ class BatchApplicationProcessor:
|
|
|
512
479
|
entity_timestamp_column="scheduled_time",
|
|
513
480
|
start_time=start_time,
|
|
514
481
|
end_time=end_time,
|
|
515
|
-
timestamp_for_filtering=
|
|
482
|
+
timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
|
|
516
483
|
target=ParquetTarget(
|
|
517
484
|
path=parquet_directory,
|
|
518
485
|
time_partitioning_granularity="minute",
|
|
519
486
|
partition_cols=[
|
|
520
|
-
|
|
487
|
+
mm_constants.EventFieldType.ENDPOINT_ID,
|
|
521
488
|
],
|
|
522
489
|
storage_options=storage_options,
|
|
523
490
|
),
|