mlrun 1.5.0rc12__py3-none-any.whl → 1.5.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (45) hide show
  1. mlrun/__main__.py +31 -2
  2. mlrun/api/api/endpoints/functions.py +110 -52
  3. mlrun/api/crud/model_monitoring/deployment.py +208 -38
  4. mlrun/api/crud/model_monitoring/helpers.py +19 -6
  5. mlrun/api/crud/model_monitoring/model_endpoints.py +14 -1
  6. mlrun/api/db/sqldb/db.py +3 -1
  7. mlrun/api/utils/builder.py +2 -4
  8. mlrun/common/model_monitoring/helpers.py +19 -5
  9. mlrun/common/schemas/model_monitoring/constants.py +69 -0
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -0
  11. mlrun/config.py +30 -12
  12. mlrun/datastore/__init__.py +1 -0
  13. mlrun/datastore/sources.py +4 -30
  14. mlrun/datastore/targets.py +68 -31
  15. mlrun/db/httpdb.py +20 -6
  16. mlrun/feature_store/api.py +3 -31
  17. mlrun/feature_store/feature_vector.py +1 -1
  18. mlrun/feature_store/retrieval/base.py +8 -3
  19. mlrun/launcher/remote.py +3 -3
  20. mlrun/lists.py +11 -0
  21. mlrun/model_monitoring/__init__.py +0 -1
  22. mlrun/model_monitoring/api.py +1 -1
  23. mlrun/model_monitoring/application.py +313 -0
  24. mlrun/model_monitoring/batch_application.py +526 -0
  25. mlrun/model_monitoring/batch_application_handler.py +32 -0
  26. mlrun/model_monitoring/evidently_application.py +89 -0
  27. mlrun/model_monitoring/helpers.py +39 -3
  28. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +37 -0
  29. mlrun/model_monitoring/tracking_policy.py +4 -4
  30. mlrun/model_monitoring/writer.py +37 -0
  31. mlrun/projects/pipelines.py +38 -4
  32. mlrun/projects/project.py +257 -43
  33. mlrun/run.py +5 -2
  34. mlrun/runtimes/__init__.py +2 -0
  35. mlrun/runtimes/function.py +2 -1
  36. mlrun/utils/helpers.py +12 -0
  37. mlrun/utils/http.py +3 -0
  38. mlrun/utils/version/version.json +2 -2
  39. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/METADATA +5 -5
  40. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/RECORD +45 -40
  41. /mlrun/model_monitoring/{model_monitoring_batch.py → batch.py} +0 -0
  42. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/LICENSE +0 -0
  43. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/WHEEL +0 -0
  44. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/entry_points.txt +0 -0
  45. {mlrun-1.5.0rc12.dist-info → mlrun-1.5.0rc13.dist-info}/top_level.txt +0 -0
@@ -77,6 +77,30 @@ class EventFieldType:
77
77
  DRIFT_DETECTED_THRESHOLD = "drift_detected_threshold"
78
78
  POSSIBLE_DRIFT_THRESHOLD = "possible_drift_threshold"
79
79
 
80
+ SAMPLE_PARQUET_PATH = "sample_parquet_path"
81
+
82
+
83
+ class ApplicationEvent:
84
+ APPLICATION_NAME = "application_name"
85
+ CURRENT_STATS = "current_stats"
86
+ FEATURE_STATS = "feature_stats"
87
+ SAMPLE_PARQUET_PATH = "sample_parquet_path"
88
+ SCHEDULE_TIME = "schedule_time"
89
+ LAST_REQUEST = "last_request"
90
+ ENDPOINT_ID = "endpoint_id"
91
+ OUTPUT_STREAM_URI = "output_stream_uri"
92
+
93
+
94
+ class WriterEvent:
95
+ APPLICATION_NAME = "application_name"
96
+ ENDPOINT_ID = "endpoint_id"
97
+ SCHEDULE_TIME = "schedule_time"
98
+ RESULT_NAME = "result_name"
99
+ RESULT_VALUE = "result_value"
100
+ RESULT_KIND = "result_kind"
101
+ RESULT_STATUS = "result_status"
102
+ RESULT_EXTRA_DATA = "result_extra_data"
103
+
80
104
 
81
105
  class EventLiveStats:
82
106
  LATENCY_AVG_5M = "latency_avg_5m"
@@ -106,6 +130,7 @@ class ModelEndpointTarget:
106
130
  class ProjectSecretKeys:
107
131
  ENDPOINT_STORE_CONNECTION = "MODEL_MONITORING_ENDPOINT_STORE_CONNECTION"
108
132
  ACCESS_KEY = "MODEL_MONITORING_ACCESS_KEY"
133
+ PIPELINES_ACCESS_KEY = "MODEL_MONITORING_PIPELINES_ACCESS_KEY"
109
134
  KAFKA_BOOTSTRAP_SERVERS = "KAFKA_BOOTSTRAP_SERVERS"
110
135
  STREAM_PATH = "STREAM_PATH"
111
136
 
@@ -120,6 +145,7 @@ class FileTargetKind:
120
145
  EVENTS = "events"
121
146
  STREAM = "stream"
122
147
  PARQUET = "parquet"
148
+ BATCH_CONTROLLER_PARQUET = "batch_controller_parquet"
123
149
  LOG_STREAM = "log_stream"
124
150
 
125
151
 
@@ -143,6 +169,22 @@ class PrometheusMetric:
143
169
  DRIFT_STATUS = "drift_status"
144
170
 
145
171
 
172
+ class MonitoringFunctionNames:
173
+ WRITER = "model-monitoring-writer"
174
+ BATCH = "model-monitoring-batch"
175
+ BATCH_APPLICATION = "model-monitoring-batch-application"
176
+ STREAM = None
177
+
178
+ @staticmethod
179
+ def all():
180
+ return [
181
+ MonitoringFunctionNames.WRITER,
182
+ MonitoringFunctionNames.STREAM,
183
+ MonitoringFunctionNames.BATCH,
184
+ MonitoringFunctionNames.BATCH_APPLICATION,
185
+ ]
186
+
187
+
146
188
  @dataclass
147
189
  class FunctionURI:
148
190
  project: str
@@ -208,3 +250,30 @@ class DriftStatus(Enum):
208
250
  NO_DRIFT = "NO_DRIFT"
209
251
  DRIFT_DETECTED = "DRIFT_DETECTED"
210
252
  POSSIBLE_DRIFT = "POSSIBLE_DRIFT"
253
+
254
+
255
+ class ResultKindApp(enum.Enum):
256
+ """
257
+ Enum for the result kind values
258
+ """
259
+
260
+ data_drift = 0
261
+ concept_drift = 1
262
+ model_performance = 2
263
+ system_performance = 3
264
+
265
+
266
+ class ResultStatusApp(enum.Enum):
267
+ """
268
+ Enum for the result status values, detected means that the app detected some problem.
269
+ """
270
+
271
+ irrelevant = -1
272
+ no_detection = 0
273
+ potential_detection = 1
274
+ detected = 2
275
+
276
+
277
+ class ModelMonitoringAppTag:
278
+ KEY = "type"
279
+ VAL = "model-monitoring-application"
@@ -111,6 +111,16 @@ class ModelEndpointSpec(ObjectSpec):
111
111
  ),
112
112
  }
113
113
 
114
+ @validator("model_uri")
115
+ def validate_model_uri(cls, model_uri):
116
+ """Validate that the model uri includes the required prefix"""
117
+ prefix, uri = mlrun.datastore.parse_store_uri(model_uri)
118
+ if prefix and prefix != mlrun.utils.helpers.StorePrefix.Model:
119
+ return mlrun.datastore.get_store_uri(
120
+ mlrun.utils.helpers.StorePrefix.Model, uri
121
+ )
122
+ return model_uri
123
+
114
124
 
115
125
  class Histogram(BaseModel):
116
126
  buckets: List[float]
mlrun/config.py CHANGED
@@ -403,6 +403,7 @@ default_config = {
403
403
  },
404
404
  "model_endpoint_monitoring": {
405
405
  "serving_stream_args": {"shard_count": 1, "retention_period_hours": 24},
406
+ "application_stream_args": {"shard_count": 3, "retention_period_hours": 24},
406
407
  "drift_thresholds": {"default": {"possible_drift": 0.5, "drift_detected": 0.7}},
407
408
  # Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
408
409
  # stream, and endpoints.
@@ -417,6 +418,7 @@ default_config = {
417
418
  # Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
418
419
  # when the user is working in CE environment and has not provided any stream path.
419
420
  "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
421
+ "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
420
422
  "batch_processing_function_branch": "master",
421
423
  "parquet_batching_max_events": 10000,
422
424
  "parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
@@ -981,20 +983,22 @@ class Config:
981
983
  kind: str = "",
982
984
  target: str = "online",
983
985
  artifact_path: str = None,
986
+ application_name: str = None,
984
987
  ) -> str:
985
988
  """Get the full path from the configuration based on the provided project and kind.
986
989
 
987
- :param project: Project name.
988
- :param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
989
- :param target: Can be either online or offline. If the target is online, then we try to get a specific
990
- path for the provided kind. If it doesn't exist, use the default path.
991
- If the target path is offline and the offline path is already a full path in the
992
- configuration, then the result will be that path as-is. If the offline path is a
993
- relative path, then the result will be based on the project artifact path and the offline
994
- relative path. If project artifact path wasn't provided, then we use MLRun artifact
995
- path instead.
996
- :param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
997
- relative artifact path will be taken from the global MLRun artifact path.
990
+ :param project: Project name.
991
+ :param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
992
+ :param target: Can be either online or offline. If the target is online, then we try to get a specific
993
+ path for the provided kind. If it doesn't exist, use the default path.
994
+ If the target path is offline and the offline path is already a full path in the
995
+ configuration, then the result will be that path as-is. If the offline path is a
996
+ relative path, then the result will be based on the project artifact path and the
997
+ offline relative path. If project artifact path wasn't provided, then we use MLRun
998
+ artifact path instead.
999
+ :param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
1000
+ relative artifact path will be taken from the global MLRun artifact path.
1001
+ :param application_name:Application name, None for model_monitoring_stream.
998
1002
 
999
1003
  :return: Full configured path for the provided kind.
1000
1004
  """
@@ -1006,8 +1010,22 @@ class Config:
1006
1010
  if store_prefix_dict.get(kind):
1007
1011
  # Target exist in store prefix and has a valid string value
1008
1012
  return store_prefix_dict[kind].format(project=project)
1013
+
1014
+ if (
1015
+ application_name
1016
+ != mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
1017
+ ):
1018
+ return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1019
+ project=project,
1020
+ kind=kind
1021
+ if application_name is None
1022
+ else f"{kind}-{application_name.lower()}",
1023
+ )
1009
1024
  return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1010
- project=project, kind=kind
1025
+ project=project,
1026
+ kind=kind
1027
+ if application_name is None
1028
+ else f"{kind}-{application_name.lower()}",
1011
1029
  )
1012
1030
 
1013
1031
  # Get the current offline path from the configuration
@@ -31,6 +31,7 @@ __all__ = [
31
31
  "RedisStore",
32
32
  "DatabricksFileSystemDisableCache",
33
33
  "DatabricksFileBugFixed",
34
+ "get_stream_pusher",
34
35
  ]
35
36
 
36
37
  import fsspec
@@ -138,7 +138,6 @@ class CSVSource(BaseSourceDriver):
138
138
  :parameter path: path to CSV file
139
139
  :parameter key_field: the CSV field to be used as the key for events. May be an int (field index) or string
140
140
  (field name) if with_header is True. Defaults to None (no key). Can be a list of keys.
141
- :parameter time_field: DEPRECATED. Use parse_dates to parse timestamps.
142
141
  :parameter schedule: string to configure scheduling of the ingestion job.
143
142
  :parameter attributes: additional parameters to pass to storey. For example:
144
143
  attributes={"timestamp_format": '%Y%m%d%H'}
@@ -156,29 +155,13 @@ class CSVSource(BaseSourceDriver):
156
155
  path: str = None,
157
156
  attributes: Dict[str, str] = None,
158
157
  key_field: str = None,
159
- time_field: str = None,
160
158
  schedule: str = None,
161
159
  parse_dates: Union[None, int, str, List[int], List[str]] = None,
162
160
  **kwargs,
163
161
  ):
164
- super().__init__(
165
- name, path, attributes, key_field, time_field, schedule, **kwargs
166
- )
167
- if time_field is not None:
168
- warnings.warn(
169
- "CSVSource's time_field parameter is deprecated in 1.3.0 and will be removed in 1.5.0. "
170
- "Use parse_dates instead.",
171
- # TODO: remove in 1.5.0
172
- FutureWarning,
173
- )
174
- if isinstance(parse_dates, (int, str)):
175
- parse_dates = [parse_dates]
176
-
177
- if parse_dates is None:
178
- parse_dates = [time_field]
179
- elif time_field not in parse_dates:
180
- parse_dates = copy(parse_dates)
181
- parse_dates.append(time_field)
162
+ super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
163
+ if parse_dates and not isinstance(parse_dates, list):
164
+ parse_dates = [parse_dates]
182
165
  self._parse_dates = parse_dates
183
166
 
184
167
  def to_step(self, key_field=None, time_field=None, context=None):
@@ -724,16 +707,7 @@ class DataFrameSource:
724
707
 
725
708
  support_storey = True
726
709
 
727
- def __init__(
728
- self, df, key_field=None, time_field=None, context=None, iterator=False
729
- ):
730
- if time_field:
731
- warnings.warn(
732
- "DataFrameSource's time_field parameter has no effect. "
733
- "It is deprecated in 1.3.0 and will be removed in 1.5.0",
734
- FutureWarning,
735
- )
736
-
710
+ def __init__(self, df, key_field=None, context=None, iterator=False):
737
711
  self._df = df
738
712
  if isinstance(key_field, str):
739
713
  self.key_field = [key_field]
@@ -484,6 +484,7 @@ class BaseStoreTarget(DataTargetBase):
484
484
  if hasattr(df, "rdd"):
485
485
  options = self.get_spark_options(key_column, timestamp_key)
486
486
  options.update(kwargs)
487
+ df = self.prepare_spark_df(df, key_column, timestamp_key, options)
487
488
  df.write.mode("overwrite").save(**options)
488
489
  elif hasattr(df, "dask"):
489
490
  dask_options = self.get_dask_options()
@@ -513,36 +514,41 @@ class BaseStoreTarget(DataTargetBase):
513
514
  dir = os.path.dirname(target_path)
514
515
  if dir:
515
516
  os.makedirs(dir, exist_ok=True)
516
- partition_cols = []
517
- if target_path.endswith(".parquet") or target_path.endswith(".pq"):
518
- partition_cols = None
519
517
  target_df = df
520
- if timestamp_key and (
521
- self.partitioned or self.time_partitioning_granularity
522
- ):
523
- target_df = df.copy(deep=False)
524
- time_partitioning_granularity = self.time_partitioning_granularity
525
- if not time_partitioning_granularity and self.partitioned:
526
- time_partitioning_granularity = (
527
- mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
528
- )
529
- for unit, fmt in [
530
- ("year", "%Y"),
531
- ("month", "%m"),
532
- ("day", "%d"),
533
- ("hour", "%H"),
534
- ("minute", "%M"),
535
- ]:
536
- partition_cols.append(unit)
537
- target_df[unit] = pd.DatetimeIndex(target_df[timestamp_key]).format(
538
- date_format=fmt
539
- )
540
- if unit == time_partitioning_granularity:
541
- break
518
+ partition_cols = None # single parquet file
519
+ if not target_path.endswith(".parquet") and not target_path.endswith(
520
+ ".pq"
521
+ ): # directory
522
+ partition_cols = []
523
+ if timestamp_key and (
524
+ self.partitioned or self.time_partitioning_granularity
525
+ ):
526
+ target_df = df.copy(deep=False)
527
+ time_partitioning_granularity = self.time_partitioning_granularity
528
+ if not time_partitioning_granularity and self.partitioned:
529
+ time_partitioning_granularity = (
530
+ mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
531
+ )
532
+ for unit, fmt in [
533
+ ("year", "%Y"),
534
+ ("month", "%m"),
535
+ ("day", "%d"),
536
+ ("hour", "%H"),
537
+ ("minute", "%M"),
538
+ ]:
539
+ partition_cols.append(unit)
540
+ target_df[unit] = pd.DatetimeIndex(
541
+ target_df[timestamp_key]
542
+ ).format(date_format=fmt)
543
+ if unit == time_partitioning_granularity:
544
+ break
545
+ # Partitioning will be performed on timestamp_key and then on self.partition_cols
546
+ # (We might want to give the user control on this order as additional functionality)
547
+ partition_cols += self.partition_cols or []
542
548
  storage_options = self._get_store().get_storage_options()
543
549
  self._write_dataframe(
544
550
  target_df,
545
- storage_options,
551
+ self.storage_options or storage_options,
546
552
  target_path,
547
553
  partition_cols=partition_cols,
548
554
  **kwargs,
@@ -690,7 +696,7 @@ class BaseStoreTarget(DataTargetBase):
690
696
  # options used in spark.read.load(**options)
691
697
  raise NotImplementedError()
692
698
 
693
- def prepare_spark_df(self, df, key_columns):
699
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
694
700
  return df
695
701
 
696
702
  def get_dask_options(self):
@@ -924,6 +930,37 @@ class ParquetTarget(BaseStoreTarget):
924
930
  return self.path.endswith(".parquet") or self.path.endswith(".pq")
925
931
  return False
926
932
 
933
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
934
+ # If partitioning by time, add the necessary columns
935
+ if (
936
+ timestamp_key
937
+ and isinstance(spark_options, dict)
938
+ and "partitionBy" in spark_options
939
+ ):
940
+ from pyspark.sql.functions import (
941
+ dayofmonth,
942
+ hour,
943
+ minute,
944
+ month,
945
+ second,
946
+ year,
947
+ )
948
+
949
+ time_unit_to_op = {
950
+ "year": year,
951
+ "month": month,
952
+ "day": dayofmonth,
953
+ "hour": hour,
954
+ "minute": minute,
955
+ "second": second,
956
+ }
957
+ timestamp_col = df[timestamp_key]
958
+ for partition in spark_options["partitionBy"]:
959
+ if partition not in df.columns and partition in time_unit_to_op:
960
+ op = time_unit_to_op[partition]
961
+ df = df.withColumn(partition, op(timestamp_col))
962
+ return df
963
+
927
964
 
928
965
  class CSVTarget(BaseStoreTarget):
929
966
  kind = TargetTypes.csv
@@ -973,7 +1010,7 @@ class CSVTarget(BaseStoreTarget):
973
1010
  "header": "true",
974
1011
  }
975
1012
 
976
- def prepare_spark_df(self, df, key_columns):
1013
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
977
1014
  import pyspark.sql.functions as funcs
978
1015
 
979
1016
  for col_name, col_type in df.dtypes:
@@ -1067,7 +1104,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1067
1104
  **self.attributes,
1068
1105
  )
1069
1106
 
1070
- def prepare_spark_df(self, df, key_columns):
1107
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1071
1108
  raise NotImplementedError()
1072
1109
 
1073
1110
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
@@ -1139,7 +1176,7 @@ class NoSqlTarget(NoSqlBaseTarget):
1139
1176
  spark_options["columnUpdate"] = True
1140
1177
  return spark_options
1141
1178
 
1142
- def prepare_spark_df(self, df, key_columns):
1179
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1143
1180
  from pyspark.sql.functions import col
1144
1181
 
1145
1182
  spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
@@ -1232,7 +1269,7 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1232
1269
  endpoint, uri = self._get_server_endpoint()
1233
1270
  return endpoint
1234
1271
 
1235
- def prepare_spark_df(self, df, key_columns):
1272
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1236
1273
  from pyspark.sql.functions import col
1237
1274
 
1238
1275
  spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
mlrun/db/httpdb.py CHANGED
@@ -18,6 +18,7 @@ import tempfile
18
18
  import time
19
19
  import traceback
20
20
  import typing
21
+ import warnings
21
22
  from datetime import datetime, timedelta
22
23
  from os import path, remove
23
24
  from typing import Dict, List, Optional, Union
@@ -1411,6 +1412,8 @@ class HTTPRunDB(RunDBInterface):
1411
1412
  namespace=None,
1412
1413
  artifact_path=None,
1413
1414
  ops=None,
1415
+ # TODO: deprecated, remove in 1.6.0
1416
+ ttl=None,
1414
1417
  cleanup_ttl=None,
1415
1418
  ):
1416
1419
  """Submit a KFP pipeline for execution.
@@ -1423,9 +1426,18 @@ class HTTPRunDB(RunDBInterface):
1423
1426
  :param namespace: Kubernetes namespace to execute the pipeline in.
1424
1427
  :param artifact_path: A path to artifacts used by this pipeline.
1425
1428
  :param ops: Transformers to apply on all ops in the pipeline.
1429
+ :param ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the workflow
1430
+ and all its resources are deleted) (deprecated, use cleanup_ttl instead)
1426
1431
  :param cleanup_ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the
1427
1432
  workflow and all its resources are deleted)
1428
1433
  """
1434
+ if ttl:
1435
+ warnings.warn(
1436
+ "'ttl' is deprecated, use 'cleanup_ttl' instead. "
1437
+ "This will be removed in 1.6.0",
1438
+ # TODO: Remove this in 1.6.0
1439
+ FutureWarning,
1440
+ )
1429
1441
 
1430
1442
  if isinstance(pipeline, str):
1431
1443
  pipe_file = pipeline
@@ -1433,7 +1445,7 @@ class HTTPRunDB(RunDBInterface):
1433
1445
  pipe_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=False).name
1434
1446
  conf = new_pipe_metadata(
1435
1447
  artifact_path=artifact_path,
1436
- cleanup_ttl=cleanup_ttl,
1448
+ cleanup_ttl=cleanup_ttl or ttl,
1437
1449
  op_transformers=ops,
1438
1450
  )
1439
1451
  kfp.compiler.Compiler().compile(
@@ -1471,15 +1483,17 @@ class HTTPRunDB(RunDBInterface):
1471
1483
  headers=headers,
1472
1484
  )
1473
1485
  except OSError as err:
1474
- logger.error(f"error cannot submit pipeline: {err_to_str(err)}")
1475
- raise OSError(f"error: cannot cannot submit pipeline, {err_to_str(err)}")
1486
+ logger.error("Error: Cannot submit pipeline", err=err_to_str(err))
1487
+ raise OSError(f"Error: Cannot submit pipeline, {err_to_str(err)}")
1476
1488
 
1477
1489
  if not resp.ok:
1478
- logger.error(f"bad resp!!\n{resp.text}")
1479
- raise ValueError(f"bad submit pipeline response, {resp.text}")
1490
+ logger.error("Failed to submit pipeline", respones_text=resp.text)
1491
+ raise ValueError(f"Failed to submit pipeline, {resp.text}")
1480
1492
 
1481
1493
  resp = resp.json()
1482
- logger.info(f"submitted pipeline {resp['name']} id={resp['id']}")
1494
+ logger.info(
1495
+ "Pipeline submitted successfully", pipeline_name=resp["name"], id=resp["id"]
1496
+ )
1483
1497
  return resp["id"]
1484
1498
 
1485
1499
  def list_pipelines(
@@ -975,37 +975,9 @@ def _ingest_with_spark(
975
975
  )
976
976
 
977
977
  df_to_write = df
978
-
979
- # If partitioning by time, add the necessary columns
980
- if timestamp_key and "partitionBy" in spark_options:
981
- from pyspark.sql.functions import (
982
- dayofmonth,
983
- hour,
984
- minute,
985
- month,
986
- second,
987
- year,
988
- )
989
-
990
- time_unit_to_op = {
991
- "year": year,
992
- "month": month,
993
- "day": dayofmonth,
994
- "hour": hour,
995
- "minute": minute,
996
- "second": second,
997
- }
998
- timestamp_col = df_to_write[timestamp_key]
999
- for partition in spark_options["partitionBy"]:
1000
- if (
1001
- partition not in df_to_write.columns
1002
- and partition in time_unit_to_op
1003
- ):
1004
- op = time_unit_to_op[partition]
1005
- df_to_write = df_to_write.withColumn(
1006
- partition, op(timestamp_col)
1007
- )
1008
- df_to_write = target.prepare_spark_df(df_to_write, key_columns)
978
+ df_to_write = target.prepare_spark_df(
979
+ df_to_write, key_columns, timestamp_key, spark_options
980
+ )
1009
981
  if overwrite:
1010
982
  df_to_write.write.mode("overwrite").save(**spark_options)
1011
983
  else:
@@ -631,7 +631,7 @@ class FeatureVector(ModelObj):
631
631
  feature_set_fields: list of field (name, alias) per featureset
632
632
  """
633
633
  processed_features = {} # dict of name to (featureset, feature object)
634
- feature_set_objects = {}
634
+ feature_set_objects = self.feature_set_objects or {}
635
635
  index_keys = []
636
636
  feature_set_fields = collections.defaultdict(list)
637
637
  features = copy(self.spec.features)
@@ -136,7 +136,7 @@ class BaseMerger(abc.ABC):
136
136
  order_by=order_by,
137
137
  )
138
138
 
139
- def _write_to_offline_target(self):
139
+ def _write_to_offline_target(self, timestamp_key=None):
140
140
  if self._target:
141
141
  is_persistent_vector = self.vector.metadata.name is not None
142
142
  if not self._target.path and not is_persistent_vector:
@@ -144,7 +144,12 @@ class BaseMerger(abc.ABC):
144
144
  "target path was not specified"
145
145
  )
146
146
  self._target.set_resource(self.vector)
147
- size = self._target.write_dataframe(self._result_df)
147
+ size = self._target.write_dataframe(
148
+ self._result_df,
149
+ timestamp_key=timestamp_key
150
+ if not self._drop_indexes and timestamp_key not in self._drop_columns
151
+ else None,
152
+ )
148
153
  if is_persistent_vector:
149
154
  target_status = self._target.update_resource_status("ready", size=size)
150
155
  logger.info(f"wrote target: {target_status}")
@@ -361,7 +366,7 @@ class BaseMerger(abc.ABC):
361
366
  )
362
367
  self._order_by(order_by_active)
363
368
 
364
- self._write_to_offline_target()
369
+ self._write_to_offline_target(timestamp_key=result_timestamp)
365
370
  return OfflineVectorResponse(self)
366
371
 
367
372
  def init_online_vector_service(
mlrun/launcher/remote.py CHANGED
@@ -89,7 +89,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
89
89
 
90
90
  else:
91
91
  raise mlrun.errors.MLRunRuntimeError(
92
- "function image is not built/ready, set auto_build=True or use .deploy() method first"
92
+ "Function image is not built/ready, set auto_build=True or use .deploy() method first"
93
93
  )
94
94
 
95
95
  if runtime.verbose:
@@ -122,11 +122,11 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
122
122
  resp = db.submit_job(run, schedule=schedule)
123
123
  if schedule:
124
124
  action = resp.pop("action", "created")
125
- logger.info(f"task schedule {action}", **resp)
125
+ logger.info(f"Task schedule {action}", **resp)
126
126
  return
127
127
 
128
128
  except (requests.HTTPError, Exception) as err:
129
- logger.error(f"got remote run err, {mlrun.errors.err_to_str(err)}")
129
+ logger.error("Failed remote run", error=mlrun.errors.err_to_str(err))
130
130
 
131
131
  if isinstance(err, requests.HTTPError):
132
132
  runtime._handle_submit_job_http_error(err)
mlrun/lists.py CHANGED
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import warnings
14
15
  from copy import copy
15
16
  from typing import List
16
17
 
@@ -219,6 +220,16 @@ class ArtifactList(list):
219
220
  """return as a list of artifact objects"""
220
221
  return [dict_to_artifact(artifact) for artifact in self]
221
222
 
223
+ def objects(self) -> List[Artifact]:
224
+ """return as a list of artifact objects"""
225
+ warnings.warn(
226
+ "'objects' is deprecated in 1.3.0 and will be removed in 1.6.0. "
227
+ "Use 'to_objects' instead.",
228
+ # TODO: remove in 1.6.0
229
+ FutureWarning,
230
+ )
231
+ return [dict_to_artifact(artifact) for artifact in self]
232
+
222
233
  def dataitems(self) -> List["mlrun.DataItem"]:
223
234
  """return as a list of DataItem objects"""
224
235
  dataitems = []
@@ -15,7 +15,6 @@
15
15
  # flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
16
16
  # for backwards compatibility
17
17
 
18
-
19
18
  from .helpers import get_stream_path
20
19
  from .model_endpoint import ModelEndpoint
21
20
  from .stores import ModelEndpointStore, ModelEndpointStoreType, get_model_endpoint_store
@@ -28,9 +28,9 @@ from mlrun.common.schemas.model_monitoring import EventFieldType, ModelMonitorin
28
28
  from mlrun.data_types.infer import InferOptions, get_df_stats
29
29
  from mlrun.utils import logger
30
30
 
31
+ from .batch import VirtualDrift
31
32
  from .features_drift_table import FeaturesDriftTablePlot
32
33
  from .model_endpoint import ModelEndpoint
33
- from .model_monitoring_batch import VirtualDrift
34
34
 
35
35
  # A union of all supported dataset types:
36
36
  DatasetType = typing.Union[