mlrun 1.5.0rc11__py3-none-any.whl → 1.5.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (49) hide show
  1. mlrun/__main__.py +31 -2
  2. mlrun/api/api/endpoints/functions.py +110 -52
  3. mlrun/api/api/endpoints/model_endpoints.py +0 -56
  4. mlrun/api/crud/model_monitoring/deployment.py +208 -38
  5. mlrun/api/crud/model_monitoring/helpers.py +19 -6
  6. mlrun/api/crud/model_monitoring/model_endpoints.py +14 -31
  7. mlrun/api/db/sqldb/db.py +3 -1
  8. mlrun/api/utils/builder.py +2 -4
  9. mlrun/common/model_monitoring/helpers.py +19 -5
  10. mlrun/common/schemas/model_monitoring/constants.py +69 -0
  11. mlrun/common/schemas/model_monitoring/model_endpoints.py +22 -1
  12. mlrun/config.py +30 -12
  13. mlrun/datastore/__init__.py +1 -0
  14. mlrun/datastore/datastore_profile.py +2 -2
  15. mlrun/datastore/sources.py +4 -30
  16. mlrun/datastore/targets.py +106 -55
  17. mlrun/db/httpdb.py +20 -6
  18. mlrun/feature_store/__init__.py +2 -0
  19. mlrun/feature_store/api.py +3 -31
  20. mlrun/feature_store/feature_vector.py +1 -1
  21. mlrun/feature_store/retrieval/base.py +8 -3
  22. mlrun/launcher/remote.py +3 -3
  23. mlrun/lists.py +11 -0
  24. mlrun/model_monitoring/__init__.py +0 -1
  25. mlrun/model_monitoring/api.py +1 -1
  26. mlrun/model_monitoring/application.py +313 -0
  27. mlrun/model_monitoring/batch_application.py +526 -0
  28. mlrun/model_monitoring/batch_application_handler.py +32 -0
  29. mlrun/model_monitoring/evidently_application.py +89 -0
  30. mlrun/model_monitoring/helpers.py +39 -3
  31. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +38 -7
  32. mlrun/model_monitoring/tracking_policy.py +4 -4
  33. mlrun/model_monitoring/writer.py +37 -0
  34. mlrun/projects/pipelines.py +38 -4
  35. mlrun/projects/project.py +257 -43
  36. mlrun/run.py +5 -2
  37. mlrun/runtimes/__init__.py +2 -0
  38. mlrun/runtimes/function.py +2 -1
  39. mlrun/utils/helpers.py +12 -0
  40. mlrun/utils/http.py +3 -0
  41. mlrun/utils/notifications/notification_pusher.py +22 -8
  42. mlrun/utils/version/version.json +2 -2
  43. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/METADATA +5 -5
  44. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/RECORD +49 -44
  45. /mlrun/model_monitoring/{model_monitoring_batch.py → batch.py} +0 -0
  46. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/LICENSE +0 -0
  47. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/WHEEL +0 -0
  48. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/entry_points.txt +0 -0
  49. {mlrun-1.5.0rc11.dist-info → mlrun-1.5.0rc13.dist-info}/top_level.txt +0 -0
@@ -77,6 +77,30 @@ class EventFieldType:
77
77
  DRIFT_DETECTED_THRESHOLD = "drift_detected_threshold"
78
78
  POSSIBLE_DRIFT_THRESHOLD = "possible_drift_threshold"
79
79
 
80
+ SAMPLE_PARQUET_PATH = "sample_parquet_path"
81
+
82
+
83
+ class ApplicationEvent:
84
+ APPLICATION_NAME = "application_name"
85
+ CURRENT_STATS = "current_stats"
86
+ FEATURE_STATS = "feature_stats"
87
+ SAMPLE_PARQUET_PATH = "sample_parquet_path"
88
+ SCHEDULE_TIME = "schedule_time"
89
+ LAST_REQUEST = "last_request"
90
+ ENDPOINT_ID = "endpoint_id"
91
+ OUTPUT_STREAM_URI = "output_stream_uri"
92
+
93
+
94
+ class WriterEvent:
95
+ APPLICATION_NAME = "application_name"
96
+ ENDPOINT_ID = "endpoint_id"
97
+ SCHEDULE_TIME = "schedule_time"
98
+ RESULT_NAME = "result_name"
99
+ RESULT_VALUE = "result_value"
100
+ RESULT_KIND = "result_kind"
101
+ RESULT_STATUS = "result_status"
102
+ RESULT_EXTRA_DATA = "result_extra_data"
103
+
80
104
 
81
105
  class EventLiveStats:
82
106
  LATENCY_AVG_5M = "latency_avg_5m"
@@ -106,6 +130,7 @@ class ModelEndpointTarget:
106
130
  class ProjectSecretKeys:
107
131
  ENDPOINT_STORE_CONNECTION = "MODEL_MONITORING_ENDPOINT_STORE_CONNECTION"
108
132
  ACCESS_KEY = "MODEL_MONITORING_ACCESS_KEY"
133
+ PIPELINES_ACCESS_KEY = "MODEL_MONITORING_PIPELINES_ACCESS_KEY"
109
134
  KAFKA_BOOTSTRAP_SERVERS = "KAFKA_BOOTSTRAP_SERVERS"
110
135
  STREAM_PATH = "STREAM_PATH"
111
136
 
@@ -120,6 +145,7 @@ class FileTargetKind:
120
145
  EVENTS = "events"
121
146
  STREAM = "stream"
122
147
  PARQUET = "parquet"
148
+ BATCH_CONTROLLER_PARQUET = "batch_controller_parquet"
123
149
  LOG_STREAM = "log_stream"
124
150
 
125
151
 
@@ -143,6 +169,22 @@ class PrometheusMetric:
143
169
  DRIFT_STATUS = "drift_status"
144
170
 
145
171
 
172
+ class MonitoringFunctionNames:
173
+ WRITER = "model-monitoring-writer"
174
+ BATCH = "model-monitoring-batch"
175
+ BATCH_APPLICATION = "model-monitoring-batch-application"
176
+ STREAM = None
177
+
178
+ @staticmethod
179
+ def all():
180
+ return [
181
+ MonitoringFunctionNames.WRITER,
182
+ MonitoringFunctionNames.STREAM,
183
+ MonitoringFunctionNames.BATCH,
184
+ MonitoringFunctionNames.BATCH_APPLICATION,
185
+ ]
186
+
187
+
146
188
  @dataclass
147
189
  class FunctionURI:
148
190
  project: str
@@ -208,3 +250,30 @@ class DriftStatus(Enum):
208
250
  NO_DRIFT = "NO_DRIFT"
209
251
  DRIFT_DETECTED = "DRIFT_DETECTED"
210
252
  POSSIBLE_DRIFT = "POSSIBLE_DRIFT"
253
+
254
+
255
+ class ResultKindApp(enum.Enum):
256
+ """
257
+ Enum for the result kind values
258
+ """
259
+
260
+ data_drift = 0
261
+ concept_drift = 1
262
+ model_performance = 2
263
+ system_performance = 3
264
+
265
+
266
+ class ResultStatusApp(enum.Enum):
267
+ """
268
+ Enum for the result status values, detected means that the app detected some problem.
269
+ """
270
+
271
+ irrelevant = -1
272
+ no_detection = 0
273
+ potential_detection = 1
274
+ detected = 2
275
+
276
+
277
+ class ModelMonitoringAppTag:
278
+ KEY = "type"
279
+ VAL = "model-monitoring-application"
@@ -18,7 +18,7 @@ import json
18
18
  import typing
19
19
  from typing import Any, Dict, List, Optional
20
20
 
21
- from pydantic import BaseModel, Field
21
+ from pydantic import BaseModel, Field, validator
22
22
  from pydantic.main import Extra
23
23
 
24
24
  import mlrun.common.model_monitoring
@@ -100,6 +100,27 @@ class ModelEndpointSpec(ObjectSpec):
100
100
  json_parse_values=json_parse_values,
101
101
  )
102
102
 
103
+ @validator("monitor_configuration")
104
+ def set_name(cls, monitor_configuration):
105
+ return monitor_configuration or {
106
+ EventFieldType.DRIFT_DETECTED_THRESHOLD: (
107
+ mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected
108
+ ),
109
+ EventFieldType.POSSIBLE_DRIFT_THRESHOLD: (
110
+ mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift
111
+ ),
112
+ }
113
+
114
+ @validator("model_uri")
115
+ def validate_model_uri(cls, model_uri):
116
+ """Validate that the model uri includes the required prefix"""
117
+ prefix, uri = mlrun.datastore.parse_store_uri(model_uri)
118
+ if prefix and prefix != mlrun.utils.helpers.StorePrefix.Model:
119
+ return mlrun.datastore.get_store_uri(
120
+ mlrun.utils.helpers.StorePrefix.Model, uri
121
+ )
122
+ return model_uri
123
+
103
124
 
104
125
  class Histogram(BaseModel):
105
126
  buckets: List[float]
mlrun/config.py CHANGED
@@ -403,6 +403,7 @@ default_config = {
403
403
  },
404
404
  "model_endpoint_monitoring": {
405
405
  "serving_stream_args": {"shard_count": 1, "retention_period_hours": 24},
406
+ "application_stream_args": {"shard_count": 3, "retention_period_hours": 24},
406
407
  "drift_thresholds": {"default": {"possible_drift": 0.5, "drift_detected": 0.7}},
407
408
  # Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
408
409
  # stream, and endpoints.
@@ -417,6 +418,7 @@ default_config = {
417
418
  # Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
418
419
  # when the user is working in CE environment and has not provided any stream path.
419
420
  "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
421
+ "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
420
422
  "batch_processing_function_branch": "master",
421
423
  "parquet_batching_max_events": 10000,
422
424
  "parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
@@ -981,20 +983,22 @@ class Config:
981
983
  kind: str = "",
982
984
  target: str = "online",
983
985
  artifact_path: str = None,
986
+ application_name: str = None,
984
987
  ) -> str:
985
988
  """Get the full path from the configuration based on the provided project and kind.
986
989
 
987
- :param project: Project name.
988
- :param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
989
- :param target: Can be either online or offline. If the target is online, then we try to get a specific
990
- path for the provided kind. If it doesn't exist, use the default path.
991
- If the target path is offline and the offline path is already a full path in the
992
- configuration, then the result will be that path as-is. If the offline path is a
993
- relative path, then the result will be based on the project artifact path and the offline
994
- relative path. If project artifact path wasn't provided, then we use MLRun artifact
995
- path instead.
996
- :param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
997
- relative artifact path will be taken from the global MLRun artifact path.
990
+ :param project: Project name.
991
+ :param kind: Kind of target path (e.g. events, log_stream, endpoints, etc.)
992
+ :param target: Can be either online or offline. If the target is online, then we try to get a specific
993
+ path for the provided kind. If it doesn't exist, use the default path.
994
+ If the target path is offline and the offline path is already a full path in the
995
+ configuration, then the result will be that path as-is. If the offline path is a
996
+ relative path, then the result will be based on the project artifact path and the
997
+ offline relative path. If project artifact path wasn't provided, then we use MLRun
998
+ artifact path instead.
999
+ :param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
1000
+ relative artifact path will be taken from the global MLRun artifact path.
1001
+ :param application_name:Application name, None for model_monitoring_stream.
998
1002
 
999
1003
  :return: Full configured path for the provided kind.
1000
1004
  """
@@ -1006,8 +1010,22 @@ class Config:
1006
1010
  if store_prefix_dict.get(kind):
1007
1011
  # Target exist in store prefix and has a valid string value
1008
1012
  return store_prefix_dict[kind].format(project=project)
1013
+
1014
+ if (
1015
+ application_name
1016
+ != mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
1017
+ ):
1018
+ return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1019
+ project=project,
1020
+ kind=kind
1021
+ if application_name is None
1022
+ else f"{kind}-{application_name.lower()}",
1023
+ )
1009
1024
  return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1010
- project=project, kind=kind
1025
+ project=project,
1026
+ kind=kind
1027
+ if application_name is None
1028
+ else f"{kind}-{application_name.lower()}",
1011
1029
  )
1012
1030
 
1013
1031
  # Get the current offline path from the configuration
@@ -31,6 +31,7 @@ __all__ = [
31
31
  "RedisStore",
32
32
  "DatabricksFileSystemDisableCache",
33
33
  "DatabricksFileBugFixed",
34
+ "get_stream_pusher",
34
35
  ]
35
36
 
36
37
  import fsspec
@@ -37,9 +37,9 @@ class DatastoreProfile(pydantic.BaseModel):
37
37
 
38
38
  @staticmethod
39
39
  def generate_secret_key(profile_name: str, project: str):
40
- secret_name_separator = "-__-"
40
+ secret_name_separator = "."
41
41
  full_key = (
42
- "mlrun.datastore-profiles"
42
+ "datastore-profiles"
43
43
  + secret_name_separator
44
44
  + project
45
45
  + secret_name_separator
@@ -138,7 +138,6 @@ class CSVSource(BaseSourceDriver):
138
138
  :parameter path: path to CSV file
139
139
  :parameter key_field: the CSV field to be used as the key for events. May be an int (field index) or string
140
140
  (field name) if with_header is True. Defaults to None (no key). Can be a list of keys.
141
- :parameter time_field: DEPRECATED. Use parse_dates to parse timestamps.
142
141
  :parameter schedule: string to configure scheduling of the ingestion job.
143
142
  :parameter attributes: additional parameters to pass to storey. For example:
144
143
  attributes={"timestamp_format": '%Y%m%d%H'}
@@ -156,29 +155,13 @@ class CSVSource(BaseSourceDriver):
156
155
  path: str = None,
157
156
  attributes: Dict[str, str] = None,
158
157
  key_field: str = None,
159
- time_field: str = None,
160
158
  schedule: str = None,
161
159
  parse_dates: Union[None, int, str, List[int], List[str]] = None,
162
160
  **kwargs,
163
161
  ):
164
- super().__init__(
165
- name, path, attributes, key_field, time_field, schedule, **kwargs
166
- )
167
- if time_field is not None:
168
- warnings.warn(
169
- "CSVSource's time_field parameter is deprecated in 1.3.0 and will be removed in 1.5.0. "
170
- "Use parse_dates instead.",
171
- # TODO: remove in 1.5.0
172
- FutureWarning,
173
- )
174
- if isinstance(parse_dates, (int, str)):
175
- parse_dates = [parse_dates]
176
-
177
- if parse_dates is None:
178
- parse_dates = [time_field]
179
- elif time_field not in parse_dates:
180
- parse_dates = copy(parse_dates)
181
- parse_dates.append(time_field)
162
+ super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
163
+ if parse_dates and not isinstance(parse_dates, list):
164
+ parse_dates = [parse_dates]
182
165
  self._parse_dates = parse_dates
183
166
 
184
167
  def to_step(self, key_field=None, time_field=None, context=None):
@@ -724,16 +707,7 @@ class DataFrameSource:
724
707
 
725
708
  support_storey = True
726
709
 
727
- def __init__(
728
- self, df, key_field=None, time_field=None, context=None, iterator=False
729
- ):
730
- if time_field:
731
- warnings.warn(
732
- "DataFrameSource's time_field parameter has no effect. "
733
- "It is deprecated in 1.3.0 and will be removed in 1.5.0",
734
- FutureWarning,
735
- )
736
-
710
+ def __init__(self, df, key_field=None, context=None, iterator=False):
737
711
  self._df = df
738
712
  if isinstance(key_field, str):
739
713
  self.key_field = [key_field]
@@ -484,6 +484,7 @@ class BaseStoreTarget(DataTargetBase):
484
484
  if hasattr(df, "rdd"):
485
485
  options = self.get_spark_options(key_column, timestamp_key)
486
486
  options.update(kwargs)
487
+ df = self.prepare_spark_df(df, key_column, timestamp_key, options)
487
488
  df.write.mode("overwrite").save(**options)
488
489
  elif hasattr(df, "dask"):
489
490
  dask_options = self.get_dask_options()
@@ -513,36 +514,41 @@ class BaseStoreTarget(DataTargetBase):
513
514
  dir = os.path.dirname(target_path)
514
515
  if dir:
515
516
  os.makedirs(dir, exist_ok=True)
516
- partition_cols = []
517
- if target_path.endswith(".parquet") or target_path.endswith(".pq"):
518
- partition_cols = None
519
517
  target_df = df
520
- if timestamp_key and (
521
- self.partitioned or self.time_partitioning_granularity
522
- ):
523
- target_df = df.copy(deep=False)
524
- time_partitioning_granularity = self.time_partitioning_granularity
525
- if not time_partitioning_granularity and self.partitioned:
526
- time_partitioning_granularity = (
527
- mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
528
- )
529
- for unit, fmt in [
530
- ("year", "%Y"),
531
- ("month", "%m"),
532
- ("day", "%d"),
533
- ("hour", "%H"),
534
- ("minute", "%M"),
535
- ]:
536
- partition_cols.append(unit)
537
- target_df[unit] = pd.DatetimeIndex(target_df[timestamp_key]).format(
538
- date_format=fmt
539
- )
540
- if unit == time_partitioning_granularity:
541
- break
518
+ partition_cols = None # single parquet file
519
+ if not target_path.endswith(".parquet") and not target_path.endswith(
520
+ ".pq"
521
+ ): # directory
522
+ partition_cols = []
523
+ if timestamp_key and (
524
+ self.partitioned or self.time_partitioning_granularity
525
+ ):
526
+ target_df = df.copy(deep=False)
527
+ time_partitioning_granularity = self.time_partitioning_granularity
528
+ if not time_partitioning_granularity and self.partitioned:
529
+ time_partitioning_granularity = (
530
+ mlrun.utils.helpers.DEFAULT_TIME_PARTITIONING_GRANULARITY
531
+ )
532
+ for unit, fmt in [
533
+ ("year", "%Y"),
534
+ ("month", "%m"),
535
+ ("day", "%d"),
536
+ ("hour", "%H"),
537
+ ("minute", "%M"),
538
+ ]:
539
+ partition_cols.append(unit)
540
+ target_df[unit] = pd.DatetimeIndex(
541
+ target_df[timestamp_key]
542
+ ).format(date_format=fmt)
543
+ if unit == time_partitioning_granularity:
544
+ break
545
+ # Partitioning will be performed on timestamp_key and then on self.partition_cols
546
+ # (We might want to give the user control on this order as additional functionality)
547
+ partition_cols += self.partition_cols or []
542
548
  storage_options = self._get_store().get_storage_options()
543
549
  self._write_dataframe(
544
550
  target_df,
545
- storage_options,
551
+ self.storage_options or storage_options,
546
552
  target_path,
547
553
  partition_cols=partition_cols,
548
554
  **kwargs,
@@ -690,7 +696,7 @@ class BaseStoreTarget(DataTargetBase):
690
696
  # options used in spark.read.load(**options)
691
697
  raise NotImplementedError()
692
698
 
693
- def prepare_spark_df(self, df, key_columns):
699
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
694
700
  return df
695
701
 
696
702
  def get_dask_options(self):
@@ -924,6 +930,37 @@ class ParquetTarget(BaseStoreTarget):
924
930
  return self.path.endswith(".parquet") or self.path.endswith(".pq")
925
931
  return False
926
932
 
933
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
934
+ # If partitioning by time, add the necessary columns
935
+ if (
936
+ timestamp_key
937
+ and isinstance(spark_options, dict)
938
+ and "partitionBy" in spark_options
939
+ ):
940
+ from pyspark.sql.functions import (
941
+ dayofmonth,
942
+ hour,
943
+ minute,
944
+ month,
945
+ second,
946
+ year,
947
+ )
948
+
949
+ time_unit_to_op = {
950
+ "year": year,
951
+ "month": month,
952
+ "day": dayofmonth,
953
+ "hour": hour,
954
+ "minute": minute,
955
+ "second": second,
956
+ }
957
+ timestamp_col = df[timestamp_key]
958
+ for partition in spark_options["partitionBy"]:
959
+ if partition not in df.columns and partition in time_unit_to_op:
960
+ op = time_unit_to_op[partition]
961
+ df = df.withColumn(partition, op(timestamp_col))
962
+ return df
963
+
927
964
 
928
965
  class CSVTarget(BaseStoreTarget):
929
966
  kind = TargetTypes.csv
@@ -973,7 +1010,7 @@ class CSVTarget(BaseStoreTarget):
973
1010
  "header": "true",
974
1011
  }
975
1012
 
976
- def prepare_spark_df(self, df, key_columns):
1013
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
977
1014
  import pyspark.sql.functions as funcs
978
1015
 
979
1016
  for col_name, col_type in df.dtypes:
@@ -1067,7 +1104,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1067
1104
  **self.attributes,
1068
1105
  )
1069
1106
 
1070
- def prepare_spark_df(self, df, key_columns):
1107
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1071
1108
  raise NotImplementedError()
1072
1109
 
1073
1110
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
@@ -1139,7 +1176,7 @@ class NoSqlTarget(NoSqlBaseTarget):
1139
1176
  spark_options["columnUpdate"] = True
1140
1177
  return spark_options
1141
1178
 
1142
- def prepare_spark_df(self, df, key_columns):
1179
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1143
1180
  from pyspark.sql.functions import col
1144
1181
 
1145
1182
  spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
@@ -1232,7 +1269,7 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1232
1269
  endpoint, uri = self._get_server_endpoint()
1233
1270
  return endpoint
1234
1271
 
1235
- def prepare_spark_df(self, df, key_columns):
1272
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1236
1273
  from pyspark.sql.functions import col
1237
1274
 
1238
1275
  spark_udf_directory = os.path.dirname(os.path.abspath(__file__))
@@ -1580,16 +1617,6 @@ class SQLTarget(BaseStoreTarget):
1580
1617
  :param parse_dates : all the field to be parsed as timestamp.
1581
1618
  """
1582
1619
 
1583
- # Validate sqlalchemy (not installed by default):
1584
- try:
1585
- import sqlalchemy
1586
-
1587
- self.sqlalchemy = sqlalchemy
1588
- except (ModuleNotFoundError, ImportError) as exc:
1589
- raise mlrun.errors.MLRunMissingDependencyError(
1590
- "Using 'SQLTarget' requires sqlalchemy package. Use pip install mlrun[sqlalchemy] to install it."
1591
- ) from exc
1592
-
1593
1620
  create_according_to_data = False # TODO: open for user
1594
1621
  if time_fields:
1595
1622
  warnings.warn(
@@ -1696,8 +1723,14 @@ class SQLTarget(BaseStoreTarget):
1696
1723
  time_column=None,
1697
1724
  **kwargs,
1698
1725
  ):
1726
+ try:
1727
+ import sqlalchemy
1728
+
1729
+ except (ModuleNotFoundError, ImportError) as exc:
1730
+ self._raise_sqlalchemy_import_error(exc)
1731
+
1699
1732
  db_path, table_name, _, _, _, _ = self._parse_url()
1700
- engine = self.sqlalchemy.create_engine(db_path)
1733
+ engine = sqlalchemy.create_engine(db_path)
1701
1734
  parse_dates: Optional[List[str]] = self.attributes.get("parse_dates")
1702
1735
  with engine.connect() as conn:
1703
1736
  query, parse_dates = _generate_sql_query_with_time_filter(
@@ -1721,6 +1754,12 @@ class SQLTarget(BaseStoreTarget):
1721
1754
  def write_dataframe(
1722
1755
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1723
1756
  ):
1757
+ try:
1758
+ import sqlalchemy
1759
+
1760
+ except (ModuleNotFoundError, ImportError) as exc:
1761
+ self._raise_sqlalchemy_import_error(exc)
1762
+
1724
1763
  self._create_sql_table()
1725
1764
 
1726
1765
  if hasattr(df, "rdd"):
@@ -1735,7 +1774,7 @@ class SQLTarget(BaseStoreTarget):
1735
1774
  _,
1736
1775
  ) = self._parse_url()
1737
1776
  create_according_to_data = bool(create_according_to_data)
1738
- engine = self.sqlalchemy.create_engine(
1777
+ engine = sqlalchemy.create_engine(
1739
1778
  db_path,
1740
1779
  )
1741
1780
  connection = engine.connect()
@@ -1760,28 +1799,34 @@ class SQLTarget(BaseStoreTarget):
1760
1799
  primary_key,
1761
1800
  create_table,
1762
1801
  ) = self._parse_url()
1802
+ try:
1803
+ import sqlalchemy
1804
+
1805
+ except (ModuleNotFoundError, ImportError) as exc:
1806
+ self._raise_sqlalchemy_import_error(exc)
1807
+
1763
1808
  try:
1764
1809
  primary_key = ast.literal_eval(primary_key)
1765
1810
  primary_key_for_check = primary_key
1766
1811
  except Exception:
1767
1812
  primary_key_for_check = [primary_key]
1768
- engine = self.sqlalchemy.create_engine(db_path)
1813
+ engine = sqlalchemy.create_engine(db_path)
1769
1814
  with engine.connect() as conn:
1770
- metadata = self.sqlalchemy.MetaData()
1815
+ metadata = sqlalchemy.MetaData()
1771
1816
  table_exists = engine.dialect.has_table(conn, table_name)
1772
1817
  if not table_exists and not create_table:
1773
1818
  raise ValueError(f"Table named {table_name} is not exist")
1774
1819
 
1775
1820
  elif not table_exists and create_table:
1776
1821
  TYPE_TO_SQL_TYPE = {
1777
- int: self.sqlalchemy.Integer,
1778
- str: self.sqlalchemy.String(self.attributes.get("varchar_len")),
1779
- datetime.datetime: self.sqlalchemy.dialects.mysql.DATETIME(fsp=6),
1780
- pd.Timestamp: self.sqlalchemy.dialects.mysql.DATETIME(fsp=6),
1781
- bool: self.sqlalchemy.Boolean,
1782
- float: self.sqlalchemy.Float,
1783
- datetime.timedelta: self.sqlalchemy.Interval,
1784
- pd.Timedelta: self.sqlalchemy.Interval,
1822
+ int: sqlalchemy.Integer,
1823
+ str: sqlalchemy.String(self.attributes.get("varchar_len")),
1824
+ datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
1825
+ pd.Timestamp: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
1826
+ bool: sqlalchemy.Boolean,
1827
+ float: sqlalchemy.Float,
1828
+ datetime.timedelta: sqlalchemy.Interval,
1829
+ pd.Timedelta: sqlalchemy.Interval,
1785
1830
  }
1786
1831
  # creat new table with the given name
1787
1832
  columns = []
@@ -1790,12 +1835,12 @@ class SQLTarget(BaseStoreTarget):
1790
1835
  if col_type is None:
1791
1836
  raise TypeError(f"{col_type} unsupported type")
1792
1837
  columns.append(
1793
- self.sqlalchemy.Column(
1838
+ sqlalchemy.Column(
1794
1839
  col, col_type, primary_key=(col in primary_key_for_check)
1795
1840
  )
1796
1841
  )
1797
1842
 
1798
- self.sqlalchemy.Table(table_name, metadata, *columns)
1843
+ sqlalchemy.Table(table_name, metadata, *columns)
1799
1844
  metadata.create_all(engine)
1800
1845
  if_exists = "append"
1801
1846
  self.path = (
@@ -1804,6 +1849,12 @@ class SQLTarget(BaseStoreTarget):
1804
1849
  )
1805
1850
  conn.close()
1806
1851
 
1852
+ @staticmethod
1853
+ def _raise_sqlalchemy_import_error(exc):
1854
+ raise mlrun.errors.MLRunMissingDependencyError(
1855
+ "Using 'SQLTarget' requires sqlalchemy package. Use pip install mlrun[sqlalchemy] to install it."
1856
+ ) from exc
1857
+
1807
1858
 
1808
1859
  kind_to_driver = {
1809
1860
  TargetTypes.parquet: ParquetTarget,
mlrun/db/httpdb.py CHANGED
@@ -18,6 +18,7 @@ import tempfile
18
18
  import time
19
19
  import traceback
20
20
  import typing
21
+ import warnings
21
22
  from datetime import datetime, timedelta
22
23
  from os import path, remove
23
24
  from typing import Dict, List, Optional, Union
@@ -1411,6 +1412,8 @@ class HTTPRunDB(RunDBInterface):
1411
1412
  namespace=None,
1412
1413
  artifact_path=None,
1413
1414
  ops=None,
1415
+ # TODO: deprecated, remove in 1.6.0
1416
+ ttl=None,
1414
1417
  cleanup_ttl=None,
1415
1418
  ):
1416
1419
  """Submit a KFP pipeline for execution.
@@ -1423,9 +1426,18 @@ class HTTPRunDB(RunDBInterface):
1423
1426
  :param namespace: Kubernetes namespace to execute the pipeline in.
1424
1427
  :param artifact_path: A path to artifacts used by this pipeline.
1425
1428
  :param ops: Transformers to apply on all ops in the pipeline.
1429
+ :param ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the workflow
1430
+ and all its resources are deleted) (deprecated, use cleanup_ttl instead)
1426
1431
  :param cleanup_ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the
1427
1432
  workflow and all its resources are deleted)
1428
1433
  """
1434
+ if ttl:
1435
+ warnings.warn(
1436
+ "'ttl' is deprecated, use 'cleanup_ttl' instead. "
1437
+ "This will be removed in 1.6.0",
1438
+ # TODO: Remove this in 1.6.0
1439
+ FutureWarning,
1440
+ )
1429
1441
 
1430
1442
  if isinstance(pipeline, str):
1431
1443
  pipe_file = pipeline
@@ -1433,7 +1445,7 @@ class HTTPRunDB(RunDBInterface):
1433
1445
  pipe_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=False).name
1434
1446
  conf = new_pipe_metadata(
1435
1447
  artifact_path=artifact_path,
1436
- cleanup_ttl=cleanup_ttl,
1448
+ cleanup_ttl=cleanup_ttl or ttl,
1437
1449
  op_transformers=ops,
1438
1450
  )
1439
1451
  kfp.compiler.Compiler().compile(
@@ -1471,15 +1483,17 @@ class HTTPRunDB(RunDBInterface):
1471
1483
  headers=headers,
1472
1484
  )
1473
1485
  except OSError as err:
1474
- logger.error(f"error cannot submit pipeline: {err_to_str(err)}")
1475
- raise OSError(f"error: cannot cannot submit pipeline, {err_to_str(err)}")
1486
+ logger.error("Error: Cannot submit pipeline", err=err_to_str(err))
1487
+ raise OSError(f"Error: Cannot submit pipeline, {err_to_str(err)}")
1476
1488
 
1477
1489
  if not resp.ok:
1478
- logger.error(f"bad resp!!\n{resp.text}")
1479
- raise ValueError(f"bad submit pipeline response, {resp.text}")
1490
+ logger.error("Failed to submit pipeline", respones_text=resp.text)
1491
+ raise ValueError(f"Failed to submit pipeline, {resp.text}")
1480
1492
 
1481
1493
  resp = resp.json()
1482
- logger.info(f"submitted pipeline {resp['name']} id={resp['id']}")
1494
+ logger.info(
1495
+ "Pipeline submitted successfully", pipeline_name=resp["name"], id=resp["id"]
1496
+ )
1483
1497
  return resp["id"]
1484
1498
 
1485
1499
  def list_pipelines(
@@ -20,6 +20,7 @@ __all__ = [
20
20
  "ingest",
21
21
  "preview",
22
22
  "deploy_ingestion_service",
23
+ "deploy_ingestion_service_v2",
23
24
  "delete_feature_set",
24
25
  "delete_feature_vector",
25
26
  "get_feature_set",
@@ -41,6 +42,7 @@ from .api import (
41
42
  delete_feature_set,
42
43
  delete_feature_vector,
43
44
  deploy_ingestion_service,
45
+ deploy_ingestion_service_v2,
44
46
  get_feature_set,
45
47
  get_feature_vector,
46
48
  get_offline_features,