mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (160) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +23 -111
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +169 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +36 -253
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +46 -42
  10. mlrun/artifacts/model.py +9 -141
  11. mlrun/artifacts/plots.py +14 -375
  12. mlrun/common/constants.py +65 -3
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
  15. mlrun/common/formatters/base.py +113 -0
  16. mlrun/common/formatters/function.py +46 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +10 -5
  21. mlrun/common/schemas/alert.py +92 -11
  22. mlrun/common/schemas/api_gateway.py +56 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +2 -0
  25. mlrun/common/schemas/client_spec.py +1 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/model_monitoring/__init__.py +15 -3
  29. mlrun/common/schemas/model_monitoring/constants.py +58 -7
  30. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  31. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  32. mlrun/common/schemas/pipeline.py +0 -9
  33. mlrun/common/schemas/project.py +5 -11
  34. mlrun/common/types.py +1 -0
  35. mlrun/config.py +30 -9
  36. mlrun/data_types/to_pandas.py +9 -9
  37. mlrun/datastore/base.py +41 -9
  38. mlrun/datastore/datastore.py +6 -2
  39. mlrun/datastore/datastore_profile.py +56 -4
  40. mlrun/datastore/inmem.py +2 -2
  41. mlrun/datastore/redis.py +2 -2
  42. mlrun/datastore/s3.py +5 -0
  43. mlrun/datastore/sources.py +147 -7
  44. mlrun/datastore/store_resources.py +7 -7
  45. mlrun/datastore/targets.py +110 -42
  46. mlrun/datastore/utils.py +42 -0
  47. mlrun/db/base.py +54 -10
  48. mlrun/db/httpdb.py +282 -79
  49. mlrun/db/nopdb.py +52 -10
  50. mlrun/errors.py +11 -0
  51. mlrun/execution.py +26 -9
  52. mlrun/feature_store/__init__.py +0 -2
  53. mlrun/feature_store/api.py +12 -47
  54. mlrun/feature_store/feature_set.py +9 -0
  55. mlrun/feature_store/feature_vector.py +8 -0
  56. mlrun/feature_store/ingestion.py +7 -6
  57. mlrun/feature_store/retrieval/base.py +9 -4
  58. mlrun/feature_store/retrieval/conversion.py +9 -9
  59. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  60. mlrun/feature_store/retrieval/job.py +9 -3
  61. mlrun/feature_store/retrieval/local_merger.py +2 -0
  62. mlrun/feature_store/retrieval/spark_merger.py +16 -0
  63. mlrun/frameworks/__init__.py +6 -0
  64. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  65. mlrun/frameworks/parallel_coordinates.py +2 -1
  66. mlrun/frameworks/tf_keras/__init__.py +4 -1
  67. mlrun/k8s_utils.py +10 -11
  68. mlrun/launcher/base.py +4 -3
  69. mlrun/launcher/client.py +5 -3
  70. mlrun/launcher/local.py +12 -2
  71. mlrun/launcher/remote.py +9 -2
  72. mlrun/lists.py +6 -2
  73. mlrun/model.py +47 -21
  74. mlrun/model_monitoring/__init__.py +1 -1
  75. mlrun/model_monitoring/api.py +42 -18
  76. mlrun/model_monitoring/application.py +5 -305
  77. mlrun/model_monitoring/applications/__init__.py +11 -0
  78. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  79. mlrun/model_monitoring/applications/base.py +280 -0
  80. mlrun/model_monitoring/applications/context.py +214 -0
  81. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  82. mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
  83. mlrun/model_monitoring/applications/results.py +99 -0
  84. mlrun/model_monitoring/controller.py +3 -1
  85. mlrun/model_monitoring/db/__init__.py +2 -0
  86. mlrun/model_monitoring/db/stores/__init__.py +0 -2
  87. mlrun/model_monitoring/db/stores/base/store.py +22 -37
  88. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
  89. mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
  90. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
  91. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
  92. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
  93. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
  94. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  95. mlrun/model_monitoring/db/tsdb/base.py +316 -0
  96. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  97. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  98. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  99. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  100. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +401 -0
  101. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  102. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  103. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +658 -0
  104. mlrun/model_monitoring/evidently_application.py +6 -118
  105. mlrun/model_monitoring/helpers.py +63 -1
  106. mlrun/model_monitoring/model_endpoint.py +3 -2
  107. mlrun/model_monitoring/stream_processing.py +57 -216
  108. mlrun/model_monitoring/writer.py +134 -124
  109. mlrun/package/__init__.py +13 -1
  110. mlrun/package/packagers/__init__.py +6 -1
  111. mlrun/package/utils/_formatter.py +2 -2
  112. mlrun/platforms/__init__.py +10 -9
  113. mlrun/platforms/iguazio.py +21 -202
  114. mlrun/projects/operations.py +24 -12
  115. mlrun/projects/pipelines.py +79 -102
  116. mlrun/projects/project.py +271 -103
  117. mlrun/render.py +15 -14
  118. mlrun/run.py +16 -46
  119. mlrun/runtimes/__init__.py +6 -3
  120. mlrun/runtimes/base.py +14 -7
  121. mlrun/runtimes/daskjob.py +1 -0
  122. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  123. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  124. mlrun/runtimes/funcdoc.py +0 -28
  125. mlrun/runtimes/kubejob.py +2 -1
  126. mlrun/runtimes/local.py +12 -3
  127. mlrun/runtimes/mpijob/__init__.py +0 -20
  128. mlrun/runtimes/mpijob/v1.py +1 -1
  129. mlrun/runtimes/nuclio/api_gateway.py +194 -84
  130. mlrun/runtimes/nuclio/application/application.py +170 -8
  131. mlrun/runtimes/nuclio/function.py +39 -49
  132. mlrun/runtimes/pod.py +16 -36
  133. mlrun/runtimes/remotesparkjob.py +9 -3
  134. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  135. mlrun/runtimes/utils.py +6 -45
  136. mlrun/serving/__init__.py +8 -1
  137. mlrun/serving/server.py +2 -1
  138. mlrun/serving/states.py +51 -8
  139. mlrun/serving/utils.py +19 -11
  140. mlrun/serving/v2_serving.py +5 -1
  141. mlrun/track/tracker.py +2 -1
  142. mlrun/utils/async_http.py +25 -5
  143. mlrun/utils/helpers.py +157 -83
  144. mlrun/utils/logger.py +39 -7
  145. mlrun/utils/notifications/notification/__init__.py +14 -9
  146. mlrun/utils/notifications/notification/base.py +1 -1
  147. mlrun/utils/notifications/notification/slack.py +34 -7
  148. mlrun/utils/notifications/notification/webhook.py +1 -1
  149. mlrun/utils/notifications/notification_pusher.py +147 -16
  150. mlrun/utils/regex.py +9 -0
  151. mlrun/utils/v3io_clients.py +0 -1
  152. mlrun/utils/version/version.json +2 -2
  153. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/METADATA +14 -6
  154. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/RECORD +158 -138
  155. mlrun/kfpops.py +0 -865
  156. mlrun/platforms/other.py +0 -305
  157. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/LICENSE +0 -0
  158. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/WHEEL +0 -0
  159. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/entry_points.txt +0 -0
  160. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/top_level.txt +0 -0
mlrun/db/nopdb.py CHANGED
@@ -16,6 +16,9 @@
16
16
  import datetime
17
17
  from typing import Optional, Union
18
18
 
19
+ import mlrun.alerts
20
+ import mlrun.common.formatters
21
+ import mlrun.common.runtimes.constants
19
22
  import mlrun.common.schemas
20
23
  import mlrun.errors
21
24
 
@@ -79,7 +82,10 @@ class NopDB(RunDBInterface):
79
82
  uid: Optional[Union[str, list[str]]] = None,
80
83
  project: Optional[str] = None,
81
84
  labels: Optional[Union[str, list[str]]] = None,
82
- state: Optional[str] = None,
85
+ state: Optional[
86
+ mlrun.common.runtimes.constants.RunStates
87
+ ] = None, # Backward compatibility
88
+ states: Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
83
89
  sort: bool = True,
84
90
  last: int = 0,
85
91
  iter: bool = False,
@@ -128,7 +134,18 @@ class NopDB(RunDBInterface):
128
134
  ):
129
135
  pass
130
136
 
131
- def del_artifact(self, key, tag="", project="", tree=None, uid=None):
137
+ def del_artifact(
138
+ self,
139
+ key,
140
+ tag="",
141
+ project="",
142
+ tree=None,
143
+ uid=None,
144
+ deletion_strategy: mlrun.common.schemas.artifact.ArtifactsDeletionStrategies = (
145
+ mlrun.common.schemas.artifact.ArtifactsDeletionStrategies.metadata_only
146
+ ),
147
+ secrets: dict = None,
148
+ ):
132
149
  pass
133
150
 
134
151
  def del_artifacts(self, name="", project="", tag="", labels=None):
@@ -196,7 +213,7 @@ class NopDB(RunDBInterface):
196
213
  def list_projects(
197
214
  self,
198
215
  owner: str = None,
199
- format_: mlrun.common.schemas.ProjectsFormat = mlrun.common.schemas.ProjectsFormat.name_only,
216
+ format_: mlrun.common.formatters.ProjectFormat = mlrun.common.formatters.ProjectFormat.name_only,
200
217
  labels: list[str] = None,
201
218
  state: mlrun.common.schemas.ProjectState = None,
202
219
  ) -> mlrun.common.schemas.ProjectsOutput:
@@ -351,8 +368,8 @@ class NopDB(RunDBInterface):
351
368
  namespace: str = None,
352
369
  timeout: int = 30,
353
370
  format_: Union[
354
- str, mlrun.common.schemas.PipelinesFormat
355
- ] = mlrun.common.schemas.PipelinesFormat.summary,
371
+ str, mlrun.common.formatters.PipelineFormat
372
+ ] = mlrun.common.formatters.PipelineFormat.summary,
356
373
  project: str = None,
357
374
  ):
358
375
  pass
@@ -365,8 +382,8 @@ class NopDB(RunDBInterface):
365
382
  page_token: str = "",
366
383
  filter_: str = "",
367
384
  format_: Union[
368
- str, mlrun.common.schemas.PipelinesFormat
369
- ] = mlrun.common.schemas.PipelinesFormat.metadata_only,
385
+ str, mlrun.common.formatters.PipelineFormat
386
+ ] = mlrun.common.formatters.PipelineFormat.metadata_only,
370
387
  page_size: int = None,
371
388
  ) -> mlrun.common.schemas.PipelinesOutput:
372
389
  pass
@@ -508,8 +525,11 @@ class NopDB(RunDBInterface):
508
525
 
509
526
  def store_api_gateway(
510
527
  self,
511
- project: str,
512
- api_gateway: mlrun.runtimes.nuclio.APIGateway,
528
+ api_gateway: Union[
529
+ mlrun.common.schemas.APIGateway,
530
+ mlrun.runtimes.nuclio.api_gateway.APIGateway,
531
+ ],
532
+ project: str = None,
513
533
  ) -> mlrun.common.schemas.APIGateway:
514
534
  pass
515
535
 
@@ -658,6 +678,22 @@ class NopDB(RunDBInterface):
658
678
  ) -> None:
659
679
  pass
660
680
 
681
+ def disable_model_monitoring(
682
+ self,
683
+ project: str,
684
+ delete_resources: bool = True,
685
+ delete_stream_function: bool = False,
686
+ delete_histogram_data_drift_app: bool = True,
687
+ delete_user_applications: bool = False,
688
+ user_application_list: list[str] = None,
689
+ ) -> bool:
690
+ pass
691
+
692
+ def delete_model_monitoring_function(
693
+ self, project: str, functions: list[str]
694
+ ) -> bool:
695
+ pass
696
+
661
697
  def deploy_histogram_data_drift_app(
662
698
  self, project: str, image: str = "mlrun/mlrun"
663
699
  ) -> None:
@@ -671,7 +707,7 @@ class NopDB(RunDBInterface):
671
707
  def store_alert_config(
672
708
  self,
673
709
  alert_name: str,
674
- alert_data: Union[dict, mlrun.common.schemas.AlertConfig],
710
+ alert_data: Union[dict, mlrun.alerts.alert.AlertConfig],
675
711
  project="",
676
712
  ):
677
713
  pass
@@ -687,3 +723,9 @@ class NopDB(RunDBInterface):
687
723
 
688
724
  def reset_alert_config(self, alert_name: str, project=""):
689
725
  pass
726
+
727
+ def get_alert_template(self, template_name: str):
728
+ pass
729
+
730
+ def list_alert_templates(self):
731
+ pass
mlrun/errors.py CHANGED
@@ -155,6 +155,10 @@ class MLRunNotFoundError(MLRunHTTPStatusError):
155
155
  error_status_code = HTTPStatus.NOT_FOUND.value
156
156
 
157
157
 
158
+ class MLRunPaginationEndOfResultsError(MLRunNotFoundError):
159
+ pass
160
+
161
+
158
162
  class MLRunBadRequestError(MLRunHTTPStatusError):
159
163
  error_status_code = HTTPStatus.BAD_REQUEST.value
160
164
 
@@ -183,6 +187,10 @@ class MLRunInternalServerError(MLRunHTTPStatusError):
183
187
  error_status_code = HTTPStatus.INTERNAL_SERVER_ERROR.value
184
188
 
185
189
 
190
+ class MLRunNotImplementedServerError(MLRunHTTPStatusError):
191
+ error_status_code = HTTPStatus.NOT_IMPLEMENTED.value
192
+
193
+
186
194
  class MLRunServiceUnavailableError(MLRunHTTPStatusError):
187
195
  error_status_code = HTTPStatus.SERVICE_UNAVAILABLE.value
188
196
 
@@ -234,4 +242,7 @@ STATUS_ERRORS = {
234
242
  HTTPStatus.PRECONDITION_FAILED.value: MLRunPreconditionFailedError,
235
243
  HTTPStatus.INTERNAL_SERVER_ERROR.value: MLRunInternalServerError,
236
244
  HTTPStatus.SERVICE_UNAVAILABLE.value: MLRunServiceUnavailableError,
245
+ HTTPStatus.NOT_IMPLEMENTED.value: MLRunNotImplementedServerError,
237
246
  }
247
+
248
+ EXPECTED_ERRORS = (MLRunPaginationEndOfResultsError,)
mlrun/execution.py CHANGED
@@ -22,6 +22,7 @@ import yaml
22
22
  from dateutil import parser
23
23
 
24
24
  import mlrun
25
+ import mlrun.common.constants as mlrun_constants
25
26
  from mlrun.artifacts import ModelArtifact
26
27
  from mlrun.datastore.store_resources import get_store_resource
27
28
  from mlrun.errors import MLRunInvalidArgumentError
@@ -110,6 +111,7 @@ class MLClientCtx:
110
111
 
111
112
  self._project_object = None
112
113
  self._allow_empty_resources = None
114
+ self._reset_on_run = None
113
115
 
114
116
  def __enter__(self):
115
117
  return self
@@ -129,7 +131,9 @@ class MLClientCtx:
129
131
  @property
130
132
  def tag(self):
131
133
  """Run tag (uid or workflow id if exists)"""
132
- return self._labels.get("workflow") or self._uid
134
+ return (
135
+ self._labels.get(mlrun_constants.MLRunInternalLabels.workflow) or self._uid
136
+ )
133
137
 
134
138
  @property
135
139
  def state(self):
@@ -329,8 +333,10 @@ class MLClientCtx:
329
333
  "uri": uri,
330
334
  "owner": get_in(self._labels, "owner"),
331
335
  }
332
- if "workflow" in self._labels:
333
- resp["workflow"] = self._labels["workflow"]
336
+ if mlrun_constants.MLRunInternalLabels.workflow in self._labels:
337
+ resp[mlrun_constants.MLRunInternalLabels.workflow] = self._labels[
338
+ mlrun_constants.MLRunInternalLabels.workflow
339
+ ]
334
340
  return resp
335
341
 
336
342
  @classmethod
@@ -384,6 +390,7 @@ class MLClientCtx:
384
390
  self._state_thresholds = spec.get(
385
391
  "state_thresholds", self._state_thresholds
386
392
  )
393
+ self._reset_on_run = spec.get("reset_on_run", self._reset_on_run)
387
394
 
388
395
  self._init_dbs(rundb)
389
396
 
@@ -396,7 +403,7 @@ class MLClientCtx:
396
403
  self._set_input(k, v)
397
404
 
398
405
  if host and not is_api:
399
- self.set_label("host", host)
406
+ self.set_label(mlrun_constants.MLRunInternalLabels.host, host)
400
407
 
401
408
  start = get_in(attrs, "status.start_time")
402
409
  if start:
@@ -990,10 +997,15 @@ class MLClientCtx:
990
997
  # If it's a OpenMPI job, get the global rank and compare to the logging rank (worker) set in MLRun's
991
998
  # configuration:
992
999
  labels = self.labels
993
- if "host" in labels and labels.get("kind", "job") == "mpijob":
1000
+ if (
1001
+ mlrun_constants.MLRunInternalLabels.host in labels
1002
+ and labels.get(mlrun_constants.MLRunInternalLabels.kind, "job") == "mpijob"
1003
+ ):
994
1004
  # The host (pod name) of each worker is created by k8s, and by default it uses the rank number as the id in
995
1005
  # the following template: ...-worker-<rank>
996
- rank = int(labels["host"].rsplit("-", 1)[1])
1006
+ rank = int(
1007
+ labels[mlrun_constants.MLRunInternalLabels.host].rsplit("-", 1)[1]
1008
+ )
997
1009
  return rank == mlrun.mlconf.packagers.logging_worker
998
1010
 
999
1011
  # Single worker is always the logging worker:
@@ -1029,9 +1041,14 @@ class MLClientCtx:
1029
1041
  "status.last_update": to_date_str(self._last_update),
1030
1042
  }
1031
1043
 
1032
- # completion of runs is not decided by the execution as there may be
1033
- # multiple executions for a single run (e.g. mpi)
1034
- if self._state != "completed":
1044
+ # Completion of runs is decided by the API runs monitoring as there may be
1045
+ # multiple executions for a single run (e.g. mpi).
1046
+ # For kinds that are not monitored by the API (local) we allow changing the state.
1047
+ run_kind = self.labels.get(mlrun_constants.MLRunInternalLabels.kind, "")
1048
+ if (
1049
+ mlrun.runtimes.RuntimeKinds.is_local_runtime(run_kind)
1050
+ or self._state != "completed"
1051
+ ):
1035
1052
  struct["status.state"] = self._state
1036
1053
 
1037
1054
  if self.is_logging_worker():
@@ -19,7 +19,6 @@ __all__ = [
19
19
  "get_online_feature_service",
20
20
  "ingest",
21
21
  "preview",
22
- "deploy_ingestion_service",
23
22
  "deploy_ingestion_service_v2",
24
23
  "delete_feature_set",
25
24
  "delete_feature_vector",
@@ -41,7 +40,6 @@ from ..features import Entity, Feature
41
40
  from .api import (
42
41
  delete_feature_set,
43
42
  delete_feature_vector,
44
- deploy_ingestion_service,
45
43
  deploy_ingestion_service_v2,
46
44
  get_feature_set,
47
45
  get_feature_vector,
@@ -113,6 +113,7 @@ def get_offline_features(
113
113
  order_by: Union[str, list[str]] = None,
114
114
  spark_service: str = None,
115
115
  timestamp_for_filtering: Union[str, dict[str, str]] = None,
116
+ additional_filters: list = None,
116
117
  ):
117
118
  """retrieve offline feature vector results
118
119
 
@@ -175,6 +176,13 @@ def get_offline_features(
175
176
  By default, the filter executes on the timestamp_key of each feature set.
176
177
  Note: the time filtering is performed on each feature set before the
177
178
  merge process using start_time and end_time params.
179
+ :param additional_filters: List of additional_filter conditions as tuples.
180
+ Each tuple should be in the format (column_name, operator, value).
181
+ Supported operators: "=", ">=", "<=", ">", "<".
182
+ Example: [("Product", "=", "Computer")]
183
+ For all supported filters, please see:
184
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
185
+
178
186
 
179
187
  """
180
188
  return _get_offline_features(
@@ -194,6 +202,7 @@ def get_offline_features(
194
202
  order_by,
195
203
  spark_service,
196
204
  timestamp_for_filtering,
205
+ additional_filters,
197
206
  )
198
207
 
199
208
 
@@ -214,6 +223,7 @@ def _get_offline_features(
214
223
  order_by: Union[str, list[str]] = None,
215
224
  spark_service: str = None,
216
225
  timestamp_for_filtering: Union[str, dict[str, str]] = None,
226
+ additional_filters=None,
217
227
  ) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
218
228
  if entity_rows is None and entity_timestamp_column is not None:
219
229
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -252,6 +262,7 @@ def _get_offline_features(
252
262
  start_time=start_time,
253
263
  end_time=end_time,
254
264
  timestamp_for_filtering=timestamp_for_filtering,
265
+ additional_filters=additional_filters,
255
266
  )
256
267
 
257
268
  merger = merger_engine(feature_vector, **(engine_args or {}))
@@ -267,6 +278,7 @@ def _get_offline_features(
267
278
  update_stats=update_stats,
268
279
  query=query,
269
280
  order_by=order_by,
281
+ additional_filters=additional_filters,
270
282
  )
271
283
 
272
284
 
@@ -1005,53 +1017,6 @@ def _deploy_ingestion_service_v2(
1005
1017
  return function.deploy(), function
1006
1018
 
1007
1019
 
1008
- @deprecated(
1009
- version="1.5.0",
1010
- reason="'deploy_ingestion_service' will be removed in 1.7.0, use 'deploy_ingestion_service_v2' instead",
1011
- category=FutureWarning,
1012
- )
1013
- def deploy_ingestion_service(
1014
- featureset: Union[FeatureSet, str],
1015
- source: DataSource = None,
1016
- targets: list[DataTargetBase] = None,
1017
- name: str = None,
1018
- run_config: RunConfig = None,
1019
- verbose=False,
1020
- ) -> str:
1021
- """Start real-time ingestion service using nuclio function
1022
-
1023
- Deploy a real-time function implementing feature ingestion pipeline
1024
- the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
1025
-
1026
- the `run_config` parameter allow specifying the function and job configuration,
1027
- see: :py:class:`~mlrun.feature_store.RunConfig`
1028
-
1029
- example::
1030
-
1031
- source = HTTPSource()
1032
- func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
1033
- config = RunConfig(function=func)
1034
- my_set.deploy_ingestion_service(source, run_config=config)
1035
-
1036
- :param featureset: feature set object or uri
1037
- :param source: data source object describing the online or offline source
1038
- :param targets: list of data target objects
1039
- :param name: name for the job/function
1040
- :param run_config: service runtime configuration (function object/uri, resources, etc..)
1041
- :param verbose: verbose log
1042
-
1043
- :return: URL to access the deployed ingestion service
1044
- """
1045
- endpoint, _ = featureset.deploy_ingestion_service(
1046
- source=source,
1047
- targets=targets,
1048
- name=name,
1049
- run_config=run_config,
1050
- verbose=verbose,
1051
- )
1052
- return endpoint
1053
-
1054
-
1055
1020
  def _ingest_with_spark(
1056
1021
  spark=None,
1057
1022
  featureset: Union[FeatureSet, str] = None,
@@ -917,6 +917,7 @@ class FeatureSet(ModelObj):
917
917
  start_time=None,
918
918
  end_time=None,
919
919
  time_column=None,
920
+ additional_filters=None,
920
921
  **kwargs,
921
922
  ):
922
923
  """return featureset (offline) data as dataframe
@@ -928,6 +929,12 @@ class FeatureSet(ModelObj):
928
929
  :param end_time: filter by end time
929
930
  :param time_column: specify the time column name in the file
930
931
  :param kwargs: additional reader (csv, parquet, ..) args
932
+ :param additional_filters: List of additional_filter conditions as tuples.
933
+ Each tuple should be in the format (column_name, operator, value).
934
+ Supported operators: "=", ">=", "<=", ">", "<".
935
+ Example: [("Product", "=", "Computer")]
936
+ For all supported filters, please see:
937
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
931
938
  :return: DataFrame
932
939
  """
933
940
  entities = list(self.spec.entities.keys())
@@ -946,6 +953,7 @@ class FeatureSet(ModelObj):
946
953
  start_time=start_time,
947
954
  end_time=end_time,
948
955
  time_field=time_column,
956
+ additional_filters=additional_filters,
949
957
  **kwargs,
950
958
  )
951
959
  # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -965,6 +973,7 @@ class FeatureSet(ModelObj):
965
973
  start_time=start_time,
966
974
  end_time=end_time,
967
975
  time_column=time_column,
976
+ additional_filters=additional_filters,
968
977
  **kwargs,
969
978
  )
970
979
  return result
@@ -741,6 +741,7 @@ class FeatureVector(ModelObj):
741
741
  order_by: Union[str, list[str]] = None,
742
742
  spark_service: str = None,
743
743
  timestamp_for_filtering: Union[str, dict[str, str]] = None,
744
+ additional_filters: list = None,
744
745
  ):
745
746
  """retrieve offline feature vector results
746
747
 
@@ -797,6 +798,12 @@ class FeatureVector(ModelObj):
797
798
  By default, the filter executes on the timestamp_key of each feature set.
798
799
  Note: the time filtering is performed on each feature set before the
799
800
  merge process using start_time and end_time params.
801
+ :param additional_filters: List of additional_filter conditions as tuples.
802
+ Each tuple should be in the format (column_name, operator, value).
803
+ Supported operators: "=", ">=", "<=", ">", "<".
804
+ Example: [("Product", "=", "Computer")]
805
+ For all supported filters, please see:
806
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
800
807
 
801
808
  """
802
809
 
@@ -817,6 +824,7 @@ class FeatureVector(ModelObj):
817
824
  order_by,
818
825
  spark_service,
819
826
  timestamp_for_filtering,
827
+ additional_filters,
820
828
  )
821
829
 
822
830
  def get_online_feature_service(
@@ -17,6 +17,7 @@ import uuid
17
17
  import pandas as pd
18
18
 
19
19
  import mlrun
20
+ import mlrun.common.constants as mlrun_constants
20
21
  from mlrun.datastore.sources import get_source_from_dict, get_source_step
21
22
  from mlrun.datastore.targets import (
22
23
  add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
263
264
  out_path=featureset.spec.output_path,
264
265
  )
265
266
  task.spec.secret_sources = run_config.secret_sources
266
- task.set_label("job-type", "feature-ingest").set_label(
267
- "feature-set", featureset.uri
268
- )
267
+ task.set_label(
268
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
269
+ ).set_label("feature-set", featureset.uri)
269
270
  if run_config.owner:
270
- task.set_label("owner", run_config.owner).set_label(
271
- "v3io_user", run_config.owner
272
- )
271
+ task.set_label(
272
+ mlrun_constants.MLRunInternalLabels.owner, run_config.owner
273
+ ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
273
274
 
274
275
  # set run UID and save in the feature set status (linking the features et to the job)
275
276
  task.metadata.uid = uuid.uuid4().hex
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
88
88
  update_stats=None,
89
89
  query=None,
90
90
  order_by=None,
91
+ additional_filters=None,
91
92
  ):
92
93
  self._target = target
93
94
 
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
134
135
  timestamp_for_filtering=timestamp_for_filtering,
135
136
  query=query,
136
137
  order_by=order_by,
138
+ additional_filters=additional_filters,
137
139
  )
138
140
 
139
141
  def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
186
188
  timestamp_for_filtering=None,
187
189
  query=None,
188
190
  order_by=None,
191
+ additional_filters=None,
189
192
  ):
190
193
  self._create_engine_env()
191
194
 
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
212
215
  feature_sets.append(None)
213
216
  join_types.append(None)
214
217
 
215
- filtered = False
218
+ timestamp_filtered = False
216
219
  for step in join_graph.steps:
217
220
  name = step.right_feature_set_name
218
221
  feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
250
253
  if self._drop_indexes:
251
254
  self._append_drop_column(time_column)
252
255
  if (start_time or end_time) and time_column:
253
- filtered = True
256
+ timestamp_filtered = True
254
257
 
255
258
  df = self._get_engine_df(
256
259
  feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
259
262
  start_time if time_column else None,
260
263
  end_time if time_column else None,
261
264
  time_column,
265
+ additional_filters,
262
266
  )
263
267
 
264
268
  fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
302
306
  new_columns.append((column, alias))
303
307
  self._update_alias(dictionary={name: alias for name, alias in new_columns})
304
308
 
305
- # None of the feature sets was filtered as required
306
- if not filtered and (start_time or end_time):
309
+ # None of the feature sets was timestamp filtered as required
310
+ if not timestamp_filtered and (start_time or end_time):
307
311
  raise mlrun.errors.MLRunRuntimeError(
308
312
  "start_time and end_time can only be provided in conjunction with "
309
313
  "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
755
759
  start_time: typing.Union[str, datetime] = None,
756
760
  end_time: typing.Union[str, datetime] = None,
757
761
  time_column: typing.Optional[str] = None,
762
+ additional_filters=None,
758
763
  ):
759
764
  """
760
765
  Return the feature_set data frame according to the args
@@ -79,10 +79,10 @@ class PandasConversionMixin:
79
79
  msg = (
80
80
  "toPandas attempted Arrow optimization because "
81
81
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
82
- "failed by the reason below:\n %s\n"
82
+ f"failed by the reason below:\n {e}\n"
83
83
  "Attempting non-optimization as "
84
84
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
85
- "true." % str(e)
85
+ "true."
86
86
  )
87
87
  warnings.warn(msg)
88
88
  use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin:
92
92
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
93
93
  "reached the error below and will not continue because automatic fallback "
94
94
  "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
95
- "false.\n %s" % str(e)
95
+ f"false.\n {e}"
96
96
  )
97
97
  warnings.warn(msg)
98
98
  raise
@@ -158,7 +158,7 @@ class PandasConversionMixin:
158
158
  "reached the error below and can not continue. Note that "
159
159
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
160
160
  "effect on failures in the middle of "
161
- "computation.\n %s" % str(e)
161
+ f"computation.\n {e}"
162
162
  )
163
163
  warnings.warn(msg)
164
164
  raise
@@ -168,10 +168,10 @@ class PandasConversionMixin:
168
168
  column_counter = Counter(self.columns)
169
169
 
170
170
  dtype = [None] * len(self.schema)
171
- for fieldIdx, field in enumerate(self.schema):
171
+ for field_idx, field in enumerate(self.schema):
172
172
  # For duplicate column name, we use `iloc` to access it.
173
173
  if column_counter[field.name] > 1:
174
- pandas_col = pdf.iloc[:, fieldIdx]
174
+ pandas_col = pdf.iloc[:, field_idx]
175
175
  else:
176
176
  pandas_col = pdf[field.name]
177
177
 
@@ -187,12 +187,12 @@ class PandasConversionMixin:
187
187
  and field.nullable
188
188
  and pandas_col.isnull().any()
189
189
  ):
190
- dtype[fieldIdx] = pandas_type
190
+ dtype[field_idx] = pandas_type
191
191
  # Ensure we fall back to nullable numpy types, even when whole column is null:
192
192
  if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
193
- dtype[fieldIdx] = np.float64
193
+ dtype[field_idx] = np.float64
194
194
  if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
195
- dtype[fieldIdx] = object
195
+ dtype[field_idx] = object
196
196
 
197
197
  df = pd.DataFrame()
198
198
  for index, t in enumerate(dtype):
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
145
145
  start_time=None,
146
146
  end_time=None,
147
147
  time_column=None,
148
+ additional_filters=None,
148
149
  ):
149
150
  import dask.dataframe as dd
150
151
 
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
155
156
  end_time=end_time,
156
157
  time_column=time_column,
157
158
  index=False,
159
+ additional_filters=additional_filters,
158
160
  )
159
161
 
160
162
  return self._reset_index(df).persist()
@@ -15,6 +15,7 @@
15
15
  import uuid
16
16
 
17
17
  import mlrun
18
+ import mlrun.common.constants as mlrun_constants
18
19
  from mlrun.config import config as mlconf
19
20
  from mlrun.model import DataTargetBase, new_task
20
21
  from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
42
43
  start_time=None,
43
44
  end_time=None,
44
45
  timestamp_for_filtering=None,
46
+ additional_filters=None,
45
47
  ):
46
48
  name = vector.metadata.name
47
49
  if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
116
118
  "end_time": end_time,
117
119
  "timestamp_for_filtering": timestamp_for_filtering,
118
120
  "engine_args": engine_args,
121
+ "additional_filters": additional_filters,
119
122
  },
120
123
  inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
121
124
  )
122
125
  task.spec.secret_sources = run_config.secret_sources
123
- task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
126
+ task.set_label(
127
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
128
+ ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
124
129
  task.metadata.uid = uuid.uuid4().hex
125
130
  vector.status.run_uri = task.metadata.uid
126
131
  vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
196
201
  from mlrun.datastore.targets import get_target_driver
197
202
  def merge_handler(context, vector_uri, target, entity_rows=None,
198
203
  entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
199
- engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
204
+ engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
205
+ additional_filters=None):
200
206
  vector = context.get_store_resource(vector_uri)
201
207
  store_target = get_target_driver(target, vector)
202
208
  if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
206
212
  merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
207
213
  merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
208
214
  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
209
- timestamp_for_filtering=timestamp_for_filtering)
215
+ timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
210
216
 
211
217
  target = vector.status.targets[store_target.name].to_dict()
212
218
  context.log_result('feature_vector', vector.uri)
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
114
114
  start_time=None,
115
115
  end_time=None,
116
116
  time_column=None,
117
+ additional_filters=None,
117
118
  ):
118
119
  df = feature_set.to_dataframe(
119
120
  columns=column_names,
120
121
  start_time=start_time,
121
122
  end_time=end_time,
122
123
  time_column=time_column,
124
+ additional_filters=additional_filters,
123
125
  )
124
126
  if df.index.names[0]:
125
127
  df.reset_index(inplace=True)