mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +26 -112
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +46 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +47 -48
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +69 -0
  13. mlrun/common/db/sql_session.py +2 -3
  14. mlrun/common/formatters/__init__.py +19 -0
  15. mlrun/common/formatters/artifact.py +21 -0
  16. mlrun/common/formatters/base.py +78 -0
  17. mlrun/common/formatters/function.py +41 -0
  18. mlrun/common/formatters/pipeline.py +53 -0
  19. mlrun/common/formatters/project.py +51 -0
  20. mlrun/common/helpers.py +1 -2
  21. mlrun/common/model_monitoring/helpers.py +9 -5
  22. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  23. mlrun/common/schemas/__init__.py +24 -4
  24. mlrun/common/schemas/alert.py +203 -0
  25. mlrun/common/schemas/api_gateway.py +148 -0
  26. mlrun/common/schemas/artifact.py +18 -8
  27. mlrun/common/schemas/auth.py +11 -5
  28. mlrun/common/schemas/background_task.py +1 -1
  29. mlrun/common/schemas/client_spec.py +4 -1
  30. mlrun/common/schemas/feature_store.py +16 -16
  31. mlrun/common/schemas/frontend_spec.py +8 -7
  32. mlrun/common/schemas/function.py +5 -1
  33. mlrun/common/schemas/hub.py +11 -18
  34. mlrun/common/schemas/memory_reports.py +2 -2
  35. mlrun/common/schemas/model_monitoring/__init__.py +18 -3
  36. mlrun/common/schemas/model_monitoring/constants.py +83 -26
  37. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  38. mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
  39. mlrun/common/schemas/notification.py +4 -4
  40. mlrun/common/schemas/object.py +2 -2
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +1 -10
  43. mlrun/common/schemas/project.py +24 -23
  44. mlrun/common/schemas/runtime_resource.py +8 -12
  45. mlrun/common/schemas/schedule.py +3 -3
  46. mlrun/common/schemas/tag.py +1 -2
  47. mlrun/common/schemas/workflow.py +2 -2
  48. mlrun/common/types.py +7 -1
  49. mlrun/config.py +54 -17
  50. mlrun/data_types/to_pandas.py +10 -12
  51. mlrun/datastore/__init__.py +5 -8
  52. mlrun/datastore/alibaba_oss.py +130 -0
  53. mlrun/datastore/azure_blob.py +17 -5
  54. mlrun/datastore/base.py +62 -39
  55. mlrun/datastore/datastore.py +28 -9
  56. mlrun/datastore/datastore_profile.py +146 -20
  57. mlrun/datastore/filestore.py +0 -1
  58. mlrun/datastore/google_cloud_storage.py +6 -2
  59. mlrun/datastore/hdfs.py +56 -0
  60. mlrun/datastore/inmem.py +2 -2
  61. mlrun/datastore/redis.py +6 -2
  62. mlrun/datastore/s3.py +9 -0
  63. mlrun/datastore/snowflake_utils.py +43 -0
  64. mlrun/datastore/sources.py +201 -96
  65. mlrun/datastore/spark_utils.py +1 -2
  66. mlrun/datastore/store_resources.py +7 -7
  67. mlrun/datastore/targets.py +358 -104
  68. mlrun/datastore/utils.py +72 -58
  69. mlrun/datastore/v3io.py +5 -1
  70. mlrun/db/base.py +185 -35
  71. mlrun/db/factory.py +1 -1
  72. mlrun/db/httpdb.py +614 -179
  73. mlrun/db/nopdb.py +210 -26
  74. mlrun/errors.py +12 -1
  75. mlrun/execution.py +41 -24
  76. mlrun/feature_store/__init__.py +0 -2
  77. mlrun/feature_store/api.py +40 -72
  78. mlrun/feature_store/common.py +1 -1
  79. mlrun/feature_store/feature_set.py +76 -55
  80. mlrun/feature_store/feature_vector.py +28 -30
  81. mlrun/feature_store/ingestion.py +7 -6
  82. mlrun/feature_store/retrieval/base.py +16 -11
  83. mlrun/feature_store/retrieval/conversion.py +11 -13
  84. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  85. mlrun/feature_store/retrieval/job.py +9 -3
  86. mlrun/feature_store/retrieval/local_merger.py +2 -0
  87. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  88. mlrun/feature_store/steps.py +37 -34
  89. mlrun/features.py +9 -20
  90. mlrun/frameworks/_common/artifacts_library.py +9 -9
  91. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  92. mlrun/frameworks/_common/model_handler.py +48 -48
  93. mlrun/frameworks/_common/plan.py +2 -3
  94. mlrun/frameworks/_common/producer.py +3 -4
  95. mlrun/frameworks/_common/utils.py +5 -5
  96. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  97. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  98. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  99. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  100. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  101. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  102. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  103. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  104. mlrun/frameworks/_ml_common/plan.py +1 -1
  105. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  109. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  110. mlrun/frameworks/_ml_common/utils.py +4 -4
  111. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  112. mlrun/frameworks/huggingface/model_server.py +4 -4
  113. mlrun/frameworks/lgbm/__init__.py +33 -33
  114. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  117. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  119. mlrun/frameworks/lgbm/model_handler.py +10 -10
  120. mlrun/frameworks/lgbm/model_server.py +6 -6
  121. mlrun/frameworks/lgbm/utils.py +5 -5
  122. mlrun/frameworks/onnx/dataset.py +8 -8
  123. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  124. mlrun/frameworks/onnx/model_handler.py +6 -6
  125. mlrun/frameworks/onnx/model_server.py +7 -7
  126. mlrun/frameworks/parallel_coordinates.py +4 -3
  127. mlrun/frameworks/pytorch/__init__.py +18 -18
  128. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  129. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  130. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  131. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  132. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  133. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  134. mlrun/frameworks/pytorch/model_handler.py +17 -17
  135. mlrun/frameworks/pytorch/model_server.py +7 -7
  136. mlrun/frameworks/sklearn/__init__.py +13 -13
  137. mlrun/frameworks/sklearn/estimator.py +4 -4
  138. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  139. mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
  140. mlrun/frameworks/sklearn/model_handler.py +2 -2
  141. mlrun/frameworks/tf_keras/__init__.py +10 -7
  142. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  143. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  144. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  145. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  146. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  147. mlrun/frameworks/tf_keras/model_server.py +6 -6
  148. mlrun/frameworks/xgboost/__init__.py +13 -13
  149. mlrun/frameworks/xgboost/model_handler.py +6 -6
  150. mlrun/k8s_utils.py +14 -16
  151. mlrun/launcher/__init__.py +1 -1
  152. mlrun/launcher/base.py +16 -15
  153. mlrun/launcher/client.py +8 -6
  154. mlrun/launcher/factory.py +1 -1
  155. mlrun/launcher/local.py +17 -11
  156. mlrun/launcher/remote.py +16 -10
  157. mlrun/lists.py +7 -6
  158. mlrun/model.py +238 -73
  159. mlrun/model_monitoring/__init__.py +1 -1
  160. mlrun/model_monitoring/api.py +138 -315
  161. mlrun/model_monitoring/application.py +5 -296
  162. mlrun/model_monitoring/applications/__init__.py +24 -0
  163. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  164. mlrun/model_monitoring/applications/base.py +282 -0
  165. mlrun/model_monitoring/applications/context.py +214 -0
  166. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  167. mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
  168. mlrun/model_monitoring/applications/results.py +99 -0
  169. mlrun/model_monitoring/controller.py +104 -84
  170. mlrun/model_monitoring/controller_handler.py +13 -5
  171. mlrun/model_monitoring/db/__init__.py +18 -0
  172. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  173. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  174. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
  175. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  176. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  177. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  178. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  179. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  180. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  181. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  182. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
  183. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  184. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  185. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  187. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  188. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  189. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  190. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  191. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  192. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  193. mlrun/model_monitoring/evidently_application.py +6 -118
  194. mlrun/model_monitoring/features_drift_table.py +134 -106
  195. mlrun/model_monitoring/helpers.py +127 -28
  196. mlrun/model_monitoring/metrics/__init__.py +13 -0
  197. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  198. mlrun/model_monitoring/model_endpoint.py +3 -2
  199. mlrun/model_monitoring/prometheus.py +1 -4
  200. mlrun/model_monitoring/stream_processing.py +62 -231
  201. mlrun/model_monitoring/tracking_policy.py +9 -2
  202. mlrun/model_monitoring/writer.py +152 -124
  203. mlrun/package/__init__.py +6 -6
  204. mlrun/package/context_handler.py +5 -5
  205. mlrun/package/packager.py +7 -7
  206. mlrun/package/packagers/default_packager.py +6 -6
  207. mlrun/package/packagers/numpy_packagers.py +15 -15
  208. mlrun/package/packagers/pandas_packagers.py +5 -5
  209. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  210. mlrun/package/packagers_manager.py +19 -23
  211. mlrun/package/utils/_formatter.py +6 -6
  212. mlrun/package/utils/_pickler.py +2 -2
  213. mlrun/package/utils/_supported_format.py +4 -4
  214. mlrun/package/utils/log_hint_utils.py +2 -2
  215. mlrun/package/utils/type_hint_utils.py +4 -9
  216. mlrun/platforms/__init__.py +11 -10
  217. mlrun/platforms/iguazio.py +24 -203
  218. mlrun/projects/operations.py +35 -21
  219. mlrun/projects/pipelines.py +68 -99
  220. mlrun/projects/project.py +830 -266
  221. mlrun/render.py +3 -11
  222. mlrun/run.py +162 -166
  223. mlrun/runtimes/__init__.py +62 -7
  224. mlrun/runtimes/base.py +39 -32
  225. mlrun/runtimes/daskjob.py +8 -8
  226. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  227. mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
  228. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  229. mlrun/runtimes/funcdoc.py +0 -28
  230. mlrun/runtimes/function_reference.py +1 -1
  231. mlrun/runtimes/kubejob.py +28 -122
  232. mlrun/runtimes/local.py +6 -3
  233. mlrun/runtimes/mpijob/__init__.py +0 -20
  234. mlrun/runtimes/mpijob/abstract.py +9 -10
  235. mlrun/runtimes/mpijob/v1.py +1 -1
  236. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  237. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  238. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  239. mlrun/runtimes/nuclio/application/application.py +523 -0
  240. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  241. mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
  242. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  243. mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
  244. mlrun/runtimes/pod.py +286 -88
  245. mlrun/runtimes/remotesparkjob.py +2 -2
  246. mlrun/runtimes/sparkjob/spark3job.py +51 -34
  247. mlrun/runtimes/utils.py +7 -75
  248. mlrun/secrets.py +9 -5
  249. mlrun/serving/remote.py +2 -7
  250. mlrun/serving/routers.py +13 -10
  251. mlrun/serving/server.py +22 -26
  252. mlrun/serving/states.py +99 -25
  253. mlrun/serving/utils.py +3 -3
  254. mlrun/serving/v1_serving.py +6 -7
  255. mlrun/serving/v2_serving.py +59 -20
  256. mlrun/track/tracker.py +2 -1
  257. mlrun/track/tracker_manager.py +3 -3
  258. mlrun/track/trackers/mlflow_tracker.py +1 -2
  259. mlrun/utils/async_http.py +5 -7
  260. mlrun/utils/azure_vault.py +1 -1
  261. mlrun/utils/clones.py +1 -2
  262. mlrun/utils/condition_evaluator.py +3 -3
  263. mlrun/utils/db.py +3 -3
  264. mlrun/utils/helpers.py +183 -197
  265. mlrun/utils/http.py +2 -5
  266. mlrun/utils/logger.py +76 -14
  267. mlrun/utils/notifications/notification/__init__.py +17 -12
  268. mlrun/utils/notifications/notification/base.py +14 -2
  269. mlrun/utils/notifications/notification/console.py +2 -0
  270. mlrun/utils/notifications/notification/git.py +3 -1
  271. mlrun/utils/notifications/notification/ipython.py +3 -1
  272. mlrun/utils/notifications/notification/slack.py +101 -21
  273. mlrun/utils/notifications/notification/webhook.py +11 -1
  274. mlrun/utils/notifications/notification_pusher.py +155 -30
  275. mlrun/utils/retryer.py +208 -0
  276. mlrun/utils/singleton.py +1 -1
  277. mlrun/utils/v3io_clients.py +2 -4
  278. mlrun/utils/version/version.json +2 -2
  279. mlrun/utils/version/version.py +2 -6
  280. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
  281. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  282. mlrun/kfpops.py +0 -868
  283. mlrun/model_monitoring/batch.py +0 -1095
  284. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  285. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  286. mlrun/platforms/other.py +0 -306
  287. mlrun-1.6.4rc2.dist-info/RECORD +0 -314
  288. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  289. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
  290. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  291. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ import typing
17
17
  from copy import copy
18
18
  from datetime import datetime
19
19
  from enum import Enum
20
- from typing import Dict, List, Union
20
+ from typing import Union
21
21
 
22
22
  import numpy as np
23
23
  import pandas as pd
@@ -69,18 +69,16 @@ class FeatureVectorSpec(ModelObj):
69
69
  self._entity_fields: ObjectList = None
70
70
  self._entity_source: DataSource = None
71
71
  self._function: FunctionReference = None
72
- self._relations: typing.Dict[str, ObjectDict] = None
72
+ self._relations: dict[str, ObjectDict] = None
73
73
  self._join_graph: JoinGraph = None
74
74
 
75
75
  self.description = description
76
- self.features: List[str] = features or []
76
+ self.features: list[str] = features or []
77
77
  self.entity_source = entity_source
78
78
  self.entity_fields = entity_fields or []
79
79
  self.graph = graph
80
80
  self.join_graph = join_graph
81
- self.relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]] = (
82
- relations or {}
83
- )
81
+ self.relations: dict[str, dict[str, Union[Entity, str]]] = relations or {}
84
82
  self.timestamp_field = timestamp_field
85
83
  self.label_feature = label_feature
86
84
  self.with_indexes = with_indexes
@@ -97,12 +95,12 @@ class FeatureVectorSpec(ModelObj):
97
95
  self._entity_source = self._verify_dict(source, "entity_source", DataSource)
98
96
 
99
97
  @property
100
- def entity_fields(self) -> List[Feature]:
98
+ def entity_fields(self) -> list[Feature]:
101
99
  """the schema/metadata for the entity source fields"""
102
100
  return self._entity_fields
103
101
 
104
102
  @entity_fields.setter
105
- def entity_fields(self, entity_fields: List[Feature]):
103
+ def entity_fields(self, entity_fields: list[Feature]):
106
104
  self._entity_fields = ObjectList.from_list(Feature, entity_fields)
107
105
 
108
106
  @property
@@ -125,14 +123,12 @@ class FeatureVectorSpec(ModelObj):
125
123
  self._function = self._verify_dict(function, "function", FunctionReference)
126
124
 
127
125
  @property
128
- def relations(self) -> typing.Dict[str, ObjectDict]:
126
+ def relations(self) -> dict[str, ObjectDict]:
129
127
  """feature set relations dict"""
130
128
  return self._relations
131
129
 
132
130
  @relations.setter
133
- def relations(
134
- self, relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]]
135
- ):
131
+ def relations(self, relations: dict[str, dict[str, Union[Entity, str]]]):
136
132
  temp_relations = {}
137
133
  for fs_name, relation in relations.items():
138
134
  for col, ent in relation.items():
@@ -179,29 +175,29 @@ class FeatureVectorStatus(ModelObj):
179
175
  self.stats = stats or {}
180
176
  self.index_keys = index_keys
181
177
  self.preview = preview or []
182
- self.features: List[Feature] = features or []
178
+ self.features: list[Feature] = features or []
183
179
  self.run_uri = run_uri
184
180
  self.timestamp_key = timestamp_key
185
181
 
186
182
  @property
187
- def targets(self) -> List[DataTarget]:
183
+ def targets(self) -> list[DataTarget]:
188
184
  """list of material storage targets + their status/path"""
189
185
  return self._targets
190
186
 
191
187
  @targets.setter
192
- def targets(self, targets: List[DataTarget]):
188
+ def targets(self, targets: list[DataTarget]):
193
189
  self._targets = ObjectList.from_list(DataTarget, targets)
194
190
 
195
191
  def update_target(self, target: DataTarget):
196
192
  self._targets.update(target)
197
193
 
198
194
  @property
199
- def features(self) -> List[Feature]:
195
+ def features(self) -> list[Feature]:
200
196
  """list of features (result of joining features from the source feature sets)"""
201
197
  return self._features
202
198
 
203
199
  @features.setter
204
- def features(self, features: List[Feature]):
200
+ def features(self, features: list[Feature]):
205
201
  self._features = ObjectList.from_list(Feature, features)
206
202
 
207
203
 
@@ -378,7 +374,7 @@ class _JoinStep(ModelObj):
378
374
  name: str = None,
379
375
  left_step_name: str = None,
380
376
  right_step_name: str = None,
381
- left_feature_set_names: Union[str, List[str]] = None,
377
+ left_feature_set_names: Union[str, list[str]] = None,
382
378
  right_feature_set_name: str = None,
383
379
  join_type: str = "inner",
384
380
  asof_join: bool = False,
@@ -388,7 +384,8 @@ class _JoinStep(ModelObj):
388
384
  self.right_step_name = right_step_name
389
385
  self.left_feature_set_names = (
390
386
  left_feature_set_names
391
- if isinstance(left_feature_set_names, list)
387
+ if left_feature_set_names is None
388
+ or isinstance(left_feature_set_names, list)
392
389
  else [left_feature_set_names]
393
390
  )
394
391
  self.right_feature_set_name = right_feature_set_name
@@ -402,7 +399,7 @@ class _JoinStep(ModelObj):
402
399
  self,
403
400
  feature_set_objects: ObjectList,
404
401
  vector,
405
- entity_rows_keys: List[str] = None,
402
+ entity_rows_keys: list[str] = None,
406
403
  ):
407
404
  if feature_set_objects[self.right_feature_set_name].is_connectable_to_df(
408
405
  entity_rows_keys
@@ -482,21 +479,22 @@ class FeatureVector(ModelObj):
482
479
  description=None,
483
480
  with_indexes=None,
484
481
  join_graph: JoinGraph = None,
485
- relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]] = None,
482
+ relations: dict[str, dict[str, Union[Entity, str]]] = None,
486
483
  ):
487
484
  """Feature vector, specify selected features, their metadata and material views
488
485
 
489
486
  example::
490
487
 
491
488
  import mlrun.feature_store as fstore
489
+
492
490
  features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
493
491
  vector = fstore.FeatureVector("my-vec", features)
494
492
 
495
493
  # get the vector as a dataframe
496
- df = fstore.get_offline_features(vector).to_dataframe()
494
+ df = vector.get_offline_features().to_dataframe()
497
495
 
498
496
  # return an online/real-time feature service
499
- svc = fstore.get_online_feature_service(vector, impute_policy={"*": "$mean"})
497
+ svc = vector.get_online_feature_service(impute_policy={"*": "$mean"})
500
498
  resp = svc.get([{"stock": "GOOG"}])
501
499
 
502
500
  :param name: List of names of targets to delete (default: delete all ingested targets)
@@ -732,7 +730,7 @@ class FeatureVector(ModelObj):
732
730
  entity_timestamp_column: str = None,
733
731
  target: DataTargetBase = None,
734
732
  run_config: RunConfig = None,
735
- drop_columns: List[str] = None,
733
+ drop_columns: list[str] = None,
736
734
  start_time: Union[str, datetime] = None,
737
735
  end_time: Union[str, datetime] = None,
738
736
  with_indexes: bool = False,
@@ -740,9 +738,9 @@ class FeatureVector(ModelObj):
740
738
  engine: str = None,
741
739
  engine_args: dict = None,
742
740
  query: str = None,
743
- order_by: Union[str, List[str]] = None,
741
+ order_by: Union[str, list[str]] = None,
744
742
  spark_service: str = None,
745
- timestamp_for_filtering: Union[str, Dict[str, str]] = None,
743
+ timestamp_for_filtering: Union[str, dict[str, str]] = None,
746
744
  ):
747
745
  """retrieve offline feature vector results
748
746
 
@@ -827,7 +825,7 @@ class FeatureVector(ModelObj):
827
825
  fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
828
826
  impute_policy: dict = None,
829
827
  update_stats: bool = False,
830
- entity_keys: List[str] = None,
828
+ entity_keys: list[str] = None,
831
829
  ):
832
830
  """initialize and return online feature vector service api,
833
831
  returns :py:class:`~mlrun.feature_store.OnlineVectorService`
@@ -855,7 +853,7 @@ class FeatureVector(ModelObj):
855
853
 
856
854
  Example::
857
855
 
858
- svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
856
+ svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
859
857
  try:
860
858
  resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
861
859
  print(resp)
@@ -910,7 +908,7 @@ class OnlineVectorService:
910
908
  graph,
911
909
  index_columns,
912
910
  impute_policy: dict = None,
913
- requested_columns: List[str] = None,
911
+ requested_columns: list[str] = None,
914
912
  ):
915
913
  self.vector = vector
916
914
  self.impute_policy = impute_policy or {}
@@ -966,7 +964,7 @@ class OnlineVectorService:
966
964
  """vector merger function status (ready, running, error)"""
967
965
  return "ready"
968
966
 
969
- def get(self, entity_rows: List[Union[dict, list]], as_list=False):
967
+ def get(self, entity_rows: list[Union[dict, list]], as_list=False):
970
968
  """get feature vector given the provided entity inputs
971
969
 
972
970
  take a list of input vectors/rows and return a list of enriched feature vectors
@@ -17,6 +17,7 @@ import uuid
17
17
  import pandas as pd
18
18
 
19
19
  import mlrun
20
+ import mlrun.common.constants as mlrun_constants
20
21
  from mlrun.datastore.sources import get_source_from_dict, get_source_step
21
22
  from mlrun.datastore.targets import (
22
23
  add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
263
264
  out_path=featureset.spec.output_path,
264
265
  )
265
266
  task.spec.secret_sources = run_config.secret_sources
266
- task.set_label("job-type", "feature-ingest").set_label(
267
- "feature-set", featureset.uri
268
- )
267
+ task.set_label(
268
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
269
+ ).set_label("feature-set", featureset.uri)
269
270
  if run_config.owner:
270
- task.set_label("owner", run_config.owner).set_label(
271
- "v3io_user", run_config.owner
272
- )
271
+ task.set_label(
272
+ mlrun_constants.MLRunInternalLabels.owner, run_config.owner
273
+ ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
273
274
 
274
275
  # set run UID and save in the feature set status (linking the features et to the job)
275
276
  task.metadata.uid = uuid.uuid4().hex
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
88
88
  update_stats=None,
89
89
  query=None,
90
90
  order_by=None,
91
+ additional_filters=None,
91
92
  ):
92
93
  self._target = target
93
94
 
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
134
135
  timestamp_for_filtering=timestamp_for_filtering,
135
136
  query=query,
136
137
  order_by=order_by,
138
+ additional_filters=additional_filters,
137
139
  )
138
140
 
139
141
  def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
186
188
  timestamp_for_filtering=None,
187
189
  query=None,
188
190
  order_by=None,
191
+ additional_filters=None,
189
192
  ):
190
193
  self._create_engine_env()
191
194
 
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
212
215
  feature_sets.append(None)
213
216
  join_types.append(None)
214
217
 
215
- filtered = False
218
+ timestamp_filtered = False
216
219
  for step in join_graph.steps:
217
220
  name = step.right_feature_set_name
218
221
  feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
250
253
  if self._drop_indexes:
251
254
  self._append_drop_column(time_column)
252
255
  if (start_time or end_time) and time_column:
253
- filtered = True
256
+ timestamp_filtered = True
254
257
 
255
258
  df = self._get_engine_df(
256
259
  feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
259
262
  start_time if time_column else None,
260
263
  end_time if time_column else None,
261
264
  time_column,
265
+ additional_filters,
262
266
  )
263
267
 
264
268
  fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
302
306
  new_columns.append((column, alias))
303
307
  self._update_alias(dictionary={name: alias for name, alias in new_columns})
304
308
 
305
- # None of the feature sets was filtered as required
306
- if not filtered and (start_time or end_time):
309
+ # None of the feature sets was timestamp filtered as required
310
+ if not timestamp_filtered and (start_time or end_time):
307
311
  raise mlrun.errors.MLRunRuntimeError(
308
312
  "start_time and end_time can only be provided in conjunction with "
309
313
  "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -540,8 +544,8 @@ class BaseMerger(abc.ABC):
540
544
  self,
541
545
  name: str,
542
546
  order: int,
543
- left_keys: typing.List[str] = None,
544
- right_keys: typing.List[str] = None,
547
+ left_keys: list[str] = None,
548
+ right_keys: list[str] = None,
545
549
  ):
546
550
  self.name = name
547
551
  self.left_keys = left_keys if left_keys is not None else []
@@ -750,11 +754,12 @@ class BaseMerger(abc.ABC):
750
754
  def _get_engine_df(
751
755
  self,
752
756
  feature_set: FeatureSet,
753
- feature_set_name: typing.List[str],
754
- column_names: typing.List[str] = None,
757
+ feature_set_name: list[str],
758
+ column_names: list[str] = None,
755
759
  start_time: typing.Union[str, datetime] = None,
756
760
  end_time: typing.Union[str, datetime] = None,
757
761
  time_column: typing.Optional[str] = None,
762
+ additional_filters=None,
758
763
  ):
759
764
  """
760
765
  Return the feature_set data frame according to the args
@@ -773,8 +778,8 @@ class BaseMerger(abc.ABC):
773
778
  def _rename_columns_and_select(
774
779
  self,
775
780
  df,
776
- rename_col_dict: typing.Dict[str, str],
777
- columns: typing.List[str] = None,
781
+ rename_col_dict: dict[str, str],
782
+ columns: list[str] = None,
778
783
  ):
779
784
  """
780
785
  rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
@@ -801,7 +806,7 @@ class BaseMerger(abc.ABC):
801
806
  """
802
807
  raise NotImplementedError
803
808
 
804
- def _order_by(self, order_by_active: typing.List[str]):
809
+ def _order_by(self, order_by_active: list[str]):
805
810
  """
806
811
  Order by `order_by_active` along all axis.
807
812
 
@@ -19,7 +19,7 @@ from collections import Counter
19
19
  # np.bool -> bool and np.object -> object fix backported from pyspark v3.3.3.
20
20
 
21
21
 
22
- class PandasConversionMixin(object):
22
+ class PandasConversionMixin:
23
23
  """
24
24
  Min-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame`
25
25
  can use this class.
@@ -79,10 +79,10 @@ class PandasConversionMixin(object):
79
79
  msg = (
80
80
  "toPandas attempted Arrow optimization because "
81
81
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
82
- "failed by the reason below:\n %s\n"
82
+ f"failed by the reason below:\n {e}\n"
83
83
  "Attempting non-optimization as "
84
84
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
85
- "true." % str(e)
85
+ "true."
86
86
  )
87
87
  warnings.warn(msg)
88
88
  use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin(object):
92
92
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
93
93
  "reached the error below and will not continue because automatic fallback "
94
94
  "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
95
- "false.\n %s" % str(e)
95
+ f"false.\n {e}"
96
96
  )
97
97
  warnings.warn(msg)
98
98
  raise
@@ -108,9 +108,7 @@ class PandasConversionMixin(object):
108
108
  )
109
109
 
110
110
  # Rename columns to avoid duplicated column names.
111
- tmp_column_names = [
112
- "col_{}".format(i) for i in range(len(self.columns))
113
- ]
111
+ tmp_column_names = [f"col_{i}" for i in range(len(self.columns))]
114
112
  self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled()
115
113
  batches = self.toDF(*tmp_column_names)._collect_as_arrow(
116
114
  split_batches=self_destruct
@@ -160,7 +158,7 @@ class PandasConversionMixin(object):
160
158
  "reached the error below and can not continue. Note that "
161
159
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
162
160
  "effect on failures in the middle of "
163
- "computation.\n %s" % str(e)
161
+ f"computation.\n {e}"
164
162
  )
165
163
  warnings.warn(msg)
166
164
  raise
@@ -170,10 +168,10 @@ class PandasConversionMixin(object):
170
168
  column_counter = Counter(self.columns)
171
169
 
172
170
  dtype = [None] * len(self.schema)
173
- for fieldIdx, field in enumerate(self.schema):
171
+ for field_idx, field in enumerate(self.schema):
174
172
  # For duplicate column name, we use `iloc` to access it.
175
173
  if column_counter[field.name] > 1:
176
- pandas_col = pdf.iloc[:, fieldIdx]
174
+ pandas_col = pdf.iloc[:, field_idx]
177
175
  else:
178
176
  pandas_col = pdf[field.name]
179
177
 
@@ -189,12 +187,12 @@ class PandasConversionMixin(object):
189
187
  and field.nullable
190
188
  and pandas_col.isnull().any()
191
189
  ):
192
- dtype[fieldIdx] = pandas_type
190
+ dtype[field_idx] = pandas_type
193
191
  # Ensure we fall back to nullable numpy types, even when whole column is null:
194
192
  if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
195
- dtype[fieldIdx] = np.float64
193
+ dtype[field_idx] = np.float64
196
194
  if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
197
- dtype[fieldIdx] = object
195
+ dtype[field_idx] = object
198
196
 
199
197
  df = pd.DataFrame()
200
198
  for index, t in enumerate(dtype):
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
145
145
  start_time=None,
146
146
  end_time=None,
147
147
  time_column=None,
148
+ additional_filters=None,
148
149
  ):
149
150
  import dask.dataframe as dd
150
151
 
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
155
156
  end_time=end_time,
156
157
  time_column=time_column,
157
158
  index=False,
159
+ additional_filters=additional_filters,
158
160
  )
159
161
 
160
162
  return self._reset_index(df).persist()
@@ -15,6 +15,7 @@
15
15
  import uuid
16
16
 
17
17
  import mlrun
18
+ import mlrun.common.constants as mlrun_constants
18
19
  from mlrun.config import config as mlconf
19
20
  from mlrun.model import DataTargetBase, new_task
20
21
  from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
42
43
  start_time=None,
43
44
  end_time=None,
44
45
  timestamp_for_filtering=None,
46
+ additional_filters=None,
45
47
  ):
46
48
  name = vector.metadata.name
47
49
  if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
116
118
  "end_time": end_time,
117
119
  "timestamp_for_filtering": timestamp_for_filtering,
118
120
  "engine_args": engine_args,
121
+ "additional_filters": additional_filters,
119
122
  },
120
123
  inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
121
124
  )
122
125
  task.spec.secret_sources = run_config.secret_sources
123
- task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
126
+ task.set_label(
127
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
128
+ ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
124
129
  task.metadata.uid = uuid.uuid4().hex
125
130
  vector.status.run_uri = task.metadata.uid
126
131
  vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
196
201
  from mlrun.datastore.targets import get_target_driver
197
202
  def merge_handler(context, vector_uri, target, entity_rows=None,
198
203
  entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
199
- engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
204
+ engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
205
+ additional_filters=None):
200
206
  vector = context.get_store_resource(vector_uri)
201
207
  store_target = get_target_driver(target, vector)
202
208
  if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
206
212
  merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
207
213
  merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
208
214
  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
209
- timestamp_for_filtering=timestamp_for_filtering)
215
+ timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
210
216
 
211
217
  target = vector.status.targets[store_target.name].to_dict()
212
218
  context.log_result('feature_vector', vector.uri)
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
114
114
  start_time=None,
115
115
  end_time=None,
116
116
  time_column=None,
117
+ additional_filters=None,
117
118
  ):
118
119
  df = feature_set.to_dataframe(
119
120
  columns=column_names,
120
121
  start_time=start_time,
121
122
  end_time=end_time,
122
123
  time_column=time_column,
124
+ additional_filters=additional_filters,
123
125
  )
124
126
  if df.index.names[0]:
125
127
  df.reset_index(inplace=True)
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
+
15
16
  import pandas as pd
16
17
  import semver
17
18
 
@@ -24,6 +25,32 @@ from .base import BaseMerger
24
25
  from .conversion import PandasConversionMixin
25
26
 
26
27
 
28
+ def spark_df_to_pandas(spark_df):
29
+ # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
30
+ # when we upgrade pyspark, we should check whether this workaround is still necessary
31
+ # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
32
+ if semver.parse(pd.__version__)["major"] >= 2:
33
+ import pyspark.sql.functions as pyspark_functions
34
+
35
+ type_conversion_dict = {}
36
+ for field in spark_df.schema.fields:
37
+ if str(field.dataType) == "TimestampType":
38
+ spark_df = spark_df.withColumn(
39
+ field.name,
40
+ pyspark_functions.date_format(
41
+ pyspark_functions.to_timestamp(field.name),
42
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
43
+ ),
44
+ )
45
+ type_conversion_dict[field.name] = "datetime64[ns]"
46
+ df = PandasConversionMixin.toPandas(spark_df)
47
+ if type_conversion_dict:
48
+ df = df.astype(type_conversion_dict)
49
+ return df
50
+ else:
51
+ return PandasConversionMixin.toPandas(spark_df)
52
+
53
+
27
54
  class SparkFeatureMerger(BaseMerger):
28
55
  engine = "spark"
29
56
  support_offline = True
@@ -166,29 +193,7 @@ class SparkFeatureMerger(BaseMerger):
166
193
  def get_df(self, to_pandas=True):
167
194
  if to_pandas:
168
195
  if self._pandas_df is None:
169
- df = self._result_df
170
- # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
171
- # when we upgrade pyspark, we should check whether this workaround is still necessary
172
- # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
173
- if semver.parse(pd.__version__)["major"] >= 2:
174
- import pyspark.sql.functions as pyspark_functions
175
-
176
- type_conversion_dict = {}
177
- for field in df.schema.fields:
178
- if str(field.dataType) == "TimestampType":
179
- df = df.withColumn(
180
- field.name,
181
- pyspark_functions.date_format(
182
- pyspark_functions.to_timestamp(field.name),
183
- "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
184
- ),
185
- )
186
- type_conversion_dict[field.name] = "datetime64[ns]"
187
- df = PandasConversionMixin.toPandas(df)
188
- if type_conversion_dict:
189
- df = df.astype(type_conversion_dict)
190
- else:
191
- df = PandasConversionMixin.toPandas(df)
196
+ df = spark_df_to_pandas(self._result_df)
192
197
  self._pandas_df = df
193
198
  self._set_indexes(self._pandas_df)
194
199
  return self._pandas_df
@@ -221,7 +226,12 @@ class SparkFeatureMerger(BaseMerger):
221
226
  start_time=None,
222
227
  end_time=None,
223
228
  time_column=None,
229
+ additional_filters=None,
224
230
  ):
231
+ mlrun.utils.helpers.additional_filters_warning(
232
+ additional_filters, self.__class__
233
+ )
234
+
225
235
  source_kwargs = {}
226
236
  if feature_set.spec.passthrough:
227
237
  if not feature_set.spec.source:
@@ -243,13 +253,13 @@ class SparkFeatureMerger(BaseMerger):
243
253
  # handling case where there are multiple feature sets and user creates vector where
244
254
  # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
245
255
  source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
246
-
247
256
  source = source_driver(
248
257
  name=self.vector.metadata.name,
249
258
  path=source_path,
250
259
  time_field=time_column,
251
260
  start_time=start_time,
252
261
  end_time=end_time,
262
+ additional_filters=additional_filters,
253
263
  **source_kwargs,
254
264
  )
255
265