mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -337,7 +337,10 @@ class FeatureSet(ModelObj):
337
337
  example::
338
338
 
339
339
  import mlrun.feature_store as fstore
340
- ticks = fstore.FeatureSet("ticks", entities=["stock"], timestamp_key="timestamp")
340
+
341
+ ticks = fstore.FeatureSet(
342
+ "ticks", entities=["stock"], timestamp_key="timestamp"
343
+ )
341
344
  ticks.ingest(df)
342
345
 
343
346
  :param name: name of the feature set
@@ -625,12 +628,12 @@ class FeatureSet(ModelObj):
625
628
 
626
629
  import mlrun.feature_store as fstore
627
630
 
628
- ticks = fstore.FeatureSet("ticks",
629
- entities=["stock"],
630
- timestamp_key="timestamp")
631
- ticks.add_entity("country",
632
- mlrun.data_types.ValueType.STRING,
633
- description="stock country")
631
+ ticks = fstore.FeatureSet(
632
+ "ticks", entities=["stock"], timestamp_key="timestamp"
633
+ )
634
+ ticks.add_entity(
635
+ "country", mlrun.data_types.ValueType.STRING, description="stock country"
636
+ )
634
637
  ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
635
638
  ticks.save()
636
639
 
@@ -650,13 +653,23 @@ class FeatureSet(ModelObj):
650
653
  import mlrun.feature_store as fstore
651
654
  from mlrun.features import Feature
652
655
 
653
- ticks = fstore.FeatureSet("ticks",
654
- entities=["stock"],
655
- timestamp_key="timestamp")
656
- ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.STRING,
657
- description="client consistency"),"ABC01")
658
- ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.FLOAT,
659
- description="client volatility"),"SAB")
656
+ ticks = fstore.FeatureSet(
657
+ "ticks", entities=["stock"], timestamp_key="timestamp"
658
+ )
659
+ ticks.add_feature(
660
+ Feature(
661
+ value_type=mlrun.data_types.ValueType.STRING,
662
+ description="client consistency",
663
+ ),
664
+ "ABC01",
665
+ )
666
+ ticks.add_feature(
667
+ Feature(
668
+ value_type=mlrun.data_types.ValueType.FLOAT,
669
+ description="client volatility",
670
+ ),
671
+ "SAB",
672
+ )
660
673
  ticks.save()
661
674
 
662
675
  :param feature: setting of Feature
@@ -860,15 +873,18 @@ class FeatureSet(ModelObj):
860
873
  example::
861
874
 
862
875
  import mlrun.feature_store as fstore
876
+
863
877
  ...
864
- ticks = fstore.FeatureSet("ticks",
865
- entities=["stock"],
866
- timestamp_key="timestamp")
867
- ticks.add_aggregation(name='priceN',
868
- column='price',
869
- operations=['avg'],
870
- windows=['1d'],
871
- period='1h')
878
+ ticks = fstore.FeatureSet(
879
+ "ticks", entities=["stock"], timestamp_key="timestamp"
880
+ )
881
+ ticks.add_aggregation(
882
+ name="priceN",
883
+ column="price",
884
+ operations=["avg"],
885
+ windows=["1d"],
886
+ period="1h",
887
+ )
872
888
  ticks.plot(rankdir="LR", with_targets=True)
873
889
 
874
890
  :param filename: target filepath for the graph image (None for the notebook)
@@ -901,6 +917,7 @@ class FeatureSet(ModelObj):
901
917
  start_time=None,
902
918
  end_time=None,
903
919
  time_column=None,
920
+ additional_filters=None,
904
921
  **kwargs,
905
922
  ):
906
923
  """return featureset (offline) data as dataframe
@@ -912,6 +929,12 @@ class FeatureSet(ModelObj):
912
929
  :param end_time: filter by end time
913
930
  :param time_column: specify the time column name in the file
914
931
  :param kwargs: additional reader (csv, parquet, ..) args
932
+ :param additional_filters: List of additional_filter conditions as tuples.
933
+ Each tuple should be in the format (column_name, operator, value).
934
+ Supported operators: "=", ">=", "<=", ">", "<".
935
+ Example: [("Product", "=", "Computer")]
936
+ For all supported filters, please see:
937
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
915
938
  :return: DataFrame
916
939
  """
917
940
  entities = list(self.spec.entities.keys())
@@ -930,6 +953,7 @@ class FeatureSet(ModelObj):
930
953
  start_time=start_time,
931
954
  end_time=end_time,
932
955
  time_field=time_column,
956
+ additional_filters=additional_filters,
933
957
  **kwargs,
934
958
  )
935
959
  # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -949,6 +973,7 @@ class FeatureSet(ModelObj):
949
973
  start_time=start_time,
950
974
  end_time=end_time,
951
975
  time_column=time_column,
976
+ additional_filters=additional_filters,
952
977
  **kwargs,
953
978
  )
954
979
  return result
@@ -1005,7 +1030,7 @@ class FeatureSet(ModelObj):
1005
1030
  df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
1006
1031
 
1007
1032
  # for running as remote job
1008
- config = RunConfig(image='mlrun/mlrun')
1033
+ config = RunConfig(image="mlrun/mlrun")
1009
1034
  df = ingest(stocks_set, stocks, run_config=config)
1010
1035
 
1011
1036
  # specify source and targets
@@ -486,6 +486,7 @@ class FeatureVector(ModelObj):
486
486
  example::
487
487
 
488
488
  import mlrun.feature_store as fstore
489
+
489
490
  features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
490
491
  vector = fstore.FeatureVector("my-vec", features)
491
492
 
@@ -740,6 +741,7 @@ class FeatureVector(ModelObj):
740
741
  order_by: Union[str, list[str]] = None,
741
742
  spark_service: str = None,
742
743
  timestamp_for_filtering: Union[str, dict[str, str]] = None,
744
+ additional_filters: list = None,
743
745
  ):
744
746
  """retrieve offline feature vector results
745
747
 
@@ -796,6 +798,12 @@ class FeatureVector(ModelObj):
796
798
  By default, the filter executes on the timestamp_key of each feature set.
797
799
  Note: the time filtering is performed on each feature set before the
798
800
  merge process using start_time and end_time params.
801
+ :param additional_filters: List of additional_filter conditions as tuples.
802
+ Each tuple should be in the format (column_name, operator, value).
803
+ Supported operators: "=", ">=", "<=", ">", "<".
804
+ Example: [("Product", "=", "Computer")]
805
+ For all supported filters, please see:
806
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
799
807
 
800
808
  """
801
809
 
@@ -816,6 +824,7 @@ class FeatureVector(ModelObj):
816
824
  order_by,
817
825
  spark_service,
818
826
  timestamp_for_filtering,
827
+ additional_filters,
819
828
  )
820
829
 
821
830
  def get_online_feature_service(
@@ -852,7 +861,7 @@ class FeatureVector(ModelObj):
852
861
 
853
862
  Example::
854
863
 
855
- svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
864
+ svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
856
865
  try:
857
866
  resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
858
867
  print(resp)
@@ -1077,7 +1086,9 @@ class OfflineVectorResponse:
1077
1086
  def to_dataframe(self, to_pandas=True):
1078
1087
  """return result as dataframe"""
1079
1088
  if self.status != "completed":
1080
- raise mlrun.errors.MLRunTaskNotReady("feature vector dataset is not ready")
1089
+ raise mlrun.errors.MLRunTaskNotReadyError(
1090
+ "feature vector dataset is not ready"
1091
+ )
1081
1092
  return self._merger.get_df(to_pandas=to_pandas)
1082
1093
 
1083
1094
  def to_parquet(self, target_path, **kw):
@@ -17,6 +17,7 @@ import uuid
17
17
  import pandas as pd
18
18
 
19
19
  import mlrun
20
+ import mlrun.common.constants as mlrun_constants
20
21
  from mlrun.datastore.sources import get_source_from_dict, get_source_step
21
22
  from mlrun.datastore.targets import (
22
23
  add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
263
264
  out_path=featureset.spec.output_path,
264
265
  )
265
266
  task.spec.secret_sources = run_config.secret_sources
266
- task.set_label("job-type", "feature-ingest").set_label(
267
- "feature-set", featureset.uri
268
- )
267
+ task.set_label(
268
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
269
+ ).set_label("feature-set", featureset.uri)
269
270
  if run_config.owner:
270
- task.set_label("owner", run_config.owner).set_label(
271
- "v3io_user", run_config.owner
272
- )
271
+ task.set_label(
272
+ mlrun_constants.MLRunInternalLabels.owner, run_config.owner
273
+ ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
273
274
 
274
275
  # set run UID and save in the feature set status (linking the features et to the job)
275
276
  task.metadata.uid = uuid.uuid4().hex
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
88
88
  update_stats=None,
89
89
  query=None,
90
90
  order_by=None,
91
+ additional_filters=None,
91
92
  ):
92
93
  self._target = target
93
94
 
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
134
135
  timestamp_for_filtering=timestamp_for_filtering,
135
136
  query=query,
136
137
  order_by=order_by,
138
+ additional_filters=additional_filters,
137
139
  )
138
140
 
139
141
  def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
186
188
  timestamp_for_filtering=None,
187
189
  query=None,
188
190
  order_by=None,
191
+ additional_filters=None,
189
192
  ):
190
193
  self._create_engine_env()
191
194
 
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
212
215
  feature_sets.append(None)
213
216
  join_types.append(None)
214
217
 
215
- filtered = False
218
+ timestamp_filtered = False
216
219
  for step in join_graph.steps:
217
220
  name = step.right_feature_set_name
218
221
  feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
250
253
  if self._drop_indexes:
251
254
  self._append_drop_column(time_column)
252
255
  if (start_time or end_time) and time_column:
253
- filtered = True
256
+ timestamp_filtered = True
254
257
 
255
258
  df = self._get_engine_df(
256
259
  feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
259
262
  start_time if time_column else None,
260
263
  end_time if time_column else None,
261
264
  time_column,
265
+ additional_filters,
262
266
  )
263
267
 
264
268
  fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
302
306
  new_columns.append((column, alias))
303
307
  self._update_alias(dictionary={name: alias for name, alias in new_columns})
304
308
 
305
- # None of the feature sets was filtered as required
306
- if not filtered and (start_time or end_time):
309
+ # None of the feature sets was timestamp filtered as required
310
+ if not timestamp_filtered and (start_time or end_time):
307
311
  raise mlrun.errors.MLRunRuntimeError(
308
312
  "start_time and end_time can only be provided in conjunction with "
309
313
  "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
755
759
  start_time: typing.Union[str, datetime] = None,
756
760
  end_time: typing.Union[str, datetime] = None,
757
761
  time_column: typing.Optional[str] = None,
762
+ additional_filters=None,
758
763
  ):
759
764
  """
760
765
  Return the feature_set data frame according to the args
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
145
145
  start_time=None,
146
146
  end_time=None,
147
147
  time_column=None,
148
+ additional_filters=None,
148
149
  ):
149
150
  import dask.dataframe as dd
150
151
 
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
155
156
  end_time=end_time,
156
157
  time_column=time_column,
157
158
  index=False,
159
+ additional_filters=additional_filters,
158
160
  )
159
161
 
160
162
  return self._reset_index(df).persist()
@@ -15,6 +15,7 @@
15
15
  import uuid
16
16
 
17
17
  import mlrun
18
+ import mlrun.common.constants as mlrun_constants
18
19
  from mlrun.config import config as mlconf
19
20
  from mlrun.model import DataTargetBase, new_task
20
21
  from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
42
43
  start_time=None,
43
44
  end_time=None,
44
45
  timestamp_for_filtering=None,
46
+ additional_filters=None,
45
47
  ):
46
48
  name = vector.metadata.name
47
49
  if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
116
118
  "end_time": end_time,
117
119
  "timestamp_for_filtering": timestamp_for_filtering,
118
120
  "engine_args": engine_args,
121
+ "additional_filters": additional_filters,
119
122
  },
120
123
  inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
121
124
  )
122
125
  task.spec.secret_sources = run_config.secret_sources
123
- task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
126
+ task.set_label(
127
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
128
+ ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
124
129
  task.metadata.uid = uuid.uuid4().hex
125
130
  vector.status.run_uri = task.metadata.uid
126
131
  vector.save()
@@ -151,7 +156,9 @@ class RemoteVectorResponse:
151
156
 
152
157
  def _is_ready(self):
153
158
  if self.status != "completed":
154
- raise mlrun.errors.MLRunTaskNotReady("feature vector dataset is not ready")
159
+ raise mlrun.errors.MLRunTaskNotReadyError(
160
+ "feature vector dataset is not ready"
161
+ )
155
162
  self.vector.reload()
156
163
 
157
164
  def to_dataframe(self, columns=None, df_module=None, **kwargs):
@@ -176,6 +183,7 @@ class RemoteVectorResponse:
176
183
  file_format = kwargs.get("format")
177
184
  if not file_format:
178
185
  file_format = self.run.status.results["target"]["kind"]
186
+
179
187
  df = mlrun.get_dataitem(self.target_uri).as_df(
180
188
  columns=columns, df_module=df_module, format=file_format, **kwargs
181
189
  )
@@ -196,7 +204,8 @@ import mlrun.feature_store.retrieval
196
204
  from mlrun.datastore.targets import get_target_driver
197
205
  def merge_handler(context, vector_uri, target, entity_rows=None,
198
206
  entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
199
- engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
207
+ engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
208
+ additional_filters=None):
200
209
  vector = context.get_store_resource(vector_uri)
201
210
  store_target = get_target_driver(target, vector)
202
211
  if entity_rows:
@@ -206,7 +215,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
206
215
  merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
207
216
  merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
208
217
  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
209
- timestamp_for_filtering=timestamp_for_filtering)
218
+ timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
210
219
 
211
220
  target = vector.status.targets[store_target.name].to_dict()
212
221
  context.log_result('feature_vector', vector.uri)
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
114
114
  start_time=None,
115
115
  end_time=None,
116
116
  time_column=None,
117
+ additional_filters=None,
117
118
  ):
118
119
  df = feature_set.to_dataframe(
119
120
  columns=column_names,
120
121
  start_time=start_time,
121
122
  end_time=end_time,
122
123
  time_column=time_column,
124
+ additional_filters=additional_filters,
123
125
  )
124
126
  if df.index.names[0]:
125
127
  df.reset_index(inplace=True)
@@ -12,16 +12,17 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
- import pandas as pd
16
- import semver
15
+
17
16
 
18
17
  import mlrun
18
+ from mlrun.data_types.to_pandas import spark_df_to_pandas
19
+ from mlrun.datastore.sources import ParquetSource
19
20
  from mlrun.datastore.targets import get_offline_target
21
+ from mlrun.runtimes import RemoteSparkRuntime
22
+ from mlrun.runtimes.sparkjob import Spark3Runtime
23
+ from mlrun.utils.helpers import additional_filters_warning
20
24
 
21
- from ...runtimes import RemoteSparkRuntime
22
- from ...runtimes.sparkjob import Spark3Runtime
23
25
  from .base import BaseMerger
24
- from .conversion import PandasConversionMixin
25
26
 
26
27
 
27
28
  class SparkFeatureMerger(BaseMerger):
@@ -166,29 +167,7 @@ class SparkFeatureMerger(BaseMerger):
166
167
  def get_df(self, to_pandas=True):
167
168
  if to_pandas:
168
169
  if self._pandas_df is None:
169
- df = self._result_df
170
- # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
171
- # when we upgrade pyspark, we should check whether this workaround is still necessary
172
- # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
173
- if semver.parse(pd.__version__)["major"] >= 2:
174
- import pyspark.sql.functions as pyspark_functions
175
-
176
- type_conversion_dict = {}
177
- for field in df.schema.fields:
178
- if str(field.dataType) == "TimestampType":
179
- df = df.withColumn(
180
- field.name,
181
- pyspark_functions.date_format(
182
- pyspark_functions.to_timestamp(field.name),
183
- "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
184
- ),
185
- )
186
- type_conversion_dict[field.name] = "datetime64[ns]"
187
- df = PandasConversionMixin.toPandas(df)
188
- if type_conversion_dict:
189
- df = df.astype(type_conversion_dict)
190
- else:
191
- df = PandasConversionMixin.toPandas(df)
170
+ df = spark_df_to_pandas(self._result_df)
192
171
  self._pandas_df = df
193
172
  self._set_indexes(self._pandas_df)
194
173
  return self._pandas_df
@@ -209,9 +188,13 @@ class SparkFeatureMerger(BaseMerger):
209
188
 
210
189
  if self.spark is None:
211
190
  # create spark context
212
- self.spark = SparkSession.builder.appName(
213
- f"vector-merger-{self.vector.metadata.name}"
214
- ).getOrCreate()
191
+ self.spark = (
192
+ SparkSession.builder.appName(
193
+ f"vector-merger-{self.vector.metadata.name}"
194
+ )
195
+ .config("spark.driver.memory", "2g")
196
+ .getOrCreate()
197
+ )
215
198
 
216
199
  def _get_engine_df(
217
200
  self,
@@ -221,6 +204,7 @@ class SparkFeatureMerger(BaseMerger):
221
204
  start_time=None,
222
205
  end_time=None,
223
206
  time_column=None,
207
+ additional_filters=None,
224
208
  ):
225
209
  source_kwargs = {}
226
210
  if feature_set.spec.passthrough:
@@ -231,6 +215,7 @@ class SparkFeatureMerger(BaseMerger):
231
215
  source_kind = feature_set.spec.source.kind
232
216
  source_path = feature_set.spec.source.path
233
217
  source_kwargs.update(feature_set.spec.source.attributes)
218
+ source_kwargs.pop("additional_filters", None)
234
219
  else:
235
220
  target = get_offline_target(feature_set)
236
221
  if not target:
@@ -239,17 +224,24 @@ class SparkFeatureMerger(BaseMerger):
239
224
  )
240
225
  source_kind = target.kind
241
226
  source_path = target.get_target_path()
242
-
227
+ source_kwargs = target.source_spark_attributes
243
228
  # handling case where there are multiple feature sets and user creates vector where
244
229
  # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
245
230
  source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
246
231
 
232
+ if source_driver != ParquetSource:
233
+ additional_filters_warning(additional_filters, source_driver)
234
+ additional_filters = None
235
+ additional_filters_dict = (
236
+ {"additional_filters": additional_filters} if additional_filters else {}
237
+ )
247
238
  source = source_driver(
248
239
  name=self.vector.metadata.name,
249
240
  path=source_path,
250
241
  time_field=time_column,
251
242
  start_time=start_time,
252
243
  end_time=end_time,
244
+ **additional_filters_dict,
253
245
  **source_kwargs,
254
246
  )
255
247
 
@@ -162,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
162
162
  example::
163
163
 
164
164
  # replace the value "U" with '0' in the age column
165
- graph.to(MapValues(mapping={'age': {'U': '0'}}, with_original_features=True))
165
+ graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
166
166
 
167
167
  # replace integers, example
168
- graph.to(MapValues(mapping={'not': {0: 1, 1: 0}}))
168
+ graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
169
169
 
170
170
  # replace by range, use -inf and inf for extended range
171
- graph.to(MapValues(mapping={'numbers': {'ranges': {'negative': [-inf, 0], 'positive': [0, inf]}}}))
171
+ graph.to(
172
+ MapValues(
173
+ mapping={
174
+ "numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
175
+ }
176
+ )
177
+ )
172
178
 
173
179
  :param mapping: a dict with entry per column and the associated old/new values map
174
180
  :param with_original_features: set to True to keep the original features
@@ -424,8 +430,10 @@ class OneHotEncoder(StepToDict, MLRunStep):
424
430
 
425
431
  example::
426
432
 
427
- mapping = {'category': ['food', 'health', 'transportation'],
428
- 'gender': ['male', 'female']}
433
+ mapping = {
434
+ "category": ["food", "health", "transportation"],
435
+ "gender": ["male", "female"],
436
+ }
429
437
  graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
430
438
 
431
439
  :param mapping: a dict of per column categories (to map to binary fields)
@@ -542,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
542
550
 
543
551
  # (taken from the fraud-detection end-to-end feature store demo)
544
552
  # Define the Transactions FeatureSet
545
- transaction_set = fstore.FeatureSet("transactions",
546
- entities=[fstore.Entity("source")],
547
- timestamp_key='timestamp',
548
- description="transactions feature set")
553
+ transaction_set = fstore.FeatureSet(
554
+ "transactions",
555
+ entities=[fstore.Entity("source")],
556
+ timestamp_key="timestamp",
557
+ description="transactions feature set",
558
+ )
549
559
 
550
560
  # Get FeatureSet computation graph
551
561
  transaction_graph = transaction_set.graph
@@ -553,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
553
563
  # Add the custom `DateExtractor` step
554
564
  # to the computation graph
555
565
  transaction_graph.to(
556
- class_name='DateExtractor',
557
- name='Extract Dates',
558
- parts = ['hour', 'day_of_week'],
559
- timestamp_col = 'timestamp',
560
- )
566
+ class_name="DateExtractor",
567
+ name="Extract Dates",
568
+ parts=["hour", "day_of_week"],
569
+ timestamp_col="timestamp",
570
+ )
561
571
 
562
572
  :param parts: list of pandas style date-time parts you want to extract.
563
573
  :param timestamp_col: The name of the column containing the timestamps to extract from,
@@ -694,11 +704,12 @@ class DropFeatures(StepToDict, MLRunStep):
694
704
 
695
705
  example::
696
706
 
697
- feature_set = fstore.FeatureSet("fs-new",
698
- entities=[fstore.Entity("id")],
699
- description="feature set",
700
- engine="pandas",
701
- )
707
+ feature_set = fstore.FeatureSet(
708
+ "fs-new",
709
+ entities=[fstore.Entity("id")],
710
+ description="feature set",
711
+ engine="pandas",
712
+ )
702
713
  # Pre-processing graph steps
703
714
  feature_set.graph.to(DropFeatures(features=["age"]))
704
715
  df_pandas = feature_set.ingest(data)
@@ -732,3 +743,11 @@ class DropFeatures(StepToDict, MLRunStep):
732
743
  raise mlrun.errors.MLRunInvalidArgumentError(
733
744
  f"DropFeatures can only drop features, not entities: {dropped_entities}"
734
745
  )
746
+ if feature_set.spec.label_column in features:
747
+ raise mlrun.errors.MLRunInvalidArgumentError(
748
+ f"DropFeatures can not drop label_column: {feature_set.spec.label_column}"
749
+ )
750
+ if feature_set.spec.timestamp_key in features:
751
+ raise mlrun.errors.MLRunInvalidArgumentError(
752
+ f"DropFeatures can not drop timestamp_key: {feature_set.spec.timestamp_key}"
753
+ )
mlrun/features.py CHANGED
@@ -100,7 +100,8 @@ class Feature(ModelObj):
100
100
  :param name: name of the feature
101
101
  :param validator: feature validation policy
102
102
  :param default: default value
103
- :param labels: a set of key/value labels (tags)
103
+ :param labels: a set of key/value labels (tags). Labels can be used to filter featues, for example,
104
+ in the UI Feature store page.
104
105
  """
105
106
  self.name = name or ""
106
107
  if isinstance(value_type, ValueType):
@@ -238,10 +239,7 @@ class Validator(ModelObj):
238
239
  from mlrun.features import Validator
239
240
 
240
241
  # Add validator to the feature 'bid' with check type
241
- quotes_set["bid"].validator = Validator(
242
- check_type=True,
243
- severity="info"
244
- )
242
+ quotes_set["bid"].validator = Validator(check_type=True, severity="info")
245
243
 
246
244
  :param check_type: check feature type e.g. True, False
247
245
  :param severity: severity name e.g. info, warning, etc.
@@ -280,10 +278,7 @@ class MinMaxValidator(Validator):
280
278
 
281
279
  # Add validator to the feature 'bid', where valid
282
280
  # minimal value is 52
283
- quotes_set["bid"].validator = MinMaxValidator(
284
- min=52,
285
- severity="info"
286
- )
281
+ quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
287
282
 
288
283
  :param check_type: check feature type e.g. True, False
289
284
  :param severity: severity name e.g. info, warning, etc.
@@ -344,9 +339,7 @@ class MinMaxLenValidator(Validator):
344
339
  # Add length validator to the feature 'ticker', where valid
345
340
  # minimal length is 1 and maximal length is 10
346
341
  quotes_set["ticker"].validator = MinMaxLenValidator(
347
- min=1,
348
- max=10,
349
- severity="info"
342
+ min=1, max=10, severity="info"
350
343
  )
351
344
 
352
345
  :param check_type: check feature type e.g. True, False
@@ -408,8 +401,7 @@ class RegexValidator(Validator):
408
401
  # expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
409
402
  # e.g. A1234567, z9874563, etc.
410
403
  quotes_set["name"].validator = RegexValidator(
411
- regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
412
- severity="info"
404
+ regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
413
405
  )
414
406
 
415
407
  :param check_type: check feature type e.g. True, False