mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (200) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +25 -111
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +38 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +41 -47
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +68 -0
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
  15. mlrun/common/formatters/base.py +78 -0
  16. mlrun/common/formatters/function.py +41 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +25 -4
  21. mlrun/common/schemas/alert.py +203 -0
  22. mlrun/common/schemas/api_gateway.py +148 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +8 -2
  25. mlrun/common/schemas/client_spec.py +2 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/hub.py +7 -9
  29. mlrun/common/schemas/model_monitoring/__init__.py +19 -3
  30. mlrun/common/schemas/model_monitoring/constants.py +96 -26
  31. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  32. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  33. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  34. mlrun/common/schemas/pipeline.py +0 -9
  35. mlrun/common/schemas/project.py +22 -21
  36. mlrun/common/types.py +7 -1
  37. mlrun/config.py +87 -19
  38. mlrun/data_types/data_types.py +4 -0
  39. mlrun/data_types/to_pandas.py +9 -9
  40. mlrun/datastore/__init__.py +5 -8
  41. mlrun/datastore/alibaba_oss.py +130 -0
  42. mlrun/datastore/azure_blob.py +4 -5
  43. mlrun/datastore/base.py +69 -30
  44. mlrun/datastore/datastore.py +10 -2
  45. mlrun/datastore/datastore_profile.py +90 -6
  46. mlrun/datastore/google_cloud_storage.py +1 -1
  47. mlrun/datastore/hdfs.py +5 -0
  48. mlrun/datastore/inmem.py +2 -2
  49. mlrun/datastore/redis.py +2 -2
  50. mlrun/datastore/s3.py +5 -0
  51. mlrun/datastore/snowflake_utils.py +43 -0
  52. mlrun/datastore/sources.py +172 -44
  53. mlrun/datastore/store_resources.py +7 -7
  54. mlrun/datastore/targets.py +285 -41
  55. mlrun/datastore/utils.py +68 -5
  56. mlrun/datastore/v3io.py +27 -50
  57. mlrun/db/auth_utils.py +152 -0
  58. mlrun/db/base.py +149 -14
  59. mlrun/db/factory.py +1 -1
  60. mlrun/db/httpdb.py +608 -178
  61. mlrun/db/nopdb.py +191 -7
  62. mlrun/errors.py +11 -0
  63. mlrun/execution.py +37 -20
  64. mlrun/feature_store/__init__.py +0 -2
  65. mlrun/feature_store/api.py +21 -52
  66. mlrun/feature_store/feature_set.py +48 -23
  67. mlrun/feature_store/feature_vector.py +2 -1
  68. mlrun/feature_store/ingestion.py +7 -6
  69. mlrun/feature_store/retrieval/base.py +9 -4
  70. mlrun/feature_store/retrieval/conversion.py +9 -9
  71. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  72. mlrun/feature_store/retrieval/job.py +9 -3
  73. mlrun/feature_store/retrieval/local_merger.py +2 -0
  74. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  75. mlrun/feature_store/steps.py +30 -19
  76. mlrun/features.py +4 -13
  77. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  78. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  79. mlrun/frameworks/lgbm/__init__.py +1 -1
  80. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  81. mlrun/frameworks/lgbm/model_handler.py +1 -1
  82. mlrun/frameworks/parallel_coordinates.py +2 -1
  83. mlrun/frameworks/pytorch/__init__.py +2 -2
  84. mlrun/frameworks/sklearn/__init__.py +1 -1
  85. mlrun/frameworks/tf_keras/__init__.py +5 -2
  86. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  87. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  88. mlrun/frameworks/xgboost/__init__.py +1 -1
  89. mlrun/k8s_utils.py +10 -11
  90. mlrun/launcher/__init__.py +1 -1
  91. mlrun/launcher/base.py +6 -5
  92. mlrun/launcher/client.py +8 -6
  93. mlrun/launcher/factory.py +1 -1
  94. mlrun/launcher/local.py +9 -3
  95. mlrun/launcher/remote.py +9 -3
  96. mlrun/lists.py +6 -2
  97. mlrun/model.py +58 -19
  98. mlrun/model_monitoring/__init__.py +1 -1
  99. mlrun/model_monitoring/api.py +127 -301
  100. mlrun/model_monitoring/application.py +5 -296
  101. mlrun/model_monitoring/applications/__init__.py +11 -0
  102. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  103. mlrun/model_monitoring/applications/base.py +282 -0
  104. mlrun/model_monitoring/applications/context.py +214 -0
  105. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  106. mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
  107. mlrun/model_monitoring/applications/results.py +99 -0
  108. mlrun/model_monitoring/controller.py +30 -36
  109. mlrun/model_monitoring/db/__init__.py +18 -0
  110. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  111. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  112. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
  113. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  114. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  115. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  116. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  117. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  118. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  119. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  120. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
  121. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  122. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  123. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  124. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  125. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  126. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  127. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  128. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  129. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  130. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  131. mlrun/model_monitoring/evidently_application.py +6 -118
  132. mlrun/model_monitoring/features_drift_table.py +34 -22
  133. mlrun/model_monitoring/helpers.py +100 -7
  134. mlrun/model_monitoring/model_endpoint.py +3 -2
  135. mlrun/model_monitoring/stream_processing.py +93 -228
  136. mlrun/model_monitoring/tracking_policy.py +7 -1
  137. mlrun/model_monitoring/writer.py +152 -124
  138. mlrun/package/packagers_manager.py +1 -0
  139. mlrun/package/utils/_formatter.py +2 -2
  140. mlrun/platforms/__init__.py +11 -10
  141. mlrun/platforms/iguazio.py +21 -202
  142. mlrun/projects/operations.py +30 -16
  143. mlrun/projects/pipelines.py +92 -99
  144. mlrun/projects/project.py +757 -268
  145. mlrun/render.py +15 -14
  146. mlrun/run.py +160 -162
  147. mlrun/runtimes/__init__.py +55 -3
  148. mlrun/runtimes/base.py +33 -19
  149. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  150. mlrun/runtimes/funcdoc.py +0 -28
  151. mlrun/runtimes/kubejob.py +28 -122
  152. mlrun/runtimes/local.py +5 -2
  153. mlrun/runtimes/mpijob/__init__.py +0 -20
  154. mlrun/runtimes/mpijob/abstract.py +8 -8
  155. mlrun/runtimes/mpijob/v1.py +1 -1
  156. mlrun/runtimes/nuclio/__init__.py +1 -0
  157. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  158. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  159. mlrun/runtimes/nuclio/application/application.py +523 -0
  160. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  161. mlrun/runtimes/nuclio/function.py +98 -58
  162. mlrun/runtimes/nuclio/serving.py +36 -42
  163. mlrun/runtimes/pod.py +196 -45
  164. mlrun/runtimes/remotesparkjob.py +1 -1
  165. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  166. mlrun/runtimes/utils.py +6 -73
  167. mlrun/secrets.py +6 -2
  168. mlrun/serving/remote.py +2 -3
  169. mlrun/serving/routers.py +7 -4
  170. mlrun/serving/server.py +7 -8
  171. mlrun/serving/states.py +73 -43
  172. mlrun/serving/v2_serving.py +8 -7
  173. mlrun/track/tracker.py +2 -1
  174. mlrun/utils/async_http.py +25 -5
  175. mlrun/utils/helpers.py +141 -75
  176. mlrun/utils/http.py +1 -1
  177. mlrun/utils/logger.py +39 -7
  178. mlrun/utils/notifications/notification/__init__.py +14 -9
  179. mlrun/utils/notifications/notification/base.py +12 -0
  180. mlrun/utils/notifications/notification/console.py +2 -0
  181. mlrun/utils/notifications/notification/git.py +3 -1
  182. mlrun/utils/notifications/notification/ipython.py +2 -0
  183. mlrun/utils/notifications/notification/slack.py +101 -21
  184. mlrun/utils/notifications/notification/webhook.py +11 -1
  185. mlrun/utils/notifications/notification_pusher.py +147 -16
  186. mlrun/utils/retryer.py +3 -2
  187. mlrun/utils/v3io_clients.py +0 -1
  188. mlrun/utils/version/version.json +2 -2
  189. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
  190. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  191. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
  192. mlrun/kfpops.py +0 -868
  193. mlrun/model_monitoring/batch.py +0 -974
  194. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  195. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  196. mlrun/platforms/other.py +0 -305
  197. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  198. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  199. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  200. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
@@ -337,7 +337,10 @@ class FeatureSet(ModelObj):
337
337
  example::
338
338
 
339
339
  import mlrun.feature_store as fstore
340
- ticks = fstore.FeatureSet("ticks", entities=["stock"], timestamp_key="timestamp")
340
+
341
+ ticks = fstore.FeatureSet(
342
+ "ticks", entities=["stock"], timestamp_key="timestamp"
343
+ )
341
344
  ticks.ingest(df)
342
345
 
343
346
  :param name: name of the feature set
@@ -625,12 +628,12 @@ class FeatureSet(ModelObj):
625
628
 
626
629
  import mlrun.feature_store as fstore
627
630
 
628
- ticks = fstore.FeatureSet("ticks",
629
- entities=["stock"],
630
- timestamp_key="timestamp")
631
- ticks.add_entity("country",
632
- mlrun.data_types.ValueType.STRING,
633
- description="stock country")
631
+ ticks = fstore.FeatureSet(
632
+ "ticks", entities=["stock"], timestamp_key="timestamp"
633
+ )
634
+ ticks.add_entity(
635
+ "country", mlrun.data_types.ValueType.STRING, description="stock country"
636
+ )
634
637
  ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
635
638
  ticks.save()
636
639
 
@@ -650,13 +653,23 @@ class FeatureSet(ModelObj):
650
653
  import mlrun.feature_store as fstore
651
654
  from mlrun.features import Feature
652
655
 
653
- ticks = fstore.FeatureSet("ticks",
654
- entities=["stock"],
655
- timestamp_key="timestamp")
656
- ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.STRING,
657
- description="client consistency"),"ABC01")
658
- ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.FLOAT,
659
- description="client volatility"),"SAB")
656
+ ticks = fstore.FeatureSet(
657
+ "ticks", entities=["stock"], timestamp_key="timestamp"
658
+ )
659
+ ticks.add_feature(
660
+ Feature(
661
+ value_type=mlrun.data_types.ValueType.STRING,
662
+ description="client consistency",
663
+ ),
664
+ "ABC01",
665
+ )
666
+ ticks.add_feature(
667
+ Feature(
668
+ value_type=mlrun.data_types.ValueType.FLOAT,
669
+ description="client volatility",
670
+ ),
671
+ "SAB",
672
+ )
660
673
  ticks.save()
661
674
 
662
675
  :param feature: setting of Feature
@@ -860,15 +873,18 @@ class FeatureSet(ModelObj):
860
873
  example::
861
874
 
862
875
  import mlrun.feature_store as fstore
876
+
863
877
  ...
864
- ticks = fstore.FeatureSet("ticks",
865
- entities=["stock"],
866
- timestamp_key="timestamp")
867
- ticks.add_aggregation(name='priceN',
868
- column='price',
869
- operations=['avg'],
870
- windows=['1d'],
871
- period='1h')
878
+ ticks = fstore.FeatureSet(
879
+ "ticks", entities=["stock"], timestamp_key="timestamp"
880
+ )
881
+ ticks.add_aggregation(
882
+ name="priceN",
883
+ column="price",
884
+ operations=["avg"],
885
+ windows=["1d"],
886
+ period="1h",
887
+ )
872
888
  ticks.plot(rankdir="LR", with_targets=True)
873
889
 
874
890
  :param filename: target filepath for the graph image (None for the notebook)
@@ -901,6 +917,7 @@ class FeatureSet(ModelObj):
901
917
  start_time=None,
902
918
  end_time=None,
903
919
  time_column=None,
920
+ additional_filters=None,
904
921
  **kwargs,
905
922
  ):
906
923
  """return featureset (offline) data as dataframe
@@ -912,6 +929,12 @@ class FeatureSet(ModelObj):
912
929
  :param end_time: filter by end time
913
930
  :param time_column: specify the time column name in the file
914
931
  :param kwargs: additional reader (csv, parquet, ..) args
932
+ :param additional_filters: List of additional_filter conditions as tuples.
933
+ Each tuple should be in the format (column_name, operator, value).
934
+ Supported operators: "=", ">=", "<=", ">", "<".
935
+ Example: [("Product", "=", "Computer")]
936
+ For all supported filters, please see:
937
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
915
938
  :return: DataFrame
916
939
  """
917
940
  entities = list(self.spec.entities.keys())
@@ -930,6 +953,7 @@ class FeatureSet(ModelObj):
930
953
  start_time=start_time,
931
954
  end_time=end_time,
932
955
  time_field=time_column,
956
+ additional_filters=additional_filters,
933
957
  **kwargs,
934
958
  )
935
959
  # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -949,6 +973,7 @@ class FeatureSet(ModelObj):
949
973
  start_time=start_time,
950
974
  end_time=end_time,
951
975
  time_column=time_column,
976
+ additional_filters=additional_filters,
952
977
  **kwargs,
953
978
  )
954
979
  return result
@@ -1005,7 +1030,7 @@ class FeatureSet(ModelObj):
1005
1030
  df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
1006
1031
 
1007
1032
  # for running as remote job
1008
- config = RunConfig(image='mlrun/mlrun')
1033
+ config = RunConfig(image="mlrun/mlrun")
1009
1034
  df = ingest(stocks_set, stocks, run_config=config)
1010
1035
 
1011
1036
  # specify source and targets
@@ -486,6 +486,7 @@ class FeatureVector(ModelObj):
486
486
  example::
487
487
 
488
488
  import mlrun.feature_store as fstore
489
+
489
490
  features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
490
491
  vector = fstore.FeatureVector("my-vec", features)
491
492
 
@@ -852,7 +853,7 @@ class FeatureVector(ModelObj):
852
853
 
853
854
  Example::
854
855
 
855
- svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
856
+ svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
856
857
  try:
857
858
  resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
858
859
  print(resp)
@@ -17,6 +17,7 @@ import uuid
17
17
  import pandas as pd
18
18
 
19
19
  import mlrun
20
+ import mlrun.common.constants as mlrun_constants
20
21
  from mlrun.datastore.sources import get_source_from_dict, get_source_step
21
22
  from mlrun.datastore.targets import (
22
23
  add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
263
264
  out_path=featureset.spec.output_path,
264
265
  )
265
266
  task.spec.secret_sources = run_config.secret_sources
266
- task.set_label("job-type", "feature-ingest").set_label(
267
- "feature-set", featureset.uri
268
- )
267
+ task.set_label(
268
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
269
+ ).set_label("feature-set", featureset.uri)
269
270
  if run_config.owner:
270
- task.set_label("owner", run_config.owner).set_label(
271
- "v3io_user", run_config.owner
272
- )
271
+ task.set_label(
272
+ mlrun_constants.MLRunInternalLabels.owner, run_config.owner
273
+ ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
273
274
 
274
275
  # set run UID and save in the feature set status (linking the features et to the job)
275
276
  task.metadata.uid = uuid.uuid4().hex
@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
88
88
  update_stats=None,
89
89
  query=None,
90
90
  order_by=None,
91
+ additional_filters=None,
91
92
  ):
92
93
  self._target = target
93
94
 
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
134
135
  timestamp_for_filtering=timestamp_for_filtering,
135
136
  query=query,
136
137
  order_by=order_by,
138
+ additional_filters=additional_filters,
137
139
  )
138
140
 
139
141
  def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
186
188
  timestamp_for_filtering=None,
187
189
  query=None,
188
190
  order_by=None,
191
+ additional_filters=None,
189
192
  ):
190
193
  self._create_engine_env()
191
194
 
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
212
215
  feature_sets.append(None)
213
216
  join_types.append(None)
214
217
 
215
- filtered = False
218
+ timestamp_filtered = False
216
219
  for step in join_graph.steps:
217
220
  name = step.right_feature_set_name
218
221
  feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
250
253
  if self._drop_indexes:
251
254
  self._append_drop_column(time_column)
252
255
  if (start_time or end_time) and time_column:
253
- filtered = True
256
+ timestamp_filtered = True
254
257
 
255
258
  df = self._get_engine_df(
256
259
  feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
259
262
  start_time if time_column else None,
260
263
  end_time if time_column else None,
261
264
  time_column,
265
+ additional_filters,
262
266
  )
263
267
 
264
268
  fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
302
306
  new_columns.append((column, alias))
303
307
  self._update_alias(dictionary={name: alias for name, alias in new_columns})
304
308
 
305
- # None of the feature sets was filtered as required
306
- if not filtered and (start_time or end_time):
309
+ # None of the feature sets was timestamp filtered as required
310
+ if not timestamp_filtered and (start_time or end_time):
307
311
  raise mlrun.errors.MLRunRuntimeError(
308
312
  "start_time and end_time can only be provided in conjunction with "
309
313
  "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
755
759
  start_time: typing.Union[str, datetime] = None,
756
760
  end_time: typing.Union[str, datetime] = None,
757
761
  time_column: typing.Optional[str] = None,
762
+ additional_filters=None,
758
763
  ):
759
764
  """
760
765
  Return the feature_set data frame according to the args
@@ -79,10 +79,10 @@ class PandasConversionMixin:
79
79
  msg = (
80
80
  "toPandas attempted Arrow optimization because "
81
81
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
82
- "failed by the reason below:\n %s\n"
82
+ f"failed by the reason below:\n {e}\n"
83
83
  "Attempting non-optimization as "
84
84
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
85
- "true." % str(e)
85
+ "true."
86
86
  )
87
87
  warnings.warn(msg)
88
88
  use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin:
92
92
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
93
93
  "reached the error below and will not continue because automatic fallback "
94
94
  "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
95
- "false.\n %s" % str(e)
95
+ f"false.\n {e}"
96
96
  )
97
97
  warnings.warn(msg)
98
98
  raise
@@ -158,7 +158,7 @@ class PandasConversionMixin:
158
158
  "reached the error below and can not continue. Note that "
159
159
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
160
160
  "effect on failures in the middle of "
161
- "computation.\n %s" % str(e)
161
+ f"computation.\n {e}"
162
162
  )
163
163
  warnings.warn(msg)
164
164
  raise
@@ -168,10 +168,10 @@ class PandasConversionMixin:
168
168
  column_counter = Counter(self.columns)
169
169
 
170
170
  dtype = [None] * len(self.schema)
171
- for fieldIdx, field in enumerate(self.schema):
171
+ for field_idx, field in enumerate(self.schema):
172
172
  # For duplicate column name, we use `iloc` to access it.
173
173
  if column_counter[field.name] > 1:
174
- pandas_col = pdf.iloc[:, fieldIdx]
174
+ pandas_col = pdf.iloc[:, field_idx]
175
175
  else:
176
176
  pandas_col = pdf[field.name]
177
177
 
@@ -187,12 +187,12 @@ class PandasConversionMixin:
187
187
  and field.nullable
188
188
  and pandas_col.isnull().any()
189
189
  ):
190
- dtype[fieldIdx] = pandas_type
190
+ dtype[field_idx] = pandas_type
191
191
  # Ensure we fall back to nullable numpy types, even when whole column is null:
192
192
  if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
193
- dtype[fieldIdx] = np.float64
193
+ dtype[field_idx] = np.float64
194
194
  if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
195
- dtype[fieldIdx] = object
195
+ dtype[field_idx] = object
196
196
 
197
197
  df = pd.DataFrame()
198
198
  for index, t in enumerate(dtype):
@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
145
145
  start_time=None,
146
146
  end_time=None,
147
147
  time_column=None,
148
+ additional_filters=None,
148
149
  ):
149
150
  import dask.dataframe as dd
150
151
 
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
155
156
  end_time=end_time,
156
157
  time_column=time_column,
157
158
  index=False,
159
+ additional_filters=additional_filters,
158
160
  )
159
161
 
160
162
  return self._reset_index(df).persist()
@@ -15,6 +15,7 @@
15
15
  import uuid
16
16
 
17
17
  import mlrun
18
+ import mlrun.common.constants as mlrun_constants
18
19
  from mlrun.config import config as mlconf
19
20
  from mlrun.model import DataTargetBase, new_task
20
21
  from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
42
43
  start_time=None,
43
44
  end_time=None,
44
45
  timestamp_for_filtering=None,
46
+ additional_filters=None,
45
47
  ):
46
48
  name = vector.metadata.name
47
49
  if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
116
118
  "end_time": end_time,
117
119
  "timestamp_for_filtering": timestamp_for_filtering,
118
120
  "engine_args": engine_args,
121
+ "additional_filters": additional_filters,
119
122
  },
120
123
  inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
121
124
  )
122
125
  task.spec.secret_sources = run_config.secret_sources
123
- task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
126
+ task.set_label(
127
+ mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
128
+ ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
124
129
  task.metadata.uid = uuid.uuid4().hex
125
130
  vector.status.run_uri = task.metadata.uid
126
131
  vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
196
201
  from mlrun.datastore.targets import get_target_driver
197
202
  def merge_handler(context, vector_uri, target, entity_rows=None,
198
203
  entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
199
- engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
204
+ engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
205
+ additional_filters=None):
200
206
  vector = context.get_store_resource(vector_uri)
201
207
  store_target = get_target_driver(target, vector)
202
208
  if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
206
212
  merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
207
213
  merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
208
214
  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
209
- timestamp_for_filtering=timestamp_for_filtering)
215
+ timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
210
216
 
211
217
  target = vector.status.targets[store_target.name].to_dict()
212
218
  context.log_result('feature_vector', vector.uri)
@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
114
114
  start_time=None,
115
115
  end_time=None,
116
116
  time_column=None,
117
+ additional_filters=None,
117
118
  ):
118
119
  df = feature_set.to_dataframe(
119
120
  columns=column_names,
120
121
  start_time=start_time,
121
122
  end_time=end_time,
122
123
  time_column=time_column,
124
+ additional_filters=additional_filters,
123
125
  )
124
126
  if df.index.names[0]:
125
127
  df.reset_index(inplace=True)
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
+
15
16
  import pandas as pd
16
17
  import semver
17
18
 
@@ -24,6 +25,32 @@ from .base import BaseMerger
24
25
  from .conversion import PandasConversionMixin
25
26
 
26
27
 
28
+ def spark_df_to_pandas(spark_df):
29
+ # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
30
+ # when we upgrade pyspark, we should check whether this workaround is still necessary
31
+ # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
32
+ if semver.parse(pd.__version__)["major"] >= 2:
33
+ import pyspark.sql.functions as pyspark_functions
34
+
35
+ type_conversion_dict = {}
36
+ for field in spark_df.schema.fields:
37
+ if str(field.dataType) == "TimestampType":
38
+ spark_df = spark_df.withColumn(
39
+ field.name,
40
+ pyspark_functions.date_format(
41
+ pyspark_functions.to_timestamp(field.name),
42
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
43
+ ),
44
+ )
45
+ type_conversion_dict[field.name] = "datetime64[ns]"
46
+ df = PandasConversionMixin.toPandas(spark_df)
47
+ if type_conversion_dict:
48
+ df = df.astype(type_conversion_dict)
49
+ return df
50
+ else:
51
+ return PandasConversionMixin.toPandas(spark_df)
52
+
53
+
27
54
  class SparkFeatureMerger(BaseMerger):
28
55
  engine = "spark"
29
56
  support_offline = True
@@ -166,29 +193,7 @@ class SparkFeatureMerger(BaseMerger):
166
193
  def get_df(self, to_pandas=True):
167
194
  if to_pandas:
168
195
  if self._pandas_df is None:
169
- df = self._result_df
170
- # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
171
- # when we upgrade pyspark, we should check whether this workaround is still necessary
172
- # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
173
- if semver.parse(pd.__version__)["major"] >= 2:
174
- import pyspark.sql.functions as pyspark_functions
175
-
176
- type_conversion_dict = {}
177
- for field in df.schema.fields:
178
- if str(field.dataType) == "TimestampType":
179
- df = df.withColumn(
180
- field.name,
181
- pyspark_functions.date_format(
182
- pyspark_functions.to_timestamp(field.name),
183
- "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
184
- ),
185
- )
186
- type_conversion_dict[field.name] = "datetime64[ns]"
187
- df = PandasConversionMixin.toPandas(df)
188
- if type_conversion_dict:
189
- df = df.astype(type_conversion_dict)
190
- else:
191
- df = PandasConversionMixin.toPandas(df)
196
+ df = spark_df_to_pandas(self._result_df)
192
197
  self._pandas_df = df
193
198
  self._set_indexes(self._pandas_df)
194
199
  return self._pandas_df
@@ -221,7 +226,12 @@ class SparkFeatureMerger(BaseMerger):
221
226
  start_time=None,
222
227
  end_time=None,
223
228
  time_column=None,
229
+ additional_filters=None,
224
230
  ):
231
+ mlrun.utils.helpers.additional_filters_warning(
232
+ additional_filters, self.__class__
233
+ )
234
+
225
235
  source_kwargs = {}
226
236
  if feature_set.spec.passthrough:
227
237
  if not feature_set.spec.source:
@@ -243,13 +253,13 @@ class SparkFeatureMerger(BaseMerger):
243
253
  # handling case where there are multiple feature sets and user creates vector where
244
254
  # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
245
255
  source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
246
-
247
256
  source = source_driver(
248
257
  name=self.vector.metadata.name,
249
258
  path=source_path,
250
259
  time_field=time_column,
251
260
  start_time=start_time,
252
261
  end_time=end_time,
262
+ additional_filters=additional_filters,
253
263
  **source_kwargs,
254
264
  )
255
265
 
@@ -162,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
162
162
  example::
163
163
 
164
164
  # replace the value "U" with '0' in the age column
165
- graph.to(MapValues(mapping={'age': {'U': '0'}}, with_original_features=True))
165
+ graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
166
166
 
167
167
  # replace integers, example
168
- graph.to(MapValues(mapping={'not': {0: 1, 1: 0}}))
168
+ graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
169
169
 
170
170
  # replace by range, use -inf and inf for extended range
171
- graph.to(MapValues(mapping={'numbers': {'ranges': {'negative': [-inf, 0], 'positive': [0, inf]}}}))
171
+ graph.to(
172
+ MapValues(
173
+ mapping={
174
+ "numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
175
+ }
176
+ )
177
+ )
172
178
 
173
179
  :param mapping: a dict with entry per column and the associated old/new values map
174
180
  :param with_original_features: set to True to keep the original features
@@ -424,8 +430,10 @@ class OneHotEncoder(StepToDict, MLRunStep):
424
430
 
425
431
  example::
426
432
 
427
- mapping = {'category': ['food', 'health', 'transportation'],
428
- 'gender': ['male', 'female']}
433
+ mapping = {
434
+ "category": ["food", "health", "transportation"],
435
+ "gender": ["male", "female"],
436
+ }
429
437
  graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
430
438
 
431
439
  :param mapping: a dict of per column categories (to map to binary fields)
@@ -542,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
542
550
 
543
551
  # (taken from the fraud-detection end-to-end feature store demo)
544
552
  # Define the Transactions FeatureSet
545
- transaction_set = fstore.FeatureSet("transactions",
546
- entities=[fstore.Entity("source")],
547
- timestamp_key='timestamp',
548
- description="transactions feature set")
553
+ transaction_set = fstore.FeatureSet(
554
+ "transactions",
555
+ entities=[fstore.Entity("source")],
556
+ timestamp_key="timestamp",
557
+ description="transactions feature set",
558
+ )
549
559
 
550
560
  # Get FeatureSet computation graph
551
561
  transaction_graph = transaction_set.graph
@@ -553,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
553
563
  # Add the custom `DateExtractor` step
554
564
  # to the computation graph
555
565
  transaction_graph.to(
556
- class_name='DateExtractor',
557
- name='Extract Dates',
558
- parts = ['hour', 'day_of_week'],
559
- timestamp_col = 'timestamp',
560
- )
566
+ class_name="DateExtractor",
567
+ name="Extract Dates",
568
+ parts=["hour", "day_of_week"],
569
+ timestamp_col="timestamp",
570
+ )
561
571
 
562
572
  :param parts: list of pandas style date-time parts you want to extract.
563
573
  :param timestamp_col: The name of the column containing the timestamps to extract from,
@@ -694,11 +704,12 @@ class DropFeatures(StepToDict, MLRunStep):
694
704
 
695
705
  example::
696
706
 
697
- feature_set = fstore.FeatureSet("fs-new",
698
- entities=[fstore.Entity("id")],
699
- description="feature set",
700
- engine="pandas",
701
- )
707
+ feature_set = fstore.FeatureSet(
708
+ "fs-new",
709
+ entities=[fstore.Entity("id")],
710
+ description="feature set",
711
+ engine="pandas",
712
+ )
702
713
  # Pre-processing graph steps
703
714
  feature_set.graph.to(DropFeatures(features=["age"]))
704
715
  df_pandas = feature_set.ingest(data)
mlrun/features.py CHANGED
@@ -238,10 +238,7 @@ class Validator(ModelObj):
238
238
  from mlrun.features import Validator
239
239
 
240
240
  # Add validator to the feature 'bid' with check type
241
- quotes_set["bid"].validator = Validator(
242
- check_type=True,
243
- severity="info"
244
- )
241
+ quotes_set["bid"].validator = Validator(check_type=True, severity="info")
245
242
 
246
243
  :param check_type: check feature type e.g. True, False
247
244
  :param severity: severity name e.g. info, warning, etc.
@@ -280,10 +277,7 @@ class MinMaxValidator(Validator):
280
277
 
281
278
  # Add validator to the feature 'bid', where valid
282
279
  # minimal value is 52
283
- quotes_set["bid"].validator = MinMaxValidator(
284
- min=52,
285
- severity="info"
286
- )
280
+ quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
287
281
 
288
282
  :param check_type: check feature type e.g. True, False
289
283
  :param severity: severity name e.g. info, warning, etc.
@@ -344,9 +338,7 @@ class MinMaxLenValidator(Validator):
344
338
  # Add length validator to the feature 'ticker', where valid
345
339
  # minimal length is 1 and maximal length is 10
346
340
  quotes_set["ticker"].validator = MinMaxLenValidator(
347
- min=1,
348
- max=10,
349
- severity="info"
341
+ min=1, max=10, severity="info"
350
342
  )
351
343
 
352
344
  :param check_type: check feature type e.g. True, False
@@ -408,8 +400,7 @@ class RegexValidator(Validator):
408
400
  # expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
409
401
  # e.g. A1234567, z9874563, etc.
410
402
  quotes_set["name"].validator = RegexValidator(
411
- regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
412
- severity="info"
403
+ regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
413
404
  )
414
405
 
415
406
  :param check_type: check feature type e.g. True, False
@@ -547,9 +547,9 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
547
547
  "inputs",
548
548
  "parameters",
549
549
  ]:
550
- text += "\n * **{}**: {}".format(
551
- property_name.capitalize(),
552
- self._markdown_print(value=property_value, tabs=2),
550
+ text += (
551
+ f"\n * **{property_name.capitalize()}**: "
552
+ f"{self._markdown_print(value=property_value, tabs=2)}"
553
553
  )
554
554
  else:
555
555
  for property_name, property_value in self._extract_epoch_results().items():
@@ -614,13 +614,8 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
614
614
  :return: The generated link.
615
615
  """
616
616
  return (
617
- '<a href="{}/{}/{}/jobs/monitor/{}/overview" target="_blank">{}</a>'.format(
618
- config.resolve_ui_url(),
619
- config.ui.projects_prefix,
620
- context.project,
621
- context.uid,
622
- link_text,
623
- )
617
+ f'<a href="{config.resolve_ui_url()}/{config.ui.projects_prefix}/{context.project}'
618
+ f'/jobs/monitor/{context.uid}/overview" target="_blank">{link_text}</a>'
624
619
  )
625
620
 
626
621
  @staticmethod
@@ -653,13 +648,13 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
653
648
  if isinstance(value, list):
654
649
  if len(value) == 0:
655
650
  return ""
656
- text = "\n" + yaml.dump(value)
651
+ text = "\n" + yaml.safe_dump(value)
657
652
  text = " \n".join([" " * tabs + line for line in text.splitlines()])
658
653
  return text
659
654
  if isinstance(value, dict):
660
655
  if len(value) == 0:
661
656
  return ""
662
- text = yaml.dump(value)
657
+ text = yaml.safe_dump(value)
663
658
  text = " \n".join(
664
659
  [" " * tabs + "- " + line for line in text.splitlines()]
665
660
  )