mlrun 1.8.0rc43__py3-none-any.whl → 1.8.0rc44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

@@ -19,18 +19,18 @@ from datetime import datetime
19
19
  from enum import Enum
20
20
  from typing import Union
21
21
 
22
- import numpy as np
23
22
  import pandas as pd
24
23
 
25
24
  import mlrun
26
25
 
27
26
  from ..config import config as mlconf
28
27
  from ..datastore import get_store_uri
29
- from ..datastore.targets import get_offline_target
28
+ from ..datastore.targets import BaseStoreTarget, get_offline_target
30
29
  from ..feature_store.common import (
31
30
  get_feature_set_by_uri,
32
31
  parse_feature_string,
33
32
  parse_project_name_from_feature_string,
33
+ verify_feature_vector_permissions,
34
34
  )
35
35
  from ..feature_store.feature_set import FeatureSet
36
36
  from ..features import Entity, Feature
@@ -47,6 +47,22 @@ from ..runtimes.function_reference import FunctionReference
47
47
  from ..serving.states import RootFlowStep
48
48
  from ..utils import StorePrefix
49
49
  from .common import RunConfig
50
+ from .feature_vector_utils import JoinGraph, OnlineVectorService
51
+ from .retrieval import get_merger, run_merge_job
52
+
53
+
54
+ def _features_to_vector_and_check_permissions(features: "FeatureVector", update_stats):
55
+ vector = features
56
+ if not vector.metadata.name:
57
+ raise mlrun.errors.MLRunInvalidArgumentError(
58
+ "feature vector name must be specified"
59
+ )
60
+ verify_feature_vector_permissions(
61
+ vector, mlrun.common.schemas.AuthorizationAction.update
62
+ )
63
+
64
+ vector.save()
65
+ return vector
50
66
 
51
67
 
52
68
  class FeatureVectorSpec(ModelObj):
@@ -201,254 +217,6 @@ class FeatureVectorStatus(ModelObj):
201
217
  self._features = ObjectList.from_list(Feature, features)
202
218
 
203
219
 
204
- class JoinGraph(ModelObj):
205
- """
206
- explain here about the class
207
- """
208
-
209
- default_graph_name = "$__join_graph_fv__$"
210
- first_join_type = "first"
211
- _dict_fields = ["name", "first_feature_set", "steps"]
212
-
213
- def __init__(
214
- self,
215
- name: typing.Optional[str] = None,
216
- first_feature_set: Union[str, FeatureSet] = None,
217
- ):
218
- """
219
- JoinGraph is a class that represents a graph of data joins between feature sets. It allows users to define
220
- data joins step by step, specifying the join type for each step. The graph can be used to build a sequence of
221
- joins that will be executed in order, allowing the creation of complex join operations between feature sets.
222
-
223
-
224
- Example:
225
- # Create a new JoinGraph and add steps for joining feature sets.
226
- join_graph = JoinGraph(name="my_join_graph", first_feature_set="featureset1")
227
- join_graph.inner("featureset2")
228
- join_graph.left("featureset3", asof_join=True)
229
-
230
-
231
- :param name: (str, optional) The name of the join graph. If not provided,
232
- a default name will be used.
233
- :param first_feature_set: (str or FeatureSet, optional) The first feature set to join. It can be
234
- specified either as a string representing the name of the feature set or as a
235
- FeatureSet object.
236
- """
237
- self.name = name or self.default_graph_name
238
- self._steps: ObjectList = None
239
- self._feature_sets = None
240
- if first_feature_set:
241
- self._start(first_feature_set)
242
-
243
- def inner(self, other_operand: typing.Union[str, FeatureSet]):
244
- """
245
- Specifies an inner join with the given feature set
246
-
247
- :param other_operand: (str or FeatureSet) The name of the feature set or a FeatureSet object to join with.
248
-
249
- :return: JoinGraph: The updated JoinGraph object with the specified inner join.
250
- """
251
- return self._join_operands(other_operand, "inner")
252
-
253
- def outer(self, other_operand: typing.Union[str, FeatureSet]):
254
- """
255
- Specifies an outer join with the given feature set
256
-
257
- :param other_operand: (str or FeatureSet) The name of the feature set or a FeatureSet object to join with.
258
- :return: JoinGraph: The updated JoinGraph object with the specified outer join.
259
- """
260
- return self._join_operands(other_operand, "outer")
261
-
262
- def left(self, other_operand: typing.Union[str, FeatureSet], asof_join):
263
- """
264
- Specifies a left join with the given feature set
265
-
266
- :param other_operand: (str or FeatureSet) The name of the feature set or a FeatureSet object to join with.
267
- :param asof_join: (bool) A flag indicating whether to perform an as-of join.
268
-
269
- :return: JoinGraph: The updated JoinGraph object with the specified left join.
270
- """
271
- return self._join_operands(other_operand, "left", asof_join=asof_join)
272
-
273
- def right(self, other_operand: typing.Union[str, FeatureSet]):
274
- """
275
- Specifies a right join with the given feature set
276
-
277
- :param other_operand: (str or FeatureSet) The name of the feature set or a FeatureSet object to join with.
278
-
279
- :return: JoinGraph: The updated JoinGraph object with the specified right join.
280
- """
281
- return self._join_operands(other_operand, "right")
282
-
283
- def _join_operands(
284
- self,
285
- other_operand: typing.Union[str, FeatureSet],
286
- join_type: str,
287
- asof_join: bool = False,
288
- ):
289
- if isinstance(other_operand, FeatureSet):
290
- other_operand = other_operand.metadata.name
291
-
292
- first_key_num = len(self._steps.keys()) if self._steps else 0
293
- left_last_step_name, left_all_feature_sets = (
294
- self.last_step_name,
295
- self.all_feature_sets_names,
296
- )
297
- is_first_fs = (
298
- join_type == JoinGraph.first_join_type or left_all_feature_sets == self.name
299
- )
300
- # create_new_step
301
- new_step = _JoinStep(
302
- f"step_{first_key_num}",
303
- left_last_step_name if not is_first_fs else "",
304
- other_operand,
305
- left_all_feature_sets if not is_first_fs else [],
306
- other_operand,
307
- join_type,
308
- asof_join,
309
- )
310
-
311
- if self.steps is not None:
312
- self.steps.update(new_step)
313
- else:
314
- self.steps = [new_step]
315
- return self
316
-
317
- def _start(self, other_operand: typing.Union[str, FeatureSet]):
318
- return self._join_operands(other_operand, JoinGraph.first_join_type)
319
-
320
- def _init_all_join_keys(
321
- self,
322
- feature_set_objects,
323
- vector,
324
- entity_rows_keys: typing.Optional[list[str]] = None,
325
- ):
326
- for step in self.steps:
327
- step.init_join_keys(feature_set_objects, vector, entity_rows_keys)
328
-
329
- @property
330
- def all_feature_sets_names(self):
331
- """
332
- Returns a list of all feature set names included in the join graph.
333
-
334
- :return: List[str]: A list of feature set names.
335
- """
336
- if self._steps:
337
- return self._steps[-1].left_feature_set_names + [
338
- self._steps[-1].right_feature_set_name
339
- ]
340
- else:
341
- return self.name
342
-
343
- @property
344
- def last_step_name(self):
345
- """
346
- Returns the name of the last step in the join graph.
347
-
348
- :return: str: The name of the last step.
349
- """
350
- if self._steps:
351
- return self._steps[-1].name
352
- else:
353
- return self.name
354
-
355
- @property
356
- def steps(self):
357
- """
358
- Returns the list of join steps as ObjectList, which can be used to iterate over the steps
359
- or access the properties of each step.
360
- :return: ObjectList: The list of join steps.
361
- """
362
- return self._steps
363
-
364
- @steps.setter
365
- def steps(self, steps):
366
- """
367
- Setter for the steps property. It allows updating the join steps.
368
-
369
- :param steps: (List[_JoinStep]) The list of join steps.
370
- """
371
- self._steps = ObjectList.from_list(child_class=_JoinStep, children=steps)
372
-
373
-
374
- class _JoinStep(ModelObj):
375
- def __init__(
376
- self,
377
- name: typing.Optional[str] = None,
378
- left_step_name: typing.Optional[str] = None,
379
- right_step_name: typing.Optional[str] = None,
380
- left_feature_set_names: typing.Optional[Union[str, list[str]]] = None,
381
- right_feature_set_name: typing.Optional[str] = None,
382
- join_type: str = "inner",
383
- asof_join: bool = False,
384
- ):
385
- self.name = name
386
- self.left_step_name = left_step_name
387
- self.right_step_name = right_step_name
388
- self.left_feature_set_names = (
389
- left_feature_set_names
390
- if left_feature_set_names is None
391
- or isinstance(left_feature_set_names, list)
392
- else [left_feature_set_names]
393
- )
394
- self.right_feature_set_name = right_feature_set_name
395
- self.join_type = join_type
396
- self.asof_join = asof_join
397
-
398
- self.left_keys = []
399
- self.right_keys = []
400
-
401
- def init_join_keys(
402
- self,
403
- feature_set_objects: ObjectList,
404
- vector,
405
- entity_rows_keys: typing.Optional[list[str]] = None,
406
- ):
407
- if feature_set_objects[self.right_feature_set_name].is_connectable_to_df(
408
- entity_rows_keys
409
- ):
410
- self.left_keys, self.right_keys = [
411
- list(
412
- feature_set_objects[
413
- self.right_feature_set_name
414
- ].spec.entities.keys()
415
- )
416
- ] * 2
417
-
418
- if (
419
- self.join_type == JoinGraph.first_join_type
420
- or not self.left_feature_set_names
421
- ):
422
- self.join_type = (
423
- "inner"
424
- if self.join_type == JoinGraph.first_join_type
425
- else self.join_type
426
- )
427
- return
428
-
429
- for left_fset in self.left_feature_set_names:
430
- current_left_keys = feature_set_objects[left_fset].extract_relation_keys(
431
- feature_set_objects[self.right_feature_set_name],
432
- vector.get_feature_set_relations(feature_set_objects[left_fset]),
433
- )
434
- current_right_keys = list(
435
- feature_set_objects[self.right_feature_set_name].spec.entities.keys()
436
- )
437
- for i in range(len(current_left_keys)):
438
- if (
439
- current_left_keys[i] not in self.left_keys
440
- and current_right_keys[i] not in self.right_keys
441
- ):
442
- self.left_keys.append(current_left_keys[i])
443
- self.right_keys.append(current_right_keys[i])
444
-
445
- if not self.left_keys:
446
- raise mlrun.errors.MLRunRuntimeError(
447
- f"{self.name} can't be preform due to undefined relation between "
448
- f"{self.left_feature_set_names} to {self.right_feature_set_name}"
449
- )
450
-
451
-
452
220
  class FixedWindowType(Enum):
453
221
  CurrentOpenWindow = 1
454
222
  LastClosedWindow = 2
@@ -737,7 +505,7 @@ class FeatureVector(ModelObj):
737
505
  start_time: typing.Optional[Union[str, datetime]] = None,
738
506
  end_time: typing.Optional[Union[str, datetime]] = None,
739
507
  with_indexes: bool = False,
740
- update_stats: bool = False,
508
+ update_stats: bool = True,
741
509
  engine: typing.Optional[str] = None,
742
510
  engine_args: typing.Optional[dict] = None,
743
511
  query: typing.Optional[str] = None,
@@ -787,8 +555,9 @@ class FeatureVector(ModelObj):
787
555
  columns. This property can be specified also in the feature vector spec
788
556
  (feature_vector.spec.with_indexes)
789
557
  (default False)
790
- :param update_stats: update features statistics from the requested feature sets on the vector.
791
- (default False).
558
+ :param update_stats: When set to True (default), updates feature statistics from the requested
559
+ feature sets on the vector, which requires 'update' permissions. When set to
560
+ False, uses read-only operations that only require 'read' permissions.
792
561
  :param engine: processing engine kind ("local", "dask", or "spark")
793
562
  :param engine_args: kwargs for the processing engine
794
563
  :param query: The query string used to filter rows on the output
@@ -809,25 +578,69 @@ class FeatureVector(ModelObj):
809
578
  https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
810
579
 
811
580
  """
581
+ if entity_rows is None and entity_timestamp_column is not None:
582
+ raise mlrun.errors.MLRunInvalidArgumentError(
583
+ "entity_timestamp_column param "
584
+ "can not be specified without entity_rows param"
585
+ )
586
+
587
+ if isinstance(target, BaseStoreTarget) and not target.support_pandas:
588
+ raise mlrun.errors.MLRunInvalidArgumentError(
589
+ f"get_offline_features does not support targets that do not support pandas engine."
590
+ f" Target kind: {target.kind}"
591
+ )
592
+
593
+ if update_stats:
594
+ feature_vector = _features_to_vector_and_check_permissions(
595
+ self, update_stats
596
+ )
597
+ else:
598
+ feature_vector = self
599
+ verify_feature_vector_permissions(
600
+ feature_vector, mlrun.common.schemas.AuthorizationAction.read
601
+ )
602
+
603
+ entity_timestamp_column = (
604
+ entity_timestamp_column or feature_vector.spec.timestamp_field
605
+ )
606
+
607
+ merger_engine = get_merger(engine)
608
+
609
+ if run_config and not run_config.local:
610
+ return run_merge_job(
611
+ feature_vector,
612
+ target,
613
+ merger_engine,
614
+ engine,
615
+ engine_args,
616
+ spark_service,
617
+ entity_rows,
618
+ entity_timestamp_column=entity_timestamp_column,
619
+ run_config=run_config,
620
+ drop_columns=drop_columns,
621
+ with_indexes=with_indexes,
622
+ query=query,
623
+ order_by=order_by,
624
+ start_time=start_time,
625
+ end_time=end_time,
626
+ timestamp_for_filtering=timestamp_for_filtering,
627
+ additional_filters=additional_filters,
628
+ )
812
629
 
813
- return mlrun.feature_store.api._get_offline_features(
814
- self,
630
+ merger = merger_engine(feature_vector, **(engine_args or {}))
631
+ return merger.start(
815
632
  entity_rows,
816
633
  entity_timestamp_column,
817
- target,
818
- run_config,
819
- drop_columns,
820
- start_time,
821
- end_time,
822
- with_indexes,
823
- update_stats,
824
- engine,
825
- engine_args,
826
- query,
827
- order_by,
828
- spark_service,
829
- timestamp_for_filtering,
830
- additional_filters,
634
+ target=target,
635
+ drop_columns=drop_columns,
636
+ start_time=start_time,
637
+ end_time=end_time,
638
+ timestamp_for_filtering=timestamp_for_filtering,
639
+ with_indexes=with_indexes,
640
+ update_stats=update_stats,
641
+ query=query,
642
+ order_by=order_by,
643
+ additional_filters=additional_filters,
831
644
  )
832
645
 
833
646
  def get_online_feature_service(
@@ -837,7 +650,7 @@ class FeatureVector(ModelObj):
837
650
  impute_policy: typing.Optional[dict] = None,
838
651
  update_stats: bool = False,
839
652
  entity_keys: typing.Optional[list[str]] = None,
840
- ):
653
+ ) -> OnlineVectorService:
841
654
  """initialize and return online feature vector service api,
842
655
  returns :py:class:`~mlrun.feature_store.OnlineVectorService`
843
656
 
@@ -900,204 +713,14 @@ class FeatureVector(ModelObj):
900
713
  :return: Initialize the `OnlineVectorService`.
901
714
  Will be used in subclasses where `support_online=True`.
902
715
  """
903
- return mlrun.feature_store.api._get_online_feature_service(
904
- self,
905
- run_config,
906
- fixed_window_type,
907
- impute_policy,
908
- update_stats,
909
- entity_keys,
910
- )
716
+ feature_vector = _features_to_vector_and_check_permissions(self, True)
911
717
 
718
+ engine_args = {"impute_policy": impute_policy}
719
+ merger_engine = get_merger("storey")
720
+ # todo: support remote service (using remote nuclio/mlrun function if run_config)
912
721
 
913
- class OnlineVectorService:
914
- """get_online_feature_service response object"""
915
-
916
- def __init__(
917
- self,
918
- vector,
919
- graph,
920
- index_columns,
921
- impute_policy: typing.Optional[dict] = None,
922
- requested_columns: typing.Optional[list[str]] = None,
923
- ):
924
- self.vector = vector
925
- self.impute_policy = impute_policy or {}
926
-
927
- self._controller = graph.controller
928
- self._index_columns = index_columns
929
- self._impute_values = {}
930
- self._requested_columns = requested_columns
931
-
932
- def __enter__(self):
933
- return self
934
-
935
- def __exit__(self, exc_type, exc_val, exc_tb):
936
- self.close()
937
-
938
- def initialize(self):
939
- """internal, init the feature service and prep the imputing logic"""
940
- if not self.impute_policy:
941
- return
942
-
943
- impute_policy = copy(self.impute_policy)
944
- vector = self.vector
945
- feature_stats = vector.get_stats_table()
946
- self._impute_values = {}
947
-
948
- feature_keys = list(vector.status.features.keys())
949
- if vector.status.label_column in feature_keys:
950
- feature_keys.remove(vector.status.label_column)
951
-
952
- if "*" in impute_policy:
953
- value = impute_policy["*"]
954
- del impute_policy["*"]
955
-
956
- for name in feature_keys:
957
- if name not in impute_policy:
958
- if isinstance(value, str) and value.startswith("$"):
959
- self._impute_values[name] = feature_stats.loc[name, value[1:]]
960
- else:
961
- self._impute_values[name] = value
962
-
963
- for name, value in impute_policy.items():
964
- if name not in feature_keys:
965
- raise mlrun.errors.MLRunInvalidArgumentError(
966
- f"feature {name} in impute_policy but not in feature vector"
967
- )
968
- if isinstance(value, str) and value.startswith("$"):
969
- self._impute_values[name] = feature_stats.loc[name, value[1:]]
970
- else:
971
- self._impute_values[name] = value
972
-
973
- @property
974
- def status(self):
975
- """vector merger function status (ready, running, error)"""
976
- return "ready"
977
-
978
- def get(self, entity_rows: list[Union[dict, list]], as_list=False):
979
- """get feature vector given the provided entity inputs
980
-
981
- take a list of input vectors/rows and return a list of enriched feature vectors
982
- each input and/or output vector can be a list of values or a dictionary of field names and values,
983
- to return the vector as a list of values set the `as_list` to True.
984
-
985
- if the input is a list of list (vs a list of dict), the values in the list will correspond to the
986
- index/entity values, i.e. [["GOOG"], ["MSFT"]] means "GOOG" and "MSFT" are the index/entity fields.
987
-
988
- example::
722
+ merger = merger_engine(feature_vector, **engine_args)
989
723
 
990
- # accept list of dict, return list of dict
991
- svc = fstore.get_online_feature_service(vector)
992
- resp = svc.get([{"name": "joe"}, {"name": "mike"}])
993
-
994
- # accept list of list, return list of list
995
- svc = fstore.get_online_feature_service(vector, as_list=True)
996
- resp = svc.get([["joe"], ["mike"]])
997
-
998
- :param entity_rows: list of list/dict with input entity data/rows
999
- :param as_list: return a list of list (list input is required by many ML frameworks)
1000
- """
1001
- results = []
1002
- futures = []
1003
- if isinstance(entity_rows, dict):
1004
- entity_rows = [entity_rows]
1005
-
1006
- # validate we have valid input struct
1007
- if (
1008
- not entity_rows
1009
- or not isinstance(entity_rows, list)
1010
- or not isinstance(entity_rows[0], (list, dict))
1011
- ):
1012
- raise mlrun.errors.MLRunInvalidArgumentError(
1013
- f"input data is of type {type(entity_rows)}. must be a list of lists or list of dicts"
1014
- )
1015
-
1016
- # if list of list, convert to dicts (with the index columns as the dict keys)
1017
- if isinstance(entity_rows[0], list):
1018
- if not self._index_columns or len(entity_rows[0]) != len(
1019
- self._index_columns
1020
- ):
1021
- raise mlrun.errors.MLRunInvalidArgumentError(
1022
- "input list must be in the same size of the index_keys list"
1023
- )
1024
- index_range = range(len(self._index_columns))
1025
- entity_rows = [
1026
- {self._index_columns[i]: item[i] for i in index_range}
1027
- for item in entity_rows
1028
- ]
1029
-
1030
- for row in entity_rows:
1031
- futures.append(self._controller.emit(row, return_awaitable_result=True))
1032
-
1033
- for future in futures:
1034
- result = future.await_result()
1035
- data = result.body
1036
- if data:
1037
- actual_columns = data.keys()
1038
- if all([col in self._index_columns for col in actual_columns]):
1039
- # didn't get any data from the graph
1040
- results.append(None)
1041
- continue
1042
- for column in self._requested_columns:
1043
- if (
1044
- column not in actual_columns
1045
- and column != self.vector.status.label_column
1046
- ):
1047
- data[column] = None
1048
-
1049
- if self._impute_values:
1050
- for name in data.keys():
1051
- v = data[name]
1052
- if v is None or (
1053
- isinstance(v, float) and (np.isinf(v) or np.isnan(v))
1054
- ):
1055
- data[name] = self._impute_values.get(name, v)
1056
- if not self.vector.spec.with_indexes:
1057
- for name in self.vector.status.index_keys:
1058
- data.pop(name, None)
1059
- if not any(data.values()):
1060
- data = None
1061
-
1062
- if as_list and data:
1063
- data = [
1064
- data.get(key, None)
1065
- for key in self._requested_columns
1066
- if key != self.vector.status.label_column
1067
- ]
1068
- results.append(data)
1069
-
1070
- return results
1071
-
1072
- def close(self):
1073
- """terminate the async loop"""
1074
- self._controller.terminate()
1075
-
1076
-
1077
- class OfflineVectorResponse:
1078
- """get_offline_features response object"""
1079
-
1080
- def __init__(self, merger):
1081
- self._merger = merger
1082
- self.vector = merger.vector
1083
-
1084
- @property
1085
- def status(self):
1086
- """vector prep job status (ready, running, error)"""
1087
- return self._merger.get_status()
1088
-
1089
- def to_dataframe(self, to_pandas=True):
1090
- """return result as dataframe"""
1091
- if self.status != "completed":
1092
- raise mlrun.errors.MLRunTaskNotReadyError(
1093
- "feature vector dataset is not ready"
1094
- )
1095
- return self._merger.get_df(to_pandas=to_pandas)
1096
-
1097
- def to_parquet(self, target_path, **kw):
1098
- """return results as parquet file"""
1099
- return self._merger.to_parquet(target_path, **kw)
1100
-
1101
- def to_csv(self, target_path, **kw):
1102
- """return results as csv file"""
1103
- return self._merger.to_csv(target_path, **kw)
724
+ return merger.init_online_vector_service(
725
+ entity_keys, fixed_window_type, update_stats=True
726
+ )