mlrun 1.3.2rc1__py3-none-any.whl → 1.3.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (93) hide show
  1. mlrun/api/api/deps.py +14 -1
  2. mlrun/api/api/endpoints/frontend_spec.py +0 -2
  3. mlrun/api/api/endpoints/functions.py +15 -27
  4. mlrun/api/api/endpoints/grafana_proxy.py +435 -74
  5. mlrun/api/api/endpoints/healthz.py +5 -18
  6. mlrun/api/api/endpoints/model_endpoints.py +33 -37
  7. mlrun/api/api/utils.py +6 -13
  8. mlrun/api/crud/__init__.py +14 -16
  9. mlrun/api/crud/logs.py +5 -7
  10. mlrun/api/crud/model_monitoring/__init__.py +2 -2
  11. mlrun/api/crud/model_monitoring/model_endpoint_store.py +847 -0
  12. mlrun/api/crud/model_monitoring/model_endpoints.py +105 -328
  13. mlrun/api/crud/pipelines.py +2 -3
  14. mlrun/api/db/sqldb/models/models_mysql.py +52 -19
  15. mlrun/api/db/sqldb/models/models_sqlite.py +52 -19
  16. mlrun/api/db/sqldb/session.py +19 -26
  17. mlrun/api/schemas/__init__.py +2 -0
  18. mlrun/api/schemas/constants.py +0 -13
  19. mlrun/api/schemas/frontend_spec.py +0 -1
  20. mlrun/api/schemas/model_endpoints.py +38 -195
  21. mlrun/api/schemas/schedule.py +2 -2
  22. mlrun/api/utils/clients/log_collector.py +5 -0
  23. mlrun/builder.py +9 -41
  24. mlrun/config.py +1 -76
  25. mlrun/data_types/__init__.py +1 -6
  26. mlrun/data_types/data_types.py +1 -3
  27. mlrun/datastore/__init__.py +2 -9
  28. mlrun/datastore/sources.py +20 -25
  29. mlrun/datastore/store_resources.py +1 -1
  30. mlrun/datastore/targets.py +34 -67
  31. mlrun/datastore/utils.py +4 -26
  32. mlrun/db/base.py +2 -4
  33. mlrun/db/filedb.py +5 -13
  34. mlrun/db/httpdb.py +32 -64
  35. mlrun/db/sqldb.py +2 -4
  36. mlrun/errors.py +0 -5
  37. mlrun/execution.py +0 -2
  38. mlrun/feature_store/api.py +8 -24
  39. mlrun/feature_store/feature_set.py +6 -28
  40. mlrun/feature_store/feature_vector.py +0 -2
  41. mlrun/feature_store/ingestion.py +11 -8
  42. mlrun/feature_store/retrieval/base.py +43 -271
  43. mlrun/feature_store/retrieval/dask_merger.py +153 -55
  44. mlrun/feature_store/retrieval/job.py +3 -12
  45. mlrun/feature_store/retrieval/local_merger.py +130 -48
  46. mlrun/feature_store/retrieval/spark_merger.py +125 -126
  47. mlrun/features.py +2 -7
  48. mlrun/model_monitoring/constants.py +6 -48
  49. mlrun/model_monitoring/helpers.py +35 -118
  50. mlrun/model_monitoring/model_monitoring_batch.py +260 -293
  51. mlrun/model_monitoring/stream_processing_fs.py +253 -220
  52. mlrun/platforms/iguazio.py +0 -33
  53. mlrun/projects/project.py +72 -34
  54. mlrun/runtimes/base.py +0 -5
  55. mlrun/runtimes/daskjob.py +0 -2
  56. mlrun/runtimes/function.py +3 -29
  57. mlrun/runtimes/kubejob.py +15 -39
  58. mlrun/runtimes/local.py +45 -7
  59. mlrun/runtimes/mpijob/abstract.py +0 -2
  60. mlrun/runtimes/mpijob/v1.py +0 -2
  61. mlrun/runtimes/pod.py +0 -2
  62. mlrun/runtimes/remotesparkjob.py +0 -2
  63. mlrun/runtimes/serving.py +0 -6
  64. mlrun/runtimes/sparkjob/abstract.py +2 -39
  65. mlrun/runtimes/sparkjob/spark3job.py +0 -2
  66. mlrun/serving/__init__.py +1 -2
  67. mlrun/serving/routers.py +35 -35
  68. mlrun/serving/server.py +12 -22
  69. mlrun/serving/states.py +30 -162
  70. mlrun/serving/v2_serving.py +10 -13
  71. mlrun/utils/clones.py +1 -1
  72. mlrun/utils/model_monitoring.py +96 -122
  73. mlrun/utils/version/version.json +2 -2
  74. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/METADATA +27 -23
  75. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/RECORD +79 -92
  76. mlrun/api/crud/model_monitoring/grafana.py +0 -427
  77. mlrun/datastore/spark_udf.py +0 -40
  78. mlrun/model_monitoring/__init__.py +0 -44
  79. mlrun/model_monitoring/common.py +0 -112
  80. mlrun/model_monitoring/model_endpoint.py +0 -141
  81. mlrun/model_monitoring/stores/__init__.py +0 -106
  82. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -448
  83. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  84. mlrun/model_monitoring/stores/models/__init__.py +0 -23
  85. mlrun/model_monitoring/stores/models/base.py +0 -18
  86. mlrun/model_monitoring/stores/models/mysql.py +0 -100
  87. mlrun/model_monitoring/stores/models/sqlite.py +0 -98
  88. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -375
  89. mlrun/utils/db.py +0 -52
  90. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/LICENSE +0 -0
  91. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/WHEEL +0 -0
  92. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/entry_points.txt +0 -0
  93. {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/top_level.txt +0 -0
@@ -13,16 +13,11 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  import abc
16
- import typing
17
- from datetime import datetime
18
16
 
19
17
  import mlrun
20
18
  from mlrun.datastore.targets import CSVTarget, ParquetTarget
21
- from mlrun.feature_store.feature_set import FeatureSet
22
- from mlrun.feature_store.feature_vector import Feature
23
19
 
24
20
  from ...utils import logger
25
- from ..feature_vector import OfflineVectorResponse
26
21
 
27
22
 
28
23
  class BaseMerger(abc.ABC):
@@ -41,7 +36,6 @@ class BaseMerger(abc.ABC):
41
36
  self._drop_indexes = True
42
37
  self._target = None
43
38
  self._alias = dict()
44
- self._origin_alias = dict()
45
39
 
46
40
  def _append_drop_column(self, key):
47
41
  if key and key not in self._drop_columns:
@@ -77,7 +71,6 @@ class BaseMerger(abc.ABC):
77
71
  update_stats=None,
78
72
  query=None,
79
73
  join_type="inner",
80
- order_by=None,
81
74
  ):
82
75
  self._target = target
83
76
  self._join_type = join_type
@@ -117,11 +110,9 @@ class BaseMerger(abc.ABC):
117
110
  start_time=start_time,
118
111
  end_time=end_time,
119
112
  query=query,
120
- order_by=order_by,
121
113
  )
122
114
 
123
115
  def _write_to_target(self):
124
- self.vector.spec.with_indexes = not self._drop_indexes
125
116
  if self._target:
126
117
  is_persistent_vector = self.vector.metadata.name is not None
127
118
  if not self._target.path and not is_persistent_vector:
@@ -134,14 +125,6 @@ class BaseMerger(abc.ABC):
134
125
  target_status = self._target.update_resource_status("ready", size=size)
135
126
  logger.info(f"wrote target: {target_status}")
136
127
  self.vector.save()
137
- if self.vector.spec.with_indexes:
138
- self.vector.spec.entity_fields = [
139
- Feature(name=feature, value_type=self._result_df[feature].dtype)
140
- if self._result_df[feature].dtype.name != "object"
141
- else Feature(name=feature, value_type="str")
142
- for feature in self._index_columns
143
- ]
144
- self.vector.save()
145
128
 
146
129
  def _set_indexes(self, df):
147
130
  if self._index_columns and not self._drop_indexes:
@@ -151,15 +134,28 @@ class BaseMerger(abc.ABC):
151
134
  if index not in df.columns:
152
135
  index_columns_missing.append(index)
153
136
  if not index_columns_missing:
154
- df.set_index(self._index_columns, inplace=True)
137
+ if self.engine == "local" or self.engine == "spark":
138
+ df.set_index(self._index_columns, inplace=True)
139
+ elif self.engine == "dask":
140
+ if len(self._index_columns) == 1:
141
+ return df.set_index(self._index_columns[0])
142
+ elif len(self._index_columns) != 1:
143
+ return self._reset_index(self._result_df)
144
+ else:
145
+ logger.info(
146
+ "The entities will stay as columns because "
147
+ "Dask dataframe does not yet support multi-indexes"
148
+ )
149
+ return self._result_df
155
150
  else:
156
151
  logger.warn(
157
152
  f"Can't set index, not all index columns found: {index_columns_missing}. "
158
153
  f"It is possible that column was already indexed."
159
154
  )
160
- else:
161
- df.reset_index(drop=True, inplace=True)
155
+ else:
156
+ return df
162
157
 
158
+ @abc.abstractmethod
163
159
  def _generate_vector(
164
160
  self,
165
161
  entity_rows,
@@ -169,148 +165,8 @@ class BaseMerger(abc.ABC):
169
165
  start_time=None,
170
166
  end_time=None,
171
167
  query=None,
172
- order_by=None,
173
168
  ):
174
- self._create_engine_env()
175
-
176
- feature_sets = []
177
- dfs = []
178
- keys = (
179
- []
180
- ) # the struct of key is [[[],[]], ..] So that each record indicates which way the corresponding
181
- # featureset is connected to the previous one, and within each record the left keys are indicated in index 0
182
- # and the right keys in index 1, this keys will be the keys that will be used in this join
183
-
184
- fs_link_list = self._create_linked_relation_list(
185
- feature_set_objects, feature_set_fields
186
- )
187
-
188
- for node in fs_link_list:
189
- name = node.name
190
- feature_set = feature_set_objects[name]
191
- feature_sets.append(feature_set)
192
- columns = feature_set_fields[name]
193
- self._origin_alias.update({name: alias for name, alias in columns})
194
- column_names = [name for name, _ in columns]
195
-
196
- for column in node.data["save_cols"]:
197
- if column not in column_names:
198
- self._append_drop_column(column)
199
- column_names.append(column)
200
-
201
- df = self._get_engine_df(
202
- feature_set,
203
- name,
204
- column_names,
205
- start_time,
206
- end_time,
207
- entity_timestamp_column,
208
- )
209
-
210
- column_names += node.data["save_index"]
211
- node.data["save_cols"] += node.data["save_index"]
212
- if feature_set.spec.timestamp_key:
213
- entity_timestamp_column_list = [feature_set.spec.timestamp_key]
214
- column_names += entity_timestamp_column_list
215
- node.data["save_cols"] += entity_timestamp_column_list
216
- if not entity_timestamp_column:
217
- # if not entity_timestamp_column the firs `FeatureSet` will define it
218
- entity_timestamp_column = feature_set.spec.timestamp_key
219
-
220
- # rename columns to be unique for each feature set and select if needed
221
- rename_col_dict = {
222
- column: f"{column}_{name}"
223
- for column in column_names
224
- if column not in node.data["save_cols"]
225
- }
226
- fs_entities = list(feature_set.spec.entities.keys())
227
- df_temp = self._rename_columns_and_select(
228
- df, rename_col_dict, columns=list(set(column_names + fs_entities))
229
- )
230
-
231
- if df_temp is not None:
232
- df = df_temp
233
- del df_temp
234
-
235
- dfs.append(df)
236
- del df
237
-
238
- keys.append([node.data["left_keys"], node.data["right_keys"]])
239
-
240
- # update alias according to the unique column name
241
- new_columns = []
242
- if not self._drop_indexes:
243
- new_columns.extend([(ind, ind) for ind in fs_entities])
244
- for column, alias in columns:
245
- if column in rename_col_dict:
246
- new_columns.append((rename_col_dict[column], alias or column))
247
- else:
248
- new_columns.append((column, alias))
249
- self._update_alias(dictionary={name: alias for name, alias in new_columns})
250
-
251
- # convert pandas entity_rows to spark DF if needed
252
- if (
253
- entity_rows is not None
254
- and not hasattr(entity_rows, "rdd")
255
- and self.engine == "spark"
256
- ):
257
- entity_rows = self.spark.createDataFrame(entity_rows)
258
-
259
- # join the feature data frames
260
- self.merge(
261
- entity_df=entity_rows,
262
- entity_timestamp_column=entity_timestamp_column,
263
- featuresets=feature_sets,
264
- featureset_dfs=dfs,
265
- keys=keys,
266
- )
267
-
268
- all_columns = None
269
- if not self._drop_indexes and entity_timestamp_column:
270
- if entity_timestamp_column not in self._alias.values():
271
- self._update_alias(
272
- key=entity_timestamp_column, val=entity_timestamp_column
273
- )
274
- all_columns = list(self._alias.keys())
275
-
276
- df_temp = self._rename_columns_and_select(
277
- self._result_df, self._alias, columns=all_columns
278
- )
279
- if df_temp is not None:
280
- self._result_df = df_temp
281
- del df_temp
282
-
283
- df_temp = self._drop_columns_from_result()
284
- if df_temp is not None:
285
- self._result_df = df_temp
286
- del df_temp
287
-
288
- if self.vector.status.label_column:
289
- self._result_df = self._result_df.dropna(
290
- subset=[self.vector.status.label_column]
291
- )
292
- # filter joined data frame by the query param
293
- if query:
294
- self._filter(query)
295
-
296
- if order_by:
297
- if isinstance(order_by, str):
298
- order_by = [order_by]
299
- order_by_active = [
300
- order_col
301
- if order_col in self._result_df.columns
302
- else self._origin_alias.get(order_col, None)
303
- for order_col in order_by
304
- ]
305
- if None in order_by_active:
306
- raise mlrun.errors.MLRunInvalidArgumentError(
307
- f"Result dataframe contains {self._result_df.columns} "
308
- f"columns and can't order by {order_by}"
309
- )
310
- self._order_by(order_by_active)
311
-
312
- self._write_to_target()
313
- return OfflineVectorResponse(self)
169
+ raise NotImplementedError("_generate_vector() operation not supported in class")
314
170
 
315
171
  def _unpersist_df(self, df):
316
172
  pass
@@ -322,6 +178,7 @@ class BaseMerger(abc.ABC):
322
178
  featuresets: list,
323
179
  featureset_dfs: list,
324
180
  keys: list = None,
181
+ all_columns: list = None,
325
182
  ):
326
183
  """join the entities and feature set features into a result dataframe"""
327
184
  merged_df = entity_df
@@ -333,6 +190,10 @@ class BaseMerger(abc.ABC):
333
190
  else:
334
191
  # keys can be multiple keys on each side of the join
335
192
  keys = [[[], []]] * len(featureset_dfs)
193
+ if all_columns is not None:
194
+ all_columns.pop(0)
195
+ else:
196
+ all_columns = [[]] * len(featureset_dfs)
336
197
  entity_timestamp_column = (
337
198
  entity_timestamp_column or featureset.spec.timestamp_key
338
199
  )
@@ -342,7 +203,9 @@ class BaseMerger(abc.ABC):
342
203
  # and it can join only by the entities of the first `featureset`
343
204
  keys[0][0] = keys[0][1] = list(featuresets[0].spec.entities.keys())
344
205
 
345
- for featureset, featureset_df, lr_key in zip(featuresets, featureset_dfs, keys):
206
+ for featureset, featureset_df, lr_key, columns in zip(
207
+ featuresets, featureset_dfs, keys, all_columns
208
+ ):
346
209
  if featureset.spec.timestamp_key:
347
210
  merge_func = self._asof_join
348
211
  if self._join_type != "inner":
@@ -360,6 +223,7 @@ class BaseMerger(abc.ABC):
360
223
  featureset_df,
361
224
  lr_key[0],
362
225
  lr_key[1],
226
+ columns,
363
227
  )
364
228
 
365
229
  # unpersist as required by the implementation (e.g. spark) and delete references
@@ -378,6 +242,7 @@ class BaseMerger(abc.ABC):
378
242
  featureset_df,
379
243
  left_keys: list,
380
244
  right_keys: list,
245
+ columns: list,
381
246
  ):
382
247
  raise NotImplementedError("_asof_join() operation not implemented in class")
383
248
 
@@ -390,6 +255,7 @@ class BaseMerger(abc.ABC):
390
255
  featureset_df,
391
256
  left_keys: list,
392
257
  right_keys: list,
258
+ columns: list,
393
259
  ):
394
260
  raise NotImplementedError("_join() operation not implemented in class")
395
261
 
@@ -401,7 +267,6 @@ class BaseMerger(abc.ABC):
401
267
 
402
268
  def get_df(self, to_pandas=True):
403
269
  """return the result as a dataframe (pandas by default)"""
404
- self._set_indexes(self._result_df)
405
270
  return self._result_df
406
271
 
407
272
  def to_parquet(self, target_path, **kw):
@@ -428,9 +293,6 @@ class BaseMerger(abc.ABC):
428
293
  def __eq__(self, other):
429
294
  return self.name == other.name
430
295
 
431
- def __copy__(self):
432
- return BaseMerger._Node(self.name, self.order, self.data.copy())
433
-
434
296
  class _LinkedList:
435
297
  def __init__(self, head=None):
436
298
  self.head = head
@@ -451,19 +313,6 @@ class BaseMerger(abc.ABC):
451
313
  yield node
452
314
  node = node.next
453
315
 
454
- def __copy__(self):
455
- ll = BaseMerger._LinkedList()
456
- prev_node = None
457
- for node in self:
458
- new_node = node.__copy__()
459
- if ll.head is None:
460
- ll.head = new_node
461
- else:
462
- prev_node.next = new_node
463
- prev_node = new_node
464
- ll.len = self.len
465
- return ll
466
-
467
316
  def add_first(self, node):
468
317
  node.next = self.head
469
318
  self.head = node
@@ -476,9 +325,7 @@ class BaseMerger(abc.ABC):
476
325
  for current_node in self:
477
326
  pass
478
327
  current_node.next = node
479
- while node:
480
- self.len += 1
481
- node = node.next
328
+ self.len += 1
482
329
 
483
330
  def add_after(self, target_node, new_node):
484
331
  new_node.next = target_node.next
@@ -499,9 +346,7 @@ class BaseMerger(abc.ABC):
499
346
  node = self.find_node(other_head.name)
500
347
  if node is None:
501
348
  return
502
- for col in other_head.data["save_cols"]:
503
- if col not in node.data["save_cols"]:
504
- node.data["save_cols"].append(col)
349
+ node.data["save_cols"] += other_head.data["save_cols"]
505
350
  for other_node in other_iter:
506
351
  if self.find_node(other_node.name) is None:
507
352
  while node is not None and other_node.order > node.order:
@@ -575,9 +420,10 @@ class BaseMerger(abc.ABC):
575
420
  )
576
421
  )
577
422
 
578
- if all(
579
- curr_col_relation_list
580
- ): # checking if feature_set have relation with feature_set_in
423
+ # checking if feature_set have relation with feature_set_in
424
+ relation_wise = all(curr_col_relation_list)
425
+
426
+ if relation_wise:
581
427
  # add to the link list feature set according to the defined relation
582
428
  linked_list_relation.add_last(
583
429
  BaseMerger._Node(
@@ -591,8 +437,8 @@ class BaseMerger(abc.ABC):
591
437
  order=name_in_order,
592
438
  )
593
439
  )
594
- linked_list_relation.head.data["save_cols"].extend(
595
- curr_col_relation_list
440
+ linked_list_relation.head.data["save_cols"].append(
441
+ *curr_col_relation_list
596
442
  )
597
443
  elif name_in_order > head_order and sorted(
598
444
  feature_set_in_entity_list_names
@@ -622,14 +468,14 @@ class BaseMerger(abc.ABC):
622
468
  relation_linked_lists.append(linked_relation)
623
469
 
624
470
  # concat all the link lists to one, for the merging process
625
- for i in range(len(relation_linked_lists)):
626
- return_relation = relation_linked_lists[i].__copy__()
627
- for relation_list in relation_linked_lists:
628
- return_relation.concat(relation_list)
629
- if return_relation.len == len(feature_set_objects):
630
- return return_relation
471
+ link_list_iter = iter(relation_linked_lists)
472
+ return_relation = next(link_list_iter)
473
+ for relation_list in link_list_iter:
474
+ return_relation.concat(relation_list)
475
+ if return_relation.len != len(feature_set_objects):
476
+ raise mlrun.errors.MLRunRuntimeError("Failed to merge")
631
477
 
632
- raise mlrun.errors.MLRunRuntimeError("Failed to merge")
478
+ return return_relation
633
479
 
634
480
  @classmethod
635
481
  def get_default_image(cls, kind):
@@ -637,77 +483,3 @@ class BaseMerger(abc.ABC):
637
483
 
638
484
  def _reset_index(self, _result_df):
639
485
  raise NotImplementedError
640
-
641
- @abc.abstractmethod
642
- def _create_engine_env(self):
643
- """
644
- initialize engine env if needed
645
- """
646
- raise NotImplementedError
647
-
648
- @abc.abstractmethod
649
- def _get_engine_df(
650
- self,
651
- feature_set: FeatureSet,
652
- feature_set_name: typing.List[str],
653
- column_names: typing.List[str] = None,
654
- start_time: typing.Union[str, datetime] = None,
655
- end_time: typing.Union[str, datetime] = None,
656
- entity_timestamp_column: str = None,
657
- ):
658
- """
659
- Return the feature_set data frame according to the args
660
-
661
- :param feature_set: current feature_set to extract from the data frame
662
- :param feature_set_name: the name of the current feature_set
663
- :param column_names: list of columns to select (if not all)
664
- :param start_time: filter by start time
665
- :param end_time: filter by end time
666
- :param entity_timestamp_column: specify the time column name in the file
667
-
668
- :return: Data frame of the current engine
669
- """
670
- raise NotImplementedError
671
-
672
- @abc.abstractmethod
673
- def _rename_columns_and_select(
674
- self,
675
- df,
676
- rename_col_dict: typing.Dict[str, str],
677
- columns: typing.List[str] = None,
678
- ):
679
- """
680
- rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
681
-
682
- :param df: the data frame to change
683
- :param rename_col_dict: the renaming dictionary - {<current_column_name>: <new_column_name>, ...}
684
- :param columns: list of columns to select (if not all)
685
-
686
- :return: the data frame after the transformation or None if the transformation were preformed inplace
687
- """
688
- raise NotImplementedError
689
-
690
- @abc.abstractmethod
691
- def _drop_columns_from_result(self):
692
- """
693
- drop `self._drop_columns` from `self._result_df`
694
- """
695
- raise NotImplementedError
696
-
697
- @abc.abstractmethod
698
- def _filter(self, query: str):
699
- """
700
- filter `self._result_df` by `query`
701
-
702
- :param query: The query string used to filter rows
703
- """
704
- raise NotImplementedError
705
-
706
- @abc.abstractmethod
707
- def _order_by(self, order_by_active: typing.List[str]):
708
- """
709
- Order by `order_by_active` along all axis.
710
-
711
- :param order_by_active: list of names to sort by.
712
- """
713
- raise NotImplementedError