mlrun 1.8.0rc34__py3-none-any.whl → 1.8.0rc36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/model.py +5 -1
- mlrun/common/schemas/artifact.py +9 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/config.py +2 -0
- mlrun/data_types/infer.py +10 -2
- mlrun/datastore/sources.py +28 -23
- mlrun/db/base.py +7 -2
- mlrun/db/httpdb.py +31 -36
- mlrun/db/nopdb.py +7 -2
- mlrun/feature_store/__init__.py +0 -10
- mlrun/feature_store/api.py +0 -380
- mlrun/model.py +13 -0
- mlrun/model_monitoring/api.py +1 -1
- mlrun/model_monitoring/applications/evidently/base.py +1 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +41 -24
- mlrun/projects/project.py +0 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -2
- mlrun/runtimes/nuclio/function.py +12 -0
- mlrun/runtimes/nuclio/serving.py +7 -0
- mlrun/utils/async_http.py +2 -1
- mlrun/utils/helpers.py +15 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/METADATA +5 -5
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/RECORD +28 -28
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc34.dist-info → mlrun-1.8.0rc36.dist-info}/top_level.txt +0 -0
mlrun/feature_store/api.py
CHANGED
|
@@ -20,7 +20,6 @@ from datetime import datetime
|
|
|
20
20
|
from typing import Any, Optional, Union
|
|
21
21
|
|
|
22
22
|
import pandas as pd
|
|
23
|
-
from deprecated import deprecated
|
|
24
23
|
|
|
25
24
|
import mlrun
|
|
26
25
|
import mlrun.errors
|
|
@@ -91,122 +90,6 @@ def _features_to_vector_and_check_permissions(features, update_stats):
|
|
|
91
90
|
return vector
|
|
92
91
|
|
|
93
92
|
|
|
94
|
-
@deprecated(
|
|
95
|
-
version="1.6.0",
|
|
96
|
-
reason="get_offline_features() will be removed in 1.8.0, please instead use "
|
|
97
|
-
"get_feature_vector('store://feature_vector_name').get_offline_features()",
|
|
98
|
-
category=FutureWarning,
|
|
99
|
-
)
|
|
100
|
-
def get_offline_features(
|
|
101
|
-
feature_vector: Union[str, FeatureVector],
|
|
102
|
-
entity_rows=None,
|
|
103
|
-
entity_timestamp_column: Optional[str] = None,
|
|
104
|
-
target: DataTargetBase = None,
|
|
105
|
-
run_config: RunConfig = None,
|
|
106
|
-
drop_columns: Optional[list[str]] = None,
|
|
107
|
-
start_time: Optional[Union[str, datetime]] = None,
|
|
108
|
-
end_time: Optional[Union[str, datetime]] = None,
|
|
109
|
-
with_indexes: bool = False,
|
|
110
|
-
update_stats: bool = False,
|
|
111
|
-
engine: Optional[str] = None,
|
|
112
|
-
engine_args: Optional[dict] = None,
|
|
113
|
-
query: Optional[str] = None,
|
|
114
|
-
order_by: Optional[Union[str, list[str]]] = None,
|
|
115
|
-
spark_service: Optional[str] = None,
|
|
116
|
-
timestamp_for_filtering: Optional[Union[str, dict[str, str]]] = None,
|
|
117
|
-
additional_filters: Optional[list] = None,
|
|
118
|
-
):
|
|
119
|
-
"""retrieve offline feature vector results
|
|
120
|
-
|
|
121
|
-
specify a feature vector object/uri and retrieve the desired features, their metadata
|
|
122
|
-
and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
|
|
123
|
-
results can be returned as a dataframe or written to a target
|
|
124
|
-
|
|
125
|
-
The start_time and end_time attributes allow filtering the data to a given time range, they accept
|
|
126
|
-
string values or pandas `Timestamp` objects, string values can also be relative, for example:
|
|
127
|
-
"now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
|
|
128
|
-
for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
|
|
129
|
-
(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment).
|
|
130
|
-
Another option to filter the data is by the `query` argument - can be seen in the example.
|
|
131
|
-
example::
|
|
132
|
-
|
|
133
|
-
features = [
|
|
134
|
-
"stock-quotes.bid",
|
|
135
|
-
"stock-quotes.asks_sum_5h",
|
|
136
|
-
"stock-quotes.ask as mycol",
|
|
137
|
-
"stocks.*",
|
|
138
|
-
]
|
|
139
|
-
vector = FeatureVector(features=features)
|
|
140
|
-
resp = get_offline_features(
|
|
141
|
-
vector,
|
|
142
|
-
entity_rows=trades,
|
|
143
|
-
entity_timestamp_column="time",
|
|
144
|
-
query="ticker in ['GOOG'] and bid>100",
|
|
145
|
-
)
|
|
146
|
-
print(resp.to_dataframe())
|
|
147
|
-
print(vector.get_stats_table())
|
|
148
|
-
resp.to_parquet("./out.parquet")
|
|
149
|
-
|
|
150
|
-
:param feature_vector: feature vector uri or FeatureVector object. passing feature vector obj requires
|
|
151
|
-
update permissions
|
|
152
|
-
:param entity_rows: dataframe with entity rows to join with
|
|
153
|
-
:param target: where to write the results to
|
|
154
|
-
:param drop_columns: list of columns to drop from the final result
|
|
155
|
-
:param entity_timestamp_column: timestamp column name in the entity rows dataframe. can be specified
|
|
156
|
-
only if param entity_rows was specified.
|
|
157
|
-
:param run_config: function and/or run configuration
|
|
158
|
-
see :py:class:`~mlrun.feature_store.RunConfig`
|
|
159
|
-
:param start_time: datetime, low limit of time needed to be filtered. Optional.
|
|
160
|
-
:param end_time: datetime, high limit of time needed to be filtered. Optional.
|
|
161
|
-
:param with_indexes: Return vector with/without the entities and the timestamp_key of the feature sets
|
|
162
|
-
and with/without entity_timestamp_column and timestamp_for_filtering columns.
|
|
163
|
-
This property can be specified also in the feature vector spec
|
|
164
|
-
(feature_vector.spec.with_indexes)
|
|
165
|
-
(default False)
|
|
166
|
-
:param update_stats: update features statistics from the requested feature sets on the vector.
|
|
167
|
-
(default False).
|
|
168
|
-
:param engine: processing engine kind ("local", "dask", or "spark")
|
|
169
|
-
:param engine_args: kwargs for the processing engine
|
|
170
|
-
:param query: The query string used to filter rows on the output
|
|
171
|
-
:param spark_service: Name of the spark service to be used (when using a remote-spark runtime)
|
|
172
|
-
:param order_by: Name or list of names to order by. The name or the names in the list can be the
|
|
173
|
-
feature name or the alias of the feature you pass in the feature list.
|
|
174
|
-
:param timestamp_for_filtering: name of the column to filter by, can be str for all the feature sets or a
|
|
175
|
-
dictionary ({<feature set name>: <timestamp column name>, ...})
|
|
176
|
-
that indicates the timestamp column name for each feature set. Optional.
|
|
177
|
-
By default, the filter executes on the timestamp_key of each feature set.
|
|
178
|
-
Note: the time filtering is performed on each feature set before the
|
|
179
|
-
merge process using start_time and end_time params.
|
|
180
|
-
:param additional_filters: List of additional_filter conditions as tuples.
|
|
181
|
-
Each tuple should be in the format (column_name, operator, value).
|
|
182
|
-
Supported operators: "=", ">=", "<=", ">", "<".
|
|
183
|
-
Example: [("Product", "=", "Computer")]
|
|
184
|
-
For all supported filters, please see:
|
|
185
|
-
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
"""
|
|
189
|
-
return _get_offline_features(
|
|
190
|
-
feature_vector,
|
|
191
|
-
entity_rows,
|
|
192
|
-
entity_timestamp_column,
|
|
193
|
-
target,
|
|
194
|
-
run_config,
|
|
195
|
-
drop_columns,
|
|
196
|
-
start_time,
|
|
197
|
-
end_time,
|
|
198
|
-
with_indexes,
|
|
199
|
-
update_stats,
|
|
200
|
-
engine,
|
|
201
|
-
engine_args,
|
|
202
|
-
query,
|
|
203
|
-
order_by,
|
|
204
|
-
spark_service,
|
|
205
|
-
timestamp_for_filtering,
|
|
206
|
-
additional_filters,
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
|
|
210
93
|
def _get_offline_features(
|
|
211
94
|
feature_vector: Union[str, FeatureVector],
|
|
212
95
|
entity_rows=None,
|
|
@@ -288,93 +171,6 @@ def _get_offline_features(
|
|
|
288
171
|
)
|
|
289
172
|
|
|
290
173
|
|
|
291
|
-
@deprecated(
|
|
292
|
-
version="1.6.0",
|
|
293
|
-
reason="get_online_feature_service() will be removed in 1.8.0, please instead use "
|
|
294
|
-
"get_feature_vector('store://feature_vector_name').get_online_feature_service()",
|
|
295
|
-
category=FutureWarning,
|
|
296
|
-
)
|
|
297
|
-
def get_online_feature_service(
|
|
298
|
-
feature_vector: Union[str, FeatureVector],
|
|
299
|
-
run_config: RunConfig = None,
|
|
300
|
-
fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
|
|
301
|
-
impute_policy: Optional[dict] = None,
|
|
302
|
-
update_stats: bool = False,
|
|
303
|
-
entity_keys: Optional[list[str]] = None,
|
|
304
|
-
):
|
|
305
|
-
"""initialize and return online feature vector service api,
|
|
306
|
-
returns :py:class:`~mlrun.feature_store.OnlineVectorService`
|
|
307
|
-
|
|
308
|
-
:**usage**:
|
|
309
|
-
There are two ways to use the function:
|
|
310
|
-
|
|
311
|
-
1. As context manager
|
|
312
|
-
|
|
313
|
-
Example::
|
|
314
|
-
|
|
315
|
-
with get_online_feature_service(vector_uri) as svc:
|
|
316
|
-
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
317
|
-
print(resp)
|
|
318
|
-
resp = svc.get([{"ticker": "AAPL"}], as_list=True)
|
|
319
|
-
print(resp)
|
|
320
|
-
|
|
321
|
-
Example with imputing::
|
|
322
|
-
|
|
323
|
-
with get_online_feature_service(vector_uri, entity_keys=['id'],
|
|
324
|
-
impute_policy={"*": "$mean", "amount": 0)) as svc:
|
|
325
|
-
resp = svc.get([{"id": "C123487"}])
|
|
326
|
-
|
|
327
|
-
2. as simple function, note that in that option you need to close the session.
|
|
328
|
-
|
|
329
|
-
Example::
|
|
330
|
-
|
|
331
|
-
svc = get_online_feature_service(vector_uri, entity_keys=["ticker"])
|
|
332
|
-
try:
|
|
333
|
-
resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
|
|
334
|
-
print(resp)
|
|
335
|
-
resp = svc.get([{"ticker": "AAPL"}], as_list=True)
|
|
336
|
-
print(resp)
|
|
337
|
-
|
|
338
|
-
finally:
|
|
339
|
-
svc.close()
|
|
340
|
-
|
|
341
|
-
Example with imputing::
|
|
342
|
-
|
|
343
|
-
svc = get_online_feature_service(vector_uri, entity_keys=['id'],
|
|
344
|
-
impute_policy={"*": "$mean", "amount": 0))
|
|
345
|
-
try:
|
|
346
|
-
resp = svc.get([{"id": "C123487"}])
|
|
347
|
-
except Exception as e:
|
|
348
|
-
handling exception...
|
|
349
|
-
finally:
|
|
350
|
-
svc.close()
|
|
351
|
-
|
|
352
|
-
:param feature_vector: feature vector uri or FeatureVector object. passing feature vector obj requires update
|
|
353
|
-
permissions.
|
|
354
|
-
:param run_config: function and/or run configuration for remote jobs/services
|
|
355
|
-
:param impute_policy: a dict with `impute_policy` per feature, the dict key is the feature name and the dict
|
|
356
|
-
value indicate which value will be used in case the feature is NaN/empty, the replaced
|
|
357
|
-
value can be fixed number for constants or $mean, $max, $min, $std, $count
|
|
358
|
-
for statistical
|
|
359
|
-
values. "*" is used to specify the default for all features, example: `{"*": "$mean"}`
|
|
360
|
-
:param fixed_window_type: determines how to query the fixed window values which were previously inserted by ingest
|
|
361
|
-
:param update_stats: update features statistics from the requested feature sets on the vector.
|
|
362
|
-
Default: False.
|
|
363
|
-
:param entity_keys: Entity list of the first feature_set in the vector.
|
|
364
|
-
The indexes that are used to query the online service.
|
|
365
|
-
:return: Initialize the `OnlineVectorService`.
|
|
366
|
-
Will be used in subclasses where `support_online=True`.
|
|
367
|
-
"""
|
|
368
|
-
return _get_online_feature_service(
|
|
369
|
-
feature_vector,
|
|
370
|
-
run_config,
|
|
371
|
-
fixed_window_type,
|
|
372
|
-
impute_policy,
|
|
373
|
-
update_stats,
|
|
374
|
-
entity_keys,
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
|
|
378
174
|
def _get_online_feature_service(
|
|
379
175
|
feature_vector: Union[str, FeatureVector],
|
|
380
176
|
run_config: RunConfig = None,
|
|
@@ -448,86 +244,6 @@ def _get_namespace(run_config: RunConfig) -> dict[str, Any]:
|
|
|
448
244
|
return get_caller_globals()
|
|
449
245
|
|
|
450
246
|
|
|
451
|
-
def ingest(
|
|
452
|
-
featureset: Union[FeatureSet, str] = None,
|
|
453
|
-
source=None,
|
|
454
|
-
targets: Optional[list[DataTargetBase]] = None,
|
|
455
|
-
namespace=None,
|
|
456
|
-
return_df: bool = True,
|
|
457
|
-
infer_options: InferOptions = InferOptions.default(),
|
|
458
|
-
run_config: RunConfig = None,
|
|
459
|
-
mlrun_context=None,
|
|
460
|
-
spark_context=None,
|
|
461
|
-
overwrite=None,
|
|
462
|
-
) -> Optional[pd.DataFrame]:
|
|
463
|
-
"""Read local DataFrame, file, URL, or source into the feature store
|
|
464
|
-
Ingest reads from the source, run the graph transformations, infers metadata and stats
|
|
465
|
-
and writes the results to the default of specified targets
|
|
466
|
-
|
|
467
|
-
when targets are not specified data is stored in the configured default targets
|
|
468
|
-
(will usually be NoSQL for real-time and Parquet for offline).
|
|
469
|
-
|
|
470
|
-
the `run_config` parameter allow specifying the function and job configuration,
|
|
471
|
-
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
472
|
-
|
|
473
|
-
example::
|
|
474
|
-
|
|
475
|
-
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
|
|
476
|
-
stocks = pd.read_csv("stocks.csv")
|
|
477
|
-
df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
|
|
478
|
-
|
|
479
|
-
# for running as remote job
|
|
480
|
-
config = RunConfig(image="mlrun/mlrun")
|
|
481
|
-
df = ingest(stocks_set, stocks, run_config=config)
|
|
482
|
-
|
|
483
|
-
# specify source and targets
|
|
484
|
-
source = CSVSource("mycsv", path="measurements.csv")
|
|
485
|
-
targets = [CSVTarget("mycsv", path="./mycsv.csv")]
|
|
486
|
-
ingest(measurements, source, targets)
|
|
487
|
-
|
|
488
|
-
:param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB,
|
|
489
|
-
call `.save()` if it's not)
|
|
490
|
-
:param source: source dataframe or other sources (e.g. parquet source see:
|
|
491
|
-
:py:class:`~mlrun.datastore.ParquetSource` and other classes in mlrun.datastore with suffix
|
|
492
|
-
Source)
|
|
493
|
-
:param targets: optional list of data target objects
|
|
494
|
-
:param namespace: namespace or module containing graph classes
|
|
495
|
-
:param return_df: indicate if to return a dataframe with the graph results
|
|
496
|
-
:param infer_options: schema (for discovery of entities, features in featureset), index, stats,
|
|
497
|
-
histogram and preview infer options (:py:class:`~mlrun.feature_store.InferOptions`)
|
|
498
|
-
:param run_config: function and/or run configuration for remote jobs,
|
|
499
|
-
see :py:class:`~mlrun.feature_store.RunConfig`
|
|
500
|
-
:param mlrun_context: mlrun context (when running as a job), for internal use !
|
|
501
|
-
:param spark_context: local spark session for spark ingestion, example for creating the spark context:
|
|
502
|
-
`spark = SparkSession.builder.appName("Spark function").getOrCreate()`
|
|
503
|
-
For remote spark ingestion, this should contain the remote spark service name
|
|
504
|
-
:param overwrite: delete the targets' data prior to ingestion
|
|
505
|
-
(default: True for non scheduled ingest - deletes the targets that are about to be ingested.
|
|
506
|
-
False for scheduled ingest - does not delete the target)
|
|
507
|
-
:return: if return_df is True, a dataframe will be returned based on the graph
|
|
508
|
-
"""
|
|
509
|
-
if mlrun_context is None:
|
|
510
|
-
deprecated(
|
|
511
|
-
version="1.6.0",
|
|
512
|
-
reason="Calling 'ingest' with mlrun_context=None is deprecated and will be removed in 1.8.0,\
|
|
513
|
-
use 'FeatureSet.ingest()' instead",
|
|
514
|
-
category=FutureWarning,
|
|
515
|
-
)
|
|
516
|
-
|
|
517
|
-
return _ingest(
|
|
518
|
-
featureset,
|
|
519
|
-
source,
|
|
520
|
-
targets,
|
|
521
|
-
namespace,
|
|
522
|
-
return_df,
|
|
523
|
-
infer_options,
|
|
524
|
-
run_config,
|
|
525
|
-
mlrun_context,
|
|
526
|
-
spark_context,
|
|
527
|
-
overwrite,
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
|
|
531
247
|
def _ingest(
|
|
532
248
|
featureset: Union[FeatureSet, str] = None,
|
|
533
249
|
source=None,
|
|
@@ -776,53 +492,6 @@ def _ingest(
|
|
|
776
492
|
return df
|
|
777
493
|
|
|
778
494
|
|
|
779
|
-
@deprecated(
|
|
780
|
-
version="1.6.0",
|
|
781
|
-
reason="'preview' will be removed in 1.8.0, use 'FeatureSet.preview()' instead",
|
|
782
|
-
category=FutureWarning,
|
|
783
|
-
)
|
|
784
|
-
def preview(
|
|
785
|
-
featureset: FeatureSet,
|
|
786
|
-
source,
|
|
787
|
-
entity_columns: Optional[list] = None,
|
|
788
|
-
namespace=None,
|
|
789
|
-
options: InferOptions = None,
|
|
790
|
-
verbose: bool = False,
|
|
791
|
-
sample_size: Optional[int] = None,
|
|
792
|
-
) -> pd.DataFrame:
|
|
793
|
-
"""run the ingestion pipeline with local DataFrame/file data and infer features schema and stats
|
|
794
|
-
|
|
795
|
-
example::
|
|
796
|
-
|
|
797
|
-
quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")])
|
|
798
|
-
quotes_set.add_aggregation("ask", ["sum", "max"], ["1h", "5h"], "10m")
|
|
799
|
-
quotes_set.add_aggregation("bid", ["min", "max"], ["1h"], "10m")
|
|
800
|
-
df = preview(
|
|
801
|
-
quotes_set,
|
|
802
|
-
quotes_df,
|
|
803
|
-
entity_columns=["ticker"],
|
|
804
|
-
)
|
|
805
|
-
|
|
806
|
-
:param featureset: feature set object or uri
|
|
807
|
-
:param source: source dataframe or csv/parquet file path
|
|
808
|
-
:param entity_columns: list of entity (index) column names
|
|
809
|
-
:param namespace: namespace or module containing graph classes
|
|
810
|
-
:param options: schema (for discovery of entities, features in featureset), index, stats,
|
|
811
|
-
histogram and preview infer options (:py:class:`~mlrun.feature_store.InferOptions`)
|
|
812
|
-
:param verbose: verbose log
|
|
813
|
-
:param sample_size: num of rows to sample from the dataset (for large datasets)
|
|
814
|
-
"""
|
|
815
|
-
return _preview(
|
|
816
|
-
featureset,
|
|
817
|
-
source,
|
|
818
|
-
entity_columns,
|
|
819
|
-
namespace,
|
|
820
|
-
options,
|
|
821
|
-
verbose,
|
|
822
|
-
sample_size,
|
|
823
|
-
)
|
|
824
|
-
|
|
825
|
-
|
|
826
495
|
def _preview(
|
|
827
496
|
featureset: FeatureSet,
|
|
828
497
|
source,
|
|
@@ -912,55 +581,6 @@ def _run_ingestion_job(
|
|
|
912
581
|
return run_ingestion_job(name, featureset, run_config, source.schedule)
|
|
913
582
|
|
|
914
583
|
|
|
915
|
-
@deprecated(
|
|
916
|
-
version="1.6.0",
|
|
917
|
-
reason="'deploy_ingestion_service_v2' will be removed in 1.8.0, "
|
|
918
|
-
"use 'FeatureSet.deploy_ingestion_service()' instead",
|
|
919
|
-
category=FutureWarning,
|
|
920
|
-
)
|
|
921
|
-
def deploy_ingestion_service_v2(
|
|
922
|
-
featureset: Union[FeatureSet, str],
|
|
923
|
-
source: DataSource = None,
|
|
924
|
-
targets: Optional[list[DataTargetBase]] = None,
|
|
925
|
-
name: Optional[str] = None,
|
|
926
|
-
run_config: RunConfig = None,
|
|
927
|
-
verbose=False,
|
|
928
|
-
) -> tuple[str, BaseRuntime]:
|
|
929
|
-
"""Start real-time ingestion service using nuclio function
|
|
930
|
-
|
|
931
|
-
Deploy a real-time function implementing feature ingestion pipeline
|
|
932
|
-
the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
|
|
933
|
-
|
|
934
|
-
the `run_config` parameter allow specifying the function and job configuration,
|
|
935
|
-
see: :py:class:`~mlrun.feature_store.RunConfig`
|
|
936
|
-
|
|
937
|
-
example::
|
|
938
|
-
|
|
939
|
-
source = HTTPSource()
|
|
940
|
-
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
941
|
-
config = RunConfig(function=func)
|
|
942
|
-
deploy_ingestion_service_v2(my_set, source, run_config=config)
|
|
943
|
-
|
|
944
|
-
:param featureset: feature set object or uri
|
|
945
|
-
:param source: data source object describing the online or offline source
|
|
946
|
-
:param targets: list of data target objects
|
|
947
|
-
:param name: name for the job/function
|
|
948
|
-
:param run_config: service runtime configuration (function object/uri, resources, etc..)
|
|
949
|
-
:param verbose: verbose log
|
|
950
|
-
|
|
951
|
-
:return: URL to access the deployed ingestion service, and the function that was deployed (which will
|
|
952
|
-
differ from the function passed in via the run_config parameter).
|
|
953
|
-
"""
|
|
954
|
-
return _deploy_ingestion_service_v2(
|
|
955
|
-
featureset,
|
|
956
|
-
source,
|
|
957
|
-
targets,
|
|
958
|
-
name,
|
|
959
|
-
run_config,
|
|
960
|
-
verbose,
|
|
961
|
-
)
|
|
962
|
-
|
|
963
|
-
|
|
964
584
|
def _deploy_ingestion_service_v2(
|
|
965
585
|
featureset: Union[FeatureSet, str],
|
|
966
586
|
source: DataSource = None,
|
mlrun/model.py
CHANGED
|
@@ -430,6 +430,19 @@ class ObjectList:
|
|
|
430
430
|
self._children[child_obj.name] = child_obj
|
|
431
431
|
return child_obj
|
|
432
432
|
|
|
433
|
+
def move_to_end(self, child, last=True):
|
|
434
|
+
self._children.move_to_end(child, last)
|
|
435
|
+
|
|
436
|
+
def update_list(self, object_list: "ObjectList", push_at_start: bool = False):
|
|
437
|
+
if push_at_start:
|
|
438
|
+
self._children = OrderedDict(
|
|
439
|
+
list(object_list._children.items()) + list(self._children.items())
|
|
440
|
+
)
|
|
441
|
+
else:
|
|
442
|
+
self._children = OrderedDict(
|
|
443
|
+
list(self._children.items()) + list(object_list._children.items())
|
|
444
|
+
)
|
|
445
|
+
|
|
433
446
|
|
|
434
447
|
class Credentials(ModelObj):
|
|
435
448
|
generate_access_key = "$generate"
|
mlrun/model_monitoring/api.py
CHANGED
|
@@ -107,7 +107,7 @@ def get_or_create_model_endpoint(
|
|
|
107
107
|
sample_set_statistics=sample_set_statistics,
|
|
108
108
|
)
|
|
109
109
|
|
|
110
|
-
except mlrun.errors.MLRunNotFoundError:
|
|
110
|
+
except (mlrun.errors.MLRunNotFoundError, mlrun.errors.MLRunInvalidArgumentError):
|
|
111
111
|
# Create a new model endpoint with the provided details
|
|
112
112
|
pass
|
|
113
113
|
if not model_endpoint:
|
|
@@ -23,7 +23,7 @@ import mlrun.model_monitoring.applications.base as mm_base
|
|
|
23
23
|
import mlrun.model_monitoring.applications.context as mm_context
|
|
24
24
|
from mlrun.errors import MLRunIncompatibleVersionError
|
|
25
25
|
|
|
26
|
-
SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.
|
|
26
|
+
SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.6.0")
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
|
|
@@ -102,10 +102,10 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
102
102
|
Each metric is calculated over all the features individually and the mean is taken as the metric value.
|
|
103
103
|
The average of Hellinger and total variance distance is taken as the result.
|
|
104
104
|
|
|
105
|
-
The application
|
|
105
|
+
The application can log two artifacts:
|
|
106
106
|
|
|
107
|
-
*
|
|
108
|
-
*
|
|
107
|
+
* JSON with the general drift value per feature, produced by default.
|
|
108
|
+
* Plotly table with the various metrics and histograms per feature (disabled by default due to performance issues).
|
|
109
109
|
|
|
110
110
|
This application is deployed by default when calling:
|
|
111
111
|
|
|
@@ -114,12 +114,18 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
114
114
|
project.enable_model_monitoring()
|
|
115
115
|
|
|
116
116
|
To avoid it, pass :code:`deploy_histogram_data_drift_app=False`.
|
|
117
|
+
|
|
118
|
+
If you want to change the application defaults, such as the classifier or which artifacts to produce, you
|
|
119
|
+
need to inherit from this class and deploy it as any other model monitoring application.
|
|
117
120
|
"""
|
|
118
121
|
|
|
119
122
|
NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
|
|
120
123
|
|
|
121
124
|
_REQUIRED_METRICS = {HellingerDistance, TotalVarianceDistance}
|
|
122
|
-
_STATS_TYPES: tuple[StatsKind] = (
|
|
125
|
+
_STATS_TYPES: tuple[StatsKind, StatsKind] = (
|
|
126
|
+
StatsKind.CURRENT_STATS,
|
|
127
|
+
StatsKind.DRIFT_MEASURES,
|
|
128
|
+
)
|
|
123
129
|
|
|
124
130
|
metrics: list[type[HistogramDistanceMetric]] = [
|
|
125
131
|
HellingerDistance,
|
|
@@ -127,7 +133,12 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
127
133
|
TotalVarianceDistance,
|
|
128
134
|
]
|
|
129
135
|
|
|
130
|
-
def __init__(
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
value_classifier: Optional[ValueClassifier] = None,
|
|
139
|
+
produce_json_artifact: bool = True,
|
|
140
|
+
produce_plotly_artifact: bool = False,
|
|
141
|
+
) -> None:
|
|
131
142
|
"""
|
|
132
143
|
:param value_classifier: Classifier object that adheres to the `ValueClassifier` protocol.
|
|
133
144
|
If not provided, the default `DataDriftClassifier()` is used.
|
|
@@ -137,6 +148,9 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
137
148
|
self.metrics
|
|
138
149
|
), "TVD and Hellinger distance are required for the general data drift result"
|
|
139
150
|
|
|
151
|
+
self._produce_json_artifact = produce_json_artifact
|
|
152
|
+
self._produce_plotly_artifact = produce_plotly_artifact
|
|
153
|
+
|
|
140
154
|
def _compute_metrics_per_feature(
|
|
141
155
|
self, monitoring_context: mm_context.MonitoringApplicationContext
|
|
142
156
|
) -> DataFrame:
|
|
@@ -295,40 +309,43 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
295
309
|
cast(str, key): (self._value_classifier.value_to_status(value), value)
|
|
296
310
|
for key, value in drift_per_feature_values.items()
|
|
297
311
|
}
|
|
298
|
-
monitoring_context.logger.debug("
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
drift_results=drift_results,
|
|
305
|
-
)
|
|
312
|
+
monitoring_context.logger.debug("Producing plotly artifact")
|
|
313
|
+
artifact = mm_drift_table.FeaturesDriftTablePlot().produce(
|
|
314
|
+
sample_set_statistics=sample_set_statistics,
|
|
315
|
+
inputs_statistics=inputs_statistics,
|
|
316
|
+
metrics=metrics_per_feature.T.to_dict(), # pyright: ignore[reportArgumentType]
|
|
317
|
+
drift_results=drift_results,
|
|
306
318
|
)
|
|
319
|
+
monitoring_context.logger.debug("Logging plotly artifact")
|
|
320
|
+
monitoring_context.log_artifact(artifact)
|
|
307
321
|
monitoring_context.logger.debug("Logged plotly artifact successfully")
|
|
308
322
|
|
|
309
323
|
def _log_drift_artifacts(
|
|
310
324
|
self,
|
|
311
325
|
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
312
326
|
metrics_per_feature: DataFrame,
|
|
313
|
-
log_json_artifact: bool = True,
|
|
314
327
|
) -> None:
|
|
315
328
|
"""Log JSON and Plotly drift data per feature artifacts"""
|
|
329
|
+
if not self._produce_json_artifact and not self._produce_plotly_artifact:
|
|
330
|
+
return
|
|
331
|
+
|
|
316
332
|
drift_per_feature_values = metrics_per_feature[
|
|
317
333
|
[HellingerDistance.NAME, TotalVarianceDistance.NAME]
|
|
318
334
|
].mean(axis=1)
|
|
319
335
|
|
|
320
|
-
if
|
|
336
|
+
if self._produce_json_artifact:
|
|
321
337
|
self._log_json_artifact(drift_per_feature_values, monitoring_context)
|
|
322
338
|
|
|
323
|
-
self.
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
339
|
+
if self._produce_plotly_artifact:
|
|
340
|
+
self._log_plotly_table_artifact(
|
|
341
|
+
sample_set_statistics=self._get_shared_features_sample_stats(
|
|
342
|
+
monitoring_context
|
|
343
|
+
),
|
|
344
|
+
inputs_statistics=monitoring_context.feature_stats,
|
|
345
|
+
metrics_per_feature=metrics_per_feature,
|
|
346
|
+
drift_per_feature_values=drift_per_feature_values,
|
|
347
|
+
monitoring_context=monitoring_context,
|
|
348
|
+
)
|
|
332
349
|
|
|
333
350
|
def do_tracking(
|
|
334
351
|
self, monitoring_context: mm_context.MonitoringApplicationContext
|
mlrun/projects/project.py
CHANGED
|
@@ -4678,7 +4678,6 @@ class MlrunProject(ModelObj):
|
|
|
4678
4678
|
] = None, # Backward compatibility
|
|
4679
4679
|
states: typing.Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
|
|
4680
4680
|
sort: bool = True,
|
|
4681
|
-
last: int = 0,
|
|
4682
4681
|
iter: bool = False,
|
|
4683
4682
|
start_time_from: Optional[datetime.datetime] = None,
|
|
4684
4683
|
start_time_to: Optional[datetime.datetime] = None,
|
|
@@ -4751,7 +4750,6 @@ class MlrunProject(ModelObj):
|
|
|
4751
4750
|
else states or None
|
|
4752
4751
|
),
|
|
4753
4752
|
sort=sort,
|
|
4754
|
-
last=last,
|
|
4755
4753
|
iter=iter,
|
|
4756
4754
|
start_time_from=start_time_from,
|
|
4757
4755
|
start_time_to=start_time_to,
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import typing
|
|
15
15
|
from ast import FunctionDef, parse, unparse
|
|
16
16
|
from base64 import b64decode
|
|
17
17
|
from typing import Callable, Optional, Union
|
|
@@ -139,7 +139,7 @@ class DatabricksRuntime(kubejob.KubejobRuntime):
|
|
|
139
139
|
)
|
|
140
140
|
|
|
141
141
|
def _get_modified_user_code(self, original_handler: str, log_artifacts_code: str):
|
|
142
|
-
encoded_code = (
|
|
142
|
+
encoded_code: typing.Optional[str] = (
|
|
143
143
|
self.spec.build.functionSourceCode if hasattr(self.spec, "build") else None
|
|
144
144
|
)
|
|
145
145
|
if not encoded_code:
|
|
@@ -527,6 +527,17 @@ class RemoteRuntime(KubeResource):
|
|
|
527
527
|
access_key = kwargs.pop("access_key", None)
|
|
528
528
|
if not access_key:
|
|
529
529
|
access_key = self._resolve_v3io_access_key()
|
|
530
|
+
engine = "sync"
|
|
531
|
+
explicit_ack_mode = kwargs.pop("explicit_ack_mode", None)
|
|
532
|
+
if (
|
|
533
|
+
self.spec
|
|
534
|
+
and hasattr(self.spec, "graph")
|
|
535
|
+
and self.spec.graph
|
|
536
|
+
and self.spec.graph.engine
|
|
537
|
+
):
|
|
538
|
+
engine = self.spec.graph.engine
|
|
539
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
540
|
+
explicit_ack_mode = explicit_ack_mode or "explicitOnly"
|
|
530
541
|
|
|
531
542
|
self.add_trigger(
|
|
532
543
|
name,
|
|
@@ -540,6 +551,7 @@ class RemoteRuntime(KubeResource):
|
|
|
540
551
|
extra_attributes=extra_attributes,
|
|
541
552
|
read_batch_size=256,
|
|
542
553
|
access_key=access_key,
|
|
554
|
+
explicit_ack_mode=explicit_ack_mode,
|
|
543
555
|
**kwargs,
|
|
544
556
|
),
|
|
545
557
|
)
|
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -376,6 +376,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
376
376
|
creation_strategy: Optional[
|
|
377
377
|
schemas.ModelEndpointCreationStrategy
|
|
378
378
|
] = schemas.ModelEndpointCreationStrategy.INPLACE,
|
|
379
|
+
outputs: Optional[list[str]] = None,
|
|
379
380
|
**class_args,
|
|
380
381
|
):
|
|
381
382
|
"""add ml model and/or route to the function.
|
|
@@ -408,6 +409,9 @@ class ServingRuntime(RemoteRuntime):
|
|
|
408
409
|
* **archive**:
|
|
409
410
|
1. If model endpoints with the same name exist, preserve them.
|
|
410
411
|
2. Create a new model endpoint with the same name and set it to `latest`.
|
|
412
|
+
:param outputs: list of the model outputs (e.g. labels) ,if provided will override the outputs that been
|
|
413
|
+
configured in the model artifact, please note that those outputs need to be equal to the
|
|
414
|
+
model serving function outputs (length, and order)
|
|
411
415
|
:param class_args: extra kwargs to pass to the model serving class __init__
|
|
412
416
|
(can be read in the model using .get_param(key) method)
|
|
413
417
|
"""
|
|
@@ -443,6 +447,8 @@ class ServingRuntime(RemoteRuntime):
|
|
|
443
447
|
if class_name and hasattr(class_name, "to_dict"):
|
|
444
448
|
if model_path:
|
|
445
449
|
class_name.model_path = model_path
|
|
450
|
+
if outputs:
|
|
451
|
+
class_name.outputs = outputs
|
|
446
452
|
key, state = params_to_step(
|
|
447
453
|
class_name,
|
|
448
454
|
key,
|
|
@@ -470,6 +476,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
470
476
|
else:
|
|
471
477
|
class_args = deepcopy(class_args)
|
|
472
478
|
class_args["model_path"] = model_path
|
|
479
|
+
class_args["outputs"] = outputs
|
|
473
480
|
state = TaskStep(
|
|
474
481
|
class_name,
|
|
475
482
|
class_args,
|