mlrun 1.7.0rc13__py3-none-any.whl → 1.7.0rc15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +0 -105
- mlrun/artifacts/__init__.py +1 -2
- mlrun/artifacts/base.py +8 -250
- mlrun/artifacts/dataset.py +1 -190
- mlrun/artifacts/manager.py +2 -41
- mlrun/artifacts/model.py +1 -140
- mlrun/artifacts/plots.py +1 -375
- mlrun/common/schemas/model_monitoring/__init__.py +4 -0
- mlrun/common/schemas/model_monitoring/constants.py +24 -3
- mlrun/common/schemas/model_monitoring/model_endpoints.py +13 -1
- mlrun/common/schemas/project.py +1 -0
- mlrun/config.py +14 -4
- mlrun/data_types/to_pandas.py +4 -4
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore_profile.py +50 -3
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/sources.py +43 -2
- mlrun/datastore/store_resources.py +2 -6
- mlrun/datastore/targets.py +125 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +1 -1
- mlrun/db/httpdb.py +69 -33
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +4 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +2 -0
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +5 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
- mlrun/kfpops.py +5 -10
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +1 -1
- mlrun/lists.py +2 -2
- mlrun/model.py +36 -9
- mlrun/model_monitoring/api.py +41 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +158 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +92 -77
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +1 -1
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +67 -4
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +1 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +2 -3
- mlrun/model_monitoring/writer.py +69 -39
- mlrun/platforms/iguazio.py +2 -2
- mlrun/projects/pipelines.py +24 -7
- mlrun/projects/project.py +130 -65
- mlrun/render.py +2 -10
- mlrun/run.py +1 -4
- mlrun/runtimes/__init__.py +3 -3
- mlrun/runtimes/base.py +3 -3
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +275 -153
- mlrun/runtimes/nuclio/function.py +1 -1
- mlrun/runtimes/pod.py +5 -5
- mlrun/runtimes/utils.py +1 -1
- mlrun/serving/states.py +53 -2
- mlrun/utils/helpers.py +27 -40
- mlrun/utils/notifications/notification/slack.py +31 -8
- mlrun/utils/notifications/notification_pusher.py +133 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/METADATA +2 -2
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/RECORD +84 -79
- mlrun/runtimes/mpijob/v1alpha1.py +0 -29
- /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/top_level.txt +0 -0
|
@@ -99,14 +99,17 @@ class FeatureSetFeatures(MonitoringStrEnum):
|
|
|
99
99
|
|
|
100
100
|
class ApplicationEvent:
|
|
101
101
|
APPLICATION_NAME = "application_name"
|
|
102
|
-
CURRENT_STATS = "current_stats"
|
|
103
|
-
FEATURE_STATS = "feature_stats"
|
|
104
|
-
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
105
102
|
START_INFER_TIME = "start_infer_time"
|
|
106
103
|
END_INFER_TIME = "end_infer_time"
|
|
107
104
|
LAST_REQUEST = "last_request"
|
|
108
105
|
ENDPOINT_ID = "endpoint_id"
|
|
109
106
|
OUTPUT_STREAM_URI = "output_stream_uri"
|
|
107
|
+
MLRUN_CONTEXT = "mlrun_context"
|
|
108
|
+
|
|
109
|
+
# Deprecated fields - TODO : delete in 1.9.0 (V1 app deprecation)
|
|
110
|
+
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
111
|
+
CURRENT_STATS = "current_stats"
|
|
112
|
+
FEATURE_STATS = "feature_stats"
|
|
110
113
|
|
|
111
114
|
|
|
112
115
|
class WriterEvent(MonitoringStrEnum):
|
|
@@ -114,6 +117,21 @@ class WriterEvent(MonitoringStrEnum):
|
|
|
114
117
|
ENDPOINT_ID = "endpoint_id"
|
|
115
118
|
START_INFER_TIME = "start_infer_time"
|
|
116
119
|
END_INFER_TIME = "end_infer_time"
|
|
120
|
+
EVENT_KIND = "event_kind" # metric or result
|
|
121
|
+
DATA = "data"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class WriterEventKind(MonitoringStrEnum):
|
|
125
|
+
METRIC = "metric"
|
|
126
|
+
RESULT = "result"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class MetricData(MonitoringStrEnum):
|
|
130
|
+
METRIC_NAME = "metric_name"
|
|
131
|
+
METRIC_VALUE = "metric_value"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class ResultData(MonitoringStrEnum):
|
|
117
135
|
RESULT_NAME = "result_name"
|
|
118
136
|
RESULT_VALUE = "result_value"
|
|
119
137
|
RESULT_KIND = "result_kind"
|
|
@@ -303,6 +321,9 @@ class ModelMonitoringAppLabel:
|
|
|
303
321
|
KEY = "mlrun__type"
|
|
304
322
|
VAL = "mlrun__model-monitoring-application"
|
|
305
323
|
|
|
324
|
+
def __str__(self) -> str:
|
|
325
|
+
return f"{self.KEY}={self.VAL}"
|
|
326
|
+
|
|
306
327
|
|
|
307
328
|
class ControllerPolicy:
|
|
308
329
|
BASE_PERIOD = "base_period"
|
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
16
15
|
import enum
|
|
17
16
|
import json
|
|
@@ -21,6 +20,7 @@ from pydantic import BaseModel, Field, validator
|
|
|
21
20
|
from pydantic.main import Extra
|
|
22
21
|
|
|
23
22
|
import mlrun.common.model_monitoring
|
|
23
|
+
import mlrun.common.types
|
|
24
24
|
|
|
25
25
|
from ..object import ObjectKind, ObjectSpec, ObjectStatus
|
|
26
26
|
from .constants import (
|
|
@@ -292,6 +292,18 @@ class ModelEndpointList(BaseModel):
|
|
|
292
292
|
endpoints: list[ModelEndpoint] = []
|
|
293
293
|
|
|
294
294
|
|
|
295
|
+
class ModelEndpointMonitoringMetricType(mlrun.common.types.StrEnum):
|
|
296
|
+
RESULT = "result"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class ModelEndpointMonitoringMetric(BaseModel):
|
|
300
|
+
project: str
|
|
301
|
+
app: str
|
|
302
|
+
type: ModelEndpointMonitoringMetricType
|
|
303
|
+
name: str
|
|
304
|
+
full_name: str
|
|
305
|
+
|
|
306
|
+
|
|
295
307
|
def _mapping_attributes(
|
|
296
308
|
base_model: BaseModel,
|
|
297
309
|
flattened_dictionary: dict,
|
mlrun/common/schemas/project.py
CHANGED
mlrun/config.py
CHANGED
|
@@ -188,6 +188,7 @@ default_config = {
|
|
|
188
188
|
"background_tasks": {
|
|
189
189
|
# enabled / disabled
|
|
190
190
|
"timeout_mode": "enabled",
|
|
191
|
+
"function_deletion_batch_size": 10,
|
|
191
192
|
# timeout in seconds to wait for background task to be updated / finished by the worker responsible for the task
|
|
192
193
|
"default_timeouts": {
|
|
193
194
|
"operations": {
|
|
@@ -196,6 +197,7 @@ default_config = {
|
|
|
196
197
|
"run_abortion": "600",
|
|
197
198
|
"abort_grace_period": "10",
|
|
198
199
|
"delete_project": "900",
|
|
200
|
+
"delete_function": "900",
|
|
199
201
|
},
|
|
200
202
|
"runtimes": {"dask": "600"},
|
|
201
203
|
},
|
|
@@ -359,7 +361,7 @@ default_config = {
|
|
|
359
361
|
# is set to ClusterIP
|
|
360
362
|
# ---------------------------------------------------------------------
|
|
361
363
|
# Note: adding a mode requires special handling on
|
|
362
|
-
# - mlrun.runtimes.constants.NuclioIngressAddTemplatedIngressModes
|
|
364
|
+
# - mlrun.common.runtimes.constants.NuclioIngressAddTemplatedIngressModes
|
|
363
365
|
# - mlrun.runtimes.nuclio.function.enrich_function_with_ingress
|
|
364
366
|
"add_templated_ingress_host_mode": "never",
|
|
365
367
|
"explicit_ack": "enabled",
|
|
@@ -552,7 +554,7 @@ default_config = {
|
|
|
552
554
|
"nosql": "v3io:///projects/{project}/FeatureStore/{name}/nosql",
|
|
553
555
|
# "authority" is optional and generalizes [userinfo "@"] host [":" port]
|
|
554
556
|
"redisnosql": "redis://{authority}/projects/{project}/FeatureStore/{name}/nosql",
|
|
555
|
-
"dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/
|
|
557
|
+
"dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/{kind}",
|
|
556
558
|
},
|
|
557
559
|
"default_targets": "parquet,nosql",
|
|
558
560
|
"default_job_image": "mlrun/mlrun",
|
|
@@ -690,7 +692,11 @@ default_config = {
|
|
|
690
692
|
"grafana_url": "",
|
|
691
693
|
"alerts": {
|
|
692
694
|
# supported modes: "enabled", "disabled".
|
|
693
|
-
"mode": "
|
|
695
|
+
"mode": "enabled"
|
|
696
|
+
},
|
|
697
|
+
"auth_with_client_id": {
|
|
698
|
+
"enabled": False,
|
|
699
|
+
"request_timeout": 5,
|
|
694
700
|
},
|
|
695
701
|
}
|
|
696
702
|
|
|
@@ -1396,7 +1402,11 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1396
1402
|
log_formatter = mlrun.utils.create_formatter_instance(
|
|
1397
1403
|
mlrun.utils.FormatterKinds(log_formatter_name)
|
|
1398
1404
|
)
|
|
1399
|
-
mlrun.utils.logger.get_handler("default")
|
|
1405
|
+
current_handler = mlrun.utils.logger.get_handler("default")
|
|
1406
|
+
current_formatter_name = current_handler.formatter.__class__.__name__
|
|
1407
|
+
desired_formatter_name = log_formatter.__class__.__name__
|
|
1408
|
+
if current_formatter_name != desired_formatter_name:
|
|
1409
|
+
current_handler.setFormatter(log_formatter)
|
|
1400
1410
|
|
|
1401
1411
|
# The default function pod resource values are of type str; however, when reading from environment variable numbers,
|
|
1402
1412
|
# it converts them to type int if contains only number, so we want to convert them to str.
|
mlrun/data_types/to_pandas.py
CHANGED
|
@@ -65,10 +65,10 @@ def toPandas(spark_df):
|
|
|
65
65
|
msg = (
|
|
66
66
|
"toPandas attempted Arrow optimization because "
|
|
67
67
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
|
|
68
|
-
"failed by the reason below:\n
|
|
68
|
+
f"failed by the reason below:\n {e}\n"
|
|
69
69
|
"Attempting non-optimization as "
|
|
70
70
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
|
|
71
|
-
"true."
|
|
71
|
+
"true."
|
|
72
72
|
)
|
|
73
73
|
warnings.warn(msg)
|
|
74
74
|
use_arrow = False
|
|
@@ -78,7 +78,7 @@ def toPandas(spark_df):
|
|
|
78
78
|
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
|
|
79
79
|
"reached the error below and will not continue because automatic fallback "
|
|
80
80
|
"with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
|
|
81
|
-
"false.\n
|
|
81
|
+
f"false.\n {e}"
|
|
82
82
|
)
|
|
83
83
|
warnings.warn(msg)
|
|
84
84
|
raise
|
|
@@ -144,7 +144,7 @@ def toPandas(spark_df):
|
|
|
144
144
|
"reached the error below and can not continue. Note that "
|
|
145
145
|
"'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
|
|
146
146
|
"effect on failures in the middle of "
|
|
147
|
-
"computation.\n
|
|
147
|
+
f"computation.\n {e}"
|
|
148
148
|
)
|
|
149
149
|
warnings.warn(msg)
|
|
150
150
|
raise
|
mlrun/datastore/base.py
CHANGED
|
@@ -179,11 +179,23 @@ class DataStore:
|
|
|
179
179
|
return {}
|
|
180
180
|
|
|
181
181
|
@staticmethod
|
|
182
|
-
def _parquet_reader(
|
|
182
|
+
def _parquet_reader(
|
|
183
|
+
df_module,
|
|
184
|
+
url,
|
|
185
|
+
file_system,
|
|
186
|
+
time_column,
|
|
187
|
+
start_time,
|
|
188
|
+
end_time,
|
|
189
|
+
additional_filters,
|
|
190
|
+
):
|
|
183
191
|
from storey.utils import find_filters, find_partitions
|
|
184
192
|
|
|
185
193
|
def set_filters(
|
|
186
|
-
partitions_time_attributes,
|
|
194
|
+
partitions_time_attributes,
|
|
195
|
+
start_time_inner,
|
|
196
|
+
end_time_inner,
|
|
197
|
+
filters_inner,
|
|
198
|
+
kwargs,
|
|
187
199
|
):
|
|
188
200
|
filters = []
|
|
189
201
|
find_filters(
|
|
@@ -193,20 +205,23 @@ class DataStore:
|
|
|
193
205
|
filters,
|
|
194
206
|
time_column,
|
|
195
207
|
)
|
|
208
|
+
if filters and filters_inner:
|
|
209
|
+
filters[0] += filters_inner
|
|
210
|
+
|
|
196
211
|
kwargs["filters"] = filters
|
|
197
212
|
|
|
198
213
|
def reader(*args, **kwargs):
|
|
199
|
-
if start_time or end_time:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
214
|
+
if time_column is None and (start_time or end_time):
|
|
215
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
216
|
+
"When providing start_time or end_time, must provide time_column"
|
|
217
|
+
)
|
|
218
|
+
if start_time or end_time or additional_filters:
|
|
205
219
|
partitions_time_attributes = find_partitions(url, file_system)
|
|
206
220
|
set_filters(
|
|
207
221
|
partitions_time_attributes,
|
|
208
222
|
start_time,
|
|
209
223
|
end_time,
|
|
224
|
+
additional_filters,
|
|
210
225
|
kwargs,
|
|
211
226
|
)
|
|
212
227
|
try:
|
|
@@ -217,6 +232,7 @@ class DataStore:
|
|
|
217
232
|
):
|
|
218
233
|
raise ex
|
|
219
234
|
|
|
235
|
+
# TODO: fix timezone issue (ML-6308)
|
|
220
236
|
if start_time.tzinfo:
|
|
221
237
|
start_time_inner = start_time.replace(tzinfo=None)
|
|
222
238
|
end_time_inner = end_time.replace(tzinfo=None)
|
|
@@ -228,6 +244,7 @@ class DataStore:
|
|
|
228
244
|
partitions_time_attributes,
|
|
229
245
|
start_time_inner,
|
|
230
246
|
end_time_inner,
|
|
247
|
+
additional_filters,
|
|
231
248
|
kwargs,
|
|
232
249
|
)
|
|
233
250
|
return df_module.read_parquet(*args, **kwargs)
|
|
@@ -246,6 +263,7 @@ class DataStore:
|
|
|
246
263
|
start_time=None,
|
|
247
264
|
end_time=None,
|
|
248
265
|
time_column=None,
|
|
266
|
+
additional_filters=None,
|
|
249
267
|
**kwargs,
|
|
250
268
|
):
|
|
251
269
|
df_module = df_module or pd
|
|
@@ -310,7 +328,13 @@ class DataStore:
|
|
|
310
328
|
kwargs["columns"] = columns
|
|
311
329
|
|
|
312
330
|
reader = self._parquet_reader(
|
|
313
|
-
df_module,
|
|
331
|
+
df_module,
|
|
332
|
+
url,
|
|
333
|
+
file_system,
|
|
334
|
+
time_column,
|
|
335
|
+
start_time,
|
|
336
|
+
end_time,
|
|
337
|
+
additional_filters,
|
|
314
338
|
)
|
|
315
339
|
|
|
316
340
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -539,6 +563,7 @@ class DataItem:
|
|
|
539
563
|
time_column=None,
|
|
540
564
|
start_time=None,
|
|
541
565
|
end_time=None,
|
|
566
|
+
additional_filters=None,
|
|
542
567
|
**kwargs,
|
|
543
568
|
):
|
|
544
569
|
"""return a dataframe object (generated from the dataitem).
|
|
@@ -550,6 +575,12 @@ class DataItem:
|
|
|
550
575
|
:param end_time: filters out data after this time
|
|
551
576
|
:param time_column: Store timestamp_key will be used if None.
|
|
552
577
|
The results will be filtered by this column and start_time & end_time.
|
|
578
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
579
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
580
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
581
|
+
Example: [("Product", "=", "Computer")]
|
|
582
|
+
For all supported filters, please see:
|
|
583
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
553
584
|
"""
|
|
554
585
|
df = self._store.as_df(
|
|
555
586
|
self._url,
|
|
@@ -560,6 +591,7 @@ class DataItem:
|
|
|
560
591
|
time_column=time_column,
|
|
561
592
|
start_time=start_time,
|
|
562
593
|
end_time=end_time,
|
|
594
|
+
additional_filters=additional_filters,
|
|
563
595
|
**kwargs,
|
|
564
596
|
)
|
|
565
597
|
return df
|
|
@@ -185,6 +185,17 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
185
185
|
assume_role_arn: typing.Optional[str] = None
|
|
186
186
|
access_key_id: typing.Optional[str] = None
|
|
187
187
|
secret_key: typing.Optional[str] = None
|
|
188
|
+
bucket: typing.Optional[str] = None
|
|
189
|
+
|
|
190
|
+
@pydantic.validator("bucket")
|
|
191
|
+
def check_bucket(cls, v):
|
|
192
|
+
if not v:
|
|
193
|
+
warnings.warn(
|
|
194
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
195
|
+
FutureWarning,
|
|
196
|
+
stacklevel=2,
|
|
197
|
+
)
|
|
198
|
+
return v
|
|
188
199
|
|
|
189
200
|
def secrets(self) -> dict:
|
|
190
201
|
res = {}
|
|
@@ -203,7 +214,13 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
203
214
|
return res
|
|
204
215
|
|
|
205
216
|
def url(self, subpath):
|
|
206
|
-
|
|
217
|
+
# TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
|
|
218
|
+
# we assume that the subpath can begin without a '/' character,
|
|
219
|
+
# while here we assume it always starts with one.
|
|
220
|
+
if self.bucket:
|
|
221
|
+
return f"s3://{self.bucket}{subpath}"
|
|
222
|
+
else:
|
|
223
|
+
return f"s3:/{subpath}"
|
|
207
224
|
|
|
208
225
|
|
|
209
226
|
class DatastoreProfileRedis(DatastoreProfile):
|
|
@@ -272,6 +289,17 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
272
289
|
_private_attributes = ("gcp_credentials",)
|
|
273
290
|
credentials_path: typing.Optional[str] = None # path to file.
|
|
274
291
|
gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
|
|
292
|
+
bucket: typing.Optional[str] = None
|
|
293
|
+
|
|
294
|
+
@pydantic.validator("bucket")
|
|
295
|
+
def check_bucket(cls, v):
|
|
296
|
+
if not v:
|
|
297
|
+
warnings.warn(
|
|
298
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
299
|
+
FutureWarning,
|
|
300
|
+
stacklevel=2,
|
|
301
|
+
)
|
|
302
|
+
return v
|
|
275
303
|
|
|
276
304
|
@pydantic.validator("gcp_credentials", pre=True, always=True)
|
|
277
305
|
def convert_dict_to_json(cls, v):
|
|
@@ -280,10 +308,15 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
280
308
|
return v
|
|
281
309
|
|
|
282
310
|
def url(self, subpath) -> str:
|
|
311
|
+
# TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
|
|
312
|
+
# but the opposite assumption is made in S3.
|
|
283
313
|
if subpath.startswith("/"):
|
|
284
314
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
285
315
|
subpath = subpath[1:]
|
|
286
|
-
|
|
316
|
+
if self.bucket:
|
|
317
|
+
return f"gcs://{self.bucket}/{subpath}"
|
|
318
|
+
else:
|
|
319
|
+
return f"gcs://{subpath}"
|
|
287
320
|
|
|
288
321
|
def secrets(self) -> dict:
|
|
289
322
|
res = {}
|
|
@@ -311,12 +344,26 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
311
344
|
client_secret: typing.Optional[str] = None
|
|
312
345
|
sas_token: typing.Optional[str] = None
|
|
313
346
|
credential: typing.Optional[str] = None
|
|
347
|
+
bucket: typing.Optional[str] = None
|
|
348
|
+
|
|
349
|
+
@pydantic.validator("bucket")
|
|
350
|
+
def check_bucket(cls, v):
|
|
351
|
+
if not v:
|
|
352
|
+
warnings.warn(
|
|
353
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
354
|
+
FutureWarning,
|
|
355
|
+
stacklevel=2,
|
|
356
|
+
)
|
|
357
|
+
return v
|
|
314
358
|
|
|
315
359
|
def url(self, subpath) -> str:
|
|
316
360
|
if subpath.startswith("/"):
|
|
317
361
|
# in azure the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
318
362
|
subpath = subpath[1:]
|
|
319
|
-
|
|
363
|
+
if self.bucket:
|
|
364
|
+
return f"az://{self.bucket}/{subpath}"
|
|
365
|
+
else:
|
|
366
|
+
return f"az://{subpath}"
|
|
320
367
|
|
|
321
368
|
def secrets(self) -> dict:
|
|
322
369
|
res = {}
|
mlrun/datastore/hdfs.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import os
|
|
15
|
+
from urllib.parse import urlparse
|
|
15
16
|
|
|
16
17
|
import fsspec
|
|
17
18
|
|
|
@@ -49,3 +50,7 @@ class HdfsStore(DataStore):
|
|
|
49
50
|
@property
|
|
50
51
|
def spark_url(self):
|
|
51
52
|
return f"hdfs://{self.host}:{self.port}"
|
|
53
|
+
|
|
54
|
+
def rm(self, url, recursive=False, maxdepth=None):
|
|
55
|
+
path = urlparse(url).path
|
|
56
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
|
|
|
80
80
|
reader = df_module.read_json
|
|
81
81
|
else:
|
|
82
82
|
raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
|
|
83
|
-
# InMemoryStore store
|
|
84
|
-
for field in ["time_column", "start_time", "end_time"]:
|
|
83
|
+
# InMemoryStore store – don't pass filters
|
|
84
|
+
for field in ["time_column", "start_time", "end_time", "additional_filters"]:
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|
mlrun/datastore/sources.py
CHANGED
|
@@ -102,8 +102,12 @@ class BaseSourceDriver(DataSource):
|
|
|
102
102
|
start_time=None,
|
|
103
103
|
end_time=None,
|
|
104
104
|
time_field=None,
|
|
105
|
+
additional_filters=None,
|
|
105
106
|
):
|
|
106
107
|
"""return the source data as dataframe"""
|
|
108
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
109
|
+
additional_filters, self.__class__
|
|
110
|
+
)
|
|
107
111
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
108
112
|
columns=columns,
|
|
109
113
|
df_module=df_module,
|
|
@@ -245,7 +249,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
245
249
|
start_time=None,
|
|
246
250
|
end_time=None,
|
|
247
251
|
time_field=None,
|
|
252
|
+
additional_filters=None,
|
|
248
253
|
):
|
|
254
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
255
|
+
additional_filters, self.__class__
|
|
256
|
+
)
|
|
249
257
|
reader_args = self.attributes.get("reader_args", {})
|
|
250
258
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
251
259
|
columns=columns,
|
|
@@ -281,6 +289,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
281
289
|
:parameter start_time: filters out data before this time
|
|
282
290
|
:parameter end_time: filters out data after this time
|
|
283
291
|
:parameter attributes: additional parameters to pass to storey.
|
|
292
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
293
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
294
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
295
|
+
Example: [("Product", "=", "Computer")]
|
|
296
|
+
For all supported filters, please see:
|
|
297
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
284
298
|
"""
|
|
285
299
|
|
|
286
300
|
kind = "parquet"
|
|
@@ -297,6 +311,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
297
311
|
schedule: str = None,
|
|
298
312
|
start_time: Optional[Union[datetime, str]] = None,
|
|
299
313
|
end_time: Optional[Union[datetime, str]] = None,
|
|
314
|
+
additional_filters: Optional[list[tuple]] = None,
|
|
300
315
|
):
|
|
301
316
|
super().__init__(
|
|
302
317
|
name,
|
|
@@ -308,6 +323,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
308
323
|
start_time,
|
|
309
324
|
end_time,
|
|
310
325
|
)
|
|
326
|
+
self.additional_filters = additional_filters
|
|
311
327
|
|
|
312
328
|
@property
|
|
313
329
|
def start_time(self):
|
|
@@ -341,6 +357,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
341
357
|
start_time=None,
|
|
342
358
|
end_time=None,
|
|
343
359
|
context=None,
|
|
360
|
+
additional_filters=None,
|
|
344
361
|
):
|
|
345
362
|
import storey
|
|
346
363
|
|
|
@@ -358,6 +375,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
358
375
|
end_filter=self.end_time,
|
|
359
376
|
start_filter=self.start_time,
|
|
360
377
|
filter_column=self.time_field or time_field,
|
|
378
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
361
379
|
**attributes,
|
|
362
380
|
)
|
|
363
381
|
|
|
@@ -380,6 +398,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
380
398
|
start_time=None,
|
|
381
399
|
end_time=None,
|
|
382
400
|
time_field=None,
|
|
401
|
+
additional_filters=None,
|
|
383
402
|
):
|
|
384
403
|
reader_args = self.attributes.get("reader_args", {})
|
|
385
404
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
@@ -389,6 +408,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
389
408
|
end_time=end_time or self.end_time,
|
|
390
409
|
time_column=time_field or self.time_field,
|
|
391
410
|
format="parquet",
|
|
411
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
392
412
|
**reader_args,
|
|
393
413
|
)
|
|
394
414
|
|
|
@@ -519,10 +539,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
519
539
|
start_time=None,
|
|
520
540
|
end_time=None,
|
|
521
541
|
time_field=None,
|
|
542
|
+
additional_filters=None,
|
|
522
543
|
):
|
|
523
544
|
from google.cloud import bigquery
|
|
524
545
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
525
546
|
|
|
547
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
548
|
+
additional_filters, self.__class__
|
|
549
|
+
)
|
|
550
|
+
|
|
526
551
|
def schema_to_dtypes(schema):
|
|
527
552
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
528
553
|
|
|
@@ -562,7 +587,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
562
587
|
else:
|
|
563
588
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
564
589
|
|
|
565
|
-
# TODO : filter as part of the query
|
|
566
590
|
return select_columns_from_df(
|
|
567
591
|
filter_df_start_end_time(
|
|
568
592
|
df,
|
|
@@ -740,7 +764,19 @@ class DataFrameSource:
|
|
|
740
764
|
context=self.context or context,
|
|
741
765
|
)
|
|
742
766
|
|
|
743
|
-
def to_dataframe(
|
|
767
|
+
def to_dataframe(
|
|
768
|
+
self,
|
|
769
|
+
columns=None,
|
|
770
|
+
df_module=None,
|
|
771
|
+
entities=None,
|
|
772
|
+
start_time=None,
|
|
773
|
+
end_time=None,
|
|
774
|
+
time_field=None,
|
|
775
|
+
additional_filters=None,
|
|
776
|
+
):
|
|
777
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
778
|
+
additional_filters, self.__class__
|
|
779
|
+
)
|
|
744
780
|
return self._df
|
|
745
781
|
|
|
746
782
|
def is_iterator(self):
|
|
@@ -935,6 +971,7 @@ class KafkaSource(OnlineSource):
|
|
|
935
971
|
start_time=None,
|
|
936
972
|
end_time=None,
|
|
937
973
|
time_field=None,
|
|
974
|
+
additional_filters=None,
|
|
938
975
|
):
|
|
939
976
|
raise mlrun.MLRunInvalidArgumentError(
|
|
940
977
|
"KafkaSource does not support batch processing"
|
|
@@ -1075,9 +1112,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1075
1112
|
start_time=None,
|
|
1076
1113
|
end_time=None,
|
|
1077
1114
|
time_field=None,
|
|
1115
|
+
additional_filters=None,
|
|
1078
1116
|
):
|
|
1079
1117
|
import sqlalchemy as sqlalchemy
|
|
1080
1118
|
|
|
1119
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1120
|
+
additional_filters, self.__class__
|
|
1121
|
+
)
|
|
1081
1122
|
db_path = self.attributes.get("db_path")
|
|
1082
1123
|
table_name = self.attributes.get("table_name")
|
|
1083
1124
|
parse_dates = self.attributes.get("parse_dates")
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
import mlrun
|
|
18
18
|
import mlrun.artifacts
|
|
19
19
|
from mlrun.config import config
|
|
20
|
-
from mlrun.utils.helpers import
|
|
20
|
+
from mlrun.utils.helpers import parse_artifact_uri
|
|
21
21
|
|
|
22
22
|
from ..common.helpers import parse_versioned_object_uri
|
|
23
23
|
from ..platforms.iguazio import parse_path
|
|
@@ -167,11 +167,7 @@ def get_store_resource(
|
|
|
167
167
|
)
|
|
168
168
|
if resource.get("kind", "") == "link":
|
|
169
169
|
# todo: support other link types (not just iter, move this to the db/api layer
|
|
170
|
-
link_iteration = (
|
|
171
|
-
resource.get("link_iteration", 0)
|
|
172
|
-
if is_legacy_artifact(resource)
|
|
173
|
-
else resource["spec"].get("link_iteration", 0)
|
|
174
|
-
)
|
|
170
|
+
link_iteration = resource["spec"].get("link_iteration", 0)
|
|
175
171
|
|
|
176
172
|
resource = db.read_artifact(
|
|
177
173
|
key,
|