mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +25 -111
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +38 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +41 -47
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +68 -0
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +25 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +8 -2
- mlrun/common/schemas/client_spec.py +2 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +19 -3
- mlrun/common/schemas/model_monitoring/constants.py +96 -26
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +22 -21
- mlrun/common/types.py +7 -1
- mlrun/config.py +87 -19
- mlrun/data_types/data_types.py +4 -0
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +69 -30
- mlrun/datastore/datastore.py +10 -2
- mlrun/datastore/datastore_profile.py +90 -6
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +172 -44
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +285 -41
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +27 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +149 -14
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +608 -178
- mlrun/db/nopdb.py +191 -7
- mlrun/errors.py +11 -0
- mlrun/execution.py +37 -20
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +21 -52
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +2 -1
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +30 -19
- mlrun/features.py +4 -13
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +9 -3
- mlrun/launcher/remote.py +9 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +58 -19
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +127 -301
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +30 -36
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +100 -7
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +93 -228
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +30 -16
- mlrun/projects/pipelines.py +92 -99
- mlrun/projects/project.py +757 -268
- mlrun/render.py +15 -14
- mlrun/run.py +160 -162
- mlrun/runtimes/__init__.py +55 -3
- mlrun/runtimes/base.py +33 -19
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +98 -58
- mlrun/runtimes/nuclio/serving.py +36 -42
- mlrun/runtimes/pod.py +196 -45
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +7 -4
- mlrun/serving/server.py +7 -8
- mlrun/serving/states.py +73 -43
- mlrun/serving/v2_serving.py +8 -7
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +141 -75
- mlrun/utils/http.py +1 -1
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +12 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/sources.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import operator
|
|
15
16
|
import os
|
|
16
17
|
import warnings
|
|
17
18
|
from base64 import b64encode
|
|
@@ -28,6 +29,8 @@ from nuclio.config import split_path
|
|
|
28
29
|
|
|
29
30
|
import mlrun
|
|
30
31
|
from mlrun.config import config
|
|
32
|
+
from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
|
|
33
|
+
from mlrun.datastore.utils import transform_list_filters_to_tuple
|
|
31
34
|
from mlrun.secrets import SecretsStore
|
|
32
35
|
|
|
33
36
|
from ..model import DataSource
|
|
@@ -101,8 +104,12 @@ class BaseSourceDriver(DataSource):
|
|
|
101
104
|
start_time=None,
|
|
102
105
|
end_time=None,
|
|
103
106
|
time_field=None,
|
|
107
|
+
additional_filters=None,
|
|
104
108
|
):
|
|
105
109
|
"""return the source data as dataframe"""
|
|
110
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
111
|
+
additional_filters, self.__class__
|
|
112
|
+
)
|
|
106
113
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
107
114
|
columns=columns,
|
|
108
115
|
df_module=df_module,
|
|
@@ -113,7 +120,11 @@ class BaseSourceDriver(DataSource):
|
|
|
113
120
|
|
|
114
121
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
115
122
|
if self.support_spark:
|
|
116
|
-
|
|
123
|
+
spark_options = self.get_spark_options()
|
|
124
|
+
spark_format = spark_options.pop("format", None)
|
|
125
|
+
df = load_spark_dataframe_with_options(
|
|
126
|
+
session, spark_options, format=spark_format
|
|
127
|
+
)
|
|
117
128
|
if named_view:
|
|
118
129
|
df.createOrReplaceTempView(self.name)
|
|
119
130
|
return self._filter_spark_df(df, time_field, columns)
|
|
@@ -169,7 +180,7 @@ class CSVSource(BaseSourceDriver):
|
|
|
169
180
|
self,
|
|
170
181
|
name: str = "",
|
|
171
182
|
path: str = None,
|
|
172
|
-
attributes: dict[str,
|
|
183
|
+
attributes: dict[str, object] = None,
|
|
173
184
|
key_field: str = None,
|
|
174
185
|
schedule: str = None,
|
|
175
186
|
parse_dates: Union[None, int, str, list[int], list[str]] = None,
|
|
@@ -204,11 +215,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
204
215
|
)
|
|
205
216
|
|
|
206
217
|
def get_spark_options(self):
|
|
207
|
-
store, path,
|
|
218
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
208
219
|
spark_options = store.get_spark_options()
|
|
209
220
|
spark_options.update(
|
|
210
221
|
{
|
|
211
|
-
"path":
|
|
222
|
+
"path": store.spark_url + path,
|
|
212
223
|
"format": "csv",
|
|
213
224
|
"header": "true",
|
|
214
225
|
"inferSchema": "true",
|
|
@@ -240,7 +251,11 @@ class CSVSource(BaseSourceDriver):
|
|
|
240
251
|
start_time=None,
|
|
241
252
|
end_time=None,
|
|
242
253
|
time_field=None,
|
|
254
|
+
additional_filters=None,
|
|
243
255
|
):
|
|
256
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
257
|
+
additional_filters, self.__class__
|
|
258
|
+
)
|
|
244
259
|
reader_args = self.attributes.get("reader_args", {})
|
|
245
260
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
246
261
|
columns=columns,
|
|
@@ -276,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
|
|
|
276
291
|
:parameter start_time: filters out data before this time
|
|
277
292
|
:parameter end_time: filters out data after this time
|
|
278
293
|
:parameter attributes: additional parameters to pass to storey.
|
|
294
|
+
:param additional_filters: List of additional_filter conditions as tuples.
|
|
295
|
+
Each tuple should be in the format (column_name, operator, value).
|
|
296
|
+
Supported operators: "=", ">=", "<=", ">", "<".
|
|
297
|
+
Example: [("Product", "=", "Computer")]
|
|
298
|
+
For all supported filters, please see:
|
|
299
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
|
|
279
300
|
"""
|
|
280
301
|
|
|
281
302
|
kind = "parquet"
|
|
@@ -286,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
|
|
|
286
307
|
self,
|
|
287
308
|
name: str = "",
|
|
288
309
|
path: str = None,
|
|
289
|
-
attributes: dict[str,
|
|
310
|
+
attributes: dict[str, object] = None,
|
|
290
311
|
key_field: str = None,
|
|
291
312
|
time_field: str = None,
|
|
292
313
|
schedule: str = None,
|
|
293
314
|
start_time: Optional[Union[datetime, str]] = None,
|
|
294
315
|
end_time: Optional[Union[datetime, str]] = None,
|
|
316
|
+
additional_filters: Optional[list[Union[tuple, list]]] = None,
|
|
295
317
|
):
|
|
318
|
+
if additional_filters:
|
|
319
|
+
attributes = copy(attributes) or {}
|
|
320
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
321
|
+
attributes["additional_filters"] = additional_filters
|
|
322
|
+
|
|
296
323
|
super().__init__(
|
|
297
324
|
name,
|
|
298
325
|
path,
|
|
@@ -320,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
320
347
|
def end_time(self, end_time):
|
|
321
348
|
self._end_time = self._convert_to_datetime(end_time)
|
|
322
349
|
|
|
350
|
+
@property
|
|
351
|
+
def additional_filters(self):
|
|
352
|
+
return self.attributes.get("additional_filters")
|
|
353
|
+
|
|
323
354
|
@staticmethod
|
|
324
355
|
def _convert_to_datetime(time):
|
|
325
356
|
if time and isinstance(time, str):
|
|
@@ -336,16 +367,17 @@ class ParquetSource(BaseSourceDriver):
|
|
|
336
367
|
start_time=None,
|
|
337
368
|
end_time=None,
|
|
338
369
|
context=None,
|
|
370
|
+
additional_filters=None,
|
|
339
371
|
):
|
|
340
372
|
import storey
|
|
341
373
|
|
|
342
|
-
attributes = self.attributes
|
|
374
|
+
attributes = copy(self.attributes)
|
|
375
|
+
attributes.pop("additional_filters", None)
|
|
343
376
|
if context:
|
|
344
377
|
attributes["context"] = context
|
|
345
|
-
|
|
378
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
346
379
|
data_item = mlrun.store_manager.object(self.path)
|
|
347
380
|
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
348
|
-
|
|
349
381
|
return storey.ParquetSource(
|
|
350
382
|
paths=url, # unlike self.path, it already has store:// replaced
|
|
351
383
|
key_field=self.key_field or key_field,
|
|
@@ -353,11 +385,22 @@ class ParquetSource(BaseSourceDriver):
|
|
|
353
385
|
end_filter=self.end_time,
|
|
354
386
|
start_filter=self.start_time,
|
|
355
387
|
filter_column=self.time_field or time_field,
|
|
388
|
+
additional_filters=self.additional_filters or additional_filters,
|
|
356
389
|
**attributes,
|
|
357
390
|
)
|
|
358
391
|
|
|
392
|
+
@classmethod
|
|
393
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
394
|
+
new_obj = super().from_dict(
|
|
395
|
+
struct=struct, fields=fields, deprecated_fields=deprecated_fields
|
|
396
|
+
)
|
|
397
|
+
new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
|
|
398
|
+
new_obj.additional_filters
|
|
399
|
+
)
|
|
400
|
+
return new_obj
|
|
401
|
+
|
|
359
402
|
def get_spark_options(self):
|
|
360
|
-
store, path,
|
|
403
|
+
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
361
404
|
spark_options = store.get_spark_options()
|
|
362
405
|
spark_options.update(
|
|
363
406
|
{
|
|
@@ -375,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
375
418
|
start_time=None,
|
|
376
419
|
end_time=None,
|
|
377
420
|
time_field=None,
|
|
421
|
+
additional_filters=None,
|
|
378
422
|
):
|
|
379
423
|
reader_args = self.attributes.get("reader_args", {})
|
|
424
|
+
additional_filters = transform_list_filters_to_tuple(additional_filters)
|
|
380
425
|
return mlrun.store_manager.object(url=self.path).as_df(
|
|
381
426
|
columns=columns,
|
|
382
427
|
df_module=df_module,
|
|
@@ -384,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
|
|
|
384
429
|
end_time=end_time or self.end_time,
|
|
385
430
|
time_column=time_field or self.time_field,
|
|
386
431
|
format="parquet",
|
|
432
|
+
additional_filters=additional_filters or self.additional_filters,
|
|
387
433
|
**reader_args,
|
|
388
434
|
)
|
|
389
435
|
|
|
436
|
+
def _build_spark_additional_filters(self, column_types: dict):
|
|
437
|
+
if not self.additional_filters:
|
|
438
|
+
return None
|
|
439
|
+
from pyspark.sql.functions import col, isnan, lit
|
|
440
|
+
|
|
441
|
+
operators = {
|
|
442
|
+
"==": operator.eq,
|
|
443
|
+
"=": operator.eq,
|
|
444
|
+
">": operator.gt,
|
|
445
|
+
"<": operator.lt,
|
|
446
|
+
">=": operator.ge,
|
|
447
|
+
"<=": operator.le,
|
|
448
|
+
"!=": operator.ne,
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
spark_filter = None
|
|
452
|
+
new_filter = lit(True)
|
|
453
|
+
for filter_tuple in self.additional_filters:
|
|
454
|
+
if not filter_tuple:
|
|
455
|
+
continue
|
|
456
|
+
col_name, op, value = filter_tuple
|
|
457
|
+
if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
|
|
458
|
+
none_exists = False
|
|
459
|
+
value = list(value)
|
|
460
|
+
for sub_value in value:
|
|
461
|
+
if sub_value is None:
|
|
462
|
+
value.remove(sub_value)
|
|
463
|
+
none_exists = True
|
|
464
|
+
if none_exists:
|
|
465
|
+
filter_nan = column_types[col_name] not in ("timestamp", "date")
|
|
466
|
+
if value:
|
|
467
|
+
if op.lower() == "in":
|
|
468
|
+
new_filter = (
|
|
469
|
+
col(col_name).isin(value) | col(col_name).isNull()
|
|
470
|
+
)
|
|
471
|
+
if filter_nan:
|
|
472
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
473
|
+
|
|
474
|
+
else:
|
|
475
|
+
new_filter = (
|
|
476
|
+
~col(col_name).isin(value) & ~col(col_name).isNull()
|
|
477
|
+
)
|
|
478
|
+
if filter_nan:
|
|
479
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
480
|
+
else:
|
|
481
|
+
if op.lower() == "in":
|
|
482
|
+
new_filter = col(col_name).isNull()
|
|
483
|
+
if filter_nan:
|
|
484
|
+
new_filter = new_filter | isnan(col(col_name))
|
|
485
|
+
else:
|
|
486
|
+
new_filter = ~col(col_name).isNull()
|
|
487
|
+
if filter_nan:
|
|
488
|
+
new_filter = new_filter & ~isnan(col(col_name))
|
|
489
|
+
else:
|
|
490
|
+
if op.lower() == "in":
|
|
491
|
+
new_filter = col(col_name).isin(value)
|
|
492
|
+
elif op.lower() == "not in":
|
|
493
|
+
new_filter = ~col(col_name).isin(value)
|
|
494
|
+
elif op in operators:
|
|
495
|
+
new_filter = operators[op](col(col_name), value)
|
|
496
|
+
else:
|
|
497
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
498
|
+
f"unsupported filter operator: {op}"
|
|
499
|
+
)
|
|
500
|
+
if spark_filter is not None:
|
|
501
|
+
spark_filter = spark_filter & new_filter
|
|
502
|
+
else:
|
|
503
|
+
spark_filter = new_filter
|
|
504
|
+
return spark_filter
|
|
505
|
+
|
|
506
|
+
def _filter_spark_df(self, df, time_field=None, columns=None):
|
|
507
|
+
spark_additional_filters = self._build_spark_additional_filters(
|
|
508
|
+
column_types=dict(df.dtypes)
|
|
509
|
+
)
|
|
510
|
+
if spark_additional_filters is not None:
|
|
511
|
+
df = df.filter(spark_additional_filters)
|
|
512
|
+
return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
|
|
513
|
+
|
|
390
514
|
|
|
391
515
|
class BigQuerySource(BaseSourceDriver):
|
|
392
516
|
"""
|
|
@@ -401,12 +525,17 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
401
525
|
|
|
402
526
|
# use sql query
|
|
403
527
|
query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
|
|
404
|
-
source = BigQuerySource(
|
|
405
|
-
|
|
406
|
-
|
|
528
|
+
source = BigQuerySource(
|
|
529
|
+
"bq1",
|
|
530
|
+
query=query_string,
|
|
531
|
+
gcp_project="my_project",
|
|
532
|
+
materialization_dataset="dataviews",
|
|
533
|
+
)
|
|
407
534
|
|
|
408
535
|
# read a table
|
|
409
|
-
source = BigQuerySource(
|
|
536
|
+
source = BigQuerySource(
|
|
537
|
+
"bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
|
|
538
|
+
)
|
|
410
539
|
|
|
411
540
|
|
|
412
541
|
:parameter name: source name
|
|
@@ -509,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
509
638
|
start_time=None,
|
|
510
639
|
end_time=None,
|
|
511
640
|
time_field=None,
|
|
641
|
+
additional_filters=None,
|
|
512
642
|
):
|
|
513
643
|
from google.cloud import bigquery
|
|
514
644
|
from google.cloud.bigquery_storage_v1 import BigQueryReadClient
|
|
515
645
|
|
|
646
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
647
|
+
additional_filters, self.__class__
|
|
648
|
+
)
|
|
649
|
+
|
|
516
650
|
def schema_to_dtypes(schema):
|
|
517
651
|
from mlrun.data_types.data_types import gbq_to_pandas_dtype
|
|
518
652
|
|
|
@@ -552,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
|
|
|
552
686
|
else:
|
|
553
687
|
df = rows_iterator.to_dataframe(dtypes=dtypes)
|
|
554
688
|
|
|
555
|
-
# TODO : filter as part of the query
|
|
556
689
|
return select_columns_from_df(
|
|
557
690
|
filter_df_start_end_time(
|
|
558
691
|
df,
|
|
@@ -673,32 +806,10 @@ class SnowflakeSource(BaseSourceDriver):
|
|
|
673
806
|
**kwargs,
|
|
674
807
|
)
|
|
675
808
|
|
|
676
|
-
def _get_password(self):
|
|
677
|
-
key = "SNOWFLAKE_PASSWORD"
|
|
678
|
-
snowflake_password = os.getenv(key) or os.getenv(
|
|
679
|
-
SecretsStore.k8s_env_variable_name_for_secret(key)
|
|
680
|
-
)
|
|
681
|
-
|
|
682
|
-
if not snowflake_password:
|
|
683
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
684
|
-
"No password provided. Set password using the SNOWFLAKE_PASSWORD "
|
|
685
|
-
"project secret or environment variable."
|
|
686
|
-
)
|
|
687
|
-
|
|
688
|
-
return snowflake_password
|
|
689
|
-
|
|
690
809
|
def get_spark_options(self):
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
"sfURL": self.attributes.get("url"),
|
|
695
|
-
"sfUser": self.attributes.get("user"),
|
|
696
|
-
"sfPassword": self._get_password(),
|
|
697
|
-
"sfDatabase": self.attributes.get("database"),
|
|
698
|
-
"sfSchema": self.attributes.get("schema"),
|
|
699
|
-
"sfWarehouse": self.attributes.get("warehouse"),
|
|
700
|
-
"application": "iguazio_platform",
|
|
701
|
-
}
|
|
810
|
+
spark_options = get_snowflake_spark_options(self.attributes)
|
|
811
|
+
spark_options["query"] = self.attributes.get("query")
|
|
812
|
+
return spark_options
|
|
702
813
|
|
|
703
814
|
|
|
704
815
|
class CustomSource(BaseSourceDriver):
|
|
@@ -752,7 +863,19 @@ class DataFrameSource:
|
|
|
752
863
|
context=self.context or context,
|
|
753
864
|
)
|
|
754
865
|
|
|
755
|
-
def to_dataframe(
|
|
866
|
+
def to_dataframe(
|
|
867
|
+
self,
|
|
868
|
+
columns=None,
|
|
869
|
+
df_module=None,
|
|
870
|
+
entities=None,
|
|
871
|
+
start_time=None,
|
|
872
|
+
end_time=None,
|
|
873
|
+
time_field=None,
|
|
874
|
+
additional_filters=None,
|
|
875
|
+
):
|
|
876
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
877
|
+
additional_filters, self.__class__
|
|
878
|
+
)
|
|
756
879
|
return self._df
|
|
757
880
|
|
|
758
881
|
def is_iterator(self):
|
|
@@ -794,7 +917,8 @@ class OnlineSource(BaseSourceDriver):
|
|
|
794
917
|
explicit_ack = (
|
|
795
918
|
is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
|
|
796
919
|
)
|
|
797
|
-
|
|
920
|
+
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
921
|
+
src_class = storey.SyncEmitSource(
|
|
798
922
|
context=context,
|
|
799
923
|
key_field=self.key_field or key_field,
|
|
800
924
|
full_event=True,
|
|
@@ -853,12 +977,11 @@ class StreamSource(OnlineSource):
|
|
|
853
977
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
854
978
|
|
|
855
979
|
def add_nuclio_trigger(self, function):
|
|
856
|
-
store,
|
|
980
|
+
store, _, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
857
981
|
if store.kind != "v3io":
|
|
858
982
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
859
983
|
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
860
984
|
)
|
|
861
|
-
path = "v3io:/" + path
|
|
862
985
|
storage_options = store.get_storage_options()
|
|
863
986
|
access_key = storage_options.get("v3io_access_key")
|
|
864
987
|
endpoint, stream_path = parse_path(url)
|
|
@@ -882,7 +1005,7 @@ class StreamSource(OnlineSource):
|
|
|
882
1005
|
kwargs["worker_allocation_mode"] = "static"
|
|
883
1006
|
|
|
884
1007
|
function.add_v3io_stream_trigger(
|
|
885
|
-
|
|
1008
|
+
url,
|
|
886
1009
|
self.name,
|
|
887
1010
|
self.attributes["group"],
|
|
888
1011
|
self.attributes["seek_to"],
|
|
@@ -947,6 +1070,7 @@ class KafkaSource(OnlineSource):
|
|
|
947
1070
|
start_time=None,
|
|
948
1071
|
end_time=None,
|
|
949
1072
|
time_field=None,
|
|
1073
|
+
additional_filters=None,
|
|
950
1074
|
):
|
|
951
1075
|
raise mlrun.MLRunInvalidArgumentError(
|
|
952
1076
|
"KafkaSource does not support batch processing"
|
|
@@ -1087,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
|
|
|
1087
1211
|
start_time=None,
|
|
1088
1212
|
end_time=None,
|
|
1089
1213
|
time_field=None,
|
|
1214
|
+
additional_filters=None,
|
|
1090
1215
|
):
|
|
1091
1216
|
import sqlalchemy as sqlalchemy
|
|
1092
1217
|
|
|
1218
|
+
mlrun.utils.helpers.additional_filters_warning(
|
|
1219
|
+
additional_filters, self.__class__
|
|
1220
|
+
)
|
|
1093
1221
|
db_path = self.attributes.get("db_path")
|
|
1094
1222
|
table_name = self.attributes.get("table_name")
|
|
1095
1223
|
parse_dates = self.attributes.get("parse_dates")
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
import mlrun
|
|
18
18
|
import mlrun.artifacts
|
|
19
19
|
from mlrun.config import config
|
|
20
|
-
from mlrun.utils.helpers import
|
|
20
|
+
from mlrun.utils.helpers import parse_artifact_uri
|
|
21
21
|
|
|
22
22
|
from ..common.helpers import parse_versioned_object_uri
|
|
23
23
|
from ..platforms.iguazio import parse_path
|
|
@@ -146,7 +146,11 @@ def get_store_resource(
|
|
|
146
146
|
|
|
147
147
|
db = db or mlrun.get_run_db(secrets=secrets)
|
|
148
148
|
kind, uri = parse_store_uri(uri)
|
|
149
|
-
if kind
|
|
149
|
+
if not kind:
|
|
150
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
151
|
+
f"Cannot get store resource from invalid URI: {uri}"
|
|
152
|
+
)
|
|
153
|
+
elif kind == StorePrefix.FeatureSet:
|
|
150
154
|
project, name, tag, uid = parse_versioned_object_uri(
|
|
151
155
|
uri, project or config.default_project
|
|
152
156
|
)
|
|
@@ -167,11 +171,7 @@ def get_store_resource(
|
|
|
167
171
|
)
|
|
168
172
|
if resource.get("kind", "") == "link":
|
|
169
173
|
# todo: support other link types (not just iter, move this to the db/api layer
|
|
170
|
-
link_iteration = (
|
|
171
|
-
resource.get("link_iteration", 0)
|
|
172
|
-
if is_legacy_artifact(resource)
|
|
173
|
-
else resource["spec"].get("link_iteration", 0)
|
|
174
|
-
)
|
|
174
|
+
link_iteration = resource["spec"].get("link_iteration", 0)
|
|
175
175
|
|
|
176
176
|
resource = db.read_artifact(
|
|
177
177
|
key,
|