mlrun 1.7.2__py3-none-any.whl → 1.8.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +14 -12
- mlrun/__main__.py +3 -3
- mlrun/alerts/alert.py +19 -12
- mlrun/artifacts/__init__.py +0 -2
- mlrun/artifacts/base.py +34 -11
- mlrun/artifacts/dataset.py +16 -16
- mlrun/artifacts/manager.py +13 -13
- mlrun/artifacts/model.py +66 -53
- mlrun/common/constants.py +6 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/common/formatters/model_endpoint.py +30 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/model_monitoring/__init__.py +0 -3
- mlrun/common/model_monitoring/helpers.py +1 -1
- mlrun/common/runtimes/constants.py +1 -2
- mlrun/common/schemas/__init__.py +4 -2
- mlrun/common/schemas/artifact.py +0 -6
- mlrun/common/schemas/common.py +50 -0
- mlrun/common/schemas/model_monitoring/__init__.py +8 -1
- mlrun/common/schemas/model_monitoring/constants.py +62 -12
- mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +149 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -5
- mlrun/common/schemas/partition.py +122 -0
- mlrun/config.py +43 -15
- mlrun/data_types/__init__.py +0 -2
- mlrun/data_types/data_types.py +0 -1
- mlrun/data_types/infer.py +3 -1
- mlrun/data_types/spark.py +4 -4
- mlrun/data_types/to_pandas.py +2 -11
- mlrun/datastore/__init__.py +0 -2
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +12 -4
- mlrun/datastore/datastore.py +9 -3
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +4 -1
- mlrun/datastore/sources.py +51 -49
- mlrun/datastore/store_resources.py +0 -2
- mlrun/datastore/targets.py +22 -23
- mlrun/datastore/utils.py +2 -2
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +126 -62
- mlrun/db/factory.py +3 -0
- mlrun/db/httpdb.py +767 -231
- mlrun/db/nopdb.py +126 -57
- mlrun/errors.py +2 -2
- mlrun/execution.py +55 -29
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -40
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +27 -24
- mlrun/feature_store/retrieval/base.py +14 -9
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/steps.py +2 -2
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +29 -27
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +4 -3
- mlrun/model.py +108 -44
- mlrun/model_monitoring/__init__.py +1 -2
- mlrun/model_monitoring/api.py +6 -6
- mlrun/model_monitoring/applications/_application_steps.py +13 -15
- mlrun/model_monitoring/applications/histogram_data_drift.py +41 -15
- mlrun/model_monitoring/applications/results.py +55 -3
- mlrun/model_monitoring/controller.py +185 -223
- mlrun/model_monitoring/db/_schedules.py +156 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/stores/__init__.py +1 -1
- mlrun/model_monitoring/db/stores/base/store.py +6 -65
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -25
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -97
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +2 -58
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -15
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +6 -257
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -271
- mlrun/model_monitoring/db/tsdb/base.py +74 -22
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +66 -35
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +284 -51
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -17
- mlrun/model_monitoring/helpers.py +97 -1
- mlrun/model_monitoring/model_endpoint.py +4 -2
- mlrun/model_monitoring/stream_processing.py +2 -2
- mlrun/model_monitoring/tracking_policy.py +10 -3
- mlrun/model_monitoring/writer.py +47 -26
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +31 -14
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +3 -3
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/__init__.py +1 -6
- mlrun/projects/operations.py +27 -27
- mlrun/projects/pipelines.py +85 -215
- mlrun/projects/project.py +444 -158
- mlrun/run.py +9 -9
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +13 -10
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/generators.py +2 -1
- mlrun/runtimes/kubejob.py +4 -5
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -11
- mlrun/runtimes/nuclio/function.py +14 -14
- mlrun/runtimes/nuclio/serving.py +9 -9
- mlrun/runtimes/pod.py +74 -29
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +21 -11
- mlrun/runtimes/utils.py +6 -5
- mlrun/serving/merger.py +6 -4
- mlrun/serving/remote.py +18 -17
- mlrun/serving/routers.py +27 -27
- mlrun/serving/server.py +1 -1
- mlrun/serving/states.py +76 -71
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +4 -4
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/helpers.py +72 -28
- mlrun/utils/logger.py +104 -2
- mlrun/utils/notifications/notification/base.py +23 -4
- mlrun/utils/notifications/notification/console.py +1 -1
- mlrun/utils/notifications/notification/git.py +6 -6
- mlrun/utils/notifications/notification/ipython.py +5 -4
- mlrun/utils/notifications/notification/slack.py +1 -1
- mlrun/utils/notifications/notification/webhook.py +13 -17
- mlrun/utils/notifications/notification_pusher.py +23 -19
- mlrun/utils/regex.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/METADATA +187 -199
- mlrun-1.8.0rc1.dist-info/RECORD +356 -0
- {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/WHEEL +1 -1
- mlrun-1.7.2.dist-info/RECORD +0 -351
- {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/LICENSE +0 -0
- {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2.dist-info → mlrun-1.8.0rc1.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -102,6 +102,9 @@ default_config = {
|
|
|
102
102
|
"log_level": "INFO",
|
|
103
103
|
# log formatter (options: human | human_extended | json)
|
|
104
104
|
"log_formatter": "human",
|
|
105
|
+
# custom logger format, workes only with log_formatter: custom
|
|
106
|
+
# Note that your custom format must include those 4 fields - timestamp, level, message and more
|
|
107
|
+
"log_format_override": None,
|
|
105
108
|
"submit_timeout": "180", # timeout when submitting a new k8s resource
|
|
106
109
|
# runtimes cleanup interval in seconds
|
|
107
110
|
"runtimes_cleanup_interval": "300",
|
|
@@ -120,14 +123,6 @@ default_config = {
|
|
|
120
123
|
"projects": {
|
|
121
124
|
"summaries": {
|
|
122
125
|
"cache_interval": "30",
|
|
123
|
-
"feature_gates": {
|
|
124
|
-
"artifacts": "enabled",
|
|
125
|
-
"schedules": "enabled",
|
|
126
|
-
"feature_sets": "enabled",
|
|
127
|
-
"models": "enabled",
|
|
128
|
-
"runs": "enabled",
|
|
129
|
-
"pipelines": "enabled",
|
|
130
|
-
},
|
|
131
126
|
},
|
|
132
127
|
},
|
|
133
128
|
},
|
|
@@ -140,6 +135,9 @@ default_config = {
|
|
|
140
135
|
"delete_crd_resources_timeout": "5 minutes",
|
|
141
136
|
},
|
|
142
137
|
},
|
|
138
|
+
"object_retentions": {
|
|
139
|
+
"alert_activation": 14 * 7, # days
|
|
140
|
+
},
|
|
143
141
|
# the grace period (in seconds) that will be given to runtime resources (after they're in terminal state)
|
|
144
142
|
# before deleting them (4 hours)
|
|
145
143
|
"runtime_resources_deletion_grace_period": "14400",
|
|
@@ -314,7 +312,7 @@ default_config = {
|
|
|
314
312
|
},
|
|
315
313
|
"request_timeout": 45, # seconds
|
|
316
314
|
},
|
|
317
|
-
# see server.api.utils.helpers.ensure_running_on_chief
|
|
315
|
+
# see server.py.services.api.utils.helpers.ensure_running_on_chief
|
|
318
316
|
"ensure_function_running_on_chief_mode": "enabled",
|
|
319
317
|
},
|
|
320
318
|
"port": 8080,
|
|
@@ -794,7 +792,7 @@ default_config = {
|
|
|
794
792
|
"grafana_url": "",
|
|
795
793
|
"alerts": {
|
|
796
794
|
# supported modes: "enabled", "disabled".
|
|
797
|
-
"mode": "
|
|
795
|
+
"mode": "enabled",
|
|
798
796
|
# maximum number of alerts we allow to be configured.
|
|
799
797
|
# user will get an error when exceeding this
|
|
800
798
|
"max_allowed": 10000,
|
|
@@ -851,6 +849,22 @@ class Config:
|
|
|
851
849
|
name = self.__class__.__name__
|
|
852
850
|
return f"{name}({self._cfg!r})"
|
|
853
851
|
|
|
852
|
+
def __iter__(self):
|
|
853
|
+
if isinstance(self._cfg, Mapping):
|
|
854
|
+
return self._cfg.__iter__()
|
|
855
|
+
|
|
856
|
+
def items(self):
|
|
857
|
+
if isinstance(self._cfg, Mapping):
|
|
858
|
+
return iter(self._cfg.items())
|
|
859
|
+
|
|
860
|
+
def keys(self):
|
|
861
|
+
if isinstance(self._cfg, Mapping):
|
|
862
|
+
return iter(self.data.keys())
|
|
863
|
+
|
|
864
|
+
def values(self):
|
|
865
|
+
if isinstance(self._cfg, Mapping):
|
|
866
|
+
return iter(self.data.values())
|
|
867
|
+
|
|
854
868
|
def update(self, cfg, skip_errors=False):
|
|
855
869
|
for key, value in cfg.items():
|
|
856
870
|
if hasattr(self, key):
|
|
@@ -1043,6 +1057,17 @@ class Config:
|
|
|
1043
1057
|
f"is not allowed for iguazio version: {igz_version} < 3.5.1"
|
|
1044
1058
|
)
|
|
1045
1059
|
|
|
1060
|
+
def validate_object_retentions(self):
|
|
1061
|
+
for table_name, retention_days in self.object_retentions.items():
|
|
1062
|
+
if retention_days < 7 and not os.getenv("PARTITION_INTERVAL"):
|
|
1063
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1064
|
+
f"{table_name} partition interval must be greater than a week"
|
|
1065
|
+
)
|
|
1066
|
+
elif retention_days > 53 * 7:
|
|
1067
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
1068
|
+
f"{table_name} partition interval must be less than a year"
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1046
1071
|
def resolve_chief_api_url(self) -> str:
|
|
1047
1072
|
if self.httpdb.clusterization.chief.url:
|
|
1048
1073
|
return self.httpdb.clusterization.chief.url
|
|
@@ -1201,9 +1226,9 @@ class Config:
|
|
|
1201
1226
|
|
|
1202
1227
|
def get_model_monitoring_file_target_path(
|
|
1203
1228
|
self,
|
|
1204
|
-
project: str
|
|
1205
|
-
kind: str
|
|
1206
|
-
target:
|
|
1229
|
+
project: str,
|
|
1230
|
+
kind: str,
|
|
1231
|
+
target: typing.Literal["online", "offline"] = "online",
|
|
1207
1232
|
artifact_path: typing.Optional[str] = None,
|
|
1208
1233
|
function_name: typing.Optional[str] = None,
|
|
1209
1234
|
**kwargs,
|
|
@@ -1381,9 +1406,12 @@ def _validate_config(config):
|
|
|
1381
1406
|
pass
|
|
1382
1407
|
|
|
1383
1408
|
config.verify_security_context_enrichment_mode_is_allowed()
|
|
1409
|
+
config.validate_object_retentions()
|
|
1384
1410
|
|
|
1385
1411
|
|
|
1386
|
-
def _verify_gpu_requests_and_limits(
|
|
1412
|
+
def _verify_gpu_requests_and_limits(
|
|
1413
|
+
requests_gpu: typing.Optional[str] = None, limits_gpu: typing.Optional[str] = None
|
|
1414
|
+
):
|
|
1387
1415
|
# https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
|
|
1388
1416
|
if requests_gpu and not limits_gpu:
|
|
1389
1417
|
raise mlrun.errors.MLRunConflictError(
|
|
@@ -1396,7 +1424,7 @@ def _verify_gpu_requests_and_limits(requests_gpu: str = None, limits_gpu: str =
|
|
|
1396
1424
|
)
|
|
1397
1425
|
|
|
1398
1426
|
|
|
1399
|
-
def _convert_resources_to_str(config: dict = None):
|
|
1427
|
+
def _convert_resources_to_str(config: typing.Optional[dict] = None):
|
|
1400
1428
|
resources_types = ["cpu", "memory", "gpu"]
|
|
1401
1429
|
resource_requirements = ["requests", "limits"]
|
|
1402
1430
|
if not config.get("default_function_pod_resources"):
|
mlrun/data_types/__init__.py
CHANGED
|
@@ -11,8 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
|
-
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
14
|
|
|
17
15
|
from .data_types import (
|
|
18
16
|
InferOptions,
|
mlrun/data_types/data_types.py
CHANGED
mlrun/data_types/infer.py
CHANGED
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import packaging.version
|
|
17
19
|
import pandas as pd
|
|
@@ -29,7 +31,7 @@ def infer_schema_from_df(
|
|
|
29
31
|
df: pd.DataFrame,
|
|
30
32
|
features,
|
|
31
33
|
entities,
|
|
32
|
-
timestamp_key: str = None,
|
|
34
|
+
timestamp_key: Optional[str] = None,
|
|
33
35
|
entity_columns=None,
|
|
34
36
|
options: InferOptions = InferOptions.Null,
|
|
35
37
|
):
|
mlrun/data_types/spark.py
CHANGED
|
@@ -14,11 +14,12 @@
|
|
|
14
14
|
#
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
from os import environ
|
|
17
|
+
from typing import Optional
|
|
17
18
|
|
|
18
19
|
import numpy as np
|
|
19
20
|
import pytz
|
|
20
21
|
from pyspark.sql.functions import to_utc_timestamp
|
|
21
|
-
from pyspark.sql.types import BooleanType, DoubleType
|
|
22
|
+
from pyspark.sql.types import BooleanType, DoubleType, TimestampType
|
|
22
23
|
|
|
23
24
|
from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
|
|
24
25
|
from mlrun.utils import logger
|
|
@@ -35,7 +36,7 @@ def infer_schema_from_df_spark(
|
|
|
35
36
|
df,
|
|
36
37
|
features,
|
|
37
38
|
entities,
|
|
38
|
-
timestamp_key: str = None,
|
|
39
|
+
timestamp_key: Optional[str] = None,
|
|
39
40
|
entity_columns=None,
|
|
40
41
|
options: InferOptions = InferOptions.Null,
|
|
41
42
|
):
|
|
@@ -143,8 +144,7 @@ def get_df_stats_spark(df, options, num_bins=20, sample_size=None):
|
|
|
143
144
|
timestamp_columns = set()
|
|
144
145
|
boolean_columns = set()
|
|
145
146
|
for field in df_after_type_casts.schema.fields:
|
|
146
|
-
|
|
147
|
-
is_timestamp = field.dataType.typeName().startswith("timestamp")
|
|
147
|
+
is_timestamp = isinstance(field.dataType, TimestampType)
|
|
148
148
|
is_boolean = isinstance(field.dataType, BooleanType)
|
|
149
149
|
if is_timestamp:
|
|
150
150
|
df_after_type_casts = df_after_type_casts.withColumn(
|
mlrun/data_types/to_pandas.py
CHANGED
|
@@ -244,15 +244,6 @@ def _to_corrected_pandas_type(dt):
|
|
|
244
244
|
|
|
245
245
|
|
|
246
246
|
def spark_df_to_pandas(spark_df):
|
|
247
|
-
import pyspark
|
|
248
|
-
|
|
249
|
-
if semver.parse(pyspark.__version__) >= semver.Version(3, 5, 0):
|
|
250
|
-
|
|
251
|
-
def to_pandas(spark_df_inner):
|
|
252
|
-
return spark_df_inner.toPandas()
|
|
253
|
-
else:
|
|
254
|
-
to_pandas = _to_pandas
|
|
255
|
-
|
|
256
247
|
# as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
|
|
257
248
|
# when we upgrade pyspark, we should check whether this workaround is still necessary
|
|
258
249
|
# see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
|
|
@@ -271,9 +262,9 @@ def spark_df_to_pandas(spark_df):
|
|
|
271
262
|
)
|
|
272
263
|
type_conversion_dict[field.name] = "datetime64[ns]"
|
|
273
264
|
|
|
274
|
-
df =
|
|
265
|
+
df = _to_pandas(spark_df)
|
|
275
266
|
if type_conversion_dict:
|
|
276
267
|
df = df.astype(type_conversion_dict)
|
|
277
268
|
return df
|
|
278
269
|
else:
|
|
279
|
-
return
|
|
270
|
+
return _to_pandas(spark_df)
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -12,8 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
|
-
|
|
17
15
|
__all__ = [
|
|
18
16
|
"DataItem",
|
|
19
17
|
"get_store_resource",
|
mlrun/datastore/alibaba_oss.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import time
|
|
16
16
|
from datetime import datetime
|
|
17
17
|
from pathlib import Path
|
|
18
|
+
from typing import Optional
|
|
18
19
|
from urllib.parse import urlparse
|
|
19
20
|
|
|
20
21
|
import oss2
|
|
@@ -28,7 +29,9 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
|
28
29
|
class OSSStore(DataStore):
|
|
29
30
|
using_bucket = True
|
|
30
31
|
|
|
31
|
-
def __init__(
|
|
32
|
+
def __init__(
|
|
33
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
34
|
+
):
|
|
32
35
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
33
36
|
# will be used in case user asks to assume a role and work through fsspec
|
|
34
37
|
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import time
|
|
16
16
|
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
17
18
|
from urllib.parse import urlparse
|
|
18
19
|
|
|
19
20
|
from azure.storage.blob import BlobServiceClient
|
|
@@ -36,7 +37,9 @@ class AzureBlobStore(DataStore):
|
|
|
36
37
|
1024 * 1024 * 8
|
|
37
38
|
) # for service_client property only, does not affect filesystem
|
|
38
39
|
|
|
39
|
-
def __init__(
|
|
40
|
+
def __init__(
|
|
41
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
42
|
+
):
|
|
40
43
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
41
44
|
self._service_client = None
|
|
42
45
|
self._storage_options = None
|
mlrun/datastore/base.py
CHANGED
|
@@ -48,7 +48,7 @@ class FileStats:
|
|
|
48
48
|
class DataStore:
|
|
49
49
|
using_bucket = False
|
|
50
50
|
|
|
51
|
-
def __init__(self, parent, name, kind, endpoint="", secrets: dict = None):
|
|
51
|
+
def __init__(self, parent, name, kind, endpoint="", secrets: Optional[dict] = None):
|
|
52
52
|
self._parent = parent
|
|
53
53
|
self.kind = kind
|
|
54
54
|
self.name = name
|
|
@@ -500,12 +500,18 @@ class DataItem:
|
|
|
500
500
|
"""DataItem url e.g. /dir/path, s3://bucket/path"""
|
|
501
501
|
return self._url
|
|
502
502
|
|
|
503
|
-
def get(
|
|
503
|
+
def get(
|
|
504
|
+
self,
|
|
505
|
+
size: Optional[int] = None,
|
|
506
|
+
offset: int = 0,
|
|
507
|
+
encoding: Optional[str] = None,
|
|
508
|
+
) -> Union[bytes, str]:
|
|
504
509
|
"""read all or a byte range and return the content
|
|
505
510
|
|
|
506
511
|
:param size: number of bytes to get
|
|
507
512
|
:param offset: fetch from offset (in bytes)
|
|
508
513
|
:param encoding: encoding (e.g. "utf-8") for converting bytes to str
|
|
514
|
+
:return: the bytes/str content
|
|
509
515
|
"""
|
|
510
516
|
body = self._store.get(self._path, size=size, offset=offset)
|
|
511
517
|
if encoding and isinstance(body, bytes):
|
|
@@ -519,7 +525,7 @@ class DataItem:
|
|
|
519
525
|
"""
|
|
520
526
|
self._store.download(self._path, target_path)
|
|
521
527
|
|
|
522
|
-
def put(self, data, append=False):
|
|
528
|
+
def put(self, data: Union[bytes, str], append: bool = False) -> None:
|
|
523
529
|
"""write/upload the data, append is only supported by some datastores
|
|
524
530
|
|
|
525
531
|
:param data: data (bytes/str) to write
|
|
@@ -687,7 +693,9 @@ def basic_auth_header(user, password):
|
|
|
687
693
|
|
|
688
694
|
|
|
689
695
|
class HttpStore(DataStore):
|
|
690
|
-
def __init__(
|
|
696
|
+
def __init__(
|
|
697
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
698
|
+
):
|
|
691
699
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
692
700
|
self._https_auth_token = None
|
|
693
701
|
self._schema = schema
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from typing import Optional
|
|
14
15
|
from urllib.parse import urlparse
|
|
15
16
|
|
|
16
17
|
from mergedeep import merge
|
|
@@ -178,12 +179,17 @@ class StoreManager:
|
|
|
178
179
|
# which accepts a feature vector uri and generate the offline vector (parquet) for it if it doesnt exist
|
|
179
180
|
if not target and not allow_empty_resources:
|
|
180
181
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
181
|
-
f"
|
|
182
|
+
f"Resource {url} does not have a valid/persistent offline target"
|
|
182
183
|
)
|
|
183
184
|
return resource, target or ""
|
|
184
185
|
|
|
185
186
|
def object(
|
|
186
|
-
self,
|
|
187
|
+
self,
|
|
188
|
+
url,
|
|
189
|
+
key="",
|
|
190
|
+
project="",
|
|
191
|
+
allow_empty_resources=None,
|
|
192
|
+
secrets: Optional[dict] = None,
|
|
187
193
|
) -> DataItem:
|
|
188
194
|
meta = artifact_url = None
|
|
189
195
|
if is_store_uri(url):
|
|
@@ -205,7 +211,7 @@ class StoreManager:
|
|
|
205
211
|
)
|
|
206
212
|
|
|
207
213
|
def get_or_create_store(
|
|
208
|
-
self, url, secrets: dict = None, project_name=""
|
|
214
|
+
self, url, secrets: Optional[dict] = None, project_name=""
|
|
209
215
|
) -> (DataStore, str, str):
|
|
210
216
|
schema, endpoint, parsed_url = parse_url(url)
|
|
211
217
|
subpath = parsed_url.path
|
|
@@ -489,7 +489,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
489
489
|
)
|
|
490
490
|
|
|
491
491
|
|
|
492
|
-
def datastore_profile_read(url, project_name="", secrets: dict = None):
|
|
492
|
+
def datastore_profile_read(url, project_name="", secrets: typing.Optional[dict] = None):
|
|
493
493
|
parsed_url = urlparse(url)
|
|
494
494
|
if parsed_url.scheme.lower() != "ds":
|
|
495
495
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import pathlib
|
|
16
|
+
from typing import Optional
|
|
16
17
|
|
|
17
18
|
from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
|
|
18
19
|
from fsspec.registry import get_filesystem_class
|
|
@@ -81,7 +82,9 @@ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
|
|
|
81
82
|
|
|
82
83
|
# dbfs objects will be represented with the following URL: dbfs://<path>
|
|
83
84
|
class DBFSStore(DataStore):
|
|
84
|
-
def __init__(
|
|
85
|
+
def __init__(
|
|
86
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
87
|
+
):
|
|
85
88
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
86
89
|
|
|
87
90
|
@property
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import time
|
|
15
15
|
from os import listdir, makedirs, path, stat
|
|
16
16
|
from shutil import copyfile
|
|
17
|
+
from typing import Optional
|
|
17
18
|
|
|
18
19
|
import fsspec
|
|
19
20
|
|
|
@@ -23,7 +24,9 @@ from .base import DataStore, FileStats
|
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class FileStore(DataStore):
|
|
26
|
-
def __init__(
|
|
27
|
+
def __init__(
|
|
28
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
29
|
+
):
|
|
27
30
|
super().__init__(parent, name, "file", endpoint, secrets=secrets)
|
|
28
31
|
|
|
29
32
|
self._item_path, self._real_path = None, None
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import json
|
|
15
15
|
import os
|
|
16
16
|
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
17
18
|
|
|
18
19
|
from fsspec.registry import get_filesystem_class
|
|
19
20
|
from google.auth.credentials import Credentials
|
|
@@ -33,7 +34,9 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
33
34
|
workers = 8
|
|
34
35
|
chunk_size = 32 * 1024 * 1024
|
|
35
36
|
|
|
36
|
-
def __init__(
|
|
37
|
+
def __init__(
|
|
38
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
39
|
+
):
|
|
37
40
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
38
41
|
self._storage_client = None
|
|
39
42
|
self._storage_options = None
|
mlrun/datastore/hdfs.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import os
|
|
15
|
+
from typing import Optional
|
|
15
16
|
from urllib.parse import urlparse
|
|
16
17
|
|
|
17
18
|
import fsspec
|
|
@@ -20,7 +21,9 @@ from mlrun.datastore.base import DataStore
|
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class HdfsStore(DataStore):
|
|
23
|
-
def __init__(
|
|
24
|
+
def __init__(
|
|
25
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
26
|
+
):
|
|
24
27
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
25
28
|
|
|
26
29
|
self.host = self._get_secret_or_env("HDFS_HOST")
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -17,6 +17,7 @@ from io import BytesIO, StringIO
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
19
|
import mlrun
|
|
20
|
+
import mlrun.utils.helpers
|
|
20
21
|
|
|
21
22
|
from .base import DataStore, FileStats
|
|
22
23
|
|
|
@@ -35,7 +36,9 @@ class InMemoryStore(DataStore):
|
|
|
35
36
|
|
|
36
37
|
def _get_item(self, key):
|
|
37
38
|
if key not in self._items:
|
|
38
|
-
raise
|
|
39
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
40
|
+
f"item {key} not found in memory store"
|
|
41
|
+
)
|
|
39
42
|
return self._items[key]
|
|
40
43
|
|
|
41
44
|
def get(self, key, size=None, offset=0):
|
mlrun/datastore/redis.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from typing import Optional
|
|
15
16
|
from urllib.parse import urlparse
|
|
16
17
|
|
|
17
18
|
import redis
|
|
@@ -30,7 +31,9 @@ class RedisStore(DataStore):
|
|
|
30
31
|
- key and value sizes are limited to 512MB
|
|
31
32
|
"""
|
|
32
33
|
|
|
33
|
-
def __init__(
|
|
34
|
+
def __init__(
|
|
35
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
36
|
+
):
|
|
34
37
|
redis_default_port = "6379"
|
|
35
38
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
39
|
self.headers = None
|
mlrun/datastore/s3.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import time
|
|
16
|
+
from typing import Optional
|
|
16
17
|
|
|
17
18
|
import boto3
|
|
18
19
|
from boto3.s3.transfer import TransferConfig
|
|
@@ -26,7 +27,9 @@ from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitiz
|
|
|
26
27
|
class S3Store(DataStore):
|
|
27
28
|
using_bucket = True
|
|
28
29
|
|
|
29
|
-
def __init__(
|
|
30
|
+
def __init__(
|
|
31
|
+
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
32
|
+
):
|
|
30
33
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
31
34
|
# will be used in case user asks to assume a role and work through fsspec
|
|
32
35
|
self._temp_credentials = None
|