mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +0 -105
- mlrun/artifacts/__init__.py +1 -2
- mlrun/artifacts/base.py +8 -250
- mlrun/artifacts/dataset.py +1 -190
- mlrun/artifacts/manager.py +2 -41
- mlrun/artifacts/model.py +1 -140
- mlrun/artifacts/plots.py +1 -375
- mlrun/common/schemas/model_monitoring/__init__.py +4 -0
- mlrun/common/schemas/model_monitoring/constants.py +24 -3
- mlrun/common/schemas/model_monitoring/model_endpoints.py +13 -1
- mlrun/config.py +3 -3
- mlrun/data_types/to_pandas.py +4 -4
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore_profile.py +50 -3
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/sources.py +43 -2
- mlrun/datastore/store_resources.py +2 -6
- mlrun/datastore/targets.py +106 -39
- mlrun/db/httpdb.py +4 -4
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +4 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +2 -0
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +5 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
- mlrun/kfpops.py +5 -10
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +1 -1
- mlrun/lists.py +2 -2
- mlrun/model.py +18 -9
- mlrun/model_monitoring/api.py +41 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +158 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +92 -77
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +1 -1
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +67 -4
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +1 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +2 -3
- mlrun/model_monitoring/writer.py +69 -39
- mlrun/platforms/iguazio.py +2 -2
- mlrun/projects/project.py +18 -31
- mlrun/render.py +2 -10
- mlrun/run.py +1 -3
- mlrun/runtimes/__init__.py +3 -3
- mlrun/runtimes/base.py +3 -3
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/function.py +1 -1
- mlrun/runtimes/utils.py +1 -1
- mlrun/utils/helpers.py +27 -40
- mlrun/utils/notifications/notification/slack.py +4 -2
- mlrun/utils/notifications/notification_pusher.py +133 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/METADATA +2 -2
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/RECORD +75 -71
- mlrun/runtimes/mpijob/v1alpha1.py +0 -29
- /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/top_level.txt +0 -0
mlrun/__main__.py
CHANGED
|
@@ -22,9 +22,6 @@ from ast import literal_eval
|
|
|
22
22
|
from base64 import b64decode, b64encode
|
|
23
23
|
from os import environ, path, remove
|
|
24
24
|
from pprint import pprint
|
|
25
|
-
from subprocess import Popen
|
|
26
|
-
from sys import executable
|
|
27
|
-
from urllib.parse import urlparse
|
|
28
25
|
|
|
29
26
|
import click
|
|
30
27
|
import dotenv
|
|
@@ -827,108 +824,6 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
|
|
|
827
824
|
)
|
|
828
825
|
|
|
829
826
|
|
|
830
|
-
@main.command(deprecated=True)
|
|
831
|
-
@click.option("--port", "-p", help="port to listen on", type=int)
|
|
832
|
-
@click.option("--dirpath", "-d", help="database directory (dirpath)")
|
|
833
|
-
@click.option("--dsn", "-s", help="database dsn, e.g. sqlite:///db/mlrun.db")
|
|
834
|
-
@click.option("--logs-path", "-l", help="logs directory path")
|
|
835
|
-
@click.option("--data-volume", "-v", help="path prefix to the location of artifacts")
|
|
836
|
-
@click.option("--verbose", is_flag=True, help="verbose log")
|
|
837
|
-
@click.option("--background", "-b", is_flag=True, help="run in background process")
|
|
838
|
-
@click.option("--artifact-path", "-a", help="default artifact path")
|
|
839
|
-
@click.option(
|
|
840
|
-
"--update-env",
|
|
841
|
-
default="",
|
|
842
|
-
is_flag=False,
|
|
843
|
-
flag_value=mlrun.config.default_env_file,
|
|
844
|
-
help=f"update the specified mlrun .env file (if TEXT not provided defaults to {mlrun.config.default_env_file})",
|
|
845
|
-
)
|
|
846
|
-
def db(
|
|
847
|
-
port,
|
|
848
|
-
dirpath,
|
|
849
|
-
dsn,
|
|
850
|
-
logs_path,
|
|
851
|
-
data_volume,
|
|
852
|
-
verbose,
|
|
853
|
-
background,
|
|
854
|
-
artifact_path,
|
|
855
|
-
update_env,
|
|
856
|
-
):
|
|
857
|
-
"""Run HTTP api/database server"""
|
|
858
|
-
warnings.warn(
|
|
859
|
-
"The `mlrun db` command is deprecated in 1.5.0 and will be removed in 1.7.0, it is for internal use only.",
|
|
860
|
-
FutureWarning,
|
|
861
|
-
)
|
|
862
|
-
env = environ.copy()
|
|
863
|
-
# ignore client side .env file (so import mlrun in server will not try to connect to local/remote DB)
|
|
864
|
-
env["MLRUN_IGNORE_ENV_FILE"] = "true"
|
|
865
|
-
env["MLRUN_DBPATH"] = ""
|
|
866
|
-
|
|
867
|
-
if port is not None:
|
|
868
|
-
env["MLRUN_HTTPDB__PORT"] = str(port)
|
|
869
|
-
if dirpath is not None:
|
|
870
|
-
env["MLRUN_HTTPDB__DIRPATH"] = dirpath
|
|
871
|
-
if dsn is not None:
|
|
872
|
-
if dsn.startswith("sqlite://") and "check_same_thread=" not in dsn:
|
|
873
|
-
dsn += "?check_same_thread=false"
|
|
874
|
-
env["MLRUN_HTTPDB__DSN"] = dsn
|
|
875
|
-
if logs_path is not None:
|
|
876
|
-
env["MLRUN_HTTPDB__LOGS_PATH"] = logs_path
|
|
877
|
-
if data_volume is not None:
|
|
878
|
-
env["MLRUN_HTTPDB__DATA_VOLUME"] = data_volume
|
|
879
|
-
if verbose:
|
|
880
|
-
env["MLRUN_LOG_LEVEL"] = "DEBUG"
|
|
881
|
-
if artifact_path or "MLRUN_ARTIFACT_PATH" not in env:
|
|
882
|
-
if not artifact_path:
|
|
883
|
-
artifact_path = (
|
|
884
|
-
env.get("MLRUN_HTTPDB__DATA_VOLUME", "./artifacts").rstrip("/")
|
|
885
|
-
+ "/{{project}}"
|
|
886
|
-
)
|
|
887
|
-
env["MLRUN_ARTIFACT_PATH"] = path.realpath(path.expanduser(artifact_path))
|
|
888
|
-
|
|
889
|
-
env["MLRUN_IS_API_SERVER"] = "true"
|
|
890
|
-
|
|
891
|
-
# create the DB dir if needed
|
|
892
|
-
dsn = dsn or mlconf.httpdb.dsn
|
|
893
|
-
if dsn and dsn.startswith("sqlite:///"):
|
|
894
|
-
parsed = urlparse(dsn)
|
|
895
|
-
p = pathlib.Path(parsed.path[1:]).parent
|
|
896
|
-
p.mkdir(parents=True, exist_ok=True)
|
|
897
|
-
|
|
898
|
-
cmd = [executable, "-m", "server.api.main"]
|
|
899
|
-
pid = None
|
|
900
|
-
if background:
|
|
901
|
-
print("Starting MLRun API service in the background...")
|
|
902
|
-
child = Popen(
|
|
903
|
-
cmd,
|
|
904
|
-
env=env,
|
|
905
|
-
stdout=open("mlrun-stdout.log", "w"),
|
|
906
|
-
stderr=open("mlrun-stderr.log", "w"),
|
|
907
|
-
start_new_session=True,
|
|
908
|
-
)
|
|
909
|
-
pid = child.pid
|
|
910
|
-
print(
|
|
911
|
-
f"Background pid: {pid}, logs written to mlrun-stdout.log and mlrun-stderr.log, use:\n"
|
|
912
|
-
f"`kill {pid}` (linux/mac) or `taskkill /pid {pid} /t /f` (windows), to kill the mlrun service process"
|
|
913
|
-
)
|
|
914
|
-
else:
|
|
915
|
-
child = Popen(cmd, env=env)
|
|
916
|
-
returncode = child.wait()
|
|
917
|
-
if returncode != 0:
|
|
918
|
-
raise SystemExit(returncode)
|
|
919
|
-
if update_env:
|
|
920
|
-
# update mlrun client env file with the API path, so client will use the new DB
|
|
921
|
-
# update and PID, allow killing the correct process in a config script
|
|
922
|
-
filename = path.expanduser(update_env)
|
|
923
|
-
dotenv.set_key(
|
|
924
|
-
filename, "MLRUN_DBPATH", f"http://localhost:{port or 8080}", quote_mode=""
|
|
925
|
-
)
|
|
926
|
-
dotenv.set_key(filename, "MLRUN_MOCK_NUCLIO_DEPLOYMENT", "auto", quote_mode="")
|
|
927
|
-
if pid:
|
|
928
|
-
dotenv.set_key(filename, "MLRUN_SERVICE_PID", str(pid), quote_mode="")
|
|
929
|
-
print(f"Updated configuration in {update_env} .env file")
|
|
930
|
-
|
|
931
|
-
|
|
932
827
|
@main.command()
|
|
933
828
|
def version():
|
|
934
829
|
"""get mlrun version"""
|
mlrun/artifacts/__init__.py
CHANGED
|
@@ -24,7 +24,6 @@ from .manager import (
|
|
|
24
24
|
ArtifactProducer,
|
|
25
25
|
artifact_types,
|
|
26
26
|
dict_to_artifact,
|
|
27
|
-
legacy_artifact_types,
|
|
28
27
|
)
|
|
29
28
|
from .model import ModelArtifact, get_model, update_model
|
|
30
|
-
from .plots import
|
|
29
|
+
from .plots import PlotArtifact, PlotlyArtifact
|
mlrun/artifacts/base.py
CHANGED
|
@@ -20,7 +20,6 @@ import warnings
|
|
|
20
20
|
import zipfile
|
|
21
21
|
|
|
22
22
|
import yaml
|
|
23
|
-
from deprecated import deprecated
|
|
24
23
|
|
|
25
24
|
import mlrun
|
|
26
25
|
import mlrun.artifacts
|
|
@@ -720,238 +719,6 @@ class LinkArtifact(Artifact):
|
|
|
720
719
|
self._spec = self._verify_dict(spec, "spec", LinkArtifactSpec)
|
|
721
720
|
|
|
722
721
|
|
|
723
|
-
# TODO: remove in 1.7.0
|
|
724
|
-
@deprecated(
|
|
725
|
-
version="1.3.0",
|
|
726
|
-
reason="'LegacyArtifact' will be removed in 1.7.0, use 'Artifact' instead",
|
|
727
|
-
category=FutureWarning,
|
|
728
|
-
)
|
|
729
|
-
class LegacyArtifact(ModelObj):
|
|
730
|
-
_dict_fields = [
|
|
731
|
-
"key",
|
|
732
|
-
"kind",
|
|
733
|
-
"iter",
|
|
734
|
-
"tree",
|
|
735
|
-
"src_path",
|
|
736
|
-
"target_path",
|
|
737
|
-
"hash",
|
|
738
|
-
"description",
|
|
739
|
-
"viewer",
|
|
740
|
-
"inline",
|
|
741
|
-
"format",
|
|
742
|
-
"size",
|
|
743
|
-
"db_key",
|
|
744
|
-
"extra_data",
|
|
745
|
-
"tag",
|
|
746
|
-
]
|
|
747
|
-
kind = ""
|
|
748
|
-
_store_prefix = StorePrefix.Artifact
|
|
749
|
-
|
|
750
|
-
def __init__(
|
|
751
|
-
self,
|
|
752
|
-
key=None,
|
|
753
|
-
body=None,
|
|
754
|
-
viewer=None,
|
|
755
|
-
is_inline=False,
|
|
756
|
-
format=None,
|
|
757
|
-
size=None,
|
|
758
|
-
target_path=None,
|
|
759
|
-
):
|
|
760
|
-
self.key = key
|
|
761
|
-
self.project = ""
|
|
762
|
-
self.db_key = None
|
|
763
|
-
self.size = size
|
|
764
|
-
self.iter = None
|
|
765
|
-
self.tree = None
|
|
766
|
-
self.updated = None
|
|
767
|
-
self.target_path = target_path
|
|
768
|
-
self.src_path = None
|
|
769
|
-
self._body = body
|
|
770
|
-
self.format = format
|
|
771
|
-
self.description = None
|
|
772
|
-
self.viewer = viewer
|
|
773
|
-
self.encoding = None
|
|
774
|
-
self.labels = {}
|
|
775
|
-
self.annotations = None
|
|
776
|
-
self.sources = []
|
|
777
|
-
self.producer = None
|
|
778
|
-
self.hash = None
|
|
779
|
-
self._inline = is_inline
|
|
780
|
-
self.license = ""
|
|
781
|
-
self.extra_data = {}
|
|
782
|
-
self.tag = None # temp store of the tag
|
|
783
|
-
|
|
784
|
-
def before_log(self):
|
|
785
|
-
for key, item in self.extra_data.items():
|
|
786
|
-
if hasattr(item, "target_path"):
|
|
787
|
-
self.extra_data[key] = item.target_path
|
|
788
|
-
|
|
789
|
-
def is_inline(self):
|
|
790
|
-
return self._inline
|
|
791
|
-
|
|
792
|
-
@property
|
|
793
|
-
def is_dir(self):
|
|
794
|
-
"""this is a directory"""
|
|
795
|
-
return False
|
|
796
|
-
|
|
797
|
-
@property
|
|
798
|
-
def inline(self):
|
|
799
|
-
"""inline data (body)"""
|
|
800
|
-
if self._inline:
|
|
801
|
-
return self.get_body()
|
|
802
|
-
return None
|
|
803
|
-
|
|
804
|
-
@inline.setter
|
|
805
|
-
def inline(self, body):
|
|
806
|
-
self._body = body
|
|
807
|
-
if body:
|
|
808
|
-
self._inline = True
|
|
809
|
-
|
|
810
|
-
@property
|
|
811
|
-
def uri(self):
|
|
812
|
-
"""return artifact uri (store://..)"""
|
|
813
|
-
return self.get_store_url()
|
|
814
|
-
|
|
815
|
-
def to_dataitem(self):
|
|
816
|
-
"""return a DataItem object (if available) representing the artifact content"""
|
|
817
|
-
uri = self.get_store_url()
|
|
818
|
-
if uri:
|
|
819
|
-
return mlrun.get_dataitem(uri)
|
|
820
|
-
|
|
821
|
-
def get_body(self):
|
|
822
|
-
"""get the artifact body when inline"""
|
|
823
|
-
return self._body
|
|
824
|
-
|
|
825
|
-
def get_target_path(self):
|
|
826
|
-
"""get the absolute target path for the artifact"""
|
|
827
|
-
return self.target_path
|
|
828
|
-
|
|
829
|
-
def get_store_url(self, with_tag=True, project=None):
|
|
830
|
-
"""get the artifact uri (store://..) with optional parameters"""
|
|
831
|
-
tag = self.tree if with_tag else None
|
|
832
|
-
uri = generate_artifact_uri(
|
|
833
|
-
project or self.project, self.db_key, tag, self.iter
|
|
834
|
-
)
|
|
835
|
-
return mlrun.datastore.get_store_uri(self._store_prefix, uri)
|
|
836
|
-
|
|
837
|
-
def base_dict(self):
|
|
838
|
-
"""return short dict form of the artifact"""
|
|
839
|
-
return super().to_dict()
|
|
840
|
-
|
|
841
|
-
def to_dict(self, fields: list = None, exclude: list = None, strip: bool = False):
|
|
842
|
-
"""return long dict form of the artifact"""
|
|
843
|
-
return super().to_dict(
|
|
844
|
-
self._dict_fields
|
|
845
|
-
+ ["updated", "labels", "annotations", "producer", "sources", "project"],
|
|
846
|
-
strip=strip,
|
|
847
|
-
)
|
|
848
|
-
|
|
849
|
-
@classmethod
|
|
850
|
-
def from_dict(cls, struct=None, fields=None):
|
|
851
|
-
fields = fields or cls._dict_fields + [
|
|
852
|
-
"updated",
|
|
853
|
-
"labels",
|
|
854
|
-
"annotations",
|
|
855
|
-
"producer",
|
|
856
|
-
"sources",
|
|
857
|
-
"project",
|
|
858
|
-
]
|
|
859
|
-
return super().from_dict(struct, fields=fields)
|
|
860
|
-
|
|
861
|
-
def upload(self):
|
|
862
|
-
"""internal, upload to target store"""
|
|
863
|
-
src_path = self.src_path
|
|
864
|
-
body = self.get_body()
|
|
865
|
-
if body:
|
|
866
|
-
self._upload_body(body)
|
|
867
|
-
else:
|
|
868
|
-
if src_path and os.path.isfile(src_path):
|
|
869
|
-
self._upload_file(src_path)
|
|
870
|
-
|
|
871
|
-
def _upload_body(self, body, target=None):
|
|
872
|
-
if mlrun.mlconf.artifacts.calculate_hash:
|
|
873
|
-
self.hash = calculate_blob_hash(body)
|
|
874
|
-
self.size = len(body)
|
|
875
|
-
mlrun.datastore.store_manager.object(url=target or self.target_path).put(body)
|
|
876
|
-
|
|
877
|
-
def _upload_file(self, src, target=None):
|
|
878
|
-
if mlrun.mlconf.artifacts.calculate_hash:
|
|
879
|
-
self.hash = calculate_local_file_hash(src)
|
|
880
|
-
self.size = os.stat(src).st_size
|
|
881
|
-
mlrun.datastore.store_manager.object(url=target or self.target_path).upload(src)
|
|
882
|
-
|
|
883
|
-
def artifact_kind(self):
|
|
884
|
-
return self.kind
|
|
885
|
-
|
|
886
|
-
def generate_target_path(self, artifact_path, producer):
|
|
887
|
-
return generate_target_path(self, artifact_path, producer)
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
# TODO: remove in 1.7.0
|
|
891
|
-
@deprecated(
|
|
892
|
-
version="1.3.0",
|
|
893
|
-
reason="'LegacyDirArtifact' will be removed in 1.7.0, use 'DirArtifact' instead",
|
|
894
|
-
category=FutureWarning,
|
|
895
|
-
)
|
|
896
|
-
class LegacyDirArtifact(LegacyArtifact):
|
|
897
|
-
_dict_fields = [
|
|
898
|
-
"key",
|
|
899
|
-
"kind",
|
|
900
|
-
"iter",
|
|
901
|
-
"tree",
|
|
902
|
-
"src_path",
|
|
903
|
-
"target_path",
|
|
904
|
-
"description",
|
|
905
|
-
"db_key",
|
|
906
|
-
]
|
|
907
|
-
kind = "dir"
|
|
908
|
-
|
|
909
|
-
@property
|
|
910
|
-
def is_dir(self):
|
|
911
|
-
return True
|
|
912
|
-
|
|
913
|
-
def upload(self):
|
|
914
|
-
if not self.src_path:
|
|
915
|
-
raise ValueError("local/source path not specified")
|
|
916
|
-
|
|
917
|
-
files = os.listdir(self.src_path)
|
|
918
|
-
for f in files:
|
|
919
|
-
file_path = os.path.join(self.src_path, f)
|
|
920
|
-
if not os.path.isfile(file_path):
|
|
921
|
-
raise ValueError(f"file {file_path} not found, cant upload")
|
|
922
|
-
target = os.path.join(self.target_path, f)
|
|
923
|
-
mlrun.datastore.store_manager.object(url=target).upload(file_path)
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
# TODO: remove in 1.7.0
|
|
927
|
-
@deprecated(
|
|
928
|
-
version="1.3.0",
|
|
929
|
-
reason="'LegacyLinkArtifact' will be removed in 1.7.0, use 'LinkArtifact' instead",
|
|
930
|
-
category=FutureWarning,
|
|
931
|
-
)
|
|
932
|
-
class LegacyLinkArtifact(LegacyArtifact):
|
|
933
|
-
_dict_fields = LegacyArtifact._dict_fields + [
|
|
934
|
-
"link_iteration",
|
|
935
|
-
"link_key",
|
|
936
|
-
"link_tree",
|
|
937
|
-
]
|
|
938
|
-
kind = "link"
|
|
939
|
-
|
|
940
|
-
def __init__(
|
|
941
|
-
self,
|
|
942
|
-
key=None,
|
|
943
|
-
target_path="",
|
|
944
|
-
link_iteration=None,
|
|
945
|
-
link_key=None,
|
|
946
|
-
link_tree=None,
|
|
947
|
-
):
|
|
948
|
-
super().__init__(key)
|
|
949
|
-
self.target_path = target_path
|
|
950
|
-
self.link_iteration = link_iteration
|
|
951
|
-
self.link_key = link_key
|
|
952
|
-
self.link_tree = link_tree
|
|
953
|
-
|
|
954
|
-
|
|
955
722
|
def calculate_blob_hash(data):
|
|
956
723
|
if isinstance(data, str):
|
|
957
724
|
data = data.encode()
|
|
@@ -1057,25 +824,16 @@ def generate_target_path(item: Artifact, artifact_path, producer):
|
|
|
1057
824
|
return f"{artifact_path}{item.key}{suffix}"
|
|
1058
825
|
|
|
1059
826
|
|
|
827
|
+
# TODO: left to support data migration from legacy artifacts to new artifacts. Remove in 1.8.0.
|
|
1060
828
|
def convert_legacy_artifact_to_new_format(
|
|
1061
|
-
legacy_artifact:
|
|
829
|
+
legacy_artifact: dict,
|
|
1062
830
|
) -> Artifact:
|
|
1063
831
|
"""Converts a legacy artifact to a new format.
|
|
1064
|
-
|
|
1065
832
|
:param legacy_artifact: The legacy artifact to convert.
|
|
1066
833
|
:return: The converted artifact.
|
|
1067
834
|
"""
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
elif isinstance(legacy_artifact, dict):
|
|
1071
|
-
legacy_artifact_dict = legacy_artifact
|
|
1072
|
-
else:
|
|
1073
|
-
raise TypeError(
|
|
1074
|
-
f"Unsupported type '{type(legacy_artifact)}' for legacy artifact"
|
|
1075
|
-
)
|
|
1076
|
-
|
|
1077
|
-
artifact_key = legacy_artifact_dict.get("key", "")
|
|
1078
|
-
artifact_tag = legacy_artifact_dict.get("tag", "")
|
|
835
|
+
artifact_key = legacy_artifact.get("key", "")
|
|
836
|
+
artifact_tag = legacy_artifact.get("tag", "")
|
|
1079
837
|
if artifact_tag:
|
|
1080
838
|
artifact_key = f"{artifact_key}:{artifact_tag}"
|
|
1081
839
|
# TODO: remove in 1.8.0
|
|
@@ -1086,12 +844,12 @@ def convert_legacy_artifact_to_new_format(
|
|
|
1086
844
|
)
|
|
1087
845
|
|
|
1088
846
|
artifact = mlrun.artifacts.artifact_types.get(
|
|
1089
|
-
|
|
847
|
+
legacy_artifact.get("kind", "artifact"), mlrun.artifacts.Artifact
|
|
1090
848
|
)()
|
|
1091
849
|
|
|
1092
|
-
artifact.metadata = artifact.metadata.from_dict(
|
|
1093
|
-
artifact.spec = artifact.spec.from_dict(
|
|
1094
|
-
artifact.status = artifact.status.from_dict(
|
|
850
|
+
artifact.metadata = artifact.metadata.from_dict(legacy_artifact)
|
|
851
|
+
artifact.spec = artifact.spec.from_dict(legacy_artifact)
|
|
852
|
+
artifact.status = artifact.status.from_dict(legacy_artifact)
|
|
1095
853
|
|
|
1096
854
|
return artifact
|
|
1097
855
|
|
mlrun/artifacts/dataset.py
CHANGED
|
@@ -18,7 +18,6 @@ from typing import Optional
|
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
import pandas as pd
|
|
21
|
-
from deprecated import deprecated
|
|
22
21
|
from pandas.io.json import build_table_schema
|
|
23
22
|
|
|
24
23
|
import mlrun
|
|
@@ -27,7 +26,7 @@ import mlrun.datastore
|
|
|
27
26
|
import mlrun.utils.helpers
|
|
28
27
|
from mlrun.config import config as mlconf
|
|
29
28
|
|
|
30
|
-
from .base import Artifact, ArtifactSpec,
|
|
29
|
+
from .base import Artifact, ArtifactSpec, StorePrefix
|
|
31
30
|
|
|
32
31
|
default_preview_rows_length = 20
|
|
33
32
|
max_preview_columns = mlconf.artifacts.datasets.max_preview_columns
|
|
@@ -360,194 +359,6 @@ class DatasetArtifact(Artifact):
|
|
|
360
359
|
self.status.stats = stats
|
|
361
360
|
|
|
362
361
|
|
|
363
|
-
# TODO: remove in 1.7.0
|
|
364
|
-
@deprecated(
|
|
365
|
-
version="1.3.0",
|
|
366
|
-
reason="'LegacyTableArtifact' will be removed in 1.7.0, use 'TableArtifact' instead",
|
|
367
|
-
category=FutureWarning,
|
|
368
|
-
)
|
|
369
|
-
class LegacyTableArtifact(LegacyArtifact):
|
|
370
|
-
_dict_fields = LegacyArtifact._dict_fields + ["schema", "header"]
|
|
371
|
-
kind = "table"
|
|
372
|
-
|
|
373
|
-
def __init__(
|
|
374
|
-
self,
|
|
375
|
-
key=None,
|
|
376
|
-
body=None,
|
|
377
|
-
df=None,
|
|
378
|
-
viewer=None,
|
|
379
|
-
visible=False,
|
|
380
|
-
inline=False,
|
|
381
|
-
format=None,
|
|
382
|
-
header=None,
|
|
383
|
-
schema=None,
|
|
384
|
-
):
|
|
385
|
-
if key:
|
|
386
|
-
key_suffix = pathlib.Path(key).suffix
|
|
387
|
-
if not format and key_suffix:
|
|
388
|
-
format = key_suffix[1:]
|
|
389
|
-
super().__init__(key, body, viewer=viewer, is_inline=inline, format=format)
|
|
390
|
-
|
|
391
|
-
if df is not None:
|
|
392
|
-
self._is_df = True
|
|
393
|
-
self.header = df.reset_index(drop=True).columns.values.tolist()
|
|
394
|
-
self.format = "csv" # todo other formats
|
|
395
|
-
# if visible and not key_suffix:
|
|
396
|
-
# key += '.csv'
|
|
397
|
-
self._body = df
|
|
398
|
-
else:
|
|
399
|
-
self._is_df = False
|
|
400
|
-
self.header = header
|
|
401
|
-
|
|
402
|
-
self.schema = schema
|
|
403
|
-
if not viewer:
|
|
404
|
-
viewer = "table" if visible else None
|
|
405
|
-
self.viewer = viewer
|
|
406
|
-
|
|
407
|
-
def get_body(self):
|
|
408
|
-
if not self._is_df:
|
|
409
|
-
return self._body
|
|
410
|
-
csv_buffer = StringIO()
|
|
411
|
-
self._body.to_csv(
|
|
412
|
-
csv_buffer,
|
|
413
|
-
encoding="utf-8",
|
|
414
|
-
**mlrun.utils.line_terminator_kwargs(),
|
|
415
|
-
)
|
|
416
|
-
return csv_buffer.getvalue()
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
# TODO: remove in 1.7.0
|
|
420
|
-
@deprecated(
|
|
421
|
-
version="1.3.0",
|
|
422
|
-
reason="'LegacyDatasetArtifact' will be removed in 1.7.0, use 'DatasetArtifact' instead",
|
|
423
|
-
category=FutureWarning,
|
|
424
|
-
)
|
|
425
|
-
class LegacyDatasetArtifact(LegacyArtifact):
|
|
426
|
-
# List of all the supported saving formats of a DataFrame:
|
|
427
|
-
SUPPORTED_FORMATS = ["csv", "parquet", "pq", "tsdb", "kv"]
|
|
428
|
-
|
|
429
|
-
_dict_fields = LegacyArtifact._dict_fields + [
|
|
430
|
-
"schema",
|
|
431
|
-
"header",
|
|
432
|
-
"length",
|
|
433
|
-
"preview",
|
|
434
|
-
"stats",
|
|
435
|
-
"extra_data",
|
|
436
|
-
"column_metadata",
|
|
437
|
-
]
|
|
438
|
-
kind = "dataset"
|
|
439
|
-
|
|
440
|
-
def __init__(
|
|
441
|
-
self,
|
|
442
|
-
key: str = None,
|
|
443
|
-
df=None,
|
|
444
|
-
preview: int = None,
|
|
445
|
-
format: str = "", # TODO: should be changed to 'fmt'.
|
|
446
|
-
stats: bool = None,
|
|
447
|
-
target_path: str = None,
|
|
448
|
-
extra_data: dict = None,
|
|
449
|
-
column_metadata: dict = None,
|
|
450
|
-
ignore_preview_limits: bool = False,
|
|
451
|
-
**kwargs,
|
|
452
|
-
):
|
|
453
|
-
format = (format or "").lower()
|
|
454
|
-
super().__init__(key, None, format=format, target_path=target_path)
|
|
455
|
-
if format and format not in self.SUPPORTED_FORMATS:
|
|
456
|
-
raise ValueError(
|
|
457
|
-
f"unsupported format {format} use one of {'|'.join(self.SUPPORTED_FORMATS)}"
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
if format == "pq":
|
|
461
|
-
format = "parquet"
|
|
462
|
-
self.format = format
|
|
463
|
-
self.stats = None
|
|
464
|
-
self.extra_data = extra_data or {}
|
|
465
|
-
self.column_metadata = column_metadata or {}
|
|
466
|
-
|
|
467
|
-
if df is not None:
|
|
468
|
-
if hasattr(df, "dask"):
|
|
469
|
-
# If df is a Dask DataFrame, and it's small in-memory, convert to Pandas
|
|
470
|
-
if (df.memory_usage(deep=True).sum().compute() / 1e9) < max_ddf_size:
|
|
471
|
-
df = df.compute()
|
|
472
|
-
self.update_preview_fields_from_df(
|
|
473
|
-
self, df, stats, preview, ignore_preview_limits
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
self._df = df
|
|
477
|
-
self._kw = kwargs
|
|
478
|
-
|
|
479
|
-
def upload(self):
|
|
480
|
-
suffix = pathlib.Path(self.target_path).suffix
|
|
481
|
-
format = self.format
|
|
482
|
-
if not format:
|
|
483
|
-
if suffix and suffix in [".csv", ".parquet", ".pq"]:
|
|
484
|
-
format = "csv" if suffix == ".csv" else "parquet"
|
|
485
|
-
else:
|
|
486
|
-
format = "parquet"
|
|
487
|
-
if not suffix and not self.target_path.startswith("memory://"):
|
|
488
|
-
self.target_path = self.target_path + "." + format
|
|
489
|
-
|
|
490
|
-
self.size, self.hash = upload_dataframe(
|
|
491
|
-
self._df,
|
|
492
|
-
self.target_path,
|
|
493
|
-
format=format,
|
|
494
|
-
src_path=self.src_path,
|
|
495
|
-
**self._kw,
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
@property
|
|
499
|
-
def df(self) -> pd.DataFrame:
|
|
500
|
-
"""
|
|
501
|
-
Get the dataset in this artifact.
|
|
502
|
-
|
|
503
|
-
:return: The dataset as a DataFrame.
|
|
504
|
-
"""
|
|
505
|
-
return self._df
|
|
506
|
-
|
|
507
|
-
@staticmethod
|
|
508
|
-
def is_format_supported(fmt: str) -> bool:
|
|
509
|
-
"""
|
|
510
|
-
Check whether the given dataset format is supported by the DatasetArtifact.
|
|
511
|
-
|
|
512
|
-
:param fmt: The format string to check.
|
|
513
|
-
|
|
514
|
-
:return: True if the format is supported and False if not.
|
|
515
|
-
"""
|
|
516
|
-
return fmt in DatasetArtifact.SUPPORTED_FORMATS
|
|
517
|
-
|
|
518
|
-
@staticmethod
|
|
519
|
-
def update_preview_fields_from_df(
|
|
520
|
-
artifact, df, stats=None, preview_rows_length=None, ignore_preview_limits=False
|
|
521
|
-
):
|
|
522
|
-
preview_rows_length = preview_rows_length or default_preview_rows_length
|
|
523
|
-
if hasattr(df, "dask"):
|
|
524
|
-
artifact.length = df.shape[0].compute()
|
|
525
|
-
preview_df = df.sample(frac=ddf_sample_pct).compute()
|
|
526
|
-
else:
|
|
527
|
-
artifact.length = df.shape[0]
|
|
528
|
-
preview_df = df
|
|
529
|
-
|
|
530
|
-
if artifact.length > preview_rows_length and not ignore_preview_limits:
|
|
531
|
-
preview_df = df.head(preview_rows_length)
|
|
532
|
-
|
|
533
|
-
preview_df = preview_df.reset_index()
|
|
534
|
-
if len(preview_df.columns) > max_preview_columns and not ignore_preview_limits:
|
|
535
|
-
preview_df = preview_df.iloc[:, :max_preview_columns]
|
|
536
|
-
artifact.header = preview_df.columns.values.tolist()
|
|
537
|
-
artifact.preview = preview_df.values.tolist()
|
|
538
|
-
# Table schema parsing doesn't require a column named "index"
|
|
539
|
-
# to align its output with previously generated header and preview data
|
|
540
|
-
if "index" in preview_df.columns:
|
|
541
|
-
preview_df.drop("index", axis=1, inplace=True)
|
|
542
|
-
artifact.schema = build_table_schema(preview_df)
|
|
543
|
-
if (
|
|
544
|
-
stats
|
|
545
|
-
or (artifact.length < max_csv and len(df.columns) < max_preview_columns)
|
|
546
|
-
or ignore_preview_limits
|
|
547
|
-
):
|
|
548
|
-
artifact.stats = get_df_stats(df)
|
|
549
|
-
|
|
550
|
-
|
|
551
362
|
def get_df_stats(df):
|
|
552
363
|
if hasattr(df, "dask"):
|
|
553
364
|
df = df.sample(frac=ddf_sample_pct).compute()
|