mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (76) hide show
  1. mlrun/__main__.py +0 -105
  2. mlrun/artifacts/__init__.py +1 -2
  3. mlrun/artifacts/base.py +8 -250
  4. mlrun/artifacts/dataset.py +1 -190
  5. mlrun/artifacts/manager.py +2 -41
  6. mlrun/artifacts/model.py +1 -140
  7. mlrun/artifacts/plots.py +1 -375
  8. mlrun/common/schemas/model_monitoring/__init__.py +4 -0
  9. mlrun/common/schemas/model_monitoring/constants.py +24 -3
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +13 -1
  11. mlrun/config.py +3 -3
  12. mlrun/data_types/to_pandas.py +4 -4
  13. mlrun/datastore/base.py +41 -9
  14. mlrun/datastore/datastore_profile.py +50 -3
  15. mlrun/datastore/inmem.py +2 -2
  16. mlrun/datastore/sources.py +43 -2
  17. mlrun/datastore/store_resources.py +2 -6
  18. mlrun/datastore/targets.py +106 -39
  19. mlrun/db/httpdb.py +4 -4
  20. mlrun/feature_store/__init__.py +0 -2
  21. mlrun/feature_store/api.py +12 -47
  22. mlrun/feature_store/feature_set.py +9 -0
  23. mlrun/feature_store/retrieval/base.py +9 -4
  24. mlrun/feature_store/retrieval/conversion.py +4 -4
  25. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  26. mlrun/feature_store/retrieval/job.py +2 -0
  27. mlrun/feature_store/retrieval/local_merger.py +2 -0
  28. mlrun/feature_store/retrieval/spark_merger.py +5 -0
  29. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
  30. mlrun/kfpops.py +5 -10
  31. mlrun/launcher/base.py +1 -1
  32. mlrun/launcher/client.py +1 -1
  33. mlrun/lists.py +2 -2
  34. mlrun/model.py +18 -9
  35. mlrun/model_monitoring/api.py +41 -18
  36. mlrun/model_monitoring/application.py +5 -305
  37. mlrun/model_monitoring/applications/__init__.py +11 -0
  38. mlrun/model_monitoring/applications/_application_steps.py +158 -0
  39. mlrun/model_monitoring/applications/base.py +282 -0
  40. mlrun/model_monitoring/applications/context.py +214 -0
  41. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  42. mlrun/model_monitoring/applications/histogram_data_drift.py +92 -77
  43. mlrun/model_monitoring/applications/results.py +99 -0
  44. mlrun/model_monitoring/controller.py +3 -1
  45. mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
  46. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +1 -1
  47. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +67 -4
  48. mlrun/model_monitoring/evidently_application.py +6 -118
  49. mlrun/model_monitoring/helpers.py +1 -1
  50. mlrun/model_monitoring/model_endpoint.py +3 -2
  51. mlrun/model_monitoring/stream_processing.py +2 -3
  52. mlrun/model_monitoring/writer.py +69 -39
  53. mlrun/platforms/iguazio.py +2 -2
  54. mlrun/projects/project.py +18 -31
  55. mlrun/render.py +2 -10
  56. mlrun/run.py +1 -3
  57. mlrun/runtimes/__init__.py +3 -3
  58. mlrun/runtimes/base.py +3 -3
  59. mlrun/runtimes/funcdoc.py +0 -28
  60. mlrun/runtimes/local.py +1 -1
  61. mlrun/runtimes/mpijob/__init__.py +0 -20
  62. mlrun/runtimes/mpijob/v1.py +1 -1
  63. mlrun/runtimes/nuclio/function.py +1 -1
  64. mlrun/runtimes/utils.py +1 -1
  65. mlrun/utils/helpers.py +27 -40
  66. mlrun/utils/notifications/notification/slack.py +4 -2
  67. mlrun/utils/notifications/notification_pusher.py +133 -14
  68. mlrun/utils/version/version.json +2 -2
  69. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/METADATA +2 -2
  70. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/RECORD +75 -71
  71. mlrun/runtimes/mpijob/v1alpha1.py +0 -29
  72. /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
  73. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/LICENSE +0 -0
  74. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/WHEEL +0 -0
  75. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/entry_points.txt +0 -0
  76. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc15.dist-info}/top_level.txt +0 -0
mlrun/__main__.py CHANGED
@@ -22,9 +22,6 @@ from ast import literal_eval
22
22
  from base64 import b64decode, b64encode
23
23
  from os import environ, path, remove
24
24
  from pprint import pprint
25
- from subprocess import Popen
26
- from sys import executable
27
- from urllib.parse import urlparse
28
25
 
29
26
  import click
30
27
  import dotenv
@@ -827,108 +824,6 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
827
824
  )
828
825
 
829
826
 
830
- @main.command(deprecated=True)
831
- @click.option("--port", "-p", help="port to listen on", type=int)
832
- @click.option("--dirpath", "-d", help="database directory (dirpath)")
833
- @click.option("--dsn", "-s", help="database dsn, e.g. sqlite:///db/mlrun.db")
834
- @click.option("--logs-path", "-l", help="logs directory path")
835
- @click.option("--data-volume", "-v", help="path prefix to the location of artifacts")
836
- @click.option("--verbose", is_flag=True, help="verbose log")
837
- @click.option("--background", "-b", is_flag=True, help="run in background process")
838
- @click.option("--artifact-path", "-a", help="default artifact path")
839
- @click.option(
840
- "--update-env",
841
- default="",
842
- is_flag=False,
843
- flag_value=mlrun.config.default_env_file,
844
- help=f"update the specified mlrun .env file (if TEXT not provided defaults to {mlrun.config.default_env_file})",
845
- )
846
- def db(
847
- port,
848
- dirpath,
849
- dsn,
850
- logs_path,
851
- data_volume,
852
- verbose,
853
- background,
854
- artifact_path,
855
- update_env,
856
- ):
857
- """Run HTTP api/database server"""
858
- warnings.warn(
859
- "The `mlrun db` command is deprecated in 1.5.0 and will be removed in 1.7.0, it is for internal use only.",
860
- FutureWarning,
861
- )
862
- env = environ.copy()
863
- # ignore client side .env file (so import mlrun in server will not try to connect to local/remote DB)
864
- env["MLRUN_IGNORE_ENV_FILE"] = "true"
865
- env["MLRUN_DBPATH"] = ""
866
-
867
- if port is not None:
868
- env["MLRUN_HTTPDB__PORT"] = str(port)
869
- if dirpath is not None:
870
- env["MLRUN_HTTPDB__DIRPATH"] = dirpath
871
- if dsn is not None:
872
- if dsn.startswith("sqlite://") and "check_same_thread=" not in dsn:
873
- dsn += "?check_same_thread=false"
874
- env["MLRUN_HTTPDB__DSN"] = dsn
875
- if logs_path is not None:
876
- env["MLRUN_HTTPDB__LOGS_PATH"] = logs_path
877
- if data_volume is not None:
878
- env["MLRUN_HTTPDB__DATA_VOLUME"] = data_volume
879
- if verbose:
880
- env["MLRUN_LOG_LEVEL"] = "DEBUG"
881
- if artifact_path or "MLRUN_ARTIFACT_PATH" not in env:
882
- if not artifact_path:
883
- artifact_path = (
884
- env.get("MLRUN_HTTPDB__DATA_VOLUME", "./artifacts").rstrip("/")
885
- + "/{{project}}"
886
- )
887
- env["MLRUN_ARTIFACT_PATH"] = path.realpath(path.expanduser(artifact_path))
888
-
889
- env["MLRUN_IS_API_SERVER"] = "true"
890
-
891
- # create the DB dir if needed
892
- dsn = dsn or mlconf.httpdb.dsn
893
- if dsn and dsn.startswith("sqlite:///"):
894
- parsed = urlparse(dsn)
895
- p = pathlib.Path(parsed.path[1:]).parent
896
- p.mkdir(parents=True, exist_ok=True)
897
-
898
- cmd = [executable, "-m", "server.api.main"]
899
- pid = None
900
- if background:
901
- print("Starting MLRun API service in the background...")
902
- child = Popen(
903
- cmd,
904
- env=env,
905
- stdout=open("mlrun-stdout.log", "w"),
906
- stderr=open("mlrun-stderr.log", "w"),
907
- start_new_session=True,
908
- )
909
- pid = child.pid
910
- print(
911
- f"Background pid: {pid}, logs written to mlrun-stdout.log and mlrun-stderr.log, use:\n"
912
- f"`kill {pid}` (linux/mac) or `taskkill /pid {pid} /t /f` (windows), to kill the mlrun service process"
913
- )
914
- else:
915
- child = Popen(cmd, env=env)
916
- returncode = child.wait()
917
- if returncode != 0:
918
- raise SystemExit(returncode)
919
- if update_env:
920
- # update mlrun client env file with the API path, so client will use the new DB
921
- # update and PID, allow killing the correct process in a config script
922
- filename = path.expanduser(update_env)
923
- dotenv.set_key(
924
- filename, "MLRUN_DBPATH", f"http://localhost:{port or 8080}", quote_mode=""
925
- )
926
- dotenv.set_key(filename, "MLRUN_MOCK_NUCLIO_DEPLOYMENT", "auto", quote_mode="")
927
- if pid:
928
- dotenv.set_key(filename, "MLRUN_SERVICE_PID", str(pid), quote_mode="")
929
- print(f"Updated configuration in {update_env} .env file")
930
-
931
-
932
827
  @main.command()
933
828
  def version():
934
829
  """get mlrun version"""
@@ -24,7 +24,6 @@ from .manager import (
24
24
  ArtifactProducer,
25
25
  artifact_types,
26
26
  dict_to_artifact,
27
- legacy_artifact_types,
28
27
  )
29
28
  from .model import ModelArtifact, get_model, update_model
30
- from .plots import BokehArtifact, ChartArtifact, PlotArtifact, PlotlyArtifact
29
+ from .plots import PlotArtifact, PlotlyArtifact
mlrun/artifacts/base.py CHANGED
@@ -20,7 +20,6 @@ import warnings
20
20
  import zipfile
21
21
 
22
22
  import yaml
23
- from deprecated import deprecated
24
23
 
25
24
  import mlrun
26
25
  import mlrun.artifacts
@@ -720,238 +719,6 @@ class LinkArtifact(Artifact):
720
719
  self._spec = self._verify_dict(spec, "spec", LinkArtifactSpec)
721
720
 
722
721
 
723
- # TODO: remove in 1.7.0
724
- @deprecated(
725
- version="1.3.0",
726
- reason="'LegacyArtifact' will be removed in 1.7.0, use 'Artifact' instead",
727
- category=FutureWarning,
728
- )
729
- class LegacyArtifact(ModelObj):
730
- _dict_fields = [
731
- "key",
732
- "kind",
733
- "iter",
734
- "tree",
735
- "src_path",
736
- "target_path",
737
- "hash",
738
- "description",
739
- "viewer",
740
- "inline",
741
- "format",
742
- "size",
743
- "db_key",
744
- "extra_data",
745
- "tag",
746
- ]
747
- kind = ""
748
- _store_prefix = StorePrefix.Artifact
749
-
750
- def __init__(
751
- self,
752
- key=None,
753
- body=None,
754
- viewer=None,
755
- is_inline=False,
756
- format=None,
757
- size=None,
758
- target_path=None,
759
- ):
760
- self.key = key
761
- self.project = ""
762
- self.db_key = None
763
- self.size = size
764
- self.iter = None
765
- self.tree = None
766
- self.updated = None
767
- self.target_path = target_path
768
- self.src_path = None
769
- self._body = body
770
- self.format = format
771
- self.description = None
772
- self.viewer = viewer
773
- self.encoding = None
774
- self.labels = {}
775
- self.annotations = None
776
- self.sources = []
777
- self.producer = None
778
- self.hash = None
779
- self._inline = is_inline
780
- self.license = ""
781
- self.extra_data = {}
782
- self.tag = None # temp store of the tag
783
-
784
- def before_log(self):
785
- for key, item in self.extra_data.items():
786
- if hasattr(item, "target_path"):
787
- self.extra_data[key] = item.target_path
788
-
789
- def is_inline(self):
790
- return self._inline
791
-
792
- @property
793
- def is_dir(self):
794
- """this is a directory"""
795
- return False
796
-
797
- @property
798
- def inline(self):
799
- """inline data (body)"""
800
- if self._inline:
801
- return self.get_body()
802
- return None
803
-
804
- @inline.setter
805
- def inline(self, body):
806
- self._body = body
807
- if body:
808
- self._inline = True
809
-
810
- @property
811
- def uri(self):
812
- """return artifact uri (store://..)"""
813
- return self.get_store_url()
814
-
815
- def to_dataitem(self):
816
- """return a DataItem object (if available) representing the artifact content"""
817
- uri = self.get_store_url()
818
- if uri:
819
- return mlrun.get_dataitem(uri)
820
-
821
- def get_body(self):
822
- """get the artifact body when inline"""
823
- return self._body
824
-
825
- def get_target_path(self):
826
- """get the absolute target path for the artifact"""
827
- return self.target_path
828
-
829
- def get_store_url(self, with_tag=True, project=None):
830
- """get the artifact uri (store://..) with optional parameters"""
831
- tag = self.tree if with_tag else None
832
- uri = generate_artifact_uri(
833
- project or self.project, self.db_key, tag, self.iter
834
- )
835
- return mlrun.datastore.get_store_uri(self._store_prefix, uri)
836
-
837
- def base_dict(self):
838
- """return short dict form of the artifact"""
839
- return super().to_dict()
840
-
841
- def to_dict(self, fields: list = None, exclude: list = None, strip: bool = False):
842
- """return long dict form of the artifact"""
843
- return super().to_dict(
844
- self._dict_fields
845
- + ["updated", "labels", "annotations", "producer", "sources", "project"],
846
- strip=strip,
847
- )
848
-
849
- @classmethod
850
- def from_dict(cls, struct=None, fields=None):
851
- fields = fields or cls._dict_fields + [
852
- "updated",
853
- "labels",
854
- "annotations",
855
- "producer",
856
- "sources",
857
- "project",
858
- ]
859
- return super().from_dict(struct, fields=fields)
860
-
861
- def upload(self):
862
- """internal, upload to target store"""
863
- src_path = self.src_path
864
- body = self.get_body()
865
- if body:
866
- self._upload_body(body)
867
- else:
868
- if src_path and os.path.isfile(src_path):
869
- self._upload_file(src_path)
870
-
871
- def _upload_body(self, body, target=None):
872
- if mlrun.mlconf.artifacts.calculate_hash:
873
- self.hash = calculate_blob_hash(body)
874
- self.size = len(body)
875
- mlrun.datastore.store_manager.object(url=target or self.target_path).put(body)
876
-
877
- def _upload_file(self, src, target=None):
878
- if mlrun.mlconf.artifacts.calculate_hash:
879
- self.hash = calculate_local_file_hash(src)
880
- self.size = os.stat(src).st_size
881
- mlrun.datastore.store_manager.object(url=target or self.target_path).upload(src)
882
-
883
- def artifact_kind(self):
884
- return self.kind
885
-
886
- def generate_target_path(self, artifact_path, producer):
887
- return generate_target_path(self, artifact_path, producer)
888
-
889
-
890
- # TODO: remove in 1.7.0
891
- @deprecated(
892
- version="1.3.0",
893
- reason="'LegacyDirArtifact' will be removed in 1.7.0, use 'DirArtifact' instead",
894
- category=FutureWarning,
895
- )
896
- class LegacyDirArtifact(LegacyArtifact):
897
- _dict_fields = [
898
- "key",
899
- "kind",
900
- "iter",
901
- "tree",
902
- "src_path",
903
- "target_path",
904
- "description",
905
- "db_key",
906
- ]
907
- kind = "dir"
908
-
909
- @property
910
- def is_dir(self):
911
- return True
912
-
913
- def upload(self):
914
- if not self.src_path:
915
- raise ValueError("local/source path not specified")
916
-
917
- files = os.listdir(self.src_path)
918
- for f in files:
919
- file_path = os.path.join(self.src_path, f)
920
- if not os.path.isfile(file_path):
921
- raise ValueError(f"file {file_path} not found, cant upload")
922
- target = os.path.join(self.target_path, f)
923
- mlrun.datastore.store_manager.object(url=target).upload(file_path)
924
-
925
-
926
- # TODO: remove in 1.7.0
927
- @deprecated(
928
- version="1.3.0",
929
- reason="'LegacyLinkArtifact' will be removed in 1.7.0, use 'LinkArtifact' instead",
930
- category=FutureWarning,
931
- )
932
- class LegacyLinkArtifact(LegacyArtifact):
933
- _dict_fields = LegacyArtifact._dict_fields + [
934
- "link_iteration",
935
- "link_key",
936
- "link_tree",
937
- ]
938
- kind = "link"
939
-
940
- def __init__(
941
- self,
942
- key=None,
943
- target_path="",
944
- link_iteration=None,
945
- link_key=None,
946
- link_tree=None,
947
- ):
948
- super().__init__(key)
949
- self.target_path = target_path
950
- self.link_iteration = link_iteration
951
- self.link_key = link_key
952
- self.link_tree = link_tree
953
-
954
-
955
722
  def calculate_blob_hash(data):
956
723
  if isinstance(data, str):
957
724
  data = data.encode()
@@ -1057,25 +824,16 @@ def generate_target_path(item: Artifact, artifact_path, producer):
1057
824
  return f"{artifact_path}{item.key}{suffix}"
1058
825
 
1059
826
 
827
+ # TODO: left to support data migration from legacy artifacts to new artifacts. Remove in 1.8.0.
1060
828
  def convert_legacy_artifact_to_new_format(
1061
- legacy_artifact: typing.Union[LegacyArtifact, dict],
829
+ legacy_artifact: dict,
1062
830
  ) -> Artifact:
1063
831
  """Converts a legacy artifact to a new format.
1064
-
1065
832
  :param legacy_artifact: The legacy artifact to convert.
1066
833
  :return: The converted artifact.
1067
834
  """
1068
- if isinstance(legacy_artifact, LegacyArtifact):
1069
- legacy_artifact_dict = legacy_artifact.to_dict()
1070
- elif isinstance(legacy_artifact, dict):
1071
- legacy_artifact_dict = legacy_artifact
1072
- else:
1073
- raise TypeError(
1074
- f"Unsupported type '{type(legacy_artifact)}' for legacy artifact"
1075
- )
1076
-
1077
- artifact_key = legacy_artifact_dict.get("key", "")
1078
- artifact_tag = legacy_artifact_dict.get("tag", "")
835
+ artifact_key = legacy_artifact.get("key", "")
836
+ artifact_tag = legacy_artifact.get("tag", "")
1079
837
  if artifact_tag:
1080
838
  artifact_key = f"{artifact_key}:{artifact_tag}"
1081
839
  # TODO: remove in 1.8.0
@@ -1086,12 +844,12 @@ def convert_legacy_artifact_to_new_format(
1086
844
  )
1087
845
 
1088
846
  artifact = mlrun.artifacts.artifact_types.get(
1089
- legacy_artifact_dict.get("kind", "artifact"), mlrun.artifacts.Artifact
847
+ legacy_artifact.get("kind", "artifact"), mlrun.artifacts.Artifact
1090
848
  )()
1091
849
 
1092
- artifact.metadata = artifact.metadata.from_dict(legacy_artifact_dict)
1093
- artifact.spec = artifact.spec.from_dict(legacy_artifact_dict)
1094
- artifact.status = artifact.status.from_dict(legacy_artifact_dict)
850
+ artifact.metadata = artifact.metadata.from_dict(legacy_artifact)
851
+ artifact.spec = artifact.spec.from_dict(legacy_artifact)
852
+ artifact.status = artifact.status.from_dict(legacy_artifact)
1095
853
 
1096
854
  return artifact
1097
855
 
@@ -18,7 +18,6 @@ from typing import Optional
18
18
 
19
19
  import numpy as np
20
20
  import pandas as pd
21
- from deprecated import deprecated
22
21
  from pandas.io.json import build_table_schema
23
22
 
24
23
  import mlrun
@@ -27,7 +26,7 @@ import mlrun.datastore
27
26
  import mlrun.utils.helpers
28
27
  from mlrun.config import config as mlconf
29
28
 
30
- from .base import Artifact, ArtifactSpec, LegacyArtifact, StorePrefix
29
+ from .base import Artifact, ArtifactSpec, StorePrefix
31
30
 
32
31
  default_preview_rows_length = 20
33
32
  max_preview_columns = mlconf.artifacts.datasets.max_preview_columns
@@ -360,194 +359,6 @@ class DatasetArtifact(Artifact):
360
359
  self.status.stats = stats
361
360
 
362
361
 
363
- # TODO: remove in 1.7.0
364
- @deprecated(
365
- version="1.3.0",
366
- reason="'LegacyTableArtifact' will be removed in 1.7.0, use 'TableArtifact' instead",
367
- category=FutureWarning,
368
- )
369
- class LegacyTableArtifact(LegacyArtifact):
370
- _dict_fields = LegacyArtifact._dict_fields + ["schema", "header"]
371
- kind = "table"
372
-
373
- def __init__(
374
- self,
375
- key=None,
376
- body=None,
377
- df=None,
378
- viewer=None,
379
- visible=False,
380
- inline=False,
381
- format=None,
382
- header=None,
383
- schema=None,
384
- ):
385
- if key:
386
- key_suffix = pathlib.Path(key).suffix
387
- if not format and key_suffix:
388
- format = key_suffix[1:]
389
- super().__init__(key, body, viewer=viewer, is_inline=inline, format=format)
390
-
391
- if df is not None:
392
- self._is_df = True
393
- self.header = df.reset_index(drop=True).columns.values.tolist()
394
- self.format = "csv" # todo other formats
395
- # if visible and not key_suffix:
396
- # key += '.csv'
397
- self._body = df
398
- else:
399
- self._is_df = False
400
- self.header = header
401
-
402
- self.schema = schema
403
- if not viewer:
404
- viewer = "table" if visible else None
405
- self.viewer = viewer
406
-
407
- def get_body(self):
408
- if not self._is_df:
409
- return self._body
410
- csv_buffer = StringIO()
411
- self._body.to_csv(
412
- csv_buffer,
413
- encoding="utf-8",
414
- **mlrun.utils.line_terminator_kwargs(),
415
- )
416
- return csv_buffer.getvalue()
417
-
418
-
419
- # TODO: remove in 1.7.0
420
- @deprecated(
421
- version="1.3.0",
422
- reason="'LegacyDatasetArtifact' will be removed in 1.7.0, use 'DatasetArtifact' instead",
423
- category=FutureWarning,
424
- )
425
- class LegacyDatasetArtifact(LegacyArtifact):
426
- # List of all the supported saving formats of a DataFrame:
427
- SUPPORTED_FORMATS = ["csv", "parquet", "pq", "tsdb", "kv"]
428
-
429
- _dict_fields = LegacyArtifact._dict_fields + [
430
- "schema",
431
- "header",
432
- "length",
433
- "preview",
434
- "stats",
435
- "extra_data",
436
- "column_metadata",
437
- ]
438
- kind = "dataset"
439
-
440
- def __init__(
441
- self,
442
- key: str = None,
443
- df=None,
444
- preview: int = None,
445
- format: str = "", # TODO: should be changed to 'fmt'.
446
- stats: bool = None,
447
- target_path: str = None,
448
- extra_data: dict = None,
449
- column_metadata: dict = None,
450
- ignore_preview_limits: bool = False,
451
- **kwargs,
452
- ):
453
- format = (format or "").lower()
454
- super().__init__(key, None, format=format, target_path=target_path)
455
- if format and format not in self.SUPPORTED_FORMATS:
456
- raise ValueError(
457
- f"unsupported format {format} use one of {'|'.join(self.SUPPORTED_FORMATS)}"
458
- )
459
-
460
- if format == "pq":
461
- format = "parquet"
462
- self.format = format
463
- self.stats = None
464
- self.extra_data = extra_data or {}
465
- self.column_metadata = column_metadata or {}
466
-
467
- if df is not None:
468
- if hasattr(df, "dask"):
469
- # If df is a Dask DataFrame, and it's small in-memory, convert to Pandas
470
- if (df.memory_usage(deep=True).sum().compute() / 1e9) < max_ddf_size:
471
- df = df.compute()
472
- self.update_preview_fields_from_df(
473
- self, df, stats, preview, ignore_preview_limits
474
- )
475
-
476
- self._df = df
477
- self._kw = kwargs
478
-
479
- def upload(self):
480
- suffix = pathlib.Path(self.target_path).suffix
481
- format = self.format
482
- if not format:
483
- if suffix and suffix in [".csv", ".parquet", ".pq"]:
484
- format = "csv" if suffix == ".csv" else "parquet"
485
- else:
486
- format = "parquet"
487
- if not suffix and not self.target_path.startswith("memory://"):
488
- self.target_path = self.target_path + "." + format
489
-
490
- self.size, self.hash = upload_dataframe(
491
- self._df,
492
- self.target_path,
493
- format=format,
494
- src_path=self.src_path,
495
- **self._kw,
496
- )
497
-
498
- @property
499
- def df(self) -> pd.DataFrame:
500
- """
501
- Get the dataset in this artifact.
502
-
503
- :return: The dataset as a DataFrame.
504
- """
505
- return self._df
506
-
507
- @staticmethod
508
- def is_format_supported(fmt: str) -> bool:
509
- """
510
- Check whether the given dataset format is supported by the DatasetArtifact.
511
-
512
- :param fmt: The format string to check.
513
-
514
- :return: True if the format is supported and False if not.
515
- """
516
- return fmt in DatasetArtifact.SUPPORTED_FORMATS
517
-
518
- @staticmethod
519
- def update_preview_fields_from_df(
520
- artifact, df, stats=None, preview_rows_length=None, ignore_preview_limits=False
521
- ):
522
- preview_rows_length = preview_rows_length or default_preview_rows_length
523
- if hasattr(df, "dask"):
524
- artifact.length = df.shape[0].compute()
525
- preview_df = df.sample(frac=ddf_sample_pct).compute()
526
- else:
527
- artifact.length = df.shape[0]
528
- preview_df = df
529
-
530
- if artifact.length > preview_rows_length and not ignore_preview_limits:
531
- preview_df = df.head(preview_rows_length)
532
-
533
- preview_df = preview_df.reset_index()
534
- if len(preview_df.columns) > max_preview_columns and not ignore_preview_limits:
535
- preview_df = preview_df.iloc[:, :max_preview_columns]
536
- artifact.header = preview_df.columns.values.tolist()
537
- artifact.preview = preview_df.values.tolist()
538
- # Table schema parsing doesn't require a column named "index"
539
- # to align its output with previously generated header and preview data
540
- if "index" in preview_df.columns:
541
- preview_df.drop("index", axis=1, inplace=True)
542
- artifact.schema = build_table_schema(preview_df)
543
- if (
544
- stats
545
- or (artifact.length < max_csv and len(df.columns) < max_preview_columns)
546
- or ignore_preview_limits
547
- ):
548
- artifact.stats = get_df_stats(df)
549
-
550
-
551
362
  def get_df_stats(df):
552
363
  if hasattr(df, "dask"):
553
364
  df = df.sample(frac=ddf_sample_pct).compute()