PyPI - mlrun - Versions diffs - 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

mlrun 1.7.2rc4py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show

mlrun/__init__.py +26 -22
mlrun/__main__.py +15 -16
mlrun/alerts/alert.py +150 -15
mlrun/api/schemas/__init__.py +1 -9
mlrun/artifacts/__init__.py +2 -3
mlrun/artifacts/base.py +62 -19
mlrun/artifacts/dataset.py +17 -17
mlrun/artifacts/document.py +454 -0
mlrun/artifacts/manager.py +28 -18
mlrun/artifacts/model.py +91 -59
mlrun/artifacts/plots.py +2 -2
mlrun/common/constants.py +8 -0
mlrun/common/formatters/__init__.py +1 -0
mlrun/common/formatters/artifact.py +1 -1
mlrun/common/formatters/feature_set.py +2 -0
mlrun/common/formatters/function.py +1 -0
mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
mlrun/common/formatters/pipeline.py +1 -2
mlrun/common/formatters/project.py +9 -0
mlrun/common/model_monitoring/__init__.py +0 -5
mlrun/common/model_monitoring/helpers.py +12 -62
mlrun/common/runtimes/constants.py +25 -4
mlrun/common/schemas/__init__.py +9 -5
mlrun/common/schemas/alert.py +114 -19
mlrun/common/schemas/api_gateway.py +3 -3
mlrun/common/schemas/artifact.py +22 -9
mlrun/common/schemas/auth.py +8 -4
mlrun/common/schemas/background_task.py +7 -7
mlrun/common/schemas/client_spec.py +4 -4
mlrun/common/schemas/clusterization_spec.py +2 -2
mlrun/common/schemas/common.py +53 -3
mlrun/common/schemas/constants.py +15 -0
mlrun/common/schemas/datastore_profile.py +1 -1
mlrun/common/schemas/feature_store.py +9 -9
mlrun/common/schemas/frontend_spec.py +4 -4
mlrun/common/schemas/function.py +10 -10
mlrun/common/schemas/hub.py +1 -1
mlrun/common/schemas/k8s.py +3 -3
mlrun/common/schemas/memory_reports.py +3 -3
mlrun/common/schemas/model_monitoring/__init__.py +4 -8
mlrun/common/schemas/model_monitoring/constants.py +127 -46
mlrun/common/schemas/model_monitoring/grafana.py +18 -12
mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
mlrun/common/schemas/notification.py +24 -3
mlrun/common/schemas/object.py +1 -1
mlrun/common/schemas/pagination.py +4 -4
mlrun/common/schemas/partition.py +142 -0
mlrun/common/schemas/pipeline.py +3 -3
mlrun/common/schemas/project.py +26 -18
mlrun/common/schemas/runs.py +3 -3
mlrun/common/schemas/runtime_resource.py +5 -5
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/secret.py +1 -1
mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
mlrun/common/schemas/tag.py +3 -3
mlrun/common/schemas/workflow.py +6 -5
mlrun/common/types.py +1 -0
mlrun/config.py +157 -89
mlrun/data_types/__init__.py +5 -3
mlrun/data_types/infer.py +13 -3
mlrun/data_types/spark.py +2 -1
mlrun/datastore/__init__.py +59 -18
mlrun/datastore/alibaba_oss.py +4 -1
mlrun/datastore/azure_blob.py +4 -1
mlrun/datastore/base.py +19 -24
mlrun/datastore/datastore.py +10 -4
mlrun/datastore/datastore_profile.py +178 -45
mlrun/datastore/dbfs_store.py +4 -1
mlrun/datastore/filestore.py +4 -1
mlrun/datastore/google_cloud_storage.py +4 -1
mlrun/datastore/hdfs.py +4 -1
mlrun/datastore/inmem.py +4 -1
mlrun/datastore/redis.py +4 -1
mlrun/datastore/s3.py +14 -3
mlrun/datastore/sources.py +89 -92
mlrun/datastore/store_resources.py +7 -4
mlrun/datastore/storeytargets.py +51 -16
mlrun/datastore/targets.py +38 -31
mlrun/datastore/utils.py +87 -4
mlrun/datastore/v3io.py +4 -1
mlrun/datastore/vectorstore.py +291 -0
mlrun/datastore/wasbfs/fs.py +13 -12
mlrun/db/base.py +286 -100
mlrun/db/httpdb.py +1562 -490
mlrun/db/nopdb.py +250 -83
mlrun/errors.py +6 -2
mlrun/execution.py +194 -50
mlrun/feature_store/__init__.py +2 -10
mlrun/feature_store/api.py +20 -458
mlrun/feature_store/common.py +9 -9
mlrun/feature_store/feature_set.py +20 -18
mlrun/feature_store/feature_vector.py +105 -479
mlrun/feature_store/feature_vector_utils.py +466 -0
mlrun/feature_store/retrieval/base.py +15 -11
mlrun/feature_store/retrieval/job.py +2 -1
mlrun/feature_store/retrieval/storey_merger.py +1 -1
mlrun/feature_store/steps.py +3 -3
mlrun/features.py +30 -13
mlrun/frameworks/__init__.py +1 -2
mlrun/frameworks/_common/__init__.py +1 -2
mlrun/frameworks/_common/artifacts_library.py +2 -2
mlrun/frameworks/_common/mlrun_interface.py +10 -6
mlrun/frameworks/_common/model_handler.py +31 -31
mlrun/frameworks/_common/producer.py +3 -1
mlrun/frameworks/_dl_common/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
mlrun/frameworks/_ml_common/__init__.py +1 -2
mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
mlrun/frameworks/_ml_common/model_handler.py +21 -21
mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/auto_mlrun/__init__.py +1 -2
mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
mlrun/frameworks/huggingface/__init__.py +1 -2
mlrun/frameworks/huggingface/model_server.py +9 -9
mlrun/frameworks/lgbm/__init__.py +47 -44
mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
mlrun/frameworks/lgbm/model_handler.py +15 -11
mlrun/frameworks/lgbm/model_server.py +11 -7
mlrun/frameworks/lgbm/utils.py +2 -2
mlrun/frameworks/onnx/__init__.py +1 -2
mlrun/frameworks/onnx/dataset.py +3 -3
mlrun/frameworks/onnx/mlrun_interface.py +2 -2
mlrun/frameworks/onnx/model_handler.py +7 -5
mlrun/frameworks/onnx/model_server.py +8 -6
mlrun/frameworks/parallel_coordinates.py +11 -11
mlrun/frameworks/pytorch/__init__.py +22 -23
mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
mlrun/frameworks/pytorch/model_handler.py +21 -17
mlrun/frameworks/pytorch/model_server.py +13 -9
mlrun/frameworks/sklearn/__init__.py +19 -18
mlrun/frameworks/sklearn/estimator.py +2 -2
mlrun/frameworks/sklearn/metric.py +3 -3
mlrun/frameworks/sklearn/metrics_library.py +8 -6
mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
mlrun/frameworks/sklearn/model_handler.py +4 -3
mlrun/frameworks/tf_keras/__init__.py +11 -12
mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
mlrun/frameworks/tf_keras/model_handler.py +17 -13
mlrun/frameworks/tf_keras/model_server.py +12 -8
mlrun/frameworks/xgboost/__init__.py +19 -18
mlrun/frameworks/xgboost/model_handler.py +13 -9
mlrun/k8s_utils.py +2 -5
mlrun/launcher/base.py +3 -4
mlrun/launcher/client.py +2 -2
mlrun/launcher/local.py +6 -2
mlrun/launcher/remote.py +1 -1
mlrun/lists.py +8 -4
mlrun/model.py +132 -46
mlrun/model_monitoring/__init__.py +3 -5
mlrun/model_monitoring/api.py +113 -98
mlrun/model_monitoring/applications/__init__.py +0 -5
mlrun/model_monitoring/applications/_application_steps.py +81 -50
mlrun/model_monitoring/applications/base.py +467 -14
mlrun/model_monitoring/applications/context.py +212 -134
mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
mlrun/model_monitoring/applications/evidently/base.py +146 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
mlrun/model_monitoring/applications/results.py +67 -15
mlrun/model_monitoring/controller.py +701 -315
mlrun/model_monitoring/db/__init__.py +0 -2
mlrun/model_monitoring/db/_schedules.py +242 -0
mlrun/model_monitoring/db/_stats.py +189 -0
mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
mlrun/model_monitoring/db/tsdb/base.py +243 -49
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
mlrun/model_monitoring/helpers.py +356 -114
mlrun/model_monitoring/stream_processing.py +190 -345
mlrun/model_monitoring/tracking_policy.py +11 -4
mlrun/model_monitoring/writer.py +49 -90
mlrun/package/__init__.py +3 -6
mlrun/package/context_handler.py +2 -2
mlrun/package/packager.py +12 -9
mlrun/package/packagers/__init__.py +0 -2
mlrun/package/packagers/default_packager.py +14 -11
mlrun/package/packagers/numpy_packagers.py +16 -7
mlrun/package/packagers/pandas_packagers.py +18 -18
mlrun/package/packagers/python_standard_library_packagers.py +25 -11
mlrun/package/packagers_manager.py +35 -32
mlrun/package/utils/__init__.py +0 -3
mlrun/package/utils/_pickler.py +6 -6
mlrun/platforms/__init__.py +47 -16
mlrun/platforms/iguazio.py +4 -1
mlrun/projects/operations.py +30 -30
mlrun/projects/pipelines.py +116 -47
mlrun/projects/project.py +1292 -329
mlrun/render.py +5 -9
mlrun/run.py +57 -14
mlrun/runtimes/__init__.py +1 -3
mlrun/runtimes/base.py +30 -22
mlrun/runtimes/daskjob.py +9 -9
mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
mlrun/runtimes/function_reference.py +5 -2
mlrun/runtimes/generators.py +3 -2
mlrun/runtimes/kubejob.py +6 -7
mlrun/runtimes/mounts.py +574 -0
mlrun/runtimes/mpijob/__init__.py +0 -2
mlrun/runtimes/mpijob/abstract.py +7 -6
mlrun/runtimes/nuclio/api_gateway.py +7 -7
mlrun/runtimes/nuclio/application/application.py +11 -13
mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
mlrun/runtimes/nuclio/function.py +127 -70
mlrun/runtimes/nuclio/serving.py +105 -37
mlrun/runtimes/pod.py +159 -54
mlrun/runtimes/remotesparkjob.py +3 -2
mlrun/runtimes/sparkjob/__init__.py +0 -2
mlrun/runtimes/sparkjob/spark3job.py +22 -12
mlrun/runtimes/utils.py +7 -6
mlrun/secrets.py +2 -2
mlrun/serving/__init__.py +8 -0
mlrun/serving/merger.py +7 -5
mlrun/serving/remote.py +35 -22
mlrun/serving/routers.py +186 -240
mlrun/serving/server.py +41 -10
mlrun/serving/states.py +432 -118
mlrun/serving/utils.py +13 -2
mlrun/serving/v1_serving.py +3 -2
mlrun/serving/v2_serving.py +161 -203
mlrun/track/__init__.py +1 -1
mlrun/track/tracker.py +2 -2
mlrun/track/trackers/mlflow_tracker.py +6 -5
mlrun/utils/async_http.py +35 -22
mlrun/utils/clones.py +7 -4
mlrun/utils/helpers.py +511 -58
mlrun/utils/logger.py +119 -13
mlrun/utils/notifications/notification/__init__.py +22 -19
mlrun/utils/notifications/notification/base.py +39 -15
mlrun/utils/notifications/notification/console.py +6 -6
mlrun/utils/notifications/notification/git.py +11 -11
mlrun/utils/notifications/notification/ipython.py +10 -9
mlrun/utils/notifications/notification/mail.py +176 -0
mlrun/utils/notifications/notification/slack.py +16 -8
mlrun/utils/notifications/notification/webhook.py +24 -8
mlrun/utils/notifications/notification_pusher.py +191 -200
mlrun/utils/regex.py +12 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/METADATA +69 -54
mlrun-1.8.0.dist-info/RECORD +351 -0
{mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
mlrun/model_monitoring/applications/evidently_base.py +0 -137
mlrun/model_monitoring/db/stores/__init__.py +0 -136
mlrun/model_monitoring/db/stores/base/store.py +0 -213
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
mlrun/model_monitoring/model_endpoint.py +0 -118
mlrun-1.7.2rc4.dist-info/RECORD +0 -351
{mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
{mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0

mlrun/artifacts/document.py ADDED Viewed

@@ -0,0 +1,454 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import tempfile
+from collections.abc import Iterator
+from copy import deepcopy
+from importlib import import_module
+from typing import Optional, Union
+import mlrun
+import mlrun.artifacts
+from mlrun.artifacts import Artifact, ArtifactSpec
+from mlrun.model import ModelObj
+from ..utils import generate_artifact_uri
+from .base import ArtifactStatus
+class DocumentLoaderSpec(ModelObj):
+    """
+    A class to load a document from a file path using a specified loader class.
+    This class is responsible for loading documents from a given source path using a specified loader class.
+    The loader class is dynamically imported and instantiated with the provided arguments. The loaded documents
+    can be optionally uploaded as artifacts. Note that only loader classes that return single results
+    (e.g., TextLoader, UnstructuredHTMLLoader, WebBaseLoader(scalar)) are supported - loaders returning multiple
+    results like DirectoryLoader or WebBaseLoader(list) are not compatible.
+    Attributes:
+        loader_class_name (str): The name of the loader class to use for loading documents.
+        src_name (str): The name of the source attribute to pass to the loader class.
+        kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
+    """
+    _dict_fields = ["loader_class_name", "src_name", "download_object", "kwargs"]
+    def __init__(
+        self,
+        loader_class_name: str = "langchain_community.document_loaders.TextLoader",
+        src_name: str = "file_path",
+        download_object: bool = True,
+        kwargs: Optional[dict] = None,
+    ):
+        """
+        Initialize the document loader.
+        Args:
+            loader_class_name (str): The name of the loader class to use.
+            src_name (str): The source name for the document.
+            kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
+            download_object (bool, optional): If True, the file will be downloaded before launching
+                the loader. If False, the loader accepts a link that should not be downloaded.
+                Defaults to True.
+        Example:
+            >>> # Create a loader specification for PDF documents
+            >>> loader_spec = DocumentLoaderSpec(
+            ...     loader_class_name="langchain_community.document_loaders.PDFLoader",
+            ...     src_name="file_path",
+            ...     kwargs={"extract_images": True},
+            ... )
+            >>> # Create a loader instance for a specific PDF file
+            >>> pdf_loader = loader_spec.make_loader("/path/to/document.pdf")
+            >>> # Load the documents
+            >>> documents = pdf_loader.load()
+        """
+        self.loader_class_name = loader_class_name
+        self.src_name = src_name
+        self.download_object = download_object
+        self.kwargs = kwargs
+    def make_loader(self, src_path):
+        module_name, class_name = self.loader_class_name.rsplit(".", 1)
+        module = import_module(module_name)
+        loader_class = getattr(module, class_name)
+        kwargs = deepcopy(self.kwargs or {})
+        kwargs[self.src_name] = src_path
+        loader = loader_class(**kwargs)
+        return loader
+class MLRunLoader:
+    """
+    A factory class for creating instances of a dynamically defined document loader.
+    Args:
+        artifact_key (str, optional): The key for the artifact to be logged.
+            The '%%' pattern in the key will be replaced by the source path
+            with any unsupported characters converted to '_'. Defaults to "%%".
+        local_path (str): The source path of the document to be loaded.
+        loader_spec (DocumentLoaderSpec): Specification for the document loader.
+        producer (Optional[Union[MlrunProject, str, MLClientCtx]], optional): The producer of the document.
+                                If not specified, will try to get the current MLRun context or project.
+                                Defaults to None.
+        upload (bool, optional): Flag indicating whether to upload the document.
+        labels (Optional[Dict[str, str]], optional): Key-value labels to attach to the artifact. Defaults to None.
+        tag (str, optional): Version tag for the artifact. Defaults to "".
+    Returns:
+        DynamicDocumentLoader: An instance of a dynamically defined subclass of BaseLoader.
+    Example:
+        >>> # Create a document loader specification
+        >>> loader_spec = DocumentLoaderSpec(
+        ...     loader_class_name="langchain_community.document_loaders.TextLoader",
+        ...     src_name="file_path",
+        ... )
+        >>> # Create a basic loader for a single file
+        >>> loader = MLRunLoader(
+        ...     source_path="/path/to/document.txt",
+        ...     loader_spec=loader_spec,
+        ...     artifact_key="my_doc",
+        ...     producer=project,
+        ...     upload=True,
+        ... )
+        >>> documents = loader.load()
+        >>> # Create a loader with auto-generated keys
+        >>> loader = MLRunLoader(
+        ...     source_path="/path/to/document.txt",
+        ...     loader_spec=loader_spec,
+        ...     artifact_key="%%",  # %% will be replaced with encoded path
+        ...     producer=project,
+        ... )
+        >>> documents = loader.load()
+        >>> # Use with DirectoryLoader
+        >>> from langchain_community.document_loaders import DirectoryLoader
+        >>> dir_loader = DirectoryLoader(
+        ...     "/path/to/directory",
+        ...     glob="**/*.txt",
+        ...     loader_cls=MLRunLoader,
+        ...     loader_kwargs={
+        ...         "loader_spec": loader_spec,
+        ...         "artifact_key": "%%",
+        ...         "producer": project,
+        ...         "upload": True,
+        ...     },
+        ... )
+        >>> documents = dir_loader.load()
+    """
+    def __new__(
+        cls,
+        source_path: str,
+        loader_spec: "DocumentLoaderSpec",
+        artifact_key="%%",
+        producer: Optional[Union["MlrunProject", str, "MLClientCtx"]] = None,  # noqa: F821
+        upload: bool = False,
+        tag: str = "",
+        labels: Optional[dict[str, str]] = None,
+    ):
+        # Dynamically import BaseLoader
+        from langchain_community.document_loaders.base import BaseLoader
+        class DynamicDocumentLoader(BaseLoader):
+            def __init__(
+                self,
+                local_path,
+                loader_spec,
+                artifact_key,
+                producer,
+                upload,
+                tag,
+                labels,
+            ):
+                self.producer = producer
+                self.artifact_key = (
+                    MLRunLoader.artifact_key_instance(artifact_key, local_path)
+                    if "%%" in artifact_key
+                    else artifact_key
+                )
+                self.loader_spec = loader_spec
+                self.local_path = local_path
+                self.upload = upload
+                self.tag = tag
+                self.labels = labels
+                # Resolve the producer
+                if not self.producer:
+                    self.producer = mlrun.mlconf.default_project
+                if isinstance(self.producer, str):
+                    self.producer = mlrun.get_or_create_project(self.producer)
+            def lazy_load(self) -> Iterator["Document"]:  # noqa: F821
+                collections = None
+                try:
+                    artifact = self.producer.get_artifact(self.artifact_key, self.tag)
+                    collections = (
+                        artifact.status.collections if artifact else collections
+                    )
+                except mlrun.MLRunNotFoundError:
+                    pass
+                artifact = self.producer.log_document(
+                    key=self.artifact_key,
+                    document_loader_spec=self.loader_spec,
+                    local_path=self.local_path,
+                    upload=self.upload,
+                    labels=self.labels,
+                    tag=self.tag,
+                    collections=collections,
+                )
+                res = artifact.to_langchain_documents()
+                return res
+        # Return an instance of the dynamically defined subclass
+        instance = DynamicDocumentLoader(
+            artifact_key=artifact_key,
+            local_path=source_path,
+            loader_spec=loader_spec,
+            producer=producer,
+            upload=upload,
+            tag=tag,
+            labels=labels,
+        )
+        return instance
+    @staticmethod
+    def artifact_key_instance(artifact_key: str, src_path: str) -> str:
+        if "%%" in artifact_key:
+            resolved_path = DocumentArtifact.key_from_source(src_path)
+            artifact_key = artifact_key.replace("%%", resolved_path)
+        return artifact_key
+class DocumentArtifact(Artifact):
+    """
+    A specific artifact class inheriting from generic artifact, used to maintain Document meta-data.
+    """
+    @staticmethod
+    def key_from_source(src_path: str) -> str:
+        """Convert a source path into a valid artifact key by replacing invalid characters with underscores.
+        Args:
+            src_path (str): The source path to be converted into a valid artifact key
+        Returns:
+            str: A modified version of the source path where all invalid characters are replaced
+                with underscores while preserving valid sequences in their original positions
+        Examples:
+            >>> DocumentArtifact.key_from_source("data/file-name(v1).txt")
+            "data_file-name_v1__txt"
+        """
+        pattern = mlrun.utils.regex.artifact_key[0]
+        # Convert anchored pattern (^...$) to non-anchored version for finditer
+        search_pattern = pattern.strip("^$")
+        result = []
+        current_pos = 0
+        # Find all valid sequences
+        for match in re.finditer(search_pattern, src_path):
+            # Add '_' values for characters between matches
+            for char in src_path[current_pos : match.start()]:
+                result.append("_")
+            # Add the valid sequence
+            result.append(match.group())
+            current_pos = match.end()
+        # Handle any remaining characters after the last match
+        for char in src_path[current_pos:]:
+            result.append("_")
+        resolved_path = "".join(result)
+        resolved_path = resolved_path.lstrip("_")
+        return resolved_path
+    class DocumentArtifactSpec(ArtifactSpec):
+        _dict_fields = ArtifactSpec._dict_fields + [
+            "document_loader",
+            "original_source",
+        ]
+        def __init__(
+            self,
+            *args,
+            document_loader: Optional[DocumentLoaderSpec] = None,
+            original_source: Optional[str] = None,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            self.document_loader = document_loader
+            self.original_source = original_source
+    class DocumentArtifactStatus(ArtifactStatus):
+        _dict_fields = ArtifactStatus._dict_fields + ["collections"]
+        def __init__(
+            self,
+            *args,
+            collections: Optional[dict] = None,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            self.collections = collections if collections is not None else {}
+    kind = "document"
+    METADATA_SOURCE_KEY = "source"
+    METADATA_ORIGINAL_SOURCE_KEY = "original_source"
+    METADATA_CHUNK_KEY = "mlrun_chunk"
+    METADATA_ARTIFACT_TARGET_PATH_KEY = "mlrun_target_path"
+    METADATA_ARTIFACT_TAG = "mlrun_tag"
+    METADATA_ARTIFACT_KEY = "mlrun_key"
+    METADATA_ARTIFACT_PROJECT = "mlrun_project"
+    def __init__(
+        self,
+        original_source: Optional[str] = None,
+        document_loader_spec: Optional[DocumentLoaderSpec] = None,
+        collections: Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.spec.document_loader = (
+            document_loader_spec.to_dict()
+            if document_loader_spec
+            else self.spec.document_loader
+        )
+        self.spec.original_source = original_source or self.spec.original_source
+        self.status = DocumentArtifact.DocumentArtifactStatus(collections=collections)
+    @property
+    def status(self) -> DocumentArtifactStatus:
+        return self._status
+    @status.setter
+    def status(self, status):
+        self._status = self._verify_dict(
+            status, "status", DocumentArtifact.DocumentArtifactStatus
+        )
+    @property
+    def spec(self) -> DocumentArtifactSpec:
+        return self._spec
+    @spec.setter
+    def spec(self, spec):
+        self._spec = self._verify_dict(
+            spec, "spec", DocumentArtifact.DocumentArtifactSpec
+        )
+    def get_source(self):
+        """Get the source URI for this artifact."""
+        return generate_artifact_uri(self.metadata.project, self.spec.db_key)
+    def to_langchain_documents(
+        self,
+        splitter: Optional["TextSplitter"] = None,  # noqa: F821
+    ) -> list["Document"]:  # noqa: F821
+        from langchain.schema import Document
+        """
+        Create LC documents from the artifact
+        Args:
+            splitter (Optional[TextSplitter]): A LangChain TextSplitter to split the document into chunks.
+        Returns:
+            list[Document]: A list of LangChain Document objects.
+        """
+        loader_spec = DocumentLoaderSpec.from_dict(self.spec.document_loader)
+        if loader_spec.download_object and self.get_target_path():
+            with tempfile.NamedTemporaryFile() as tmp_file:
+                mlrun.datastore.store_manager.object(
+                    url=self.get_target_path()
+                ).download(tmp_file.name)
+                loader = loader_spec.make_loader(tmp_file.name)
+                documents = loader.load()
+        elif self.spec.original_source:
+            loader = loader_spec.make_loader(self.spec.original_source)
+            documents = loader.load()
+        else:
+            raise ValueError(
+                "No src_path or target_path provided. Cannot load document."
+            )
+        results = []
+        idx = 0
+        for document in documents:
+            if splitter:
+                texts = splitter.split_text(document.page_content)
+            else:
+                texts = [document.page_content]
+            metadata = document.metadata
+            metadata[self.METADATA_ORIGINAL_SOURCE_KEY] = self.spec.original_source
+            metadata[self.METADATA_SOURCE_KEY] = self.get_source()
+            metadata[self.METADATA_ARTIFACT_TAG] = self.tag or "latest"
+            metadata[self.METADATA_ARTIFACT_KEY] = self.db_key
+            metadata[self.METADATA_ARTIFACT_PROJECT] = self.metadata.project
+            if self.get_target_path():
+                metadata[self.METADATA_ARTIFACT_TARGET_PATH_KEY] = (
+                    self.get_target_path()
+                )
+            for text in texts:
+                metadata[self.METADATA_CHUNK_KEY] = str(idx)
+                doc = Document(
+                    page_content=text,
+                    metadata=metadata.copy(),
+                )
+                results.append(doc)
+                idx = idx + 1
+        return results
+    def collection_add(self, collection_id: str) -> bool:
+        """
+        Add a collection ID to the artifact's collection list.
+        Adds the specified collection ID to the artifact's collection mapping if it
+        doesn't already exist.
+        This method only modifies the client-side artifact object and does not persist
+        the changes to the MLRun DB. To save the changes permanently, you must call
+        project.update_artifact() after this method.
+        Args:
+            collection_id (str): The ID of the collection to add
+        """
+        if collection_id not in self.status.collections:
+            self.status.collections[collection_id] = "1"
+            return True
+        return False
+    def collection_remove(self, collection_id: str) -> bool:
+        """
+        Remove a collection ID from the artifact's collection list.
+        Removes the specified collection ID from the artifact's local collection mapping.
+        This method only modifies the client-side artifact object and does not persist
+        the changes to the MLRun DB. To save the changes permanently, you must call
+        project.update_artifact() or context.update_artifact() after this method.
+        Args:
+            collection_id (str): The ID of the collection to remove
+        """
+        if collection_id in self.status.collections:
+            self.status.collections.pop(collection_id)
+            return True
+        return False

mlrun/artifacts/manager.py CHANGED Viewed

@@ -41,6 +41,7 @@ from .dataset import (
     DatasetArtifact,
     TableArtifact,
 )
+from .document import DocumentArtifact
 from .model import ModelArtifact
 from .plots import (
     PlotArtifact,
@@ -57,6 +58,7 @@ artifact_types = {
     "model": ModelArtifact,
     "dataset": DatasetArtifact,
     "plotly": PlotlyArtifact,
+    "document": DocumentArtifact,
 }
@@ -106,7 +108,7 @@ class ArtifactProducer:
 def dict_to_artifact(struct: dict) -> Artifact:
     kind = struct.get("kind", "")
-    # TODO: remove this in 1.8.0
+    # TODO: Remove once data migration v5 is obsolete
     if mlrun.utils.is_legacy_artifact(struct):
         return mlrun.artifacts.base.convert_legacy_artifact_to_new_format(struct)
@@ -124,7 +126,7 @@ class ArtifactManager:
         self.artifact_db = db
         self.input_artifacts = {}
-        self.artifacts = {}
+        self.artifact_uris = {}
     @staticmethod
     def ensure_artifact_source_file_exists(item, path, body):
@@ -156,14 +158,12 @@ class ArtifactManager:
     def artifact_list(self, full=False):
         artifacts = []
-        for artifact in self.artifacts.values():
-            if isinstance(artifact, dict):
-                artifacts.append(artifact)
+        for artifacts_uri in self.artifact_uris.values():
+            artifact: Artifact = mlrun.datastore.get_store_resource(artifacts_uri)
+            if full:
+                artifacts.append(artifact.to_dict())
             else:
-                if full:
-                    artifacts.append(artifact.to_dict())
-                else:
-                    artifacts.append(artifact.base_dict())
+                artifacts.append(artifact.base_dict())
         return artifacts
     def log_artifact(
@@ -246,6 +246,8 @@ class ArtifactManager:
                 # otherwise, we do not want to override it.
                 # this is mainly relevant for imported artifacts that have an explicit db_key value already set
                 db_key = item.db_key or key
+        if db_key != key:
+            validate_artifact_key_name(db_key, "artifact.db_key")
         item.db_key = db_key or ""
         item.viewer = viewer or item.viewer
         item.tree = producer.tag
@@ -304,7 +306,6 @@ class ArtifactManager:
         item.target_path = target_path
         item.before_log()
-        self.artifacts[key] = item
         if ((upload is None and item.kind != "dir") or upload) and not item.is_inline():
             # before uploading the item, we want to ensure that its tags are valid,
@@ -313,32 +314,38 @@ class ArtifactManager:
             item.upload(artifact_path=artifact_path)
         if db_key:
-            self._log_to_db(db_key, project, producer.inputs, item)
+            artifact_uid = self._log_to_db(db_key, project, producer.inputs, item)
+            if artifact_uid is not None:
+                item.uid = artifact_uid
+        # Generate the artifact URI after logging to the database and retrieving the artifact UID, if available.
+        self.artifact_uris[key] = item.uri
         size = str(item.size) or "?"
         db_str = "Y" if (self.artifact_db and db_key) else "N"
         logger.debug(
-            f"log artifact {key} at {item.target_path}, size: {size}, db: {db_str}"
+            f"Log artifact {key} at {item.target_path}, size: {size}, db: {db_str}"
         )
         return item
-    def update_artifact(self, producer, item):
-        self.artifacts[item.key] = item
+    def update_artifact(self, producer, item: Artifact):
+        self.artifact_uris[item.key] = item.uri
         self._log_to_db(item.db_key, producer.project, producer.inputs, item)
-    def _log_to_db(self, key, project, sources, item, tag=None):
+    def _log_to_db(self, key, project, sources, item, tag=None) -> typing.Optional[str]:
         """
         log artifact to db
         :param key: Identifying key of the artifact.
         :param project: Project that the artifact belongs to.
-        :param sources: List of artifact sources ( Mainly passed from the producer.items ).
+        :param sources: List of artifact sources ( Mainly passed from the `producer.items` ).
         :param item: The actual artifact to store.
         :param tag: The name of the Tag of the artifact.
+        :return: The logged artifact uid.
         """
         if self.artifact_db:
             item.updated = None
             if sources:
                 item.sources = [{"name": k, "path": str(v)} for k, v in sources.items()]
-            self.artifact_db.store_artifact(
+            artifact_item = self.artifact_db.store_artifact(
                 key,
                 item.to_dict(),
                 iter=item.iter,
@@ -346,6 +353,8 @@ class ArtifactManager:
                 project=project,
                 tree=item.tree,
             )
+            if artifact_item:
+                return artifact_item.get("metadata", {}).get("uid")
     def link_artifact(
         self,
@@ -387,13 +396,14 @@ class ArtifactManager:
         deletion_strategy: mlrun.common.schemas.artifact.ArtifactsDeletionStrategies = (
             mlrun.common.schemas.artifact.ArtifactsDeletionStrategies.metadata_only
         ),
-        secrets: dict = None,
+        secrets: typing.Optional[dict] = None,
     ):
         self.artifact_db.del_artifact(
             key=item.db_key,
             project=item.project,
             tag=item.tag,
             tree=item.tree,
+            iter=item.iter,
             deletion_strategy=deletion_strategy,
             secrets=secrets,
         )

mlrun 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.2rc4py3-none-any.whl → 1.8.0py3-none-any.whl