PyPI - mlrun - Versions diffs - 1.8.0rc2__py3-none-any.whl → 1.8.0rc4__py3-none-any.whl - Mend

mlrun 1.8.0rc2py3-none-any.whl → 1.8.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (22) hide show

mlrun/artifacts/__init__.py +1 -0
mlrun/artifacts/document.py +313 -0
mlrun/artifacts/manager.py +2 -0
mlrun/common/schemas/__init__.py +1 -0
mlrun/common/schemas/constants.py +15 -0
mlrun/config.py +1 -1
mlrun/datastore/datastore_profile.py +19 -0
mlrun/datastore/vectorstore.py +186 -0
mlrun/db/base.py +10 -0
mlrun/db/httpdb.py +47 -7
mlrun/db/nopdb.py +10 -0
mlrun/execution.py +47 -1
mlrun/platforms/__init__.py +44 -0
mlrun/projects/project.py +214 -5
mlrun/utils/clones.py +1 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/METADATA +5 -5
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/RECORD +22 -20
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/LICENSE +0 -0
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/WHEEL +0 -0
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/entry_points.txt +0 -0
{mlrun-1.8.0rc2.dist-info → mlrun-1.8.0rc4.dist-info}/top_level.txt +0 -0

mlrun/artifacts/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ from .base import (
     get_artifact_meta,
 )
 from .dataset import DatasetArtifact, TableArtifact, update_dataset_meta
+from .document import DocumentArtifact, DocumentLoader, DocumentLoaderSpec
 from .manager import (
     ArtifactManager,
     ArtifactProducer,

mlrun/artifacts/document.py ADDED Viewed

@@ -0,0 +1,313 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import re
+import tempfile
+from collections.abc import Iterator
+from copy import deepcopy
+from importlib import import_module
+from typing import Optional, Union
+import mlrun
+from mlrun.artifacts import Artifact, ArtifactSpec
+from mlrun.model import ModelObj
+from ..utils import generate_artifact_uri
+class DocumentLoaderSpec(ModelObj):
+    """
+    A class to load a document from a file path using a specified loader class.
+    This class is responsible for loading documents from a given source path using a specified loader class.
+    The loader class is dynamically imported and instantiated with the provided arguments. The loaded documents
+    can be optionally uploaded as artifacts.
+    Attributes:
+        loader_class_name (str): The name of the loader class to use for loading documents.
+        src_name (str): The name of the source attribute to pass to the loader class.
+        kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
+    Methods:
+        make_loader(src_path): Creates an instance of the loader class with the specified source path.
+    """
+    _dict_fields = ["loader_class_name", "src_name", "kwargs"]
+    def __init__(
+        self,
+        loader_class_name: str = "langchain_community.document_loaders.TextLoader",
+        src_name: str = "file_path",
+        kwargs: Optional[dict] = None,
+    ):
+        """
+        Initialize the document loader.
+        Args:
+            loader_class_name (str): The name of the loader class to use.
+            src_name (str): The source name for the document.
+            kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
+        """
+        self.loader_class_name = loader_class_name
+        self.src_name = src_name
+        self.kwargs = kwargs
+    def make_loader(self, src_path):
+        module_name, class_name = self.loader_class_name.rsplit(".", 1)
+        module = import_module(module_name)
+        loader_class = getattr(module, class_name)
+        kwargs = deepcopy(self.kwargs or {})
+        kwargs[self.src_name] = src_path
+        loader = loader_class(**kwargs)
+        return loader
+class DocumentLoader:
+    """
+    A factory class for creating instances of a dynamically defined document loader.
+    Args:
+        artifact_key (str): The key for the artifact to be logged.It can include '%%' which will be replaced
+        by a hex-encoded version of the source path.
+        source_path (str): The source path of the document to be loaded.
+        loader_spec (DocumentLoaderSpec): Specification for the document loader.
+        producer (Optional[Union[MlrunProject, str, MLClientCtx]], optional): The producer of the document
+        upload (bool, optional): Flag indicating whether to upload the document.
+    Returns:
+        DynamicDocumentLoader: An instance of a dynamically defined subclass of BaseLoader.
+    """
+    def __new__(
+        cls,
+        source_path: str,
+        loader_spec: "DocumentLoaderSpec",
+        artifact_key="doc%%",
+        producer: Optional[Union["MlrunProject", str, "MLClientCtx"]] = None,  # noqa: F821
+        upload: bool = False,
+    ):
+        # Dynamically import BaseLoader
+        from langchain_community.document_loaders.base import BaseLoader
+        class DynamicDocumentLoader(BaseLoader):
+            def __init__(
+                self,
+                source_path,
+                loader_spec,
+                artifact_key,
+                producer,
+                upload,
+            ):
+                self.producer = producer
+                self.artifact_key = (
+                    DocumentLoader.artifact_key_instance(artifact_key, source_path)
+                    if "%%" in artifact_key
+                    else artifact_key
+                )
+                self.loader_spec = loader_spec
+                self.source_path = source_path
+                self.upload = upload
+                # Resolve the producer
+                if not self.producer:
+                    self.producer = mlrun.mlconf.default_project
+                if isinstance(self.producer, str):
+                    self.producer = mlrun.get_or_create_project(self.producer)
+            def lazy_load(self) -> Iterator["Document"]:  # noqa: F821
+                artifact = self.producer.log_document(
+                    key=self.artifact_key,
+                    document_loader=self.loader_spec,
+                    src_path=self.source_path,
+                    upload=self.upload,
+                )
+                yield artifact.to_langchain_documents()
+        # Return an instance of the dynamically defined subclass
+        instance = DynamicDocumentLoader(
+            artifact_key=artifact_key,
+            source_path=source_path,
+            loader_spec=loader_spec,
+            producer=producer,
+            upload=upload,
+        )
+        return instance
+    @staticmethod
+    def artifact_key_instance(artifact_key: str, src_path: str) -> str:
+        if "%%" in artifact_key:
+            pattern = mlrun.utils.regex.artifact_key[0]
+            # Convert anchored pattern (^...$) to non-anchored version for finditer
+            search_pattern = pattern.strip("^$")
+            result = []
+            current_pos = 0
+            # Find all valid sequences
+            for match in re.finditer(search_pattern, src_path):
+                # Add hex values for characters between matches
+                for char in src_path[current_pos : match.start()]:
+                    result.append(hex(ord(char))[2:].zfill(2))
+                # Add the valid sequence
+                result.append(match.group())
+                current_pos = match.end()
+            # Handle any remaining characters after the last match
+            for char in src_path[current_pos:]:
+                result.append(hex(ord(char))[2:].zfill(2))
+            resolved_path = "".join(result)
+            artifact_key = artifact_key.replace("%%", resolved_path)
+        return artifact_key
+class DocumentArtifact(Artifact):
+    """
+    A specific artifact class inheriting from generic artifact, used to maintain Document meta-data.
+    Methods:
+        to_langchain_documents(splitter): Create LC documents from the artifact.
+        collection_add(collection_id): Add a collection ID to the artifact.
+        collection_remove(collection_id): Remove a collection ID from the artifact.
+    """
+    class DocumentArtifactSpec(ArtifactSpec):
+        _dict_fields = ArtifactSpec._dict_fields + [
+            "document_loader",
+            "collections",
+            "original_source",
+        ]
+        def __init__(
+            self,
+            *args,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            self.document_loader = None
+            self.collections = set()
+            self.original_source = None
+    """
+    A specific artifact class inheriting from generic artifact, used to maintain Document meta-data.
+    """
+    kind = "document"
+    METADATA_SOURCE_KEY = "source"
+    METADATA_ORIGINAL_SOURCE_KEY = "original_source"
+    METADATA_CHUNK_KEY = "mlrun_chunk"
+    METADATA_ARTIFACT_URI_KEY = "mlrun_object_uri"
+    METADATA_ARTIFACT_TARGET_PATH_KEY = "mlrun_target_path"
+    def __init__(
+        self,
+        key=None,
+        document_loader: DocumentLoaderSpec = DocumentLoaderSpec(),
+        **kwargs,
+    ):
+        super().__init__(key, **kwargs)
+        self.spec.document_loader = document_loader.to_str()
+        if "src_path" in kwargs:
+            self.spec.original_source = kwargs["src_path"]
+    @property
+    def spec(self) -> DocumentArtifactSpec:
+        return self._spec
+    @spec.setter
+    def spec(self, spec):
+        self._spec = self._verify_dict(
+            spec, "spec", DocumentArtifact.DocumentArtifactSpec
+        )
+        # _verify_dict doesn't handle set, so we need to convert it back
+        if isinstance(self._spec.collections, str):
+            self._spec.collections = ast.literal_eval(self._spec.collections)
+    @property
+    def inputs(self):
+        # To keep the interface consistent with the project.update_artifact() when we update the artifact
+        return None
+    @property
+    def source(self):
+        return generate_artifact_uri(self.metadata.project, self.spec.db_key)
+    def to_langchain_documents(
+        self,
+        splitter: Optional["TextSplitter"] = None,  # noqa: F821
+    ) -> list["Document"]:  # noqa: F821
+        from langchain.schema import Document
+        """
+        Create LC documents from the artifact
+        Args:
+            splitter (Optional[TextSplitter]): A LangChain TextSplitter to split the document into chunks.
+        Returns:
+            list[Document]: A list of LangChain Document objects.
+        """
+        dictionary = ast.literal_eval(self.spec.document_loader)
+        loader_spec = DocumentLoaderSpec.from_dict(dictionary)
+        if self.get_target_path():
+            with tempfile.NamedTemporaryFile() as tmp_file:
+                mlrun.datastore.store_manager.object(
+                    url=self.get_target_path()
+                ).download(tmp_file.name)
+                loader = loader_spec.make_loader(tmp_file.name)
+                documents = loader.load()
+        elif self.src_path:
+            loader = loader_spec.make_loader(self.src_path)
+            documents = loader.load()
+        else:
+            raise ValueError(
+                "No src_path or target_path provided. Cannot load document."
+            )
+        results = []
+        for document in documents:
+            if splitter:
+                texts = splitter.split_text(document.page_content)
+            else:
+                texts = [document.page_content]
+            metadata = document.metadata
+            metadata[self.METADATA_ORIGINAL_SOURCE_KEY] = self.src_path
+            metadata[self.METADATA_SOURCE_KEY] = self.source
+            metadata[self.METADATA_ARTIFACT_URI_KEY] = self.uri
+            if self.get_target_path():
+                metadata[self.METADATA_ARTIFACT_TARGET_PATH_KEY] = (
+                    self.get_target_path()
+                )
+            for idx, text in enumerate(texts):
+                metadata[self.METADATA_CHUNK_KEY] = str(idx)
+                doc = Document(
+                    page_content=text,
+                    metadata=metadata,
+                )
+                results.append(doc)
+        return results
+    def collection_add(self, collection_id: str) -> None:
+        self.spec.collections.add(collection_id)
+    def collection_remove(self, collection_id: str) -> None:
+        return self.spec.collections.discard(collection_id)

mlrun/artifacts/manager.py CHANGED Viewed

@@ -41,6 +41,7 @@ from .dataset import (
     DatasetArtifact,
     TableArtifact,
 )
+from .document import DocumentArtifact
 from .model import ModelArtifact
 from .plots import (
     PlotArtifact,
@@ -57,6 +58,7 @@ artifact_types = {
     "model": ModelArtifact,
     "dataset": DatasetArtifact,
     "plotly": PlotlyArtifact,
+    "document": DocumentArtifact,
 }

mlrun/common/schemas/__init__.py CHANGED Viewed

@@ -62,6 +62,7 @@ from .clusterization_spec import (
 from .common import ImageBuilder
 from .constants import (
     APIStates,
+    ArtifactPartitionByField,
     ClusterizationRole,
     DeletionStrategy,
     FeatureStorePartitionByField,

mlrun/common/schemas/constants.py CHANGED Viewed

@@ -133,6 +133,21 @@ class RunPartitionByField(mlrun.common.types.StrEnum):
             )
+class ArtifactPartitionByField(mlrun.common.types.StrEnum):
+    name = "name"  # Supported for artifacts objects
+    project_and_name = "project_and_name"  # Supported for artifacts objects
+    def to_partition_by_db_field(self, db_cls):
+        if self.value == ArtifactPartitionByField.name:
+            return db_cls.key
+        elif self.value == ArtifactPartitionByField.project_and_name:
+            return db_cls.project, db_cls.key
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"Unknown group by field: {self.value}"
+            )
 class SortField(mlrun.common.types.StrEnum):
     created = "created"
     updated = "updated"

mlrun/config.py CHANGED Viewed

@@ -533,7 +533,7 @@ default_config = {
             "verbose": True,
         },
         "pagination": {
-            "default_page_size": 20,
+            "default_page_size": 200,
             "pagination_cache": {
                 "interval": 60,
                 "ttl": 3600,

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -81,6 +81,24 @@ class DatastoreProfileBasic(DatastoreProfile):
     private: typing.Optional[str] = None
+class VectorStoreProfile(DatastoreProfile):
+    type: str = pydantic.Field("vector")
+    _private_attributes = ("kwargs_private",)
+    vector_store_class: str
+    kwargs_public: typing.Optional[dict] = None
+    kwargs_private: typing.Optional[dict] = None
+    def attributes(self, kwargs=None):
+        attributes = {}
+        if self.kwargs_public:
+            attributes = merge(attributes, self.kwargs_public)
+        if self.kwargs_private:
+            attributes = merge(attributes, self.kwargs_private)
+        if kwargs:
+            attributes = merge(attributes, kwargs)
+        return attributes
 class DatastoreProfileKafkaTarget(DatastoreProfile):
     type: str = pydantic.v1.Field("kafka_target")
     _private_attributes = "kwargs_private"
@@ -476,6 +494,7 @@ class DatastoreProfile2Json(pydantic.v1.BaseModel):
             "gcs": DatastoreProfileGCS,
             "az": DatastoreProfileAzureBlob,
             "hdfs": DatastoreProfileHdfs,
+            "vector": VectorStoreProfile,
         }
         if datastore_type in ds_profile_factory:
             return ds_profile_factory[datastore_type].parse_obj(decoded_dict)

mlrun/datastore/vectorstore.py ADDED Viewed

@@ -0,0 +1,186 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from importlib import import_module
+from typing import Union
+from mlrun.artifacts import DocumentArtifact
+class VectorStoreCollection:
+    """
+    VectorStoreCollection is a class that manages a collection of vector stores, providing methods to add and delete
+    documents and artifacts, and to interact with an MLRun context.
+    Attributes:
+        _collection_impl (object): The underlying collection implementation.
+        _mlrun_context (Union[MlrunProject, MLClientCtx]): The MLRun context associated with the collection.
+        collection_name (str): The name of the collection.
+        id (str): The unique identifier of the collection, composed of the datastore profile and collection name.
+    Methods:
+        add_documents(documents: list["Document"], **kwargs):
+            Adds a list of documents to the collection and updates the MLRun artifacts associated with the documents
+            if an MLRun context is present.
+        add_artifacts(artifacts: list[DocumentArtifact], splitter=None, **kwargs):
+            Adds a list of DocumentArtifact objects to the collection, optionally using a splitter to convert
+            artifacts to documents.
+        remove_itself_from_artifact(artifact: DocumentArtifact):
+            Removes the current object from the given artifact's collection and updates the artifact.
+        delete_artifacts(artifacts: list[DocumentArtifact]):
+            Deletes a list of DocumentArtifact objects from the collection and updates the MLRun context.
+            Raises NotImplementedError if the delete operation is not supported for the collection implementation.
+    """
+    def __init__(
+        self,
+        vector_store_class: str,
+        mlrun_context: Union["MlrunProject", "MLClientCtx"],  # noqa: F821
+        datastore_profile: str,
+        collection_name: str,
+        **kwargs,
+    ):
+        # Import the vector store class dynamically
+        module_name, class_name = vector_store_class.rsplit(".", 1)
+        module = import_module(module_name)
+        vector_store_class = getattr(module, class_name)
+        signature = inspect.signature(vector_store_class)
+        # Create the vector store instance
+        if "collection_name" in signature.parameters.keys():
+            vector_store = vector_store_class(collection_name=collection_name, **kwargs)
+        else:
+            vector_store = vector_store_class(**kwargs)
+        self._collection_impl = vector_store
+        self._mlrun_context = mlrun_context
+        self.collection_name = collection_name
+        self.id = datastore_profile + "/" + collection_name
+    def __getattr__(self, name):
+        # This method is called when an attribute is not found in the usual places
+        # Forward the attribute access to _collection_impl
+        return getattr(self._collection_impl, name)
+    def __setattr__(self, name, value):
+        if name in ["_collection_impl", "_mlrun_context"] or name in self.__dict__:
+            # Use the base class method to avoid recursion
+            super().__setattr__(name, value)
+        else:
+            # Forward the attribute setting to _collection_impl
+            setattr(self._collection_impl, name, value)
+    def add_documents(
+        self,
+        documents: list["Document"],  # noqa: F821
+        **kwargs,
+    ):
+        """
+        Add a list of documents to the collection.
+        If the instance has an MLRun context, it will update the MLRun artifacts
+        associated with the documents.
+        Args:
+            documents (list[Document]): A list of Document objects to be added.
+            **kwargs: Additional keyword arguments to be passed to the underlying
+                      collection implementation.
+        Returns:
+            The result of the underlying collection implementation's add_documents method.
+        """
+        if self._mlrun_context:
+            for document in documents:
+                mlrun_uri = document.metadata.get(
+                    DocumentArtifact.METADATA_ARTIFACT_URI_KEY
+                )
+                if mlrun_uri:
+                    artifact = self._mlrun_context.get_store_resource(mlrun_uri)
+                    artifact.collection_add(self.id)
+                    self._mlrun_context.update_artifact(artifact)
+        return self._collection_impl.add_documents(documents, **kwargs)
+    def add_artifacts(self, artifacts: list[DocumentArtifact], splitter=None, **kwargs):
+        """
+        Add a list of DocumentArtifact objects to the collection.
+        Args:
+            artifacts (list[DocumentArtifact]): A list of DocumentArtifact objects to be added.
+            splitter (optional): An optional splitter to be used when converting artifacts to documents.
+            **kwargs: Additional keyword arguments to be passed to the collection's add_documents method.
+        Returns:
+            list: A list of IDs of the added documents.
+        """
+        all_ids = []
+        for artifact in artifacts:
+            documents = artifact.to_langchain_documents(splitter)
+            artifact.collection_add(self.id)
+            self._mlrun_context.update_artifact(artifact)
+            ids = self._collection_impl.add_documents(documents, **kwargs)
+            all_ids.extend(ids)
+        return all_ids
+    def remove_itself_from_artifact(self, artifact: DocumentArtifact):
+        """
+        Remove the current object from the given artifact's collection and update the artifact.
+        Args:
+            artifact (DocumentArtifact): The artifact from which the current object should be removed.
+        """
+        artifact.collection_remove(self.id)
+        self._mlrun_context.update_artifact(artifact)
+    def delete_artifacts(self, artifacts: list[DocumentArtifact]):
+        """
+        Delete a list of DocumentArtifact objects from the collection.
+        This method removes the specified artifacts from the collection and updates the MLRun context.
+        The deletion process varies depending on the type of the underlying collection implementation.
+        Args:
+            artifacts (list[DocumentArtifact]): A list of DocumentArtifact objects to be deleted.
+        Raises:
+            NotImplementedError: If the delete operation is not supported for the collection implementation.
+        """
+        store_class = self._collection_impl.__class__.__name__.lower()
+        for artifact in artifacts:
+            artifact.collection_remove(self.id)
+            self._mlrun_context.update_artifact(artifact)
+            if store_class == "milvus":
+                expr = f"{DocumentArtifact.METADATA_SOURCE_KEY} == '{artifact.source}'"
+                return self._collection_impl.delete(expr=expr)
+            elif store_class == "chroma":
+                where = {DocumentArtifact.METADATA_SOURCE_KEY: artifact.source}
+                return self._collection_impl.delete(where=where)
+            elif (
+                hasattr(self._collection_impl, "delete")
+                and "filter"
+                in inspect.signature(self._collection_impl.delete).parameters
+            ):
+                filter = {
+                    "metadata": {DocumentArtifact.METADATA_SOURCE_KEY: artifact.source}
+                }
+                return self._collection_impl.delete(filter=filter)
+            else:
+                raise NotImplementedError(
+                    f"delete_artifacts() operation not supported for {store_class}"
+                )

mlrun/db/base.py CHANGED Viewed

@@ -158,6 +158,16 @@ class RunDBInterface(ABC):
         tree: Optional[str] = None,
         format_: mlrun.common.formatters.ArtifactFormat = mlrun.common.formatters.ArtifactFormat.full,
         limit: Optional[int] = None,
+        partition_by: Optional[
+            Union[mlrun.common.schemas.ArtifactPartitionByField, str]
+        ] = None,
+        rows_per_partition: int = 1,
+        partition_sort_by: Optional[
+            Union[mlrun.common.schemas.SortField, str]
+        ] = mlrun.common.schemas.SortField.updated,
+        partition_order: Union[
+            mlrun.common.schemas.OrderType, str
+        ] = mlrun.common.schemas.OrderType.desc,
     ):
         pass

mlrun 1.8.0rc2__py3-none-any.whl → 1.8.0rc4__py3-none-any.whl

Potentially problematic release.

mlrun 1.8.0rc2py3-none-any.whl → 1.8.0rc4py3-none-any.whl