PyPI - morphik - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

morphik 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

morphik/__init__.py +2 -2
morphik/_internal.py +36 -27
morphik/async_.py +294 -127
morphik/models.py +79 -58
morphik/rules.py +28 -5
morphik/sync.py +352 -144
morphik/tests/README.md +1 -1
morphik/tests/example_usage.py +69 -69
morphik/tests/test_async.py +166 -82
morphik/tests/test_docs/sample1.txt +1 -1
morphik/tests/test_docs/sample2.txt +2 -2
morphik/tests/test_docs/sample3.txt +1 -1
morphik/tests/test_sync.py +162 -84
{morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/METADATA +4 -8
morphik-0.1.6.dist-info/RECORD +18 -0
morphik-0.1.4.dist-info/RECORD +0 -18
{morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/WHEEL +0 -0

morphik/sync.py CHANGED Viewed

@@ -2,27 +2,23 @@ import json
 import logging
 from io import BytesIO, IOBase
 from pathlib import Path
-from typing import Dict, Any, List, Optional, Union, BinaryIO
-from PIL import Image
-from PIL.Image import Image as PILImage
+from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
 import httpx
+from pydantic import BaseModel
+from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
 from .models import (
+    ChunkSource,
+    CompletionResponse,  # Prompt override models
     Document,
     DocumentResult,
-    CompletionResponse,
-    IngestTextRequest,
-    ChunkSource,
-    Graph,
     FolderInfo,
-    # Prompt override models
+    Graph,
     GraphPromptOverrides,
+    IngestTextRequest,
     QueryPromptOverrides,
 )
-from .rules import Rule
-from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
 logger = logging.getLogger(__name__)
@@ -71,16 +67,16 @@ class Folder:
     def name(self) -> str:
         """Returns the folder name."""
         return self._name
     @property
     def id(self) -> Optional[str]:
         """Returns the folder ID if available."""
         return self._id
     def get_info(self) -> Dict[str, Any]:
         """
         Get detailed information about this folder.
         Returns:
             Dict[str, Any]: Detailed folder information
         """
@@ -93,9 +89,8 @@ class Folder:
                     break
             if not self._id:
                 raise ValueError(f"Folder '{self._name}' not found")
         return self._client._request("GET", f"folders/{self._id}")
     def signin(self, end_user_id: str) -> "UserScope":
         """
@@ -168,9 +163,7 @@ class Folder:
             files = {"file": (filename, file_obj)}
             # Create form data
-            form_data = self._client._logic._prepare_ingest_file_form_data(
-                metadata, rules, self._name, None
-            )
+            form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
             # use_colpali should be a query parameter as defined in the API
             response = self._client._request(
@@ -219,9 +212,9 @@ class Folder:
             )
             response = self._client._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -231,9 +224,7 @@ class Folder:
                 for error in response["errors"]:
                     logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
-            docs = [
-                self._client._logic._parse_document_response(doc) for doc in response["documents"]
-            ]
+            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
             for doc in docs:
                 doc._client = self._client
             return docs
@@ -296,6 +287,7 @@ class Folder:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[FinalChunkResult]:
         """
         Retrieve relevant chunks within this folder.
@@ -306,17 +298,19 @@ class Folder:
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
             use_colpali: Whether to use ColPali-style embedding model
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[FinalChunkResult]: List of relevant chunks
         """
+        effective_folder = self._merge_folders(additional_folders)
         request = {
             "query": query,
             "filters": filters,
             "k": k,
             "min_score": min_score,
             "use_colpali": use_colpali,
-            "folder_name": self._name,  # Add folder name here
+            "folder_name": effective_folder,
         }
         response = self._client._request("POST", "retrieve/chunks", request)
@@ -329,6 +323,7 @@ class Folder:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[DocumentResult]:
         """
         Retrieve relevant documents within this folder.
@@ -339,17 +334,19 @@ class Folder:
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
             use_colpali: Whether to use ColPali-style embedding model
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[DocumentResult]: List of relevant documents
         """
+        effective_folder = self._merge_folders(additional_folders)
         request = {
             "query": query,
             "filters": filters,
             "k": k,
             "min_score": min_score,
             "use_colpali": use_colpali,
-            "folder_name": self._name,  # Add folder name here
+            "folder_name": effective_folder,
         }
         response = self._client._request("POST", "retrieve/docs", request)
@@ -368,6 +365,8 @@ class Folder:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        additional_folders: Optional[List[str]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context within this folder.
@@ -384,10 +383,13 @@ class Folder:
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
+            additional_folders: Optional list of extra folders to include in the scope
+            schema: Optional schema for structured output
         Returns:
             CompletionResponse: Generated completion
         """
+        effective_folder = self._merge_folders(additional_folders)
         payload = self._client._logic._prepare_query_request(
             query,
             filters,
@@ -400,14 +402,31 @@ class Folder:
             hop_depth,
             include_paths,
             prompt_overrides,
-            self._name,
-            None,
+            effective_folder,
+            None,  # end_user_id not supported at this level
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._client._request("POST", "query", data=payload)
         return self._client._logic._parse_completion_response(response)
     def list_documents(
-        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
+        self,
+        skip: int = 0,
+        limit: int = 100,
+        filters: Optional[Dict[str, Any]] = None,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[Document]:
         """
         List accessible documents within this folder.
@@ -416,30 +435,34 @@ class Folder:
             skip: Number of documents to skip
             limit: Maximum number of documents to return
             filters: Optional filters
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[Document]: List of documents
         """
-        params, data = self._client._logic._prepare_list_documents_request(
-            skip, limit, filters, self._name, None
-        )
+        effective_folder = self._merge_folders(additional_folders)
+        params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, effective_folder, None)
         response = self._client._request("POST", "documents", data=data, params=params)
         docs = self._client._logic._parse_document_list_response(response)
         for doc in docs:
             doc._client = self._client
         return docs
-    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
+    def batch_get_documents(
+        self, document_ids: List[str], additional_folders: Optional[List[str]] = None
+    ) -> List[Document]:
         """
         Retrieve multiple documents by their IDs in a single batch operation within this folder.
         Args:
             document_ids: List of document IDs to retrieve
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[Document]: List of document metadata for found documents
         """
-        request = {"document_ids": document_ids, "folder_name": self._name}
+        merged = self._merge_folders(additional_folders)
+        request = {"document_ids": document_ids, "folder_name": merged}
         response = self._client._request("POST", "batch/documents", data=request)
         docs = [self._client._logic._parse_document_response(doc) for doc in response]
@@ -448,13 +471,16 @@ class Folder:
         return docs
     def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
+        self,
+        sources: List[Union[ChunkSource, Dict[str, Any]]],
+        additional_folders: Optional[List[str]] = None,
     ) -> List[FinalChunkResult]:
         """
         Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
         Args:
             sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[FinalChunkResult]: List of chunk results
@@ -467,8 +493,8 @@ class Folder:
             else:
                 source_dicts.append(source.model_dump())
-        # Add folder_name to request
-        request = {"sources": source_dicts, "folder_name": self._name}
+        merged = self._merge_folders(additional_folders)
+        request = {"sources": source_dicts, "folder_name": merged}
         response = self._client._request("POST", "batch/chunks", data=request)
         return self._client._logic._parse_chunk_result_list_response(response)
@@ -505,7 +531,9 @@ class Folder:
         }
         response = self._client._request("POST", "graph/create", request)
-        return self._client._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def update_graph(
         self,
@@ -538,7 +566,9 @@ class Folder:
         }
         response = self._client._request("POST", f"graph/{name}/update", request)
-        return self._client._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
         """
@@ -550,18 +580,28 @@ class Folder:
         Returns:
             Dict[str, str]: Deletion status
         """
-        # Get the document by filename with folder scope
-        request = {"filename": filename, "folder_name": self._name}
         # First get the document ID
-        response = self._client._request(
-            "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
-        )
+        response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
         doc = self._client._logic._parse_document_response(response)
         # Then delete by ID
         return self._client.delete_document(doc.external_id)
+    # Helper --------------------------------------------------------------
+    def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str]]:
+        """Return the effective folder scope.
+        If *additional_folders* is provided it will be combined with the folder's
+        own *self._name* and returned as a list (to preserve ordering and allow
+        duplicates to be removed server-side).  Otherwise just *self._name* is
+        returned so we keep backward-compatibility with the original API that
+        expected a single string.
+        """
+        if not additional_folders:
+            return self._name
+        # Pre-pend the scoped folder to the list provided by the caller.
+        return [self._name] + additional_folders
 class UserScope:
     """
@@ -677,7 +717,7 @@ class UserScope:
             # Add folder name if scoped to a folder
             if self._folder_name:
                 form_data["folder_name"] = self._folder_name
             # use_colpali should be a query parameter as defined in the API
             response = self._client._request(
                 "POST",
@@ -732,9 +772,7 @@ class UserScope:
             if rules:
                 if all(isinstance(r, list) for r in rules):
                     # List of lists - per-file rules
-                    converted_rules = [
-                        [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
-                    ]
+                    converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
                 else:
                     # Flat list - shared rules for all files
                     converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -754,9 +792,9 @@ class UserScope:
                 data["folder_name"] = self._folder_name
             response = self._client._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -766,9 +804,7 @@ class UserScope:
                 for error in response["errors"]:
                     logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
-            docs = [
-                self._client._logic._parse_document_response(doc) for doc in response["documents"]
-            ]
+            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
             for doc in docs:
                 doc._client = self._client
             return docs
@@ -831,6 +867,7 @@ class UserScope:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[FinalChunkResult]:
         """
         Retrieve relevant chunks as this end user.
@@ -841,10 +878,12 @@ class UserScope:
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
             use_colpali: Whether to use ColPali-style embedding model
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[FinalChunkResult]: List of relevant chunks
         """
+        effective_folder = self._merge_folders(additional_folders)
         request = {
             "query": query,
             "filters": filters,
@@ -852,6 +891,7 @@ class UserScope:
             "min_score": min_score,
             "use_colpali": use_colpali,
             "end_user_id": self._end_user_id,  # Add end user ID here
+            "folder_name": effective_folder,  # Add folder name if provided
         }
         # Add folder name if scoped to a folder
@@ -868,6 +908,7 @@ class UserScope:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[DocumentResult]:
         """
         Retrieve relevant documents as this end user.
@@ -878,10 +919,12 @@ class UserScope:
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
             use_colpali: Whether to use ColPali-style embedding model
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[DocumentResult]: List of relevant documents
         """
+        effective_folder = self._merge_folders(additional_folders)
         request = {
             "query": query,
             "filters": filters,
@@ -889,6 +932,7 @@ class UserScope:
             "min_score": min_score,
             "use_colpali": use_colpali,
             "end_user_id": self._end_user_id,  # Add end user ID here
+            "folder_name": effective_folder,  # Add folder name if provided
         }
         # Add folder name if scoped to a folder
@@ -911,6 +955,8 @@ class UserScope:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        additional_folders: Optional[List[str]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context as this end user.
@@ -927,10 +973,13 @@ class UserScope:
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
+            additional_folders: Optional list of extra folders to include in the scope
+            schema: Optional schema for structured output
         Returns:
             CompletionResponse: Generated completion
         """
+        effective_folder = self._merge_folders(additional_folders)
         payload = self._client._logic._prepare_query_request(
             query,
             filters,
@@ -943,14 +992,31 @@ class UserScope:
             hop_depth,
             include_paths,
             prompt_overrides,
-            self._folder_name,
+            effective_folder,
             self._end_user_id,
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._client._request("POST", "query", data=payload)
         return self._client._logic._parse_completion_response(response)
     def list_documents(
-        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
+        self,
+        skip: int = 0,
+        limit: int = 100,
+        filters: Optional[Dict[str, Any]] = None,
+        additional_folders: Optional[List[str]] = None,
     ) -> List[Document]:
         """
         List accessible documents for this end user.
@@ -959,6 +1025,7 @@ class UserScope:
             skip: Number of documents to skip
             limit: Maximum number of documents to return
             filters: Optional filters
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[Document]: List of documents
@@ -970,28 +1037,36 @@ class UserScope:
         if self._folder_name:
             params["folder_name"] = self._folder_name
-        response = self._client._request("POST", f"documents", data=filters or {}, params=params)
+        # Merge any additional folders into the request params
+        effective_folder = self._merge_folders(additional_folders)
+        if effective_folder:
+            params["folder_name"] = effective_folder
+        response = self._client._request("POST", "documents", data=filters or {}, params=params)
         docs = [self._client._logic._parse_document_response(doc) for doc in response]
         for doc in docs:
             doc._client = self._client
         return docs
-    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
+    def batch_get_documents(
+        self, document_ids: List[str], additional_folders: Optional[List[str]] = None
+    ) -> List[Document]:
         """
         Retrieve multiple documents by their IDs in a single batch operation for this end user.
         Args:
             document_ids: List of document IDs to retrieve
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[Document]: List of document metadata for found documents
         """
+        merged = self._merge_folders(additional_folders)
         request = {"document_ids": document_ids, "end_user_id": self._end_user_id}
-        # Add folder name if scoped to a folder
-        if self._folder_name:
-            request["folder_name"] = self._folder_name
+        if merged:
+            request["folder_name"] = merged
         response = self._client._request("POST", "batch/documents", data=request)
         docs = [self._client._logic._parse_document_response(doc) for doc in response]
@@ -1000,13 +1075,16 @@ class UserScope:
         return docs
     def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
+        self,
+        sources: List[Union[ChunkSource, Dict[str, Any]]],
+        additional_folders: Optional[List[str]] = None,
     ) -> List[FinalChunkResult]:
         """
         Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
         Args:
             sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
+            additional_folders: Optional list of extra folders to include in the scope
         Returns:
             List[FinalChunkResult]: List of chunk results
@@ -1019,12 +1097,11 @@ class UserScope:
             else:
                 source_dicts.append(source.model_dump())
-        # Add end_user_id and folder_name to request
+        merged = self._merge_folders(additional_folders)
         request = {"sources": source_dicts, "end_user_id": self._end_user_id}
-        # Add folder name if scoped to a folder
-        if self._folder_name:
-            request["folder_name"] = self._folder_name
+        if merged:
+            request["folder_name"] = merged
         response = self._client._request("POST", "batch/chunks", data=request)
         return self._client._logic._parse_chunk_result_list_response(response)
@@ -1065,7 +1142,9 @@ class UserScope:
             request["folder_name"] = self._folder_name
         response = self._client._request("POST", "graph/create", request)
-        return self._client._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def update_graph(
         self,
@@ -1102,7 +1181,9 @@ class UserScope:
             request["folder_name"] = self._folder_name
         response = self._client._request("POST", f"graph/{name}/update", request)
-        return self._client._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
         """
@@ -1128,6 +1209,22 @@ class UserScope:
         # Then delete by ID
         return self._client.delete_document(doc.external_id)
+    # Helper --------------------------------------------------------------
+    def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str], None]:
+        """Return combined folder scope for user.
+        When this user scope is already tied to *self._folder_name* we combine it
+        with any *additional_folders* passed by the caller.  Otherwise just the
+        *additional_folders* (or None) is returned so that upstream logic is
+        unchanged.
+        """
+        base = self._folder_name
+        if additional_folders:
+            if base:
+                return [base] + additional_folders
+            return additional_folders
+        return base
 class Morphik:
     """
@@ -1173,12 +1270,12 @@ class Morphik:
             # Remove Content-Type if it exists - httpx will set the correct multipart boundary
             if "Content-Type" in headers:
                 del headers["Content-Type"]
             # For file uploads with form data, use form data (not json)
             request_data = {"files": files}
             if data:
                 request_data["data"] = data
             # Files are now properly handled
         else:
             # JSON for everything else
@@ -1192,8 +1289,13 @@ class Morphik:
             params=params,
             **request_data,
         )
-        response.raise_for_status()
-        return response.json()
+        try:
+            response.raise_for_status()
+            return response.json()
+        except httpx.HTTPStatusError as e:
+            # Print error response for debugging
+            print(f"Error response: {e.response.status_code} - {e.response.text}")
+            raise
     def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
         """Convert a rule to a dictionary format"""
@@ -1210,18 +1312,16 @@ class Morphik:
         Returns:
             Folder: A folder object ready for scoped operations
         """
-        payload = {
-            "name": name
-        }
+        payload = {"name": name}
         if description:
             payload["description"] = description
         response = self._request("POST", "folders", data=payload)
         folder_info = FolderInfo(**response)
         # Return a usable Folder object with the ID from the response
         return Folder(self, name, folder_id=folder_info.id)
     def get_folder_by_name(self, name: str) -> Folder:
         """
         Get a folder by name to scope operations.
@@ -1233,7 +1333,7 @@ class Morphik:
             Folder: A folder object for scoped operations
         """
         return Folder(self, name)
     def get_folder(self, folder_id: str) -> Folder:
         """
         Get a folder by ID.
@@ -1250,13 +1350,13 @@ class Morphik:
     def list_folders(self) -> List[Folder]:
         """
         List all folders the user has access to as Folder objects.
         Returns:
             List[Folder]: List of Folder objects ready for operations
         """
         folder_infos = self._request("GET", "folders")
         return [Folder(self, info["name"], info["id"]) for info in folder_infos]
     def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
         """
         Add a document to a folder.
@@ -1270,7 +1370,7 @@ class Morphik:
         """
         response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
         return response
     def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
         """
         Remove a document from a folder.
@@ -1314,7 +1414,8 @@ class Morphik:
             rules: Optional list of rules to apply during ingestion. Can be:
                   - MetadataExtractionRule: Extract metadata using a schema
                   - NaturalLanguageRule: Transform content using natural language
-            use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
+            use_colpali: Whether to use ColPali-style embedding model to ingest the text
+                (slower, but significantly better retrieval accuracy for text and images)
         Returns:
             Document: Metadata of the ingested document
@@ -1367,7 +1468,8 @@ class Morphik:
             rules: Optional list of rules to apply during ingestion. Can be:
                   - MetadataExtractionRule: Extract metadata using a schema
                   - NaturalLanguageRule: Transform content using natural language
-            use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
+            use_colpali: Whether to use ColPali-style embedding model to ingest the file
+                (slower, but significantly better retrieval accuracy for images)
         Returns:
             Document: Metadata of the ingested document
@@ -1450,14 +1552,12 @@ class Morphik:
         try:
             # Prepare form data
             # Prepare form data - use_colpali should be a query parameter, not form data
-            data = self._logic._prepare_ingest_files_form_data(
-                metadata, rules, use_colpali, parallel, None, None
-            )
+            data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
             response = self._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -1533,6 +1633,7 @@ class Morphik:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        folder_name: Optional[Union[str, List[str]]] = None,
     ) -> List[FinalChunkResult]:
         """
         Retrieve relevant chunks.
@@ -1542,7 +1643,8 @@ class Morphik:
             filters: Optional metadata filters
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
-            use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
+                (only works for documents ingested with `use_colpali=True`)
         Returns:
             List[ChunkResult]
@@ -1555,7 +1657,7 @@ class Morphik:
             ```
         """
         payload = self._logic._prepare_retrieve_chunks_request(
-            query, filters, k, min_score, use_colpali, None, None
+            query, filters, k, min_score, use_colpali, folder_name, None
         )
         response = self._request("POST", "retrieve/chunks", data=payload)
         return self._logic._parse_chunk_result_list_response(response)
@@ -1567,6 +1669,7 @@ class Morphik:
         k: int = 4,
         min_score: float = 0.0,
         use_colpali: bool = True,
+        folder_name: Optional[Union[str, List[str]]] = None,
     ) -> List[DocumentResult]:
         """
         Retrieve relevant documents.
@@ -1576,7 +1679,8 @@ class Morphik:
             filters: Optional metadata filters
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
-            use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
+                (only works for documents ingested with `use_colpali=True`)
         Returns:
             List[DocumentResult]
@@ -1589,7 +1693,7 @@ class Morphik:
             ```
         """
         payload = self._logic._prepare_retrieve_docs_request(
-            query, filters, k, min_score, use_colpali, None, None
+            query, filters, k, min_score, use_colpali, folder_name, None
         )
         response = self._request("POST", "retrieve/docs", data=payload)
         return self._logic._parse_document_result_list_response(response)
@@ -1607,6 +1711,8 @@ class Morphik:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        folder_name: Optional[Union[str, List[str]]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context.
@@ -1618,12 +1724,15 @@ class Morphik:
             min_score: Minimum similarity threshold (default: 0.0)
             max_tokens: Maximum tokens in completion
             temperature: Model temperature
-            use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to generate the completion
+                (only works for documents ingested with `use_colpali=True`)
             graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
                 Either a QueryPromptOverrides object or a dictionary with the same structure
+            folder_name: Optional folder name to further scope operations
+            schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
         Returns:
             CompletionResponse
@@ -1671,8 +1780,30 @@ class Morphik:
             if response.metadata and "graph" in response.metadata:
                 for path in response.metadata["graph"]["paths"]:
                     print(" -> ".join(path))
+            # Using structured output with a Pydantic model
+            from pydantic import BaseModel
+            class ResearchFindings(BaseModel):
+                main_finding: str
+                supporting_evidence: List[str]
+                limitations: List[str]
+            response = db.query(
+                "Summarize the key research findings from these documents",
+                schema=ResearchFindings
+            )
+            # Access structured output
+            if response.structured_output:
+                findings = response.structured_output
+                print(f"Main finding: {findings.main_finding}")
+                print("Supporting evidence:")
+                for evidence in findings.supporting_evidence:
+                    print(f"- {evidence}")
             ```
         """
+        # Directly forward the supplied folder_name (may be None, str, or List[str])
         payload = self._logic._prepare_query_request(
             query,
             filters,
@@ -1685,14 +1816,31 @@ class Morphik:
             hop_depth,
             include_paths,
             prompt_overrides,
-            None,
-            None,
+            folder_name,
+            None,  # end_user_id not supported at this level
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._request("POST", "query", data=payload)
         return self._logic._parse_completion_response(response)
     def list_documents(
-        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
+        self,
+        skip: int = 0,
+        limit: int = 100,
+        filters: Optional[Dict[str, Any]] = None,
+        folder_name: Optional[Union[str, List[str]]] = None,
     ) -> List[Document]:
         """
         List accessible documents.
@@ -1701,6 +1849,7 @@ class Morphik:
             skip: Number of documents to skip
             limit: Maximum number of documents to return
             filters: Optional filters
+            folder_name: Optional folder name (or list of names) to scope the request
         Returns:
             List[Document]: List of accessible documents
@@ -1714,7 +1863,7 @@ class Morphik:
             next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
             ```
         """
-        params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
+        params, data = self._logic._prepare_list_documents_request(skip, limit, filters, folder_name, None)
         response = self._request("POST", "documents", data=data, params=params)
         docs = self._logic._parse_document_list_response(response)
         for doc in docs:
@@ -1741,17 +1890,17 @@ class Morphik:
         doc = self._logic._parse_document_response(response)
         doc._client = self
         return doc
     def get_document_status(self, document_id: str) -> Dict[str, Any]:
         """
         Get the current processing status of a document.
         Args:
             document_id: ID of the document to check
         Returns:
             Dict[str, Any]: Status information including current status, potential errors, and other metadata
         Example:
             ```python
             status = db.get_document_status("doc_123")
@@ -1765,23 +1914,23 @@ class Morphik:
         """
         response = self._request("GET", f"documents/{document_id}/status")
         return response
     def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
         """
         Wait for a document's processing to complete.
         Args:
             document_id: ID of the document to wait for
             timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
             check_interval_seconds: Time between status checks (default: 2 seconds)
         Returns:
             Document: Updated document with the latest status
         Raises:
             TimeoutError: If processing doesn't complete within the timeout period
             ValueError: If processing fails with an error
         Example:
             ```python
             # Upload a file and wait for processing to complete
@@ -1796,20 +1945,21 @@ class Morphik:
             ```
         """
         import time
         start_time = time.time()
         while (time.time() - start_time) < timeout_seconds:
             status = self.get_document_status(document_id)
             if status["status"] == "completed":
                 # Get the full document now that it's complete
                 return self.get_document(document_id)
             elif status["status"] == "failed":
                 raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
             # Wait before checking again
             time.sleep(check_interval_seconds)
         raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
     def get_document_by_filename(self, filename: str) -> Document:
@@ -1963,9 +2113,7 @@ class Morphik:
                 form_data["use_colpali"] = str(use_colpali).lower()
             # Use the dedicated file update endpoint
-            response = self._request(
-                "POST", f"documents/{document_id}/update_file", data=form_data, files=files
-            )
+            response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
             doc = self._logic._parse_document_response(response)
             doc._client = self
@@ -2167,12 +2315,15 @@ class Morphik:
         return result
-    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
+    def batch_get_documents(
+        self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
+    ) -> List[Document]:
         """
-        Retrieve multiple documents by their IDs in a single batch operation.
+        Retrieve multiple documents by their IDs.
         Args:
             document_ids: List of document IDs to retrieve
+            folder_name: Optional folder name (or list of names) to scope the request
         Returns:
             List[Document]: List of document metadata for found documents
@@ -2184,21 +2335,23 @@ class Morphik:
                 print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
             ```
         """
-        # API expects a dict with document_ids key, not a direct list
-        response = self._request("POST", "batch/documents", data={"document_ids": document_ids})
+        # Build request respecting folder scoping if provided
+        request = self._logic._prepare_batch_get_documents_request(document_ids, folder_name, None)
+        response = self._request("POST", "batch/documents", data=request)
         docs = self._logic._parse_document_list_response(response)
         for doc in docs:
             doc._client = self
         return docs
     def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
+        self, sources: List[Union[ChunkSource, Dict[str, Any]]], folder_name: Optional[Union[str, List[str]]] = None
     ) -> List[FinalChunkResult]:
         """
-        Retrieve specific chunks by their document ID and chunk number in a single batch operation.
+        Retrieve specific chunks by their document ID and chunk number.
         Args:
             sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
+            folder_name: Optional folder name (or list of names) to scope the request
         Returns:
             List[FinalChunkResult]: List of chunk results
@@ -2223,15 +2376,8 @@ class Morphik:
                 print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
             ```
         """
-        # Convert to list of dictionaries if needed
-        source_dicts = []
-        for source in sources:
-            if isinstance(source, dict):
-                source_dicts.append(source)
-            else:
-                source_dicts.append(source.model_dump())
-        response = self._request("POST", "batch/chunks", data=source_dicts)
+        request = self._logic._prepare_batch_get_chunks_request(sources, folder_name, None)
+        response = self._request("POST", "batch/chunks", data=request)
         return self._logic._parse_chunk_result_list_response(response)
     def create_cache(
@@ -2249,8 +2395,10 @@ class Morphik:
             name: Name of the cache to create
             model: Name of the model to use (e.g. "llama2")
             gguf_file: Name of the GGUF file to use for the model
-            filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
-            docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
+            filters: Optional metadata filters to determine which documents to include.
+                These filters will be applied in addition to any specific docs provided.
+            docs: Optional list of specific document IDs to include.
+                These docs will be included in addition to any documents matching the filters.
         Returns:
             Dict[str, Any]: Created cache configuration
@@ -2355,15 +2503,21 @@ class Morphik:
         if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
             prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
-        request = {
-            "name": name,
-            "filters": filters,
-            "documents": documents,
-            "prompt_overrides": prompt_overrides,
-        }
+        # Initialize request with required fields
+        request = {"name": name}
+        # Add optional fields only if they are not None
+        if filters is not None:
+            request["filters"] = filters
+        if documents is not None:
+            request["documents"] = documents
+        if prompt_overrides is not None:
+            request["prompt_overrides"] = prompt_overrides
         response = self._request("POST", "graph/create", request)
-        return self._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def get_graph(self, name: str) -> Graph:
         """
@@ -2383,7 +2537,9 @@ class Morphik:
             ```
         """
         response = self._request("GET", f"graph/{name}")
-        return self._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def list_graphs(self) -> List[Graph]:
         """
@@ -2401,7 +2557,10 @@ class Morphik:
             ```
         """
         response = self._request("GET", "graphs")
-        return self._logic._parse_graph_list_response(response)
+        graphs = self._logic._parse_graph_list_response(response)
+        for g in graphs:
+            g._client = self
+        return graphs
     def update_graph(
         self,
@@ -2465,7 +2624,9 @@ class Morphik:
         }
         response = self._request("POST", f"graph/{name}/update", request)
-        return self._logic._parse_graph_response(response)
+        graph = self._logic._parse_graph_response(response)
+        graph._client = self
+        return graph
     def delete_document(self, document_id: str) -> Dict[str, str]:
         """
@@ -2527,3 +2688,50 @@ class Morphik:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
+    def create_app(self, app_id: str, name: str, expiry_days: int = 30) -> Dict[str, str]:
+        """Create a new application in Morphik Cloud and obtain its auth URI.
+        This wraps the enterprise endpoint ``/ee/create_app`` which
+        returns a dictionary ``{\"uri\": ..., \"app_id\": ...}``.
+        Parameters
+        ----------
+        app_id:
+            Identifier for the new application.
+        name:
+            Human-readable application name (will be slugified by the server).
+        expiry_days:
+            Token validity period.  Defaults to 30 days.
+        """
+        payload = {"app_id": app_id, "name": name, "expiry_days": expiry_days}
+        return self._request("POST", "ee/create_app", data=payload)
+    def wait_for_graph_completion(
+        self,
+        graph_name: str,
+        timeout_seconds: int = 300,
+        check_interval_seconds: int = 5,
+    ) -> Graph:
+        """Block until the specified graph finishes processing.
+        Args:
+            graph_name: Name of the graph to monitor.
+            timeout_seconds: Maximum seconds to wait.
+            check_interval_seconds: Seconds between status checks.
+        Returns:
+            Graph: The completed graph object.
+        """
+        import time
+        start = time.time()
+        while time.time() - start < timeout_seconds:
+            graph = self.get_graph(graph_name)
+            if graph.is_completed:
+                return graph
+            if graph.is_failed:
+                raise RuntimeError(graph.error or "Graph processing failed")
+            time.sleep(check_interval_seconds)
+        raise TimeoutError("Timed out waiting for graph completion")

morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

morphik 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl