PyPI - morphik - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

morphik/__init__.py +1 -1
morphik/_internal.py +28 -19
morphik/async_.py +121 -110
morphik/models.py +36 -57
morphik/rules.py +28 -5
morphik/sync.py +156 -109
morphik/tests/README.md +1 -1
morphik/tests/example_usage.py +69 -69
morphik/tests/test_async.py +166 -82
morphik/tests/test_docs/sample1.txt +1 -1
morphik/tests/test_docs/sample2.txt +2 -2
morphik/tests/test_docs/sample3.txt +1 -1
morphik/tests/test_sync.py +162 -84
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
morphik-0.1.5.dist-info/RECORD +18 -0
morphik-0.1.4.dist-info/RECORD +0 -18
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0

morphik/sync.py CHANGED Viewed

@@ -2,27 +2,23 @@ import json
 import logging
 from io import BytesIO, IOBase
 from pathlib import Path
-from typing import Dict, Any, List, Optional, Union, BinaryIO
-from PIL import Image
-from PIL.Image import Image as PILImage
+from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
 import httpx
+from pydantic import BaseModel
+from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
 from .models import (
+    ChunkSource,
+    CompletionResponse,  # Prompt override models
     Document,
     DocumentResult,
-    CompletionResponse,
-    IngestTextRequest,
-    ChunkSource,
-    Graph,
     FolderInfo,
-    # Prompt override models
+    Graph,
     GraphPromptOverrides,
+    IngestTextRequest,
     QueryPromptOverrides,
 )
-from .rules import Rule
-from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
 logger = logging.getLogger(__name__)
@@ -71,16 +67,16 @@ class Folder:
     def name(self) -> str:
         """Returns the folder name."""
         return self._name
     @property
     def id(self) -> Optional[str]:
         """Returns the folder ID if available."""
         return self._id
     def get_info(self) -> Dict[str, Any]:
         """
         Get detailed information about this folder.
         Returns:
             Dict[str, Any]: Detailed folder information
         """
@@ -93,9 +89,8 @@ class Folder:
                     break
             if not self._id:
                 raise ValueError(f"Folder '{self._name}' not found")
         return self._client._request("GET", f"folders/{self._id}")
     def signin(self, end_user_id: str) -> "UserScope":
         """
@@ -168,9 +163,7 @@ class Folder:
             files = {"file": (filename, file_obj)}
             # Create form data
-            form_data = self._client._logic._prepare_ingest_file_form_data(
-                metadata, rules, self._name, None
-            )
+            form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
             # use_colpali should be a query parameter as defined in the API
             response = self._client._request(
@@ -219,9 +212,9 @@ class Folder:
             )
             response = self._client._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -231,9 +224,7 @@ class Folder:
                 for error in response["errors"]:
                     logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
-            docs = [
-                self._client._logic._parse_document_response(doc) for doc in response["documents"]
-            ]
+            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
             for doc in docs:
                 doc._client = self._client
             return docs
@@ -368,6 +359,7 @@ class Folder:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context within this folder.
@@ -384,6 +376,7 @@ class Folder:
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
+            schema: Optional schema for structured output
         Returns:
             CompletionResponse: Generated completion
@@ -402,7 +395,20 @@ class Folder:
             prompt_overrides,
             self._name,
             None,
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._client._request("POST", "query", data=payload)
         return self._client._logic._parse_completion_response(response)
@@ -420,9 +426,7 @@ class Folder:
         Returns:
             List[Document]: List of documents
         """
-        params, data = self._client._logic._prepare_list_documents_request(
-            skip, limit, filters, self._name, None
-        )
+        params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
         response = self._client._request("POST", "documents", data=data, params=params)
         docs = self._client._logic._parse_document_list_response(response)
         for doc in docs:
@@ -447,9 +451,7 @@ class Folder:
             doc._client = self._client
         return docs
-    def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
-    ) -> List[FinalChunkResult]:
+    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
         """
         Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
@@ -550,13 +552,8 @@ class Folder:
         Returns:
             Dict[str, str]: Deletion status
         """
-        # Get the document by filename with folder scope
-        request = {"filename": filename, "folder_name": self._name}
         # First get the document ID
-        response = self._client._request(
-            "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
-        )
+        response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
         doc = self._client._logic._parse_document_response(response)
         # Then delete by ID
@@ -677,7 +674,7 @@ class UserScope:
             # Add folder name if scoped to a folder
             if self._folder_name:
                 form_data["folder_name"] = self._folder_name
             # use_colpali should be a query parameter as defined in the API
             response = self._client._request(
                 "POST",
@@ -732,9 +729,7 @@ class UserScope:
             if rules:
                 if all(isinstance(r, list) for r in rules):
                     # List of lists - per-file rules
-                    converted_rules = [
-                        [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
-                    ]
+                    converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
                 else:
                     # Flat list - shared rules for all files
                     converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -754,9 +749,9 @@ class UserScope:
                 data["folder_name"] = self._folder_name
             response = self._client._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -766,9 +761,7 @@ class UserScope:
                 for error in response["errors"]:
                     logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
-            docs = [
-                self._client._logic._parse_document_response(doc) for doc in response["documents"]
-            ]
+            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
             for doc in docs:
                 doc._client = self._client
             return docs
@@ -911,6 +904,7 @@ class UserScope:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context as this end user.
@@ -927,6 +921,7 @@ class UserScope:
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
+            schema: Optional schema for structured output
         Returns:
             CompletionResponse: Generated completion
@@ -945,7 +940,20 @@ class UserScope:
             prompt_overrides,
             self._folder_name,
             self._end_user_id,
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._client._request("POST", "query", data=payload)
         return self._client._logic._parse_completion_response(response)
@@ -970,7 +978,7 @@ class UserScope:
         if self._folder_name:
             params["folder_name"] = self._folder_name
-        response = self._client._request("POST", f"documents", data=filters or {}, params=params)
+        response = self._client._request("POST", "documents", data=filters or {}, params=params)
         docs = [self._client._logic._parse_document_response(doc) for doc in response]
         for doc in docs:
@@ -999,9 +1007,7 @@ class UserScope:
             doc._client = self._client
         return docs
-    def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
-    ) -> List[FinalChunkResult]:
+    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
         """
         Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
@@ -1173,12 +1179,12 @@ class Morphik:
             # Remove Content-Type if it exists - httpx will set the correct multipart boundary
             if "Content-Type" in headers:
                 del headers["Content-Type"]
             # For file uploads with form data, use form data (not json)
             request_data = {"files": files}
             if data:
                 request_data["data"] = data
             # Files are now properly handled
         else:
             # JSON for everything else
@@ -1192,8 +1198,13 @@ class Morphik:
             params=params,
             **request_data,
         )
-        response.raise_for_status()
-        return response.json()
+        try:
+            response.raise_for_status()
+            return response.json()
+        except httpx.HTTPStatusError as e:
+            # Print error response for debugging
+            print(f"Error response: {e.response.status_code} - {e.response.text}")
+            raise
     def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
         """Convert a rule to a dictionary format"""
@@ -1210,18 +1221,16 @@ class Morphik:
         Returns:
             Folder: A folder object ready for scoped operations
         """
-        payload = {
-            "name": name
-        }
+        payload = {"name": name}
         if description:
             payload["description"] = description
         response = self._request("POST", "folders", data=payload)
         folder_info = FolderInfo(**response)
         # Return a usable Folder object with the ID from the response
         return Folder(self, name, folder_id=folder_info.id)
     def get_folder_by_name(self, name: str) -> Folder:
         """
         Get a folder by name to scope operations.
@@ -1233,7 +1242,7 @@ class Morphik:
             Folder: A folder object for scoped operations
         """
         return Folder(self, name)
     def get_folder(self, folder_id: str) -> Folder:
         """
         Get a folder by ID.
@@ -1250,13 +1259,13 @@ class Morphik:
     def list_folders(self) -> List[Folder]:
         """
         List all folders the user has access to as Folder objects.
         Returns:
             List[Folder]: List of Folder objects ready for operations
         """
         folder_infos = self._request("GET", "folders")
         return [Folder(self, info["name"], info["id"]) for info in folder_infos]
     def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
         """
         Add a document to a folder.
@@ -1270,7 +1279,7 @@ class Morphik:
         """
         response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
         return response
     def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
         """
         Remove a document from a folder.
@@ -1314,7 +1323,8 @@ class Morphik:
             rules: Optional list of rules to apply during ingestion. Can be:
                   - MetadataExtractionRule: Extract metadata using a schema
                   - NaturalLanguageRule: Transform content using natural language
-            use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
+            use_colpali: Whether to use ColPali-style embedding model to ingest the text
+                (slower, but significantly better retrieval accuracy for text and images)
         Returns:
             Document: Metadata of the ingested document
@@ -1367,7 +1377,8 @@ class Morphik:
             rules: Optional list of rules to apply during ingestion. Can be:
                   - MetadataExtractionRule: Extract metadata using a schema
                   - NaturalLanguageRule: Transform content using natural language
-            use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
+            use_colpali: Whether to use ColPali-style embedding model to ingest the file
+                (slower, but significantly better retrieval accuracy for images)
         Returns:
             Document: Metadata of the ingested document
@@ -1450,14 +1461,12 @@ class Morphik:
         try:
             # Prepare form data
             # Prepare form data - use_colpali should be a query parameter, not form data
-            data = self._logic._prepare_ingest_files_form_data(
-                metadata, rules, use_colpali, parallel, None, None
-            )
+            data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
             response = self._request(
-                "POST",
-                "ingest/files",
-                data=data,
+                "POST",
+                "ingest/files",
+                data=data,
                 files=file_objects,
                 params={"use_colpali": str(use_colpali).lower()},
             )
@@ -1542,7 +1551,8 @@ class Morphik:
             filters: Optional metadata filters
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
-            use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
+                (only works for documents ingested with `use_colpali=True`)
         Returns:
             List[ChunkResult]
@@ -1554,9 +1564,7 @@ class Morphik:
             )
             ```
         """
-        payload = self._logic._prepare_retrieve_chunks_request(
-            query, filters, k, min_score, use_colpali, None, None
-        )
+        payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
         response = self._request("POST", "retrieve/chunks", data=payload)
         return self._logic._parse_chunk_result_list_response(response)
@@ -1576,7 +1584,8 @@ class Morphik:
             filters: Optional metadata filters
             k: Number of results (default: 4)
             min_score: Minimum similarity threshold (default: 0.0)
-            use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
+                (only works for documents ingested with `use_colpali=True`)
         Returns:
             List[DocumentResult]
@@ -1588,9 +1597,7 @@ class Morphik:
             )
             ```
         """
-        payload = self._logic._prepare_retrieve_docs_request(
-            query, filters, k, min_score, use_colpali, None, None
-        )
+        payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
         response = self._request("POST", "retrieve/docs", data=payload)
         return self._logic._parse_document_result_list_response(response)
@@ -1607,6 +1614,7 @@ class Morphik:
         hop_depth: int = 1,
         include_paths: bool = False,
         prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
+        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
     ) -> CompletionResponse:
         """
         Generate completion using relevant chunks as context.
@@ -1618,12 +1626,14 @@ class Morphik:
             min_score: Minimum similarity threshold (default: 0.0)
             max_tokens: Maximum tokens in completion
             temperature: Model temperature
-            use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
+            use_colpali: Whether to use ColPali-style embedding model to generate the completion
+                (only works for documents ingested with `use_colpali=True`)
             graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
             hop_depth: Number of relationship hops to traverse in the graph (1-3)
             include_paths: Whether to include relationship paths in the response
             prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
                 Either a QueryPromptOverrides object or a dictionary with the same structure
+            schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
         Returns:
             CompletionResponse
@@ -1671,6 +1681,27 @@ class Morphik:
             if response.metadata and "graph" in response.metadata:
                 for path in response.metadata["graph"]["paths"]:
                     print(" -> ".join(path))
+            # Using structured output with a Pydantic model
+            from pydantic import BaseModel
+            class ResearchFindings(BaseModel):
+                main_finding: str
+                supporting_evidence: List[str]
+                limitations: List[str]
+            response = db.query(
+                "Summarize the key research findings from these documents",
+                schema=ResearchFindings
+            )
+            # Access structured output
+            if response.structured_output:
+                findings = response.structured_output
+                print(f"Main finding: {findings.main_finding}")
+                print("Supporting evidence:")
+                for evidence in findings.supporting_evidence:
+                    print(f"- {evidence}")
             ```
         """
         payload = self._logic._prepare_query_request(
@@ -1687,7 +1718,20 @@ class Morphik:
             prompt_overrides,
             None,
             None,
+            schema,
         )
+        # Add schema to payload if provided
+        if schema:
+            # If schema is a Pydantic model class, we need to serialize it to a schema dict
+            if isinstance(schema, type) and issubclass(schema, BaseModel):
+                payload["schema"] = schema.model_json_schema()
+            else:
+                payload["schema"] = schema
+            # Add a hint to the query to return in JSON format
+            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
         response = self._request("POST", "query", data=payload)
         return self._logic._parse_completion_response(response)
@@ -1741,17 +1785,17 @@ class Morphik:
         doc = self._logic._parse_document_response(response)
         doc._client = self
         return doc
     def get_document_status(self, document_id: str) -> Dict[str, Any]:
         """
         Get the current processing status of a document.
         Args:
             document_id: ID of the document to check
         Returns:
             Dict[str, Any]: Status information including current status, potential errors, and other metadata
         Example:
             ```python
             status = db.get_document_status("doc_123")
@@ -1765,23 +1809,23 @@ class Morphik:
         """
         response = self._request("GET", f"documents/{document_id}/status")
         return response
     def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
         """
         Wait for a document's processing to complete.
         Args:
             document_id: ID of the document to wait for
             timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
             check_interval_seconds: Time between status checks (default: 2 seconds)
         Returns:
             Document: Updated document with the latest status
         Raises:
             TimeoutError: If processing doesn't complete within the timeout period
             ValueError: If processing fails with an error
         Example:
             ```python
             # Upload a file and wait for processing to complete
@@ -1796,20 +1840,21 @@ class Morphik:
             ```
         """
         import time
         start_time = time.time()
         while (time.time() - start_time) < timeout_seconds:
             status = self.get_document_status(document_id)
             if status["status"] == "completed":
                 # Get the full document now that it's complete
                 return self.get_document(document_id)
             elif status["status"] == "failed":
                 raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
             # Wait before checking again
             time.sleep(check_interval_seconds)
         raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
     def get_document_by_filename(self, filename: str) -> Document:
@@ -1963,9 +2008,7 @@ class Morphik:
                 form_data["use_colpali"] = str(use_colpali).lower()
             # Use the dedicated file update endpoint
-            response = self._request(
-                "POST", f"documents/{document_id}/update_file", data=form_data, files=files
-            )
+            response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
             doc = self._logic._parse_document_response(response)
             doc._client = self
@@ -2191,9 +2234,7 @@ class Morphik:
             doc._client = self
         return docs
-    def batch_get_chunks(
-        self, sources: List[Union[ChunkSource, Dict[str, Any]]]
-    ) -> List[FinalChunkResult]:
+    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
         """
         Retrieve specific chunks by their document ID and chunk number in a single batch operation.
@@ -2249,8 +2290,10 @@ class Morphik:
             name: Name of the cache to create
             model: Name of the model to use (e.g. "llama2")
             gguf_file: Name of the GGUF file to use for the model
-            filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
-            docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
+            filters: Optional metadata filters to determine which documents to include.
+                These filters will be applied in addition to any specific docs provided.
+            docs: Optional list of specific document IDs to include.
+                These docs will be included in addition to any documents matching the filters.
         Returns:
             Dict[str, Any]: Created cache configuration
@@ -2355,12 +2398,16 @@ class Morphik:
         if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
             prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
-        request = {
-            "name": name,
-            "filters": filters,
-            "documents": documents,
-            "prompt_overrides": prompt_overrides,
-        }
+        # Initialize request with required fields
+        request = {"name": name}
+        # Add optional fields only if they are not None
+        if filters is not None:
+            request["filters"] = filters
+        if documents is not None:
+            request["documents"] = documents
+        if prompt_overrides is not None:
+            request["prompt_overrides"] = prompt_overrides
         response = self._request("POST", "graph/create", request)
         return self._logic._parse_graph_response(response)

morphik/tests/README.md CHANGED Viewed

@@ -38,4 +38,4 @@ python example_usage.py --async
 ## Environment Variables
 - `MORPHIK_TEST_URL` - The URL of the Morphik server to use for tests (default: http://localhost:8000)
-- `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server
+- `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server

morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl