PyPI - morphik - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

morphik/__init__.py +1 -1
morphik/_internal.py +28 -19
morphik/async_.py +121 -110
morphik/models.py +36 -57
morphik/rules.py +28 -5
morphik/sync.py +156 -109
morphik/tests/README.md +1 -1
morphik/tests/example_usage.py +69 -69
morphik/tests/test_async.py +166 -82
morphik/tests/test_docs/sample1.txt +1 -1
morphik/tests/test_docs/sample2.txt +2 -2
morphik/tests/test_docs/sample3.txt +1 -1
morphik/tests/test_sync.py +162 -84
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
morphik-0.1.5.dist-info/RECORD +18 -0
morphik-0.1.4.dist-info/RECORD +0 -18
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0

morphik/models.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
-from pathlib import Path
 from datetime import datetime
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field, field_validator, model_validator
@@ -11,20 +12,14 @@ class Document(BaseModel):
     content_type: str = Field(..., description="Content type of the document")
     filename: Optional[str] = Field(None, description="Original filename if available")
     metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
-    storage_info: Dict[str, str] = Field(
-        default_factory=dict, description="Storage-related information"
-    )
-    system_metadata: Dict[str, Any] = Field(
-        default_factory=dict, description="System-managed metadata"
-    )
-    access_control: Dict[str, Any] = Field(
-        default_factory=dict, description="Access control information"
-    )
+    storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
+    system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
+    access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
     chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
     # Client reference for update methods
     _client = None
     @property
     def status(self) -> Dict[str, Any]:
         """Get the latest processing status of the document from the API.
@@ -37,38 +32,38 @@ class Document(BaseModel):
                 "Document instance not connected to a client. Use a document returned from a Morphik client method."
             )
         return self._client.get_document_status(self.external_id)
     @property
     def is_processing(self) -> bool:
         """Check if the document is still being processed."""
         return self.status.get("status") == "processing"
     @property
     def is_ingested(self) -> bool:
         """Check if the document has completed processing."""
         return self.status.get("status") == "completed"
     @property
     def is_failed(self) -> bool:
         """Check if document processing has failed."""
         return self.status.get("status") == "failed"
     @property
     def error(self) -> Optional[str]:
         """Get the error message if processing failed."""
         status_info = self.status
         return status_info.get("error") if status_info.get("status") == "failed" else None
     def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
         """Wait for document processing to complete.
         Args:
             timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
             check_interval_seconds: Time between status checks (default: 2 seconds)
         Returns:
             Document: Updated document with the latest status
         Raises:
             TimeoutError: If processing doesn't complete within the timeout period
             ValueError: If processing fails with an error
@@ -173,9 +168,7 @@ class Document(BaseModel):
                 "Document instance not connected to a client. Use a document returned from a Morphik client method."
             )
-        return self._client.update_document_metadata(
-            document_id=self.external_id, metadata=metadata
-        )
+        return self._client.update_document_metadata(document_id=self.external_id, metadata=metadata)
 class ChunkResult(BaseModel):
@@ -227,12 +220,13 @@ class ChunkSource(BaseModel):
 class CompletionResponse(BaseModel):
     """Completion response model"""
-    completion: str
-    usage: Dict[str, int]
-    sources: List[ChunkSource] = Field(
-        default_factory=list, description="Sources of chunks used in the completion"
+    completion: Optional[Union[str, Dict[str, Any], None]] = Field(
+        None, description="Generated text completion or structured output"
     )
+    usage: Dict[str, int]
+    sources: List[ChunkSource] = Field(default_factory=list, description="Sources of chunks used in the completion")
     metadata: Optional[Dict[str, Any]] = None
+    finish_reason: Optional[str] = Field(None, description="Reason the generation finished (e.g., 'stop', 'length')")
 class IngestTextRequest(BaseModel):
@@ -253,9 +247,7 @@ class Entity(BaseModel):
     type: str = Field(..., description="Entity type")
     properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
     document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
-    chunk_sources: Dict[str, List[int]] = Field(
-        default_factory=dict, description="Source chunk numbers by document ID"
-    )
+    chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
     def __hash__(self):
         return hash(self.id)
@@ -274,9 +266,7 @@ class Relationship(BaseModel):
     target_id: str = Field(..., description="Target entity ID")
     type: str = Field(..., description="Relationship type")
     document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
-    chunk_sources: Dict[str, List[int]] = Field(
-        default_factory=dict, description="Source chunk numbers by document ID"
-    )
+    chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
     def __hash__(self):
         return hash(self.id)
@@ -293,20 +283,14 @@ class Graph(BaseModel):
     id: str = Field(..., description="Unique graph identifier")
     name: str = Field(..., description="Graph name")
     entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
-    relationships: List[Relationship] = Field(
-        default_factory=list, description="Relationships in the graph"
-    )
+    relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
     metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
     document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
-    filters: Optional[Dict[str, Any]] = Field(
-        None, description="Document filters used to create the graph"
-    )
+    filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
     created_at: datetime = Field(..., description="Creation timestamp")
     updated_at: datetime = Field(..., description="Last update timestamp")
     owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
-    access_control: Dict[str, List[str]] = Field(
-        default_factory=dict, description="Access control information"
-    )
+    access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
 class EntityExtractionExample(BaseModel):
@@ -318,9 +302,7 @@ class EntityExtractionExample(BaseModel):
     """
     label: str = Field(..., description="The entity label (e.g., 'John Doe', 'Apple Inc.')")
-    type: str = Field(
-        ..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')"
-    )
+    type: str = Field(..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')")
     properties: Optional[Dict[str, Any]] = Field(
         default_factory=dict,
         description="Optional properties of the entity (e.g., {'role': 'CEO', 'age': 42})",
@@ -337,9 +319,7 @@ class EntityResolutionExample(BaseModel):
     """
     canonical: str = Field(..., description="The canonical (standard/preferred) form of the entity")
-    variants: List[str] = Field(
-        ..., description="List of variant forms that should resolve to the canonical form"
-    )
+    variants: List[str] = Field(..., description="List of variant forms that should resolve to the canonical form")
 class EntityExtractionPromptOverride(BaseModel):
@@ -425,11 +405,13 @@ class GraphPromptOverrides(BaseModel):
     entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
         None,
-        description="Overrides for entity extraction prompts - controls how entities are identified in text during graph operations",
+        description="Overrides for entity extraction prompts - controls how entities are identified in text "
+        "during graph operations",
     )
     entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
         None,
-        description="Overrides for entity resolution prompts - controls how variant forms are grouped during graph operations",
+        description="Overrides for entity resolution prompts - controls how variant forms are grouped "
+        "during graph operations",
     )
     @model_validator(mode="after")
@@ -455,7 +437,8 @@ class QueryPromptOverrides(BaseModel):
     entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
         None,
-        description="Overrides for entity extraction prompts - controls how entities are identified in text during queries",
+        description="Overrides for entity extraction prompts - controls how entities are identified in text "
+        "during queries",
     )
     entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
         None,
@@ -475,9 +458,5 @@ class FolderInfo(BaseModel):
     description: Optional[str] = Field(None, description="Folder description")
     owner: Dict[str, str] = Field(..., description="Owner information")
     document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
-    system_metadata: Dict[str, Any] = Field(
-        default_factory=dict, description="System-managed metadata"
-    )
-    access_control: Dict[str, List[str]] = Field(
-        default_factory=dict, description="Access control information"
-    )
+    system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
+    access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")

morphik/rules.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from typing import Dict, Any, Type, Union
 from abc import ABC, abstractmethod
+from typing import Any, Dict, Literal, Type, Union
 from pydantic import BaseModel
@@ -15,8 +16,22 @@ class Rule(ABC):
 class MetadataExtractionRule(Rule):
     """Server-side rule for extracting metadata using a schema"""
-    def __init__(self, schema: Union[Type[BaseModel], Dict[str, Any]]):
+    def __init__(
+        self,
+        schema: Union[Type[BaseModel], Dict[str, Any]],
+        stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
+        use_images: bool = False,
+    ):
+        """
+        Args:
+            schema: Pydantic model or dict schema defining metadata fields to extract
+            stage: When to apply the rule - either "post_parsing" (full document text) or
+                  "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
+            use_images: Whether to process image chunks instead of text chunks. Defaults to False.
+        """
         self.schema = schema
+        self.stage = stage
+        self.use_images = use_images
     def to_dict(self) -> Dict[str, Any]:
         if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
@@ -26,22 +41,30 @@ class MetadataExtractionRule(Rule):
             # Assume it's already a dict schema
             schema_dict = self.schema
-        return {"type": "metadata_extraction", "schema": schema_dict}
+        return {
+            "type": "metadata_extraction",
+            "schema": schema_dict,
+            "stage": self.stage,
+            "use_images": self.use_images,
+        }
 class NaturalLanguageRule(Rule):
     """Server-side rule for transforming content using natural language"""
-    def __init__(self, prompt: str):
+    def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
         """
         Args:
             prompt: Instruction for how to transform the content
                    e.g. "Remove any personal information" or "Convert to bullet points"
+            stage: When to apply the rule - either "post_parsing" (full document text) or
+                  "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
         """
         self.prompt = prompt
+        self.stage = stage
     def to_dict(self) -> Dict[str, Any]:
-        return {"type": "natural_language", "prompt": self.prompt}
+        return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
 __all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]

morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl