PyPI - morphik - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

morphik/__init__.py +1 -1
morphik/_internal.py +28 -19
morphik/async_.py +121 -110
morphik/models.py +36 -57
morphik/rules.py +28 -5
morphik/sync.py +156 -109
morphik/tests/README.md +1 -1
morphik/tests/example_usage.py +69 -69
morphik/tests/test_async.py +166 -82
morphik/tests/test_docs/sample1.txt +1 -1
morphik/tests/test_docs/sample2.txt +2 -2
morphik/tests/test_docs/sample3.txt +1 -1
morphik/tests/test_sync.py +162 -84
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
morphik-0.1.5.dist-info/RECORD +18 -0
morphik-0.1.4.dist-info/RECORD +0 -18
{morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0

morphik/tests/test_async.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import os
-import pytest
 import asyncio
+import os
 import uuid
 from pathlib import Path
-from morphik.async_ import AsyncMorphik, AsyncFolder, AsyncUserScope
-from morphik.models import Document, CompletionResponse
+import pytest
+from pydantic import BaseModel, Field
+from morphik.async_ import AsyncMorphik
 # Set to your local Morphik server - use localhost by default
 # Default client connects to localhost:8000 automatically
@@ -13,71 +14,74 @@ from morphik.models import Document, CompletionResponse
 # Skip these tests if the SKIP_LIVE_TESTS environment variable is set
 pytestmark = pytest.mark.skipif(
     os.environ.get("SKIP_LIVE_TESTS") == "1",
-    reason="Skip tests that require a running Morphik server"
+    reason="Skip tests that require a running Morphik server",
 )
 # Get the test files directory
 TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
+class StructuredOutputSchema(BaseModel):
+    summary: str = Field(..., description="A short summary of the input text")
+    key_points: list[str] = Field(..., description="A list of key points from the text")
 class TestAsyncMorphik:
     """
     Tests for the asynchronous Morphik SDK client with a live server.
     To run these tests, start a local Morphik server and then run:
-    MORPHIK_TEST_URL=http://localhost:8000 pytest morphik/tests/test_async.py -v
+    pytest morphik/tests/test_async.py -v
     """
     @pytest.fixture
     async def db(self):
         """Create an AsyncMorphik client for testing"""
-        client = AsyncMorphik()  # Connects to localhost:8000 by default
+        # Connects to localhost:8000 by default, increase timeout
+        client = AsyncMorphik(timeout=120)
         yield client
         await client.close()
     @pytest.mark.asyncio
     async def test_ingest_text(self, db):
         """Test ingesting a text document"""
         # Generate a unique filename to avoid conflicts
         filename = f"test_{uuid.uuid4().hex[:8]}.txt"
         # Test basic text ingestion
         doc = await db.ingest_text(
             content="This is a test document for the Morphik SDK.",
             filename=filename,
-            metadata={"test_id": "async_text_test", "category": "test"}
+            metadata={"test_id": "async_text_test", "category": "test"},
         )
         # Verify the document was created
         assert doc.external_id is not None
         assert doc.filename == filename
         assert "test_id" in doc.metadata
         assert doc.metadata["test_id"] == "async_text_test"
         # Clean up
         await db.delete_document(doc.external_id)
     @pytest.mark.asyncio
     async def test_ingest_file(self, db):
         """Test ingesting a file from disk"""
         # Use one of our test documents
         file_path = TEST_DOCS_DIR / "sample1.txt"
         # Test file ingestion
-        doc = await db.ingest_file(
-            file=file_path,
-            metadata={"test_id": "async_file_test", "category": "test"}
-        )
+        doc = await db.ingest_file(file=file_path, metadata={"test_id": "async_file_test", "category": "test"})
         # Verify the document was created
         assert doc.external_id is not None
         assert doc.filename == "sample1.txt"
         assert "test_id" in doc.metadata
         assert doc.metadata["test_id"] == "async_file_test"
         # Clean up
         await db.delete_document(doc.external_id)
     @pytest.mark.asyncio
     async def test_retrieve_chunks(self, db):
         """Test retrieving chunks with a query"""
@@ -85,9 +89,9 @@ class TestAsyncMorphik:
         doc = await db.ingest_text(
             content="Artificial intelligence and machine learning are transforming industries worldwide.",
             filename=f"test_{uuid.uuid4().hex[:8]}.txt",
-            metadata={"test_id": "async_retrieval_test", "category": "test"}
+            metadata={"test_id": "async_retrieval_test", "category": "test"},
         )
         # Wait for processing to complete
         max_retries = 10
         for _ in range(max_retries):
@@ -98,93 +102,89 @@ class TestAsyncMorphik:
                 await asyncio.sleep(2)  # Wait before checking again
             except Exception:
                 await asyncio.sleep(2)
         # Test retrieval
         chunks = await db.retrieve_chunks(
-            query="What is artificial intelligence?",
-            filters={"test_id": "async_retrieval_test"}
+            query="What is artificial intelligence?", filters={"test_id": "async_retrieval_test"}
         )
         # Verify results (may be empty if processing is slow)
         if len(chunks) > 0:
             assert chunks[0].document_id == doc.external_id
             assert chunks[0].score > 0
         # Clean up
         await db.delete_document(doc.external_id)
     @pytest.mark.asyncio
     async def test_folder_operations(self, db):
         """Test folder operations"""
         # Create a unique folder name
         folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
         # Create a folder
-        folder = await db.create_folder(
-            name=folder_name,
-            description="Test folder for SDK tests"
-        )
+        folder = await db.create_folder(name=folder_name, description="Test folder for SDK tests")
         # Verify folder was created
         assert folder.name == folder_name
         assert folder.id is not None
         # Test ingesting a document into the folder
         doc = await folder.ingest_text(
             content="This is a test document in a folder.",
             filename=f"test_{uuid.uuid4().hex[:8]}.txt",
-            metadata={"test_id": "async_folder_test", "category": "test"}
+            metadata={"test_id": "async_folder_test", "category": "test"},
         )
         # Verify the document was created
         assert doc.external_id is not None
         # List documents in the folder
         docs = await folder.list_documents()
         # There should be at least our test document
         doc_ids = [d.external_id for d in docs]
         assert doc.external_id in doc_ids
         # Clean up - first delete the document
         await db.delete_document(doc.external_id)
         # TODO: Add folder deletion when API supports it
     @pytest.mark.asyncio
     async def test_user_scope(self, db):
         """Test user scoped operations"""
         # Create a unique user ID
         user_id = f"test_user_{uuid.uuid4().hex[:8]}"
         # Create a user scope
         user_scope = db.signin(user_id)
         # Verify user scope
         assert user_scope.end_user_id == user_id
         # Test ingesting a document as the user
         doc = await user_scope.ingest_text(
             content="This is a test document from a specific user.",
             filename=f"test_{uuid.uuid4().hex[:8]}.txt",
-            metadata={"test_id": "async_user_test", "category": "test"}
+            metadata={"test_id": "async_user_test", "category": "test"},
         )
         # Verify the document was created
         assert doc.external_id is not None
         assert "test_id" in doc.metadata
         assert doc.metadata["test_id"] == "async_user_test"
         # List documents for this user
         docs = await user_scope.list_documents()
         # There should be at least our test document
         doc_ids = [d.external_id for d in docs]
         assert doc.external_id in doc_ids
         # Clean up
         await db.delete_document(doc.external_id)
     @pytest.mark.asyncio
     async def test_batch_operations(self, db):
         """Test batch operations"""
@@ -192,85 +192,83 @@ class TestAsyncMorphik:
         files = [
             TEST_DOCS_DIR / "sample1.txt",
             TEST_DOCS_DIR / "sample2.txt",
-            TEST_DOCS_DIR / "sample3.txt"
+            TEST_DOCS_DIR / "sample3.txt",
         ]
         # Test batch ingestion
         docs = await db.ingest_files(
-            files=files,
-            metadata={"test_id": "async_batch_test", "category": "test"},
-            parallel=True
+            files=files, metadata={"test_id": "async_batch_test", "category": "test"}, parallel=True
         )
         # Verify documents were created
         assert len(docs) == 3
         file_names = [doc.filename for doc in docs]
         assert "sample1.txt" in file_names
         assert "sample2.txt" in file_names
         assert "sample3.txt" in file_names
         # Get documents in batch
         doc_ids = [doc.external_id for doc in docs]
         batch_docs = await db.batch_get_documents(doc_ids)
         # Verify batch retrieval
         assert len(batch_docs) == len(doc_ids)
         retrieved_ids = [doc.external_id for doc in batch_docs]
         for doc_id in doc_ids:
             assert doc_id in retrieved_ids
         # Clean up
         for doc_id in doc_ids:
             await db.delete_document(doc_id)
     @pytest.mark.asyncio
     async def test_folder_with_user_scope(self, db):
         """Test combination of folder and user scope"""
         # Create unique names
         folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
         user_id = f"test_user_{uuid.uuid4().hex[:8]}"
         # Create a folder
         folder = await db.create_folder(name=folder_name)
         # Create a user scope within the folder
         user_scope = folder.signin(user_id)
         # Verify scopes
         assert user_scope.folder_name == folder_name
         assert user_scope.end_user_id == user_id
         # Test ingestion in this combined scope
         doc = await user_scope.ingest_text(
             content="This is a test document in a folder from a specific user.",
             filename=f"test_{uuid.uuid4().hex[:8]}.txt",
-            metadata={"test_id": "async_folder_user_test", "category": "test"}
+            metadata={"test_id": "async_folder_user_test", "category": "test"},
         )
         # Verify the document was created
         assert doc.external_id is not None
         # List documents in this scope
         docs = await user_scope.list_documents()
         # There should be at least our test document
         doc_ids = [d.external_id for d in docs]
         assert doc.external_id in doc_ids
         # Clean up
         await db.delete_document(doc.external_id)
     @pytest.mark.asyncio
     async def test_query_endpoint(self, db):
         """Test the query endpoint for RAG capabilities"""
         # First ingest a document
         doc = await db.ingest_text(
             content="Artificial intelligence and machine learning are transforming industries worldwide. "
-                   "AI systems can now process natural language, recognize images, and make complex decisions.",
+            "AI systems can now process natural language, recognize images, and make complex decisions.",
             filename=f"test_{uuid.uuid4().hex[:8]}.txt",
-            metadata={"test_id": "async_query_test", "category": "test"}
+            metadata={"test_id": "async_query_test", "category": "test"},
         )
         try:
             # Wait for processing to complete
             for _ in range(10):
@@ -278,7 +276,7 @@ class TestAsyncMorphik:
                 if status.get("status") == "completed":
                     break
                 await asyncio.sleep(2)
             # Only proceed with test if document is processed
             if status.get("status") == "completed":
                 # Test the query endpoint
@@ -286,15 +284,101 @@ class TestAsyncMorphik:
                     query="What can AI systems do?",
                     filters={"test_id": "async_query_test"},
                     k=1,
-                    temperature=0.7
+                    temperature=0.7,
                 )
                 # Verify response
                 assert response.completion is not None
                 assert len(response.completion) > 0
                 assert len(response.sources) > 0
                 assert response.sources[0].document_id == doc.external_id
         finally:
             # Clean up
-            await db.delete_document(doc.external_id)
+            await db.delete_document(doc.external_id)
+    @pytest.mark.asyncio
+    async def test_query_with_pydantic_schema(self, db):
+        """Test the query endpoint with a Pydantic schema for structured output (async)."""
+        content = (
+            "Morphik async client supports coroutines. "
+            "It uses httpx for async requests. "
+            "Key features include non-blocking IO."
+        )
+        doc = await db.ingest_text(
+            content=content,
+            filename=f"test_schema_async_{uuid.uuid4().hex[:8]}.txt",
+            metadata={"test_id": "async_schema_pydantic_test"},
+        )
+        try:
+            await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
+            response = await db.query(
+                query="Summarize this async document and list key points.",
+                filters={"test_id": "async_schema_pydantic_test"},
+                k=1,
+                schema=StructuredOutputSchema,
+            )
+            assert response.completion is not None
+            # Expect completion to be the dictionary itself
+            assert isinstance(response.completion, dict)
+            output_data = response.completion
+            assert "summary" in output_data
+            assert "key_points" in output_data
+            assert isinstance(output_data["summary"], str)
+            assert isinstance(output_data["key_points"], list)
+        finally:
+            await db.delete_document(doc.external_id)
+    @pytest.mark.asyncio
+    async def test_query_with_dict_schema(self, db):
+        """Test the query endpoint with a dict schema for structured output (async)."""
+        content = "Asyncio provides infrastructure for writing single-threaded concurrent code."
+        doc = await db.ingest_text(
+            content=content,
+            filename=f"test_schema_dict_async_{uuid.uuid4().hex[:8]}.txt",
+            metadata={"test_id": "async_schema_dict_test"},
+        )
+        dict_schema = {
+            "type": "object",
+            "properties": {
+                "topic": {"type": "string", "description": "The main topic"},
+                "feature": {"type": "string", "description": "A key feature"},
+            },
+            "required": ["topic"],
+        }
+        try:
+            await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
+            response = await db.query(
+                query="Extract the topic and a feature.",
+                filters={"test_id": "async_schema_dict_test"},
+                k=1,
+                schema=dict_schema,
+            )
+            assert response.completion is not None
+            # Expect completion to be the dictionary itself
+            assert isinstance(response.completion, dict)
+            output_data = response.completion
+            assert "topic" in output_data
+            # Allow None if not required and type is string
+            if "feature" in dict_schema.get("required", []):
+                assert "feature" in output_data
+            elif output_data.get("feature") is None:
+                pass  # Allow None for non-required string
+            else:
+                assert isinstance(output_data.get("feature"), str)
+            if "topic" not in dict_schema.get("required", []) and output_data.get("topic") is None:
+                pass  # Allow None for non-required string
+            else:
+                assert isinstance(output_data.get("topic"), str)
+        finally:
+            await db.delete_document(doc.external_id)

morphik/tests/test_docs/sample1.txt CHANGED Viewed

@@ -8,4 +8,4 @@ human-like responses to various prompts and questions.
 The field of AI has seen rapid advancement in recent years, with models
 becoming increasingly capable of understanding context and generating
-coherent, relevant text.
+coherent, relevant text.

morphik/tests/test_docs/sample2.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Vector databases are specialized databases designed for storing and retrieving
+Vector databases are specialized databases designed for storing and retrieving
 high-dimensional vectors. They are becoming increasingly important in the
 world of machine learning and AI applications.
@@ -12,4 +12,4 @@ Key features of vector databases include:
 4. Scalability for large vector collections
 Common applications include semantic search, recommendation systems,
-image retrieval, and natural language processing tasks.
+image retrieval, and natural language processing tasks.

morphik/tests/test_docs/sample3.txt CHANGED Viewed

@@ -14,4 +14,4 @@ RAG has several advantages:
 - More controllable knowledge base
 This approach is now widely used in enterprise AI applications where
-accuracy and source attribution are critical.
+accuracy and source attribution are critical.

morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

morphik 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl