morphik 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ import asyncio
2
+ import os
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+ from pydantic import BaseModel, Field
8
+
9
+ from morphik.async_ import AsyncMorphik
10
+
11
+ # Set to your local Morphik server - use localhost by default
12
+ # Default client connects to localhost:8000 automatically
13
+
14
+ # Skip these tests if the SKIP_LIVE_TESTS environment variable is set
15
+ pytestmark = pytest.mark.skipif(
16
+ os.environ.get("SKIP_LIVE_TESTS") == "1",
17
+ reason="Skip tests that require a running Morphik server",
18
+ )
19
+
20
+ # Get the test files directory
21
+ TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
22
+
23
+
24
+ class StructuredOutputSchema(BaseModel):
25
+ summary: str = Field(..., description="A short summary of the input text")
26
+ key_points: list[str] = Field(..., description="A list of key points from the text")
27
+
28
+
29
+ class TestAsyncMorphik:
30
+ """
31
+ Tests for the asynchronous Morphik SDK client with a live server.
32
+
33
+ To run these tests, start a local Morphik server and then run:
34
+ pytest morphik/tests/test_async.py -v
35
+ """
36
+
37
+ @pytest.fixture
38
+ async def db(self):
39
+ """Create an AsyncMorphik client for testing"""
40
+ # Connects to localhost:8000 by default, increase timeout
41
+ client = AsyncMorphik(timeout=120)
42
+ yield client
43
+ await client.close()
44
+
45
+ @pytest.mark.asyncio
46
+ async def test_ingest_text(self, db):
47
+ """Test ingesting a text document"""
48
+ # Generate a unique filename to avoid conflicts
49
+ filename = f"test_{uuid.uuid4().hex[:8]}.txt"
50
+
51
+ # Test basic text ingestion
52
+ doc = await db.ingest_text(
53
+ content="This is a test document for the Morphik SDK.",
54
+ filename=filename,
55
+ metadata={"test_id": "async_text_test", "category": "test"},
56
+ )
57
+
58
+ # Verify the document was created
59
+ assert doc.external_id is not None
60
+ assert doc.filename == filename
61
+ assert "test_id" in doc.metadata
62
+ assert doc.metadata["test_id"] == "async_text_test"
63
+
64
+ # Clean up
65
+ await db.delete_document(doc.external_id)
66
+
67
+ @pytest.mark.asyncio
68
+ async def test_ingest_file(self, db):
69
+ """Test ingesting a file from disk"""
70
+ # Use one of our test documents
71
+ file_path = TEST_DOCS_DIR / "sample1.txt"
72
+
73
+ # Test file ingestion
74
+ doc = await db.ingest_file(file=file_path, metadata={"test_id": "async_file_test", "category": "test"})
75
+
76
+ # Verify the document was created
77
+ assert doc.external_id is not None
78
+ assert doc.filename == "sample1.txt"
79
+ assert "test_id" in doc.metadata
80
+ assert doc.metadata["test_id"] == "async_file_test"
81
+
82
+ # Clean up
83
+ await db.delete_document(doc.external_id)
84
+
85
+ @pytest.mark.asyncio
86
+ async def test_retrieve_chunks(self, db):
87
+ """Test retrieving chunks with a query"""
88
+ # First ingest a document
89
+ doc = await db.ingest_text(
90
+ content="Artificial intelligence and machine learning are transforming industries worldwide.",
91
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
92
+ metadata={"test_id": "async_retrieval_test", "category": "test"},
93
+ )
94
+
95
+ # Wait for processing to complete
96
+ max_retries = 10
97
+ for _ in range(max_retries):
98
+ try:
99
+ status = await db.get_document_status(doc.external_id)
100
+ if status.get("status") == "completed":
101
+ break
102
+ await asyncio.sleep(2) # Wait before checking again
103
+ except Exception:
104
+ await asyncio.sleep(2)
105
+
106
+ # Test retrieval
107
+ chunks = await db.retrieve_chunks(
108
+ query="What is artificial intelligence?", filters={"test_id": "async_retrieval_test"}
109
+ )
110
+
111
+ # Verify results (may be empty if processing is slow)
112
+ if len(chunks) > 0:
113
+ assert chunks[0].document_id == doc.external_id
114
+ assert chunks[0].score > 0
115
+
116
+ # Clean up
117
+ await db.delete_document(doc.external_id)
118
+
119
+ @pytest.mark.asyncio
120
+ async def test_folder_operations(self, db):
121
+ """Test folder operations"""
122
+ # Create a unique folder name
123
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
124
+
125
+ # Create a folder
126
+ folder = await db.create_folder(name=folder_name, description="Test folder for SDK tests")
127
+
128
+ # Verify folder was created
129
+ assert folder.name == folder_name
130
+ assert folder.id is not None
131
+
132
+ # Test ingesting a document into the folder
133
+ doc = await folder.ingest_text(
134
+ content="This is a test document in a folder.",
135
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
136
+ metadata={"test_id": "async_folder_test", "category": "test"},
137
+ )
138
+
139
+ # Verify the document was created
140
+ assert doc.external_id is not None
141
+
142
+ # List documents in the folder
143
+ docs = await folder.list_documents()
144
+
145
+ # There should be at least our test document
146
+ doc_ids = [d.external_id for d in docs]
147
+ assert doc.external_id in doc_ids
148
+
149
+ # Clean up - first delete the document
150
+ await db.delete_document(doc.external_id)
151
+
152
+ # TODO: Add folder deletion when API supports it
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_user_scope(self, db):
156
+ """Test user scoped operations"""
157
+ # Create a unique user ID
158
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
159
+
160
+ # Create a user scope
161
+ user_scope = db.signin(user_id)
162
+
163
+ # Verify user scope
164
+ assert user_scope.end_user_id == user_id
165
+
166
+ # Test ingesting a document as the user
167
+ doc = await user_scope.ingest_text(
168
+ content="This is a test document from a specific user.",
169
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
170
+ metadata={"test_id": "async_user_test", "category": "test"},
171
+ )
172
+
173
+ # Verify the document was created
174
+ assert doc.external_id is not None
175
+ assert "test_id" in doc.metadata
176
+ assert doc.metadata["test_id"] == "async_user_test"
177
+
178
+ # List documents for this user
179
+ docs = await user_scope.list_documents()
180
+
181
+ # There should be at least our test document
182
+ doc_ids = [d.external_id for d in docs]
183
+ assert doc.external_id in doc_ids
184
+
185
+ # Clean up
186
+ await db.delete_document(doc.external_id)
187
+
188
+ @pytest.mark.asyncio
189
+ async def test_batch_operations(self, db):
190
+ """Test batch operations"""
191
+ # Ingest multiple files
192
+ files = [
193
+ TEST_DOCS_DIR / "sample1.txt",
194
+ TEST_DOCS_DIR / "sample2.txt",
195
+ TEST_DOCS_DIR / "sample3.txt",
196
+ ]
197
+
198
+ # Test batch ingestion
199
+ docs = await db.ingest_files(
200
+ files=files, metadata={"test_id": "async_batch_test", "category": "test"}, parallel=True
201
+ )
202
+
203
+ # Verify documents were created
204
+ assert len(docs) == 3
205
+ file_names = [doc.filename for doc in docs]
206
+ assert "sample1.txt" in file_names
207
+ assert "sample2.txt" in file_names
208
+ assert "sample3.txt" in file_names
209
+
210
+ # Get documents in batch
211
+ doc_ids = [doc.external_id for doc in docs]
212
+ batch_docs = await db.batch_get_documents(doc_ids)
213
+
214
+ # Verify batch retrieval
215
+ assert len(batch_docs) == len(doc_ids)
216
+ retrieved_ids = [doc.external_id for doc in batch_docs]
217
+ for doc_id in doc_ids:
218
+ assert doc_id in retrieved_ids
219
+
220
+ # Clean up
221
+ for doc_id in doc_ids:
222
+ await db.delete_document(doc_id)
223
+
224
+ @pytest.mark.asyncio
225
+ async def test_folder_with_user_scope(self, db):
226
+ """Test combination of folder and user scope"""
227
+ # Create unique names
228
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
229
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
230
+
231
+ # Create a folder
232
+ folder = await db.create_folder(name=folder_name)
233
+
234
+ # Create a user scope within the folder
235
+ user_scope = folder.signin(user_id)
236
+
237
+ # Verify scopes
238
+ assert user_scope.folder_name == folder_name
239
+ assert user_scope.end_user_id == user_id
240
+
241
+ # Test ingestion in this combined scope
242
+ doc = await user_scope.ingest_text(
243
+ content="This is a test document in a folder from a specific user.",
244
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
245
+ metadata={"test_id": "async_folder_user_test", "category": "test"},
246
+ )
247
+
248
+ # Verify the document was created
249
+ assert doc.external_id is not None
250
+
251
+ # List documents in this scope
252
+ docs = await user_scope.list_documents()
253
+
254
+ # There should be at least our test document
255
+ doc_ids = [d.external_id for d in docs]
256
+ assert doc.external_id in doc_ids
257
+
258
+ # Clean up
259
+ await db.delete_document(doc.external_id)
260
+
261
+ @pytest.mark.asyncio
262
+ async def test_query_endpoint(self, db):
263
+ """Test the query endpoint for RAG capabilities"""
264
+ # First ingest a document
265
+ doc = await db.ingest_text(
266
+ content="Artificial intelligence and machine learning are transforming industries worldwide. "
267
+ "AI systems can now process natural language, recognize images, and make complex decisions.",
268
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
269
+ metadata={"test_id": "async_query_test", "category": "test"},
270
+ )
271
+
272
+ try:
273
+ # Wait for processing to complete
274
+ for _ in range(10):
275
+ status = await db.get_document_status(doc.external_id)
276
+ if status.get("status") == "completed":
277
+ break
278
+ await asyncio.sleep(2)
279
+
280
+ # Only proceed with test if document is processed
281
+ if status.get("status") == "completed":
282
+ # Test the query endpoint
283
+ response = await db.query(
284
+ query="What can AI systems do?",
285
+ filters={"test_id": "async_query_test"},
286
+ k=1,
287
+ temperature=0.7,
288
+ )
289
+
290
+ # Verify response
291
+ assert response.completion is not None
292
+ assert len(response.completion) > 0
293
+ assert len(response.sources) > 0
294
+ assert response.sources[0].document_id == doc.external_id
295
+
296
+ finally:
297
+ # Clean up
298
+ await db.delete_document(doc.external_id)
299
+
300
+ @pytest.mark.asyncio
301
+ async def test_query_with_pydantic_schema(self, db):
302
+ """Test the query endpoint with a Pydantic schema for structured output (async)."""
303
+ content = (
304
+ "Morphik async client supports coroutines. "
305
+ "It uses httpx for async requests. "
306
+ "Key features include non-blocking IO."
307
+ )
308
+ doc = await db.ingest_text(
309
+ content=content,
310
+ filename=f"test_schema_async_{uuid.uuid4().hex[:8]}.txt",
311
+ metadata={"test_id": "async_schema_pydantic_test"},
312
+ )
313
+
314
+ try:
315
+ await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
316
+
317
+ response = await db.query(
318
+ query="Summarize this async document and list key points.",
319
+ filters={"test_id": "async_schema_pydantic_test"},
320
+ k=1,
321
+ schema=StructuredOutputSchema,
322
+ )
323
+
324
+ assert response.completion is not None
325
+ # Expect completion to be the dictionary itself
326
+ assert isinstance(response.completion, dict)
327
+ output_data = response.completion
328
+ assert "summary" in output_data
329
+ assert "key_points" in output_data
330
+ assert isinstance(output_data["summary"], str)
331
+ assert isinstance(output_data["key_points"], list)
332
+
333
+ finally:
334
+ await db.delete_document(doc.external_id)
335
+
336
+ @pytest.mark.asyncio
337
+ async def test_query_with_dict_schema(self, db):
338
+ """Test the query endpoint with a dict schema for structured output (async)."""
339
+ content = "Asyncio provides infrastructure for writing single-threaded concurrent code."
340
+ doc = await db.ingest_text(
341
+ content=content,
342
+ filename=f"test_schema_dict_async_{uuid.uuid4().hex[:8]}.txt",
343
+ metadata={"test_id": "async_schema_dict_test"},
344
+ )
345
+
346
+ dict_schema = {
347
+ "type": "object",
348
+ "properties": {
349
+ "topic": {"type": "string", "description": "The main topic"},
350
+ "feature": {"type": "string", "description": "A key feature"},
351
+ },
352
+ "required": ["topic"],
353
+ }
354
+
355
+ try:
356
+ await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
357
+
358
+ response = await db.query(
359
+ query="Extract the topic and a feature.",
360
+ filters={"test_id": "async_schema_dict_test"},
361
+ k=1,
362
+ schema=dict_schema,
363
+ )
364
+
365
+ assert response.completion is not None
366
+ # Expect completion to be the dictionary itself
367
+ assert isinstance(response.completion, dict)
368
+ output_data = response.completion
369
+ assert "topic" in output_data
370
+ # Allow None if not required and type is string
371
+ if "feature" in dict_schema.get("required", []):
372
+ assert "feature" in output_data
373
+ elif output_data.get("feature") is None:
374
+ pass # Allow None for non-required string
375
+ else:
376
+ assert isinstance(output_data.get("feature"), str)
377
+
378
+ if "topic" not in dict_schema.get("required", []) and output_data.get("topic") is None:
379
+ pass # Allow None for non-required string
380
+ else:
381
+ assert isinstance(output_data.get("topic"), str)
382
+
383
+ finally:
384
+ await db.delete_document(doc.external_id)
@@ -0,0 +1,11 @@
1
+ This is a sample text file for testing the Morphik SDK.
2
+
3
+ It contains information about artificial intelligence and machine learning.
4
+ Large language models like Claude can process and understand natural language.
5
+
6
+ These models are trained on vast amounts of text data and can generate
7
+ human-like responses to various prompts and questions.
8
+
9
+ The field of AI has seen rapid advancement in recent years, with models
10
+ becoming increasingly capable of understanding context and generating
11
+ coherent, relevant text.
@@ -0,0 +1,15 @@
1
+ Vector databases are specialized databases designed for storing and retrieving
2
+ high-dimensional vectors. They are becoming increasingly important in the
3
+ world of machine learning and AI applications.
4
+
5
+ Unlike traditional databases that work with structured data, vector databases
6
+ are optimized for similarity search operations on embedding vectors.
7
+
8
+ Key features of vector databases include:
9
+ 1. Efficient similarity search algorithms
10
+ 2. Support for high-dimensional vectors
11
+ 3. Specialized indexing for fast retrieval
12
+ 4. Scalability for large vector collections
13
+
14
+ Common applications include semantic search, recommendation systems,
15
+ image retrieval, and natural language processing tasks.
@@ -0,0 +1,17 @@
1
+ Retrieval Augmented Generation (RAG) combines the power of large language
2
+ models with external knowledge retrieval systems.
3
+
4
+ In a RAG system, when a query is received:
5
+ 1. Relevant documents are retrieved from a knowledge base
6
+ 2. These documents are provided as context to the language model
7
+ 3. The model generates a response informed by both its trained knowledge
8
+ and the retrieved information
9
+
10
+ RAG has several advantages:
11
+ - More accurate responses with up-to-date information
12
+ - Ability to cite sources for generated content
13
+ - Reduced hallucinations compared to standalone LLMs
14
+ - More controllable knowledge base
15
+
16
+ This approach is now widely used in enterprise AI applications where
17
+ accuracy and source attribution are critical.