morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
- import os
2
- import pytest
3
1
  import asyncio
2
+ import os
4
3
  import uuid
5
4
  from pathlib import Path
6
5
 
7
- from morphik.async_ import AsyncMorphik, AsyncFolder, AsyncUserScope
8
- from morphik.models import Document, CompletionResponse
6
+ import pytest
7
+ from pydantic import BaseModel, Field
8
+
9
+ from morphik.async_ import AsyncMorphik
9
10
 
10
11
  # Set to your local Morphik server - use localhost by default
11
12
  # Default client connects to localhost:8000 automatically
@@ -13,71 +14,74 @@ from morphik.models import Document, CompletionResponse
13
14
  # Skip these tests if the SKIP_LIVE_TESTS environment variable is set
14
15
  pytestmark = pytest.mark.skipif(
15
16
  os.environ.get("SKIP_LIVE_TESTS") == "1",
16
- reason="Skip tests that require a running Morphik server"
17
+ reason="Skip tests that require a running Morphik server",
17
18
  )
18
19
 
19
20
  # Get the test files directory
20
21
  TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
21
22
 
22
23
 
24
+ class StructuredOutputSchema(BaseModel):
25
+ summary: str = Field(..., description="A short summary of the input text")
26
+ key_points: list[str] = Field(..., description="A list of key points from the text")
27
+
28
+
23
29
  class TestAsyncMorphik:
24
30
  """
25
31
  Tests for the asynchronous Morphik SDK client with a live server.
26
-
32
+
27
33
  To run these tests, start a local Morphik server and then run:
28
- MORPHIK_TEST_URL=http://localhost:8000 pytest morphik/tests/test_async.py -v
34
+ pytest morphik/tests/test_async.py -v
29
35
  """
30
-
36
+
31
37
  @pytest.fixture
32
38
  async def db(self):
33
39
  """Create an AsyncMorphik client for testing"""
34
- client = AsyncMorphik() # Connects to localhost:8000 by default
40
+ # Connects to localhost:8000 by default, increase timeout
41
+ client = AsyncMorphik(timeout=120)
35
42
  yield client
36
43
  await client.close()
37
-
44
+
38
45
  @pytest.mark.asyncio
39
46
  async def test_ingest_text(self, db):
40
47
  """Test ingesting a text document"""
41
48
  # Generate a unique filename to avoid conflicts
42
49
  filename = f"test_{uuid.uuid4().hex[:8]}.txt"
43
-
50
+
44
51
  # Test basic text ingestion
45
52
  doc = await db.ingest_text(
46
53
  content="This is a test document for the Morphik SDK.",
47
54
  filename=filename,
48
- metadata={"test_id": "async_text_test", "category": "test"}
55
+ metadata={"test_id": "async_text_test", "category": "test"},
49
56
  )
50
-
57
+
51
58
  # Verify the document was created
52
59
  assert doc.external_id is not None
53
60
  assert doc.filename == filename
54
61
  assert "test_id" in doc.metadata
55
62
  assert doc.metadata["test_id"] == "async_text_test"
56
-
63
+
57
64
  # Clean up
58
65
  await db.delete_document(doc.external_id)
59
-
66
+
60
67
  @pytest.mark.asyncio
61
68
  async def test_ingest_file(self, db):
62
69
  """Test ingesting a file from disk"""
63
70
  # Use one of our test documents
64
71
  file_path = TEST_DOCS_DIR / "sample1.txt"
65
-
72
+
66
73
  # Test file ingestion
67
- doc = await db.ingest_file(
68
- file=file_path,
69
- metadata={"test_id": "async_file_test", "category": "test"}
70
- )
71
-
74
+ doc = await db.ingest_file(file=file_path, metadata={"test_id": "async_file_test", "category": "test"})
75
+
72
76
  # Verify the document was created
73
77
  assert doc.external_id is not None
74
78
  assert doc.filename == "sample1.txt"
75
79
  assert "test_id" in doc.metadata
76
80
  assert doc.metadata["test_id"] == "async_file_test"
77
-
81
+
78
82
  # Clean up
79
83
  await db.delete_document(doc.external_id)
80
-
84
+
81
85
  @pytest.mark.asyncio
82
86
  async def test_retrieve_chunks(self, db):
83
87
  """Test retrieving chunks with a query"""
@@ -85,9 +89,9 @@ class TestAsyncMorphik:
85
89
  doc = await db.ingest_text(
86
90
  content="Artificial intelligence and machine learning are transforming industries worldwide.",
87
91
  filename=f"test_{uuid.uuid4().hex[:8]}.txt",
88
- metadata={"test_id": "async_retrieval_test", "category": "test"}
92
+ metadata={"test_id": "async_retrieval_test", "category": "test"},
89
93
  )
90
-
94
+
91
95
  # Wait for processing to complete
92
96
  max_retries = 10
93
97
  for _ in range(max_retries):
@@ -98,93 +102,89 @@ class TestAsyncMorphik:
98
102
  await asyncio.sleep(2) # Wait before checking again
99
103
  except Exception:
100
104
  await asyncio.sleep(2)
101
-
105
+
102
106
  # Test retrieval
103
107
  chunks = await db.retrieve_chunks(
104
- query="What is artificial intelligence?",
105
- filters={"test_id": "async_retrieval_test"}
108
+ query="What is artificial intelligence?", filters={"test_id": "async_retrieval_test"}
106
109
  )
107
-
110
+
108
111
  # Verify results (may be empty if processing is slow)
109
112
  if len(chunks) > 0:
110
113
  assert chunks[0].document_id == doc.external_id
111
114
  assert chunks[0].score > 0
112
-
115
+
113
116
  # Clean up
114
117
  await db.delete_document(doc.external_id)
115
-
118
+
116
119
  @pytest.mark.asyncio
117
120
  async def test_folder_operations(self, db):
118
121
  """Test folder operations"""
119
122
  # Create a unique folder name
120
123
  folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
121
-
124
+
122
125
  # Create a folder
123
- folder = await db.create_folder(
124
- name=folder_name,
125
- description="Test folder for SDK tests"
126
- )
127
-
126
+ folder = await db.create_folder(name=folder_name, description="Test folder for SDK tests")
127
+
128
128
  # Verify folder was created
129
129
  assert folder.name == folder_name
130
130
  assert folder.id is not None
131
-
131
+
132
132
  # Test ingesting a document into the folder
133
133
  doc = await folder.ingest_text(
134
134
  content="This is a test document in a folder.",
135
135
  filename=f"test_{uuid.uuid4().hex[:8]}.txt",
136
- metadata={"test_id": "async_folder_test", "category": "test"}
136
+ metadata={"test_id": "async_folder_test", "category": "test"},
137
137
  )
138
-
138
+
139
139
  # Verify the document was created
140
140
  assert doc.external_id is not None
141
-
141
+
142
142
  # List documents in the folder
143
143
  docs = await folder.list_documents()
144
-
144
+
145
145
  # There should be at least our test document
146
146
  doc_ids = [d.external_id for d in docs]
147
147
  assert doc.external_id in doc_ids
148
-
148
+
149
149
  # Clean up - first delete the document
150
150
  await db.delete_document(doc.external_id)
151
-
151
+
152
152
  # TODO: Add folder deletion when API supports it
153
-
153
+
154
154
  @pytest.mark.asyncio
155
155
  async def test_user_scope(self, db):
156
156
  """Test user scoped operations"""
157
157
  # Create a unique user ID
158
158
  user_id = f"test_user_{uuid.uuid4().hex[:8]}"
159
-
159
+
160
160
  # Create a user scope
161
161
  user_scope = db.signin(user_id)
162
-
162
+
163
163
  # Verify user scope
164
164
  assert user_scope.end_user_id == user_id
165
-
165
+
166
166
  # Test ingesting a document as the user
167
167
  doc = await user_scope.ingest_text(
168
168
  content="This is a test document from a specific user.",
169
169
  filename=f"test_{uuid.uuid4().hex[:8]}.txt",
170
- metadata={"test_id": "async_user_test", "category": "test"}
170
+ metadata={"test_id": "async_user_test", "category": "test"},
171
171
  )
172
-
172
+
173
173
  # Verify the document was created
174
174
  assert doc.external_id is not None
175
175
  assert "test_id" in doc.metadata
176
176
  assert doc.metadata["test_id"] == "async_user_test"
177
-
177
+
178
178
  # List documents for this user
179
179
  docs = await user_scope.list_documents()
180
-
180
+
181
181
  # There should be at least our test document
182
182
  doc_ids = [d.external_id for d in docs]
183
183
  assert doc.external_id in doc_ids
184
-
184
+
185
185
  # Clean up
186
186
  await db.delete_document(doc.external_id)
187
-
187
+
188
188
  @pytest.mark.asyncio
189
189
  async def test_batch_operations(self, db):
190
190
  """Test batch operations"""
@@ -192,85 +192,83 @@ class TestAsyncMorphik:
192
192
  files = [
193
193
  TEST_DOCS_DIR / "sample1.txt",
194
194
  TEST_DOCS_DIR / "sample2.txt",
195
- TEST_DOCS_DIR / "sample3.txt"
195
+ TEST_DOCS_DIR / "sample3.txt",
196
196
  ]
197
-
197
+
198
198
  # Test batch ingestion
199
199
  docs = await db.ingest_files(
200
- files=files,
201
- metadata={"test_id": "async_batch_test", "category": "test"},
202
- parallel=True
200
+ files=files, metadata={"test_id": "async_batch_test", "category": "test"}, parallel=True
203
201
  )
204
-
202
+
205
203
  # Verify documents were created
206
204
  assert len(docs) == 3
207
205
  file_names = [doc.filename for doc in docs]
208
206
  assert "sample1.txt" in file_names
209
207
  assert "sample2.txt" in file_names
210
208
  assert "sample3.txt" in file_names
211
-
209
+
212
210
  # Get documents in batch
213
211
  doc_ids = [doc.external_id for doc in docs]
214
212
  batch_docs = await db.batch_get_documents(doc_ids)
215
-
213
+
216
214
  # Verify batch retrieval
217
215
  assert len(batch_docs) == len(doc_ids)
218
216
  retrieved_ids = [doc.external_id for doc in batch_docs]
219
217
  for doc_id in doc_ids:
220
218
  assert doc_id in retrieved_ids
221
-
219
+
222
220
  # Clean up
223
221
  for doc_id in doc_ids:
224
222
  await db.delete_document(doc_id)
225
-
223
+
226
224
  @pytest.mark.asyncio
227
225
  async def test_folder_with_user_scope(self, db):
228
226
  """Test combination of folder and user scope"""
229
227
  # Create unique names
230
228
  folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
231
229
  user_id = f"test_user_{uuid.uuid4().hex[:8]}"
232
-
230
+
233
231
  # Create a folder
234
232
  folder = await db.create_folder(name=folder_name)
235
-
233
+
236
234
  # Create a user scope within the folder
237
235
  user_scope = folder.signin(user_id)
238
-
236
+
239
237
  # Verify scopes
240
238
  assert user_scope.folder_name == folder_name
241
239
  assert user_scope.end_user_id == user_id
242
-
240
+
243
241
  # Test ingestion in this combined scope
244
242
  doc = await user_scope.ingest_text(
245
243
  content="This is a test document in a folder from a specific user.",
246
244
  filename=f"test_{uuid.uuid4().hex[:8]}.txt",
247
- metadata={"test_id": "async_folder_user_test", "category": "test"}
245
+ metadata={"test_id": "async_folder_user_test", "category": "test"},
248
246
  )
249
-
247
+
250
248
  # Verify the document was created
251
249
  assert doc.external_id is not None
252
-
250
+
253
251
  # List documents in this scope
254
252
  docs = await user_scope.list_documents()
255
-
253
+
256
254
  # There should be at least our test document
257
255
  doc_ids = [d.external_id for d in docs]
258
256
  assert doc.external_id in doc_ids
259
-
257
+
260
258
  # Clean up
261
259
  await db.delete_document(doc.external_id)
262
-
260
+
263
261
  @pytest.mark.asyncio
264
262
  async def test_query_endpoint(self, db):
265
263
  """Test the query endpoint for RAG capabilities"""
266
264
  # First ingest a document
267
265
  doc = await db.ingest_text(
268
266
  content="Artificial intelligence and machine learning are transforming industries worldwide. "
269
- "AI systems can now process natural language, recognize images, and make complex decisions.",
267
+ "AI systems can now process natural language, recognize images, and make complex decisions.",
270
268
  filename=f"test_{uuid.uuid4().hex[:8]}.txt",
271
- metadata={"test_id": "async_query_test", "category": "test"}
269
+ metadata={"test_id": "async_query_test", "category": "test"},
272
270
  )
273
-
271
+
274
272
  try:
275
273
  # Wait for processing to complete
276
274
  for _ in range(10):
@@ -278,7 +276,7 @@ class TestAsyncMorphik:
278
276
  if status.get("status") == "completed":
279
277
  break
280
278
  await asyncio.sleep(2)
281
-
279
+
282
280
  # Only proceed with test if document is processed
283
281
  if status.get("status") == "completed":
284
282
  # Test the query endpoint
@@ -286,15 +284,101 @@ class TestAsyncMorphik:
286
284
  query="What can AI systems do?",
287
285
  filters={"test_id": "async_query_test"},
288
286
  k=1,
289
- temperature=0.7
287
+ temperature=0.7,
290
288
  )
291
-
289
+
292
290
  # Verify response
293
291
  assert response.completion is not None
294
292
  assert len(response.completion) > 0
295
293
  assert len(response.sources) > 0
296
294
  assert response.sources[0].document_id == doc.external_id
297
-
295
+
298
296
  finally:
299
297
  # Clean up
300
- await db.delete_document(doc.external_id)
298
+ await db.delete_document(doc.external_id)
299
+
300
+ @pytest.mark.asyncio
301
+ async def test_query_with_pydantic_schema(self, db):
302
+ """Test the query endpoint with a Pydantic schema for structured output (async)."""
303
+ content = (
304
+ "Morphik async client supports coroutines. "
305
+ "It uses httpx for async requests. "
306
+ "Key features include non-blocking IO."
307
+ )
308
+ doc = await db.ingest_text(
309
+ content=content,
310
+ filename=f"test_schema_async_{uuid.uuid4().hex[:8]}.txt",
311
+ metadata={"test_id": "async_schema_pydantic_test"},
312
+ )
313
+
314
+ try:
315
+ await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
316
+
317
+ response = await db.query(
318
+ query="Summarize this async document and list key points.",
319
+ filters={"test_id": "async_schema_pydantic_test"},
320
+ k=1,
321
+ schema=StructuredOutputSchema,
322
+ )
323
+
324
+ assert response.completion is not None
325
+ # Expect completion to be the dictionary itself
326
+ assert isinstance(response.completion, dict)
327
+ output_data = response.completion
328
+ assert "summary" in output_data
329
+ assert "key_points" in output_data
330
+ assert isinstance(output_data["summary"], str)
331
+ assert isinstance(output_data["key_points"], list)
332
+
333
+ finally:
334
+ await db.delete_document(doc.external_id)
335
+
336
+ @pytest.mark.asyncio
337
+ async def test_query_with_dict_schema(self, db):
338
+ """Test the query endpoint with a dict schema for structured output (async)."""
339
+ content = "Asyncio provides infrastructure for writing single-threaded concurrent code."
340
+ doc = await db.ingest_text(
341
+ content=content,
342
+ filename=f"test_schema_dict_async_{uuid.uuid4().hex[:8]}.txt",
343
+ metadata={"test_id": "async_schema_dict_test"},
344
+ )
345
+
346
+ dict_schema = {
347
+ "type": "object",
348
+ "properties": {
349
+ "topic": {"type": "string", "description": "The main topic"},
350
+ "feature": {"type": "string", "description": "A key feature"},
351
+ },
352
+ "required": ["topic"],
353
+ }
354
+
355
+ try:
356
+ await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
357
+
358
+ response = await db.query(
359
+ query="Extract the topic and a feature.",
360
+ filters={"test_id": "async_schema_dict_test"},
361
+ k=1,
362
+ schema=dict_schema,
363
+ )
364
+
365
+ assert response.completion is not None
366
+ # Expect completion to be the dictionary itself
367
+ assert isinstance(response.completion, dict)
368
+ output_data = response.completion
369
+ assert "topic" in output_data
370
+ # Allow None if not required and type is string
371
+ if "feature" in dict_schema.get("required", []):
372
+ assert "feature" in output_data
373
+ elif output_data.get("feature") is None:
374
+ pass # Allow None for non-required string
375
+ else:
376
+ assert isinstance(output_data.get("feature"), str)
377
+
378
+ if "topic" not in dict_schema.get("required", []) and output_data.get("topic") is None:
379
+ pass # Allow None for non-required string
380
+ else:
381
+ assert isinstance(output_data.get("topic"), str)
382
+
383
+ finally:
384
+ await db.delete_document(doc.external_id)
@@ -8,4 +8,4 @@ human-like responses to various prompts and questions.
8
8
 
9
9
  The field of AI has seen rapid advancement in recent years, with models
10
10
  becoming increasingly capable of understanding context and generating
11
- coherent, relevant text.
11
+ coherent, relevant text.
@@ -1,4 +1,4 @@
1
- Vector databases are specialized databases designed for storing and retrieving
1
+ Vector databases are specialized databases designed for storing and retrieving
2
2
  high-dimensional vectors. They are becoming increasingly important in the
3
3
  world of machine learning and AI applications.
4
4
 
@@ -12,4 +12,4 @@ Key features of vector databases include:
12
12
  4. Scalability for large vector collections
13
13
 
14
14
  Common applications include semantic search, recommendation systems,
15
- image retrieval, and natural language processing tasks.
15
+ image retrieval, and natural language processing tasks.
@@ -14,4 +14,4 @@ RAG has several advantages:
14
14
  - More controllable knowledge base
15
15
 
16
16
  This approach is now widely used in enterprise AI applications where
17
- accuracy and source attribution are critical.
17
+ accuracy and source attribution are critical.