morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +28 -19
- morphik/async_.py +121 -110
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +156 -109
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/tests/test_async.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
import os
|
2
|
-
import pytest
|
3
1
|
import asyncio
|
2
|
+
import os
|
4
3
|
import uuid
|
5
4
|
from pathlib import Path
|
6
5
|
|
7
|
-
|
8
|
-
from
|
6
|
+
import pytest
|
7
|
+
from pydantic import BaseModel, Field
|
8
|
+
|
9
|
+
from morphik.async_ import AsyncMorphik
|
9
10
|
|
10
11
|
# Set to your local Morphik server - use localhost by default
|
11
12
|
# Default client connects to localhost:8000 automatically
|
@@ -13,71 +14,74 @@ from morphik.models import Document, CompletionResponse
|
|
13
14
|
# Skip these tests if the SKIP_LIVE_TESTS environment variable is set
|
14
15
|
pytestmark = pytest.mark.skipif(
|
15
16
|
os.environ.get("SKIP_LIVE_TESTS") == "1",
|
16
|
-
reason="Skip tests that require a running Morphik server"
|
17
|
+
reason="Skip tests that require a running Morphik server",
|
17
18
|
)
|
18
19
|
|
19
20
|
# Get the test files directory
|
20
21
|
TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
|
21
22
|
|
22
23
|
|
24
|
+
class StructuredOutputSchema(BaseModel):
|
25
|
+
summary: str = Field(..., description="A short summary of the input text")
|
26
|
+
key_points: list[str] = Field(..., description="A list of key points from the text")
|
27
|
+
|
28
|
+
|
23
29
|
class TestAsyncMorphik:
|
24
30
|
"""
|
25
31
|
Tests for the asynchronous Morphik SDK client with a live server.
|
26
|
-
|
32
|
+
|
27
33
|
To run these tests, start a local Morphik server and then run:
|
28
|
-
|
34
|
+
pytest morphik/tests/test_async.py -v
|
29
35
|
"""
|
30
|
-
|
36
|
+
|
31
37
|
@pytest.fixture
|
32
38
|
async def db(self):
|
33
39
|
"""Create an AsyncMorphik client for testing"""
|
34
|
-
|
40
|
+
# Connects to localhost:8000 by default, increase timeout
|
41
|
+
client = AsyncMorphik(timeout=120)
|
35
42
|
yield client
|
36
43
|
await client.close()
|
37
|
-
|
44
|
+
|
38
45
|
@pytest.mark.asyncio
|
39
46
|
async def test_ingest_text(self, db):
|
40
47
|
"""Test ingesting a text document"""
|
41
48
|
# Generate a unique filename to avoid conflicts
|
42
49
|
filename = f"test_{uuid.uuid4().hex[:8]}.txt"
|
43
|
-
|
50
|
+
|
44
51
|
# Test basic text ingestion
|
45
52
|
doc = await db.ingest_text(
|
46
53
|
content="This is a test document for the Morphik SDK.",
|
47
54
|
filename=filename,
|
48
|
-
metadata={"test_id": "async_text_test", "category": "test"}
|
55
|
+
metadata={"test_id": "async_text_test", "category": "test"},
|
49
56
|
)
|
50
|
-
|
57
|
+
|
51
58
|
# Verify the document was created
|
52
59
|
assert doc.external_id is not None
|
53
60
|
assert doc.filename == filename
|
54
61
|
assert "test_id" in doc.metadata
|
55
62
|
assert doc.metadata["test_id"] == "async_text_test"
|
56
|
-
|
63
|
+
|
57
64
|
# Clean up
|
58
65
|
await db.delete_document(doc.external_id)
|
59
|
-
|
66
|
+
|
60
67
|
@pytest.mark.asyncio
|
61
68
|
async def test_ingest_file(self, db):
|
62
69
|
"""Test ingesting a file from disk"""
|
63
70
|
# Use one of our test documents
|
64
71
|
file_path = TEST_DOCS_DIR / "sample1.txt"
|
65
|
-
|
72
|
+
|
66
73
|
# Test file ingestion
|
67
|
-
doc = await db.ingest_file(
|
68
|
-
|
69
|
-
metadata={"test_id": "async_file_test", "category": "test"}
|
70
|
-
)
|
71
|
-
|
74
|
+
doc = await db.ingest_file(file=file_path, metadata={"test_id": "async_file_test", "category": "test"})
|
75
|
+
|
72
76
|
# Verify the document was created
|
73
77
|
assert doc.external_id is not None
|
74
78
|
assert doc.filename == "sample1.txt"
|
75
79
|
assert "test_id" in doc.metadata
|
76
80
|
assert doc.metadata["test_id"] == "async_file_test"
|
77
|
-
|
81
|
+
|
78
82
|
# Clean up
|
79
83
|
await db.delete_document(doc.external_id)
|
80
|
-
|
84
|
+
|
81
85
|
@pytest.mark.asyncio
|
82
86
|
async def test_retrieve_chunks(self, db):
|
83
87
|
"""Test retrieving chunks with a query"""
|
@@ -85,9 +89,9 @@ class TestAsyncMorphik:
|
|
85
89
|
doc = await db.ingest_text(
|
86
90
|
content="Artificial intelligence and machine learning are transforming industries worldwide.",
|
87
91
|
filename=f"test_{uuid.uuid4().hex[:8]}.txt",
|
88
|
-
metadata={"test_id": "async_retrieval_test", "category": "test"}
|
92
|
+
metadata={"test_id": "async_retrieval_test", "category": "test"},
|
89
93
|
)
|
90
|
-
|
94
|
+
|
91
95
|
# Wait for processing to complete
|
92
96
|
max_retries = 10
|
93
97
|
for _ in range(max_retries):
|
@@ -98,93 +102,89 @@ class TestAsyncMorphik:
|
|
98
102
|
await asyncio.sleep(2) # Wait before checking again
|
99
103
|
except Exception:
|
100
104
|
await asyncio.sleep(2)
|
101
|
-
|
105
|
+
|
102
106
|
# Test retrieval
|
103
107
|
chunks = await db.retrieve_chunks(
|
104
|
-
query="What is artificial intelligence?",
|
105
|
-
filters={"test_id": "async_retrieval_test"}
|
108
|
+
query="What is artificial intelligence?", filters={"test_id": "async_retrieval_test"}
|
106
109
|
)
|
107
|
-
|
110
|
+
|
108
111
|
# Verify results (may be empty if processing is slow)
|
109
112
|
if len(chunks) > 0:
|
110
113
|
assert chunks[0].document_id == doc.external_id
|
111
114
|
assert chunks[0].score > 0
|
112
|
-
|
115
|
+
|
113
116
|
# Clean up
|
114
117
|
await db.delete_document(doc.external_id)
|
115
|
-
|
118
|
+
|
116
119
|
@pytest.mark.asyncio
|
117
120
|
async def test_folder_operations(self, db):
|
118
121
|
"""Test folder operations"""
|
119
122
|
# Create a unique folder name
|
120
123
|
folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
|
121
|
-
|
124
|
+
|
122
125
|
# Create a folder
|
123
|
-
folder = await db.create_folder(
|
124
|
-
|
125
|
-
description="Test folder for SDK tests"
|
126
|
-
)
|
127
|
-
|
126
|
+
folder = await db.create_folder(name=folder_name, description="Test folder for SDK tests")
|
127
|
+
|
128
128
|
# Verify folder was created
|
129
129
|
assert folder.name == folder_name
|
130
130
|
assert folder.id is not None
|
131
|
-
|
131
|
+
|
132
132
|
# Test ingesting a document into the folder
|
133
133
|
doc = await folder.ingest_text(
|
134
134
|
content="This is a test document in a folder.",
|
135
135
|
filename=f"test_{uuid.uuid4().hex[:8]}.txt",
|
136
|
-
metadata={"test_id": "async_folder_test", "category": "test"}
|
136
|
+
metadata={"test_id": "async_folder_test", "category": "test"},
|
137
137
|
)
|
138
|
-
|
138
|
+
|
139
139
|
# Verify the document was created
|
140
140
|
assert doc.external_id is not None
|
141
|
-
|
141
|
+
|
142
142
|
# List documents in the folder
|
143
143
|
docs = await folder.list_documents()
|
144
|
-
|
144
|
+
|
145
145
|
# There should be at least our test document
|
146
146
|
doc_ids = [d.external_id for d in docs]
|
147
147
|
assert doc.external_id in doc_ids
|
148
|
-
|
148
|
+
|
149
149
|
# Clean up - first delete the document
|
150
150
|
await db.delete_document(doc.external_id)
|
151
|
-
|
151
|
+
|
152
152
|
# TODO: Add folder deletion when API supports it
|
153
|
-
|
153
|
+
|
154
154
|
@pytest.mark.asyncio
|
155
155
|
async def test_user_scope(self, db):
|
156
156
|
"""Test user scoped operations"""
|
157
157
|
# Create a unique user ID
|
158
158
|
user_id = f"test_user_{uuid.uuid4().hex[:8]}"
|
159
|
-
|
159
|
+
|
160
160
|
# Create a user scope
|
161
161
|
user_scope = db.signin(user_id)
|
162
|
-
|
162
|
+
|
163
163
|
# Verify user scope
|
164
164
|
assert user_scope.end_user_id == user_id
|
165
|
-
|
165
|
+
|
166
166
|
# Test ingesting a document as the user
|
167
167
|
doc = await user_scope.ingest_text(
|
168
168
|
content="This is a test document from a specific user.",
|
169
169
|
filename=f"test_{uuid.uuid4().hex[:8]}.txt",
|
170
|
-
metadata={"test_id": "async_user_test", "category": "test"}
|
170
|
+
metadata={"test_id": "async_user_test", "category": "test"},
|
171
171
|
)
|
172
|
-
|
172
|
+
|
173
173
|
# Verify the document was created
|
174
174
|
assert doc.external_id is not None
|
175
175
|
assert "test_id" in doc.metadata
|
176
176
|
assert doc.metadata["test_id"] == "async_user_test"
|
177
|
-
|
177
|
+
|
178
178
|
# List documents for this user
|
179
179
|
docs = await user_scope.list_documents()
|
180
|
-
|
180
|
+
|
181
181
|
# There should be at least our test document
|
182
182
|
doc_ids = [d.external_id for d in docs]
|
183
183
|
assert doc.external_id in doc_ids
|
184
|
-
|
184
|
+
|
185
185
|
# Clean up
|
186
186
|
await db.delete_document(doc.external_id)
|
187
|
-
|
187
|
+
|
188
188
|
@pytest.mark.asyncio
|
189
189
|
async def test_batch_operations(self, db):
|
190
190
|
"""Test batch operations"""
|
@@ -192,85 +192,83 @@ class TestAsyncMorphik:
|
|
192
192
|
files = [
|
193
193
|
TEST_DOCS_DIR / "sample1.txt",
|
194
194
|
TEST_DOCS_DIR / "sample2.txt",
|
195
|
-
TEST_DOCS_DIR / "sample3.txt"
|
195
|
+
TEST_DOCS_DIR / "sample3.txt",
|
196
196
|
]
|
197
|
-
|
197
|
+
|
198
198
|
# Test batch ingestion
|
199
199
|
docs = await db.ingest_files(
|
200
|
-
files=files,
|
201
|
-
metadata={"test_id": "async_batch_test", "category": "test"},
|
202
|
-
parallel=True
|
200
|
+
files=files, metadata={"test_id": "async_batch_test", "category": "test"}, parallel=True
|
203
201
|
)
|
204
|
-
|
202
|
+
|
205
203
|
# Verify documents were created
|
206
204
|
assert len(docs) == 3
|
207
205
|
file_names = [doc.filename for doc in docs]
|
208
206
|
assert "sample1.txt" in file_names
|
209
207
|
assert "sample2.txt" in file_names
|
210
208
|
assert "sample3.txt" in file_names
|
211
|
-
|
209
|
+
|
212
210
|
# Get documents in batch
|
213
211
|
doc_ids = [doc.external_id for doc in docs]
|
214
212
|
batch_docs = await db.batch_get_documents(doc_ids)
|
215
|
-
|
213
|
+
|
216
214
|
# Verify batch retrieval
|
217
215
|
assert len(batch_docs) == len(doc_ids)
|
218
216
|
retrieved_ids = [doc.external_id for doc in batch_docs]
|
219
217
|
for doc_id in doc_ids:
|
220
218
|
assert doc_id in retrieved_ids
|
221
|
-
|
219
|
+
|
222
220
|
# Clean up
|
223
221
|
for doc_id in doc_ids:
|
224
222
|
await db.delete_document(doc_id)
|
225
|
-
|
223
|
+
|
226
224
|
@pytest.mark.asyncio
|
227
225
|
async def test_folder_with_user_scope(self, db):
|
228
226
|
"""Test combination of folder and user scope"""
|
229
227
|
# Create unique names
|
230
228
|
folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
|
231
229
|
user_id = f"test_user_{uuid.uuid4().hex[:8]}"
|
232
|
-
|
230
|
+
|
233
231
|
# Create a folder
|
234
232
|
folder = await db.create_folder(name=folder_name)
|
235
|
-
|
233
|
+
|
236
234
|
# Create a user scope within the folder
|
237
235
|
user_scope = folder.signin(user_id)
|
238
|
-
|
236
|
+
|
239
237
|
# Verify scopes
|
240
238
|
assert user_scope.folder_name == folder_name
|
241
239
|
assert user_scope.end_user_id == user_id
|
242
|
-
|
240
|
+
|
243
241
|
# Test ingestion in this combined scope
|
244
242
|
doc = await user_scope.ingest_text(
|
245
243
|
content="This is a test document in a folder from a specific user.",
|
246
244
|
filename=f"test_{uuid.uuid4().hex[:8]}.txt",
|
247
|
-
metadata={"test_id": "async_folder_user_test", "category": "test"}
|
245
|
+
metadata={"test_id": "async_folder_user_test", "category": "test"},
|
248
246
|
)
|
249
|
-
|
247
|
+
|
250
248
|
# Verify the document was created
|
251
249
|
assert doc.external_id is not None
|
252
|
-
|
250
|
+
|
253
251
|
# List documents in this scope
|
254
252
|
docs = await user_scope.list_documents()
|
255
|
-
|
253
|
+
|
256
254
|
# There should be at least our test document
|
257
255
|
doc_ids = [d.external_id for d in docs]
|
258
256
|
assert doc.external_id in doc_ids
|
259
|
-
|
257
|
+
|
260
258
|
# Clean up
|
261
259
|
await db.delete_document(doc.external_id)
|
262
|
-
|
260
|
+
|
263
261
|
@pytest.mark.asyncio
|
264
262
|
async def test_query_endpoint(self, db):
|
265
263
|
"""Test the query endpoint for RAG capabilities"""
|
266
264
|
# First ingest a document
|
267
265
|
doc = await db.ingest_text(
|
268
266
|
content="Artificial intelligence and machine learning are transforming industries worldwide. "
|
269
|
-
|
267
|
+
"AI systems can now process natural language, recognize images, and make complex decisions.",
|
270
268
|
filename=f"test_{uuid.uuid4().hex[:8]}.txt",
|
271
|
-
metadata={"test_id": "async_query_test", "category": "test"}
|
269
|
+
metadata={"test_id": "async_query_test", "category": "test"},
|
272
270
|
)
|
273
|
-
|
271
|
+
|
274
272
|
try:
|
275
273
|
# Wait for processing to complete
|
276
274
|
for _ in range(10):
|
@@ -278,7 +276,7 @@ class TestAsyncMorphik:
|
|
278
276
|
if status.get("status") == "completed":
|
279
277
|
break
|
280
278
|
await asyncio.sleep(2)
|
281
|
-
|
279
|
+
|
282
280
|
# Only proceed with test if document is processed
|
283
281
|
if status.get("status") == "completed":
|
284
282
|
# Test the query endpoint
|
@@ -286,15 +284,101 @@ class TestAsyncMorphik:
|
|
286
284
|
query="What can AI systems do?",
|
287
285
|
filters={"test_id": "async_query_test"},
|
288
286
|
k=1,
|
289
|
-
temperature=0.7
|
287
|
+
temperature=0.7,
|
290
288
|
)
|
291
|
-
|
289
|
+
|
292
290
|
# Verify response
|
293
291
|
assert response.completion is not None
|
294
292
|
assert len(response.completion) > 0
|
295
293
|
assert len(response.sources) > 0
|
296
294
|
assert response.sources[0].document_id == doc.external_id
|
297
|
-
|
295
|
+
|
298
296
|
finally:
|
299
297
|
# Clean up
|
300
|
-
await db.delete_document(doc.external_id)
|
298
|
+
await db.delete_document(doc.external_id)
|
299
|
+
|
300
|
+
@pytest.mark.asyncio
|
301
|
+
async def test_query_with_pydantic_schema(self, db):
|
302
|
+
"""Test the query endpoint with a Pydantic schema for structured output (async)."""
|
303
|
+
content = (
|
304
|
+
"Morphik async client supports coroutines. "
|
305
|
+
"It uses httpx for async requests. "
|
306
|
+
"Key features include non-blocking IO."
|
307
|
+
)
|
308
|
+
doc = await db.ingest_text(
|
309
|
+
content=content,
|
310
|
+
filename=f"test_schema_async_{uuid.uuid4().hex[:8]}.txt",
|
311
|
+
metadata={"test_id": "async_schema_pydantic_test"},
|
312
|
+
)
|
313
|
+
|
314
|
+
try:
|
315
|
+
await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
|
316
|
+
|
317
|
+
response = await db.query(
|
318
|
+
query="Summarize this async document and list key points.",
|
319
|
+
filters={"test_id": "async_schema_pydantic_test"},
|
320
|
+
k=1,
|
321
|
+
schema=StructuredOutputSchema,
|
322
|
+
)
|
323
|
+
|
324
|
+
assert response.completion is not None
|
325
|
+
# Expect completion to be the dictionary itself
|
326
|
+
assert isinstance(response.completion, dict)
|
327
|
+
output_data = response.completion
|
328
|
+
assert "summary" in output_data
|
329
|
+
assert "key_points" in output_data
|
330
|
+
assert isinstance(output_data["summary"], str)
|
331
|
+
assert isinstance(output_data["key_points"], list)
|
332
|
+
|
333
|
+
finally:
|
334
|
+
await db.delete_document(doc.external_id)
|
335
|
+
|
336
|
+
@pytest.mark.asyncio
|
337
|
+
async def test_query_with_dict_schema(self, db):
|
338
|
+
"""Test the query endpoint with a dict schema for structured output (async)."""
|
339
|
+
content = "Asyncio provides infrastructure for writing single-threaded concurrent code."
|
340
|
+
doc = await db.ingest_text(
|
341
|
+
content=content,
|
342
|
+
filename=f"test_schema_dict_async_{uuid.uuid4().hex[:8]}.txt",
|
343
|
+
metadata={"test_id": "async_schema_dict_test"},
|
344
|
+
)
|
345
|
+
|
346
|
+
dict_schema = {
|
347
|
+
"type": "object",
|
348
|
+
"properties": {
|
349
|
+
"topic": {"type": "string", "description": "The main topic"},
|
350
|
+
"feature": {"type": "string", "description": "A key feature"},
|
351
|
+
},
|
352
|
+
"required": ["topic"],
|
353
|
+
}
|
354
|
+
|
355
|
+
try:
|
356
|
+
await db.wait_for_document_completion(doc.external_id, timeout_seconds=60)
|
357
|
+
|
358
|
+
response = await db.query(
|
359
|
+
query="Extract the topic and a feature.",
|
360
|
+
filters={"test_id": "async_schema_dict_test"},
|
361
|
+
k=1,
|
362
|
+
schema=dict_schema,
|
363
|
+
)
|
364
|
+
|
365
|
+
assert response.completion is not None
|
366
|
+
# Expect completion to be the dictionary itself
|
367
|
+
assert isinstance(response.completion, dict)
|
368
|
+
output_data = response.completion
|
369
|
+
assert "topic" in output_data
|
370
|
+
# Allow None if not required and type is string
|
371
|
+
if "feature" in dict_schema.get("required", []):
|
372
|
+
assert "feature" in output_data
|
373
|
+
elif output_data.get("feature") is None:
|
374
|
+
pass # Allow None for non-required string
|
375
|
+
else:
|
376
|
+
assert isinstance(output_data.get("feature"), str)
|
377
|
+
|
378
|
+
if "topic" not in dict_schema.get("required", []) and output_data.get("topic") is None:
|
379
|
+
pass # Allow None for non-required string
|
380
|
+
else:
|
381
|
+
assert isinstance(output_data.get("topic"), str)
|
382
|
+
|
383
|
+
finally:
|
384
|
+
await db.delete_document(doc.external_id)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
Vector databases are specialized databases designed for storing and retrieving
|
1
|
+
Vector databases are specialized databases designed for storing and retrieving
|
2
2
|
high-dimensional vectors. They are becoming increasingly important in the
|
3
3
|
world of machine learning and AI applications.
|
4
4
|
|
@@ -12,4 +12,4 @@ Key features of vector databases include:
|
|
12
12
|
4. Scalability for large vector collections
|
13
13
|
|
14
14
|
Common applications include semantic search, recommendation systems,
|
15
|
-
image retrieval, and natural language processing tasks.
|
15
|
+
image retrieval, and natural language processing tasks.
|