morphik 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ import os
2
+ import pytest
3
+ import asyncio
4
+ import uuid
5
+ from pathlib import Path
6
+
7
+ from morphik.async_ import AsyncMorphik, AsyncFolder, AsyncUserScope
8
+ from morphik.models import Document, CompletionResponse
9
+
10
+ # Set to your local Morphik server - use localhost by default
11
+ # Default client connects to localhost:8000 automatically
12
+
13
+ # Skip these tests if the SKIP_LIVE_TESTS environment variable is set
14
+ pytestmark = pytest.mark.skipif(
15
+ os.environ.get("SKIP_LIVE_TESTS") == "1",
16
+ reason="Skip tests that require a running Morphik server"
17
+ )
18
+
19
+ # Get the test files directory
20
+ TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
21
+
22
+
23
+ class TestAsyncMorphik:
24
+ """
25
+ Tests for the asynchronous Morphik SDK client with a live server.
26
+
27
+ To run these tests, start a local Morphik server and then run:
28
+ MORPHIK_TEST_URL=http://localhost:8000 pytest morphik/tests/test_async.py -v
29
+ """
30
+
31
+ @pytest.fixture
32
+ async def db(self):
33
+ """Create an AsyncMorphik client for testing"""
34
+ client = AsyncMorphik() # Connects to localhost:8000 by default
35
+ yield client
36
+ await client.close()
37
+
38
+ @pytest.mark.asyncio
39
+ async def test_ingest_text(self, db):
40
+ """Test ingesting a text document"""
41
+ # Generate a unique filename to avoid conflicts
42
+ filename = f"test_{uuid.uuid4().hex[:8]}.txt"
43
+
44
+ # Test basic text ingestion
45
+ doc = await db.ingest_text(
46
+ content="This is a test document for the Morphik SDK.",
47
+ filename=filename,
48
+ metadata={"test_id": "async_text_test", "category": "test"}
49
+ )
50
+
51
+ # Verify the document was created
52
+ assert doc.external_id is not None
53
+ assert doc.filename == filename
54
+ assert "test_id" in doc.metadata
55
+ assert doc.metadata["test_id"] == "async_text_test"
56
+
57
+ # Clean up
58
+ await db.delete_document(doc.external_id)
59
+
60
+ @pytest.mark.asyncio
61
+ async def test_ingest_file(self, db):
62
+ """Test ingesting a file from disk"""
63
+ # Use one of our test documents
64
+ file_path = TEST_DOCS_DIR / "sample1.txt"
65
+
66
+ # Test file ingestion
67
+ doc = await db.ingest_file(
68
+ file=file_path,
69
+ metadata={"test_id": "async_file_test", "category": "test"}
70
+ )
71
+
72
+ # Verify the document was created
73
+ assert doc.external_id is not None
74
+ assert doc.filename == "sample1.txt"
75
+ assert "test_id" in doc.metadata
76
+ assert doc.metadata["test_id"] == "async_file_test"
77
+
78
+ # Clean up
79
+ await db.delete_document(doc.external_id)
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_retrieve_chunks(self, db):
83
+ """Test retrieving chunks with a query"""
84
+ # First ingest a document
85
+ doc = await db.ingest_text(
86
+ content="Artificial intelligence and machine learning are transforming industries worldwide.",
87
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
88
+ metadata={"test_id": "async_retrieval_test", "category": "test"}
89
+ )
90
+
91
+ # Wait for processing to complete
92
+ max_retries = 10
93
+ for _ in range(max_retries):
94
+ try:
95
+ status = await db.get_document_status(doc.external_id)
96
+ if status.get("status") == "completed":
97
+ break
98
+ await asyncio.sleep(2) # Wait before checking again
99
+ except Exception:
100
+ await asyncio.sleep(2)
101
+
102
+ # Test retrieval
103
+ chunks = await db.retrieve_chunks(
104
+ query="What is artificial intelligence?",
105
+ filters={"test_id": "async_retrieval_test"}
106
+ )
107
+
108
+ # Verify results (may be empty if processing is slow)
109
+ if len(chunks) > 0:
110
+ assert chunks[0].document_id == doc.external_id
111
+ assert chunks[0].score > 0
112
+
113
+ # Clean up
114
+ await db.delete_document(doc.external_id)
115
+
116
+ @pytest.mark.asyncio
117
+ async def test_folder_operations(self, db):
118
+ """Test folder operations"""
119
+ # Create a unique folder name
120
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
121
+
122
+ # Create a folder
123
+ folder = await db.create_folder(
124
+ name=folder_name,
125
+ description="Test folder for SDK tests"
126
+ )
127
+
128
+ # Verify folder was created
129
+ assert folder.name == folder_name
130
+ assert folder.id is not None
131
+
132
+ # Test ingesting a document into the folder
133
+ doc = await folder.ingest_text(
134
+ content="This is a test document in a folder.",
135
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
136
+ metadata={"test_id": "async_folder_test", "category": "test"}
137
+ )
138
+
139
+ # Verify the document was created
140
+ assert doc.external_id is not None
141
+
142
+ # List documents in the folder
143
+ docs = await folder.list_documents()
144
+
145
+ # There should be at least our test document
146
+ doc_ids = [d.external_id for d in docs]
147
+ assert doc.external_id in doc_ids
148
+
149
+ # Clean up - first delete the document
150
+ await db.delete_document(doc.external_id)
151
+
152
+ # TODO: Add folder deletion when API supports it
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_user_scope(self, db):
156
+ """Test user scoped operations"""
157
+ # Create a unique user ID
158
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
159
+
160
+ # Create a user scope
161
+ user_scope = db.signin(user_id)
162
+
163
+ # Verify user scope
164
+ assert user_scope.end_user_id == user_id
165
+
166
+ # Test ingesting a document as the user
167
+ doc = await user_scope.ingest_text(
168
+ content="This is a test document from a specific user.",
169
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
170
+ metadata={"test_id": "async_user_test", "category": "test"}
171
+ )
172
+
173
+ # Verify the document was created
174
+ assert doc.external_id is not None
175
+ assert "test_id" in doc.metadata
176
+ assert doc.metadata["test_id"] == "async_user_test"
177
+
178
+ # List documents for this user
179
+ docs = await user_scope.list_documents()
180
+
181
+ # There should be at least our test document
182
+ doc_ids = [d.external_id for d in docs]
183
+ assert doc.external_id in doc_ids
184
+
185
+ # Clean up
186
+ await db.delete_document(doc.external_id)
187
+
188
+ @pytest.mark.asyncio
189
+ async def test_batch_operations(self, db):
190
+ """Test batch operations"""
191
+ # Ingest multiple files
192
+ files = [
193
+ TEST_DOCS_DIR / "sample1.txt",
194
+ TEST_DOCS_DIR / "sample2.txt",
195
+ TEST_DOCS_DIR / "sample3.txt"
196
+ ]
197
+
198
+ # Test batch ingestion
199
+ docs = await db.ingest_files(
200
+ files=files,
201
+ metadata={"test_id": "async_batch_test", "category": "test"},
202
+ parallel=True
203
+ )
204
+
205
+ # Verify documents were created
206
+ assert len(docs) == 3
207
+ file_names = [doc.filename for doc in docs]
208
+ assert "sample1.txt" in file_names
209
+ assert "sample2.txt" in file_names
210
+ assert "sample3.txt" in file_names
211
+
212
+ # Get documents in batch
213
+ doc_ids = [doc.external_id for doc in docs]
214
+ batch_docs = await db.batch_get_documents(doc_ids)
215
+
216
+ # Verify batch retrieval
217
+ assert len(batch_docs) == len(doc_ids)
218
+ retrieved_ids = [doc.external_id for doc in batch_docs]
219
+ for doc_id in doc_ids:
220
+ assert doc_id in retrieved_ids
221
+
222
+ # Clean up
223
+ for doc_id in doc_ids:
224
+ await db.delete_document(doc_id)
225
+
226
+ @pytest.mark.asyncio
227
+ async def test_folder_with_user_scope(self, db):
228
+ """Test combination of folder and user scope"""
229
+ # Create unique names
230
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
231
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
232
+
233
+ # Create a folder
234
+ folder = await db.create_folder(name=folder_name)
235
+
236
+ # Create a user scope within the folder
237
+ user_scope = folder.signin(user_id)
238
+
239
+ # Verify scopes
240
+ assert user_scope.folder_name == folder_name
241
+ assert user_scope.end_user_id == user_id
242
+
243
+ # Test ingestion in this combined scope
244
+ doc = await user_scope.ingest_text(
245
+ content="This is a test document in a folder from a specific user.",
246
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
247
+ metadata={"test_id": "async_folder_user_test", "category": "test"}
248
+ )
249
+
250
+ # Verify the document was created
251
+ assert doc.external_id is not None
252
+
253
+ # List documents in this scope
254
+ docs = await user_scope.list_documents()
255
+
256
+ # There should be at least our test document
257
+ doc_ids = [d.external_id for d in docs]
258
+ assert doc.external_id in doc_ids
259
+
260
+ # Clean up
261
+ await db.delete_document(doc.external_id)
262
+
263
+ @pytest.mark.asyncio
264
+ async def test_query_endpoint(self, db):
265
+ """Test the query endpoint for RAG capabilities"""
266
+ # First ingest a document
267
+ doc = await db.ingest_text(
268
+ content="Artificial intelligence and machine learning are transforming industries worldwide. "
269
+ "AI systems can now process natural language, recognize images, and make complex decisions.",
270
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
271
+ metadata={"test_id": "async_query_test", "category": "test"}
272
+ )
273
+
274
+ try:
275
+ # Wait for processing to complete
276
+ for _ in range(10):
277
+ status = await db.get_document_status(doc.external_id)
278
+ if status.get("status") == "completed":
279
+ break
280
+ await asyncio.sleep(2)
281
+
282
+ # Only proceed with test if document is processed
283
+ if status.get("status") == "completed":
284
+ # Test the query endpoint
285
+ response = await db.query(
286
+ query="What can AI systems do?",
287
+ filters={"test_id": "async_query_test"},
288
+ k=1,
289
+ temperature=0.7
290
+ )
291
+
292
+ # Verify response
293
+ assert response.completion is not None
294
+ assert len(response.completion) > 0
295
+ assert len(response.sources) > 0
296
+ assert response.sources[0].document_id == doc.external_id
297
+
298
+ finally:
299
+ # Clean up
300
+ await db.delete_document(doc.external_id)
@@ -0,0 +1,11 @@
1
+ This is a sample text file for testing the Morphik SDK.
2
+
3
+ It contains information about artificial intelligence and machine learning.
4
+ Large language models like Claude can process and understand natural language.
5
+
6
+ These models are trained on vast amounts of text data and can generate
7
+ human-like responses to various prompts and questions.
8
+
9
+ The field of AI has seen rapid advancement in recent years, with models
10
+ becoming increasingly capable of understanding context and generating
11
+ coherent, relevant text.
@@ -0,0 +1,15 @@
1
+ Vector databases are specialized databases designed for storing and retrieving
2
+ high-dimensional vectors. They are becoming increasingly important in the
3
+ world of machine learning and AI applications.
4
+
5
+ Unlike traditional databases that work with structured data, vector databases
6
+ are optimized for similarity search operations on embedding vectors.
7
+
8
+ Key features of vector databases include:
9
+ 1. Efficient similarity search algorithms
10
+ 2. Support for high-dimensional vectors
11
+ 3. Specialized indexing for fast retrieval
12
+ 4. Scalability for large vector collections
13
+
14
+ Common applications include semantic search, recommendation systems,
15
+ image retrieval, and natural language processing tasks.
@@ -0,0 +1,17 @@
1
+ Retrieval Augmented Generation (RAG) combines the power of large language
2
+ models with external knowledge retrieval systems.
3
+
4
+ In a RAG system, when a query is received:
5
+ 1. Relevant documents are retrieved from a knowledge base
6
+ 2. These documents are provided as context to the language model
7
+ 3. The model generates a response informed by both its trained knowledge
8
+ and the retrieved information
9
+
10
+ RAG has several advantages:
11
+ - More accurate responses with up-to-date information
12
+ - Ability to cite sources for generated content
13
+ - Reduced hallucinations compared to standalone LLMs
14
+ - More controllable knowledge base
15
+
16
+ This approach is now widely used in enterprise AI applications where
17
+ accuracy and source attribution are critical.
@@ -0,0 +1,293 @@
1
+ import os
2
+ import pytest
3
+ import time
4
+ import uuid
5
+ from pathlib import Path
6
+
7
+ from morphik.sync import Morphik, Folder, UserScope
8
+ from morphik.models import Document, CompletionResponse
9
+
10
+ # Set to your local Morphik server - use localhost by default
11
+ # Default client connects to localhost:8000 automatically
12
+
13
+ # Skip these tests if the SKIP_LIVE_TESTS environment variable is set
14
+ pytestmark = pytest.mark.skipif(
15
+ os.environ.get("SKIP_LIVE_TESTS") == "1",
16
+ reason="Skip tests that require a running Morphik server"
17
+ )
18
+
19
+ # Get the test files directory
20
+ TEST_DOCS_DIR = Path(__file__).parent / "test_docs"
21
+
22
+
23
+ class TestMorphik:
24
+ """
25
+ Tests for the synchronous Morphik SDK client with a live server.
26
+
27
+ To run these tests, start a local Morphik server and then run:
28
+ MORPHIK_TEST_URL=http://localhost:8000 pytest morphik/tests/test_sync.py -v
29
+ """
30
+
31
+ @pytest.fixture
32
+ def db(self):
33
+ """Create a Morphik client for testing"""
34
+ client = Morphik() # Connects to localhost:8000 by default
35
+ yield client
36
+ client.close()
37
+
38
+ def test_ingest_text(self, db):
39
+ """Test ingesting a text document"""
40
+ # Generate a unique filename to avoid conflicts
41
+ filename = f"test_{uuid.uuid4().hex[:8]}.txt"
42
+
43
+ # Test basic text ingestion
44
+ doc = db.ingest_text(
45
+ content="This is a test document for the Morphik SDK.",
46
+ filename=filename,
47
+ metadata={"test_id": "sync_text_test", "category": "test"}
48
+ )
49
+
50
+ # Verify the document was created
51
+ assert doc.external_id is not None
52
+ assert doc.filename == filename
53
+ assert "test_id" in doc.metadata
54
+ assert doc.metadata["test_id"] == "sync_text_test"
55
+
56
+ # Clean up
57
+ db.delete_document(doc.external_id)
58
+
59
+ def test_ingest_file(self, db):
60
+ """Test ingesting a file from disk"""
61
+ # Use one of our test documents
62
+ file_path = TEST_DOCS_DIR / "sample1.txt"
63
+
64
+ # Test file ingestion
65
+ doc = db.ingest_file(
66
+ file=file_path,
67
+ metadata={"test_id": "sync_file_test", "category": "test"}
68
+ )
69
+
70
+ # Verify the document was created
71
+ assert doc.external_id is not None
72
+ assert doc.filename == "sample1.txt"
73
+ assert "test_id" in doc.metadata
74
+ assert doc.metadata["test_id"] == "sync_file_test"
75
+
76
+ # Clean up
77
+ db.delete_document(doc.external_id)
78
+
79
+ def test_retrieve_chunks(self, db):
80
+ """Test retrieving chunks with a query"""
81
+ # First ingest a document
82
+ doc = db.ingest_text(
83
+ content="Artificial intelligence and machine learning are transforming industries worldwide.",
84
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
85
+ metadata={"test_id": "sync_retrieval_test", "category": "test"}
86
+ )
87
+
88
+ # Wait for processing to complete
89
+ processed_doc = doc
90
+ max_retries = 10
91
+ for _ in range(max_retries):
92
+ try:
93
+ status = db.get_document_status(doc.external_id)
94
+ if status.get("status") == "completed":
95
+ break
96
+ time.sleep(2) # Wait before checking again
97
+ except Exception:
98
+ time.sleep(2)
99
+
100
+ # Test retrieval
101
+ chunks = db.retrieve_chunks(
102
+ query="What is artificial intelligence?",
103
+ filters={"test_id": "sync_retrieval_test"}
104
+ )
105
+
106
+ # Verify results (may be empty if processing is slow)
107
+ if len(chunks) > 0:
108
+ assert chunks[0].document_id == doc.external_id
109
+ assert chunks[0].score > 0
110
+
111
+ # Clean up
112
+ db.delete_document(doc.external_id)
113
+
114
+ def test_folder_operations(self, db):
115
+ """Test folder operations"""
116
+ # Create a unique folder name
117
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
118
+
119
+ # Create a folder
120
+ folder = db.create_folder(
121
+ name=folder_name,
122
+ description="Test folder for SDK tests"
123
+ )
124
+
125
+ # Verify folder was created
126
+ assert folder.name == folder_name
127
+ assert folder.id is not None
128
+
129
+ # Test ingesting a document into the folder
130
+ doc = folder.ingest_text(
131
+ content="This is a test document in a folder.",
132
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
133
+ metadata={"test_id": "sync_folder_test", "category": "test"}
134
+ )
135
+
136
+ # Verify the document was created
137
+ assert doc.external_id is not None
138
+
139
+ # List documents in the folder
140
+ docs = folder.list_documents()
141
+
142
+ # There should be at least our test document
143
+ doc_ids = [d.external_id for d in docs]
144
+ assert doc.external_id in doc_ids
145
+
146
+ # Clean up - first delete the document
147
+ db.delete_document(doc.external_id)
148
+
149
+ # TODO: Add folder deletion when API supports it
150
+
151
+ def test_user_scope(self, db):
152
+ """Test user scoped operations"""
153
+ # Create a unique user ID
154
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
155
+
156
+ # Create a user scope
157
+ user_scope = db.signin(user_id)
158
+
159
+ # Verify user scope
160
+ assert user_scope.end_user_id == user_id
161
+
162
+ # Test ingesting a document as the user
163
+ doc = user_scope.ingest_text(
164
+ content="This is a test document from a specific user.",
165
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
166
+ metadata={"test_id": "sync_user_test", "category": "test"}
167
+ )
168
+
169
+ # Verify the document was created
170
+ assert doc.external_id is not None
171
+ assert "test_id" in doc.metadata
172
+ assert doc.metadata["test_id"] == "sync_user_test"
173
+
174
+ # List documents for this user
175
+ docs = user_scope.list_documents()
176
+
177
+ # There should be at least our test document
178
+ doc_ids = [d.external_id for d in docs]
179
+ assert doc.external_id in doc_ids
180
+
181
+ # Clean up
182
+ db.delete_document(doc.external_id)
183
+
184
+ def test_batch_operations(self, db):
185
+ """Test batch operations"""
186
+ # Ingest multiple files
187
+ files = [
188
+ TEST_DOCS_DIR / "sample1.txt",
189
+ TEST_DOCS_DIR / "sample2.txt",
190
+ TEST_DOCS_DIR / "sample3.txt"
191
+ ]
192
+
193
+ # Test batch ingestion
194
+ docs = db.ingest_files(
195
+ files=files,
196
+ metadata={"test_id": "sync_batch_test", "category": "test"},
197
+ parallel=True
198
+ )
199
+
200
+ # Verify documents were created
201
+ assert len(docs) == 3
202
+ file_names = [doc.filename for doc in docs]
203
+ assert "sample1.txt" in file_names
204
+ assert "sample2.txt" in file_names
205
+ assert "sample3.txt" in file_names
206
+
207
+ # Get documents in batch
208
+ doc_ids = [doc.external_id for doc in docs]
209
+ batch_docs = db.batch_get_documents(doc_ids)
210
+
211
+ # Verify batch retrieval
212
+ assert len(batch_docs) == len(doc_ids)
213
+ retrieved_ids = [doc.external_id for doc in batch_docs]
214
+ for doc_id in doc_ids:
215
+ assert doc_id in retrieved_ids
216
+
217
+ # Clean up
218
+ for doc_id in doc_ids:
219
+ db.delete_document(doc_id)
220
+
221
+ def test_folder_with_user_scope(self, db):
222
+ """Test combination of folder and user scope"""
223
+ # Create unique names
224
+ folder_name = f"test_folder_{uuid.uuid4().hex[:8]}"
225
+ user_id = f"test_user_{uuid.uuid4().hex[:8]}"
226
+
227
+ # Create a folder
228
+ folder = db.create_folder(name=folder_name)
229
+
230
+ # Create a user scope within the folder
231
+ user_scope = folder.signin(user_id)
232
+
233
+ # Verify scopes
234
+ assert user_scope.folder_name == folder_name
235
+ assert user_scope.end_user_id == user_id
236
+
237
+ # Test ingestion in this combined scope
238
+ doc = user_scope.ingest_text(
239
+ content="This is a test document in a folder from a specific user.",
240
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
241
+ metadata={"test_id": "sync_folder_user_test", "category": "test"}
242
+ )
243
+
244
+ # Verify the document was created
245
+ assert doc.external_id is not None
246
+
247
+ # List documents in this scope
248
+ docs = user_scope.list_documents()
249
+
250
+ # There should be at least our test document
251
+ doc_ids = [d.external_id for d in docs]
252
+ assert doc.external_id in doc_ids
253
+
254
+ # Clean up
255
+ db.delete_document(doc.external_id)
256
+
257
+ def test_query_endpoint(self, db):
258
+ """Test the query endpoint for RAG capabilities"""
259
+ # First ingest a document
260
+ doc = db.ingest_text(
261
+ content="Artificial intelligence and machine learning are transforming industries worldwide. "
262
+ "AI systems can now process natural language, recognize images, and make complex decisions.",
263
+ filename=f"test_{uuid.uuid4().hex[:8]}.txt",
264
+ metadata={"test_id": "sync_query_test", "category": "test"}
265
+ )
266
+
267
+ try:
268
+ # Wait for processing to complete
269
+ for _ in range(10):
270
+ status = db.get_document_status(doc.external_id)
271
+ if status.get("status") == "completed":
272
+ break
273
+ time.sleep(2)
274
+
275
+ # Only proceed with test if document is processed
276
+ if status.get("status") == "completed":
277
+ # Test the query endpoint
278
+ response = db.query(
279
+ query="What can AI systems do?",
280
+ filters={"test_id": "sync_query_test"},
281
+ k=1,
282
+ temperature=0.7
283
+ )
284
+
285
+ # Verify response
286
+ assert response.completion is not None
287
+ assert len(response.completion) > 0
288
+ assert len(response.sources) > 0
289
+ assert response.sources[0].document_id == doc.external_id
290
+
291
+ finally:
292
+ # Clean up
293
+ db.delete_document(doc.external_id)