morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +28 -19
- morphik/async_.py +121 -110
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +156 -109
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/sync.py
CHANGED
@@ -2,27 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
-
|
7
|
-
from PIL import Image
|
8
|
-
from PIL.Image import Image as PILImage
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
9
6
|
|
10
7
|
import httpx
|
8
|
+
from pydantic import BaseModel
|
11
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
12
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
13
14
|
Document,
|
14
15
|
DocumentResult,
|
15
|
-
CompletionResponse,
|
16
|
-
IngestTextRequest,
|
17
|
-
ChunkSource,
|
18
|
-
Graph,
|
19
16
|
FolderInfo,
|
20
|
-
|
17
|
+
Graph,
|
21
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
22
20
|
QueryPromptOverrides,
|
23
21
|
)
|
24
|
-
from .rules import Rule
|
25
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
26
22
|
|
27
23
|
logger = logging.getLogger(__name__)
|
28
24
|
|
@@ -71,16 +67,16 @@ class Folder:
|
|
71
67
|
def name(self) -> str:
|
72
68
|
"""Returns the folder name."""
|
73
69
|
return self._name
|
74
|
-
|
70
|
+
|
75
71
|
@property
|
76
72
|
def id(self) -> Optional[str]:
|
77
73
|
"""Returns the folder ID if available."""
|
78
74
|
return self._id
|
79
|
-
|
75
|
+
|
80
76
|
def get_info(self) -> Dict[str, Any]:
|
81
77
|
"""
|
82
78
|
Get detailed information about this folder.
|
83
|
-
|
79
|
+
|
84
80
|
Returns:
|
85
81
|
Dict[str, Any]: Detailed folder information
|
86
82
|
"""
|
@@ -93,9 +89,8 @@ class Folder:
|
|
93
89
|
break
|
94
90
|
if not self._id:
|
95
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
96
|
-
|
92
|
+
|
97
93
|
return self._client._request("GET", f"folders/{self._id}")
|
98
|
-
|
99
94
|
|
100
95
|
def signin(self, end_user_id: str) -> "UserScope":
|
101
96
|
"""
|
@@ -168,9 +163,7 @@ class Folder:
|
|
168
163
|
files = {"file": (filename, file_obj)}
|
169
164
|
|
170
165
|
# Create form data
|
171
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
172
|
-
metadata, rules, self._name, None
|
173
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
174
167
|
|
175
168
|
# use_colpali should be a query parameter as defined in the API
|
176
169
|
response = self._client._request(
|
@@ -219,9 +212,9 @@ class Folder:
|
|
219
212
|
)
|
220
213
|
|
221
214
|
response = self._client._request(
|
222
|
-
"POST",
|
223
|
-
"ingest/files",
|
224
|
-
data=data,
|
215
|
+
"POST",
|
216
|
+
"ingest/files",
|
217
|
+
data=data,
|
225
218
|
files=file_objects,
|
226
219
|
params={"use_colpali": str(use_colpali).lower()},
|
227
220
|
)
|
@@ -231,9 +224,7 @@ class Folder:
|
|
231
224
|
for error in response["errors"]:
|
232
225
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
233
226
|
|
234
|
-
docs = [
|
235
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
236
|
-
]
|
227
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
237
228
|
for doc in docs:
|
238
229
|
doc._client = self._client
|
239
230
|
return docs
|
@@ -368,6 +359,7 @@ class Folder:
|
|
368
359
|
hop_depth: int = 1,
|
369
360
|
include_paths: bool = False,
|
370
361
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
362
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
371
363
|
) -> CompletionResponse:
|
372
364
|
"""
|
373
365
|
Generate completion using relevant chunks as context within this folder.
|
@@ -384,6 +376,7 @@ class Folder:
|
|
384
376
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
385
377
|
include_paths: Whether to include relationship paths in the response
|
386
378
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
379
|
+
schema: Optional schema for structured output
|
387
380
|
|
388
381
|
Returns:
|
389
382
|
CompletionResponse: Generated completion
|
@@ -402,7 +395,20 @@ class Folder:
|
|
402
395
|
prompt_overrides,
|
403
396
|
self._name,
|
404
397
|
None,
|
398
|
+
schema,
|
405
399
|
)
|
400
|
+
|
401
|
+
# Add schema to payload if provided
|
402
|
+
if schema:
|
403
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
404
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
405
|
+
payload["schema"] = schema.model_json_schema()
|
406
|
+
else:
|
407
|
+
payload["schema"] = schema
|
408
|
+
|
409
|
+
# Add a hint to the query to return in JSON format
|
410
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
411
|
+
|
406
412
|
response = self._client._request("POST", "query", data=payload)
|
407
413
|
return self._client._logic._parse_completion_response(response)
|
408
414
|
|
@@ -420,9 +426,7 @@ class Folder:
|
|
420
426
|
Returns:
|
421
427
|
List[Document]: List of documents
|
422
428
|
"""
|
423
|
-
params, data = self._client._logic._prepare_list_documents_request(
|
424
|
-
skip, limit, filters, self._name, None
|
425
|
-
)
|
429
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
|
426
430
|
response = self._client._request("POST", "documents", data=data, params=params)
|
427
431
|
docs = self._client._logic._parse_document_list_response(response)
|
428
432
|
for doc in docs:
|
@@ -447,9 +451,7 @@ class Folder:
|
|
447
451
|
doc._client = self._client
|
448
452
|
return docs
|
449
453
|
|
450
|
-
def batch_get_chunks(
|
451
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
452
|
-
) -> List[FinalChunkResult]:
|
454
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
453
455
|
"""
|
454
456
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
455
457
|
|
@@ -550,13 +552,8 @@ class Folder:
|
|
550
552
|
Returns:
|
551
553
|
Dict[str, str]: Deletion status
|
552
554
|
"""
|
553
|
-
# Get the document by filename with folder scope
|
554
|
-
request = {"filename": filename, "folder_name": self._name}
|
555
|
-
|
556
555
|
# First get the document ID
|
557
|
-
response = self._client._request(
|
558
|
-
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
559
|
-
)
|
556
|
+
response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
|
560
557
|
doc = self._client._logic._parse_document_response(response)
|
561
558
|
|
562
559
|
# Then delete by ID
|
@@ -677,7 +674,7 @@ class UserScope:
|
|
677
674
|
# Add folder name if scoped to a folder
|
678
675
|
if self._folder_name:
|
679
676
|
form_data["folder_name"] = self._folder_name
|
680
|
-
|
677
|
+
|
681
678
|
# use_colpali should be a query parameter as defined in the API
|
682
679
|
response = self._client._request(
|
683
680
|
"POST",
|
@@ -732,9 +729,7 @@ class UserScope:
|
|
732
729
|
if rules:
|
733
730
|
if all(isinstance(r, list) for r in rules):
|
734
731
|
# List of lists - per-file rules
|
735
|
-
converted_rules = [
|
736
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
737
|
-
]
|
732
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
738
733
|
else:
|
739
734
|
# Flat list - shared rules for all files
|
740
735
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -754,9 +749,9 @@ class UserScope:
|
|
754
749
|
data["folder_name"] = self._folder_name
|
755
750
|
|
756
751
|
response = self._client._request(
|
757
|
-
"POST",
|
758
|
-
"ingest/files",
|
759
|
-
data=data,
|
752
|
+
"POST",
|
753
|
+
"ingest/files",
|
754
|
+
data=data,
|
760
755
|
files=file_objects,
|
761
756
|
params={"use_colpali": str(use_colpali).lower()},
|
762
757
|
)
|
@@ -766,9 +761,7 @@ class UserScope:
|
|
766
761
|
for error in response["errors"]:
|
767
762
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
768
763
|
|
769
|
-
docs = [
|
770
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
771
|
-
]
|
764
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
772
765
|
for doc in docs:
|
773
766
|
doc._client = self._client
|
774
767
|
return docs
|
@@ -911,6 +904,7 @@ class UserScope:
|
|
911
904
|
hop_depth: int = 1,
|
912
905
|
include_paths: bool = False,
|
913
906
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
907
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
914
908
|
) -> CompletionResponse:
|
915
909
|
"""
|
916
910
|
Generate completion using relevant chunks as context as this end user.
|
@@ -927,6 +921,7 @@ class UserScope:
|
|
927
921
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
928
922
|
include_paths: Whether to include relationship paths in the response
|
929
923
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
924
|
+
schema: Optional schema for structured output
|
930
925
|
|
931
926
|
Returns:
|
932
927
|
CompletionResponse: Generated completion
|
@@ -945,7 +940,20 @@ class UserScope:
|
|
945
940
|
prompt_overrides,
|
946
941
|
self._folder_name,
|
947
942
|
self._end_user_id,
|
943
|
+
schema,
|
948
944
|
)
|
945
|
+
|
946
|
+
# Add schema to payload if provided
|
947
|
+
if schema:
|
948
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
949
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
950
|
+
payload["schema"] = schema.model_json_schema()
|
951
|
+
else:
|
952
|
+
payload["schema"] = schema
|
953
|
+
|
954
|
+
# Add a hint to the query to return in JSON format
|
955
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
956
|
+
|
949
957
|
response = self._client._request("POST", "query", data=payload)
|
950
958
|
return self._client._logic._parse_completion_response(response)
|
951
959
|
|
@@ -970,7 +978,7 @@ class UserScope:
|
|
970
978
|
if self._folder_name:
|
971
979
|
params["folder_name"] = self._folder_name
|
972
980
|
|
973
|
-
response = self._client._request("POST",
|
981
|
+
response = self._client._request("POST", "documents", data=filters or {}, params=params)
|
974
982
|
|
975
983
|
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
976
984
|
for doc in docs:
|
@@ -999,9 +1007,7 @@ class UserScope:
|
|
999
1007
|
doc._client = self._client
|
1000
1008
|
return docs
|
1001
1009
|
|
1002
|
-
def batch_get_chunks(
|
1003
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
1004
|
-
) -> List[FinalChunkResult]:
|
1010
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
1005
1011
|
"""
|
1006
1012
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
1007
1013
|
|
@@ -1173,12 +1179,12 @@ class Morphik:
|
|
1173
1179
|
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1174
1180
|
if "Content-Type" in headers:
|
1175
1181
|
del headers["Content-Type"]
|
1176
|
-
|
1182
|
+
|
1177
1183
|
# For file uploads with form data, use form data (not json)
|
1178
1184
|
request_data = {"files": files}
|
1179
1185
|
if data:
|
1180
1186
|
request_data["data"] = data
|
1181
|
-
|
1187
|
+
|
1182
1188
|
# Files are now properly handled
|
1183
1189
|
else:
|
1184
1190
|
# JSON for everything else
|
@@ -1192,8 +1198,13 @@ class Morphik:
|
|
1192
1198
|
params=params,
|
1193
1199
|
**request_data,
|
1194
1200
|
)
|
1195
|
-
|
1196
|
-
|
1201
|
+
try:
|
1202
|
+
response.raise_for_status()
|
1203
|
+
return response.json()
|
1204
|
+
except httpx.HTTPStatusError as e:
|
1205
|
+
# Print error response for debugging
|
1206
|
+
print(f"Error response: {e.response.status_code} - {e.response.text}")
|
1207
|
+
raise
|
1197
1208
|
|
1198
1209
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
1199
1210
|
"""Convert a rule to a dictionary format"""
|
@@ -1210,18 +1221,16 @@ class Morphik:
|
|
1210
1221
|
Returns:
|
1211
1222
|
Folder: A folder object ready for scoped operations
|
1212
1223
|
"""
|
1213
|
-
payload = {
|
1214
|
-
"name": name
|
1215
|
-
}
|
1224
|
+
payload = {"name": name}
|
1216
1225
|
if description:
|
1217
1226
|
payload["description"] = description
|
1218
|
-
|
1227
|
+
|
1219
1228
|
response = self._request("POST", "folders", data=payload)
|
1220
1229
|
folder_info = FolderInfo(**response)
|
1221
|
-
|
1230
|
+
|
1222
1231
|
# Return a usable Folder object with the ID from the response
|
1223
1232
|
return Folder(self, name, folder_id=folder_info.id)
|
1224
|
-
|
1233
|
+
|
1225
1234
|
def get_folder_by_name(self, name: str) -> Folder:
|
1226
1235
|
"""
|
1227
1236
|
Get a folder by name to scope operations.
|
@@ -1233,7 +1242,7 @@ class Morphik:
|
|
1233
1242
|
Folder: A folder object for scoped operations
|
1234
1243
|
"""
|
1235
1244
|
return Folder(self, name)
|
1236
|
-
|
1245
|
+
|
1237
1246
|
def get_folder(self, folder_id: str) -> Folder:
|
1238
1247
|
"""
|
1239
1248
|
Get a folder by ID.
|
@@ -1250,13 +1259,13 @@ class Morphik:
|
|
1250
1259
|
def list_folders(self) -> List[Folder]:
|
1251
1260
|
"""
|
1252
1261
|
List all folders the user has access to as Folder objects.
|
1253
|
-
|
1262
|
+
|
1254
1263
|
Returns:
|
1255
1264
|
List[Folder]: List of Folder objects ready for operations
|
1256
1265
|
"""
|
1257
1266
|
folder_infos = self._request("GET", "folders")
|
1258
1267
|
return [Folder(self, info["name"], info["id"]) for info in folder_infos]
|
1259
|
-
|
1268
|
+
|
1260
1269
|
def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1261
1270
|
"""
|
1262
1271
|
Add a document to a folder.
|
@@ -1270,7 +1279,7 @@ class Morphik:
|
|
1270
1279
|
"""
|
1271
1280
|
response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1272
1281
|
return response
|
1273
|
-
|
1282
|
+
|
1274
1283
|
def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1275
1284
|
"""
|
1276
1285
|
Remove a document from a folder.
|
@@ -1314,7 +1323,8 @@ class Morphik:
|
|
1314
1323
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1315
1324
|
- MetadataExtractionRule: Extract metadata using a schema
|
1316
1325
|
- NaturalLanguageRule: Transform content using natural language
|
1317
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1326
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1327
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1318
1328
|
Returns:
|
1319
1329
|
Document: Metadata of the ingested document
|
1320
1330
|
|
@@ -1367,7 +1377,8 @@ class Morphik:
|
|
1367
1377
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1368
1378
|
- MetadataExtractionRule: Extract metadata using a schema
|
1369
1379
|
- NaturalLanguageRule: Transform content using natural language
|
1370
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1380
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1381
|
+
(slower, but significantly better retrieval accuracy for images)
|
1371
1382
|
|
1372
1383
|
Returns:
|
1373
1384
|
Document: Metadata of the ingested document
|
@@ -1450,14 +1461,12 @@ class Morphik:
|
|
1450
1461
|
try:
|
1451
1462
|
# Prepare form data
|
1452
1463
|
# Prepare form data - use_colpali should be a query parameter, not form data
|
1453
|
-
data = self._logic._prepare_ingest_files_form_data(
|
1454
|
-
metadata, rules, use_colpali, parallel, None, None
|
1455
|
-
)
|
1464
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1456
1465
|
|
1457
1466
|
response = self._request(
|
1458
|
-
"POST",
|
1459
|
-
"ingest/files",
|
1460
|
-
data=data,
|
1467
|
+
"POST",
|
1468
|
+
"ingest/files",
|
1469
|
+
data=data,
|
1461
1470
|
files=file_objects,
|
1462
1471
|
params={"use_colpali": str(use_colpali).lower()},
|
1463
1472
|
)
|
@@ -1542,7 +1551,8 @@ class Morphik:
|
|
1542
1551
|
filters: Optional metadata filters
|
1543
1552
|
k: Number of results (default: 4)
|
1544
1553
|
min_score: Minimum similarity threshold (default: 0.0)
|
1545
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1554
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1555
|
+
(only works for documents ingested with `use_colpali=True`)
|
1546
1556
|
Returns:
|
1547
1557
|
List[ChunkResult]
|
1548
1558
|
|
@@ -1554,9 +1564,7 @@ class Morphik:
|
|
1554
1564
|
)
|
1555
1565
|
```
|
1556
1566
|
"""
|
1557
|
-
payload = self._logic._prepare_retrieve_chunks_request(
|
1558
|
-
query, filters, k, min_score, use_colpali, None, None
|
1559
|
-
)
|
1567
|
+
payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
|
1560
1568
|
response = self._request("POST", "retrieve/chunks", data=payload)
|
1561
1569
|
return self._logic._parse_chunk_result_list_response(response)
|
1562
1570
|
|
@@ -1576,7 +1584,8 @@ class Morphik:
|
|
1576
1584
|
filters: Optional metadata filters
|
1577
1585
|
k: Number of results (default: 4)
|
1578
1586
|
min_score: Minimum similarity threshold (default: 0.0)
|
1579
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1587
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1588
|
+
(only works for documents ingested with `use_colpali=True`)
|
1580
1589
|
Returns:
|
1581
1590
|
List[DocumentResult]
|
1582
1591
|
|
@@ -1588,9 +1597,7 @@ class Morphik:
|
|
1588
1597
|
)
|
1589
1598
|
```
|
1590
1599
|
"""
|
1591
|
-
payload = self._logic._prepare_retrieve_docs_request(
|
1592
|
-
query, filters, k, min_score, use_colpali, None, None
|
1593
|
-
)
|
1600
|
+
payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
|
1594
1601
|
response = self._request("POST", "retrieve/docs", data=payload)
|
1595
1602
|
return self._logic._parse_document_result_list_response(response)
|
1596
1603
|
|
@@ -1607,6 +1614,7 @@ class Morphik:
|
|
1607
1614
|
hop_depth: int = 1,
|
1608
1615
|
include_paths: bool = False,
|
1609
1616
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1617
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1610
1618
|
) -> CompletionResponse:
|
1611
1619
|
"""
|
1612
1620
|
Generate completion using relevant chunks as context.
|
@@ -1618,12 +1626,14 @@ class Morphik:
|
|
1618
1626
|
min_score: Minimum similarity threshold (default: 0.0)
|
1619
1627
|
max_tokens: Maximum tokens in completion
|
1620
1628
|
temperature: Model temperature
|
1621
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1629
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1630
|
+
(only works for documents ingested with `use_colpali=True`)
|
1622
1631
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1623
1632
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1624
1633
|
include_paths: Whether to include relationship paths in the response
|
1625
1634
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1626
1635
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1636
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1627
1637
|
Returns:
|
1628
1638
|
CompletionResponse
|
1629
1639
|
|
@@ -1671,6 +1681,27 @@ class Morphik:
|
|
1671
1681
|
if response.metadata and "graph" in response.metadata:
|
1672
1682
|
for path in response.metadata["graph"]["paths"]:
|
1673
1683
|
print(" -> ".join(path))
|
1684
|
+
|
1685
|
+
# Using structured output with a Pydantic model
|
1686
|
+
from pydantic import BaseModel
|
1687
|
+
|
1688
|
+
class ResearchFindings(BaseModel):
|
1689
|
+
main_finding: str
|
1690
|
+
supporting_evidence: List[str]
|
1691
|
+
limitations: List[str]
|
1692
|
+
|
1693
|
+
response = db.query(
|
1694
|
+
"Summarize the key research findings from these documents",
|
1695
|
+
schema=ResearchFindings
|
1696
|
+
)
|
1697
|
+
|
1698
|
+
# Access structured output
|
1699
|
+
if response.structured_output:
|
1700
|
+
findings = response.structured_output
|
1701
|
+
print(f"Main finding: {findings.main_finding}")
|
1702
|
+
print("Supporting evidence:")
|
1703
|
+
for evidence in findings.supporting_evidence:
|
1704
|
+
print(f"- {evidence}")
|
1674
1705
|
```
|
1675
1706
|
"""
|
1676
1707
|
payload = self._logic._prepare_query_request(
|
@@ -1687,7 +1718,20 @@ class Morphik:
|
|
1687
1718
|
prompt_overrides,
|
1688
1719
|
None,
|
1689
1720
|
None,
|
1721
|
+
schema,
|
1690
1722
|
)
|
1723
|
+
|
1724
|
+
# Add schema to payload if provided
|
1725
|
+
if schema:
|
1726
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1727
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1728
|
+
payload["schema"] = schema.model_json_schema()
|
1729
|
+
else:
|
1730
|
+
payload["schema"] = schema
|
1731
|
+
|
1732
|
+
# Add a hint to the query to return in JSON format
|
1733
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1734
|
+
|
1691
1735
|
response = self._request("POST", "query", data=payload)
|
1692
1736
|
return self._logic._parse_completion_response(response)
|
1693
1737
|
|
@@ -1741,17 +1785,17 @@ class Morphik:
|
|
1741
1785
|
doc = self._logic._parse_document_response(response)
|
1742
1786
|
doc._client = self
|
1743
1787
|
return doc
|
1744
|
-
|
1788
|
+
|
1745
1789
|
def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1746
1790
|
"""
|
1747
1791
|
Get the current processing status of a document.
|
1748
|
-
|
1792
|
+
|
1749
1793
|
Args:
|
1750
1794
|
document_id: ID of the document to check
|
1751
|
-
|
1795
|
+
|
1752
1796
|
Returns:
|
1753
1797
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1754
|
-
|
1798
|
+
|
1755
1799
|
Example:
|
1756
1800
|
```python
|
1757
1801
|
status = db.get_document_status("doc_123")
|
@@ -1765,23 +1809,23 @@ class Morphik:
|
|
1765
1809
|
"""
|
1766
1810
|
response = self._request("GET", f"documents/{document_id}/status")
|
1767
1811
|
return response
|
1768
|
-
|
1812
|
+
|
1769
1813
|
def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
1770
1814
|
"""
|
1771
1815
|
Wait for a document's processing to complete.
|
1772
|
-
|
1816
|
+
|
1773
1817
|
Args:
|
1774
1818
|
document_id: ID of the document to wait for
|
1775
1819
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1776
1820
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1777
|
-
|
1821
|
+
|
1778
1822
|
Returns:
|
1779
1823
|
Document: Updated document with the latest status
|
1780
|
-
|
1824
|
+
|
1781
1825
|
Raises:
|
1782
1826
|
TimeoutError: If processing doesn't complete within the timeout period
|
1783
1827
|
ValueError: If processing fails with an error
|
1784
|
-
|
1828
|
+
|
1785
1829
|
Example:
|
1786
1830
|
```python
|
1787
1831
|
# Upload a file and wait for processing to complete
|
@@ -1796,20 +1840,21 @@ class Morphik:
|
|
1796
1840
|
```
|
1797
1841
|
"""
|
1798
1842
|
import time
|
1843
|
+
|
1799
1844
|
start_time = time.time()
|
1800
|
-
|
1845
|
+
|
1801
1846
|
while (time.time() - start_time) < timeout_seconds:
|
1802
1847
|
status = self.get_document_status(document_id)
|
1803
|
-
|
1848
|
+
|
1804
1849
|
if status["status"] == "completed":
|
1805
1850
|
# Get the full document now that it's complete
|
1806
1851
|
return self.get_document(document_id)
|
1807
1852
|
elif status["status"] == "failed":
|
1808
1853
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1809
|
-
|
1854
|
+
|
1810
1855
|
# Wait before checking again
|
1811
1856
|
time.sleep(check_interval_seconds)
|
1812
|
-
|
1857
|
+
|
1813
1858
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1814
1859
|
|
1815
1860
|
def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1963,9 +2008,7 @@ class Morphik:
|
|
1963
2008
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1964
2009
|
|
1965
2010
|
# Use the dedicated file update endpoint
|
1966
|
-
response = self._request(
|
1967
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1968
|
-
)
|
2011
|
+
response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1969
2012
|
|
1970
2013
|
doc = self._logic._parse_document_response(response)
|
1971
2014
|
doc._client = self
|
@@ -2191,9 +2234,7 @@ class Morphik:
|
|
2191
2234
|
doc._client = self
|
2192
2235
|
return docs
|
2193
2236
|
|
2194
|
-
def batch_get_chunks(
|
2195
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2196
|
-
) -> List[FinalChunkResult]:
|
2237
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
2197
2238
|
"""
|
2198
2239
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2199
2240
|
|
@@ -2249,8 +2290,10 @@ class Morphik:
|
|
2249
2290
|
name: Name of the cache to create
|
2250
2291
|
model: Name of the model to use (e.g. "llama2")
|
2251
2292
|
gguf_file: Name of the GGUF file to use for the model
|
2252
|
-
filters: Optional metadata filters to determine which documents to include.
|
2253
|
-
|
2293
|
+
filters: Optional metadata filters to determine which documents to include.
|
2294
|
+
These filters will be applied in addition to any specific docs provided.
|
2295
|
+
docs: Optional list of specific document IDs to include.
|
2296
|
+
These docs will be included in addition to any documents matching the filters.
|
2254
2297
|
|
2255
2298
|
Returns:
|
2256
2299
|
Dict[str, Any]: Created cache configuration
|
@@ -2355,12 +2398,16 @@ class Morphik:
|
|
2355
2398
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
2356
2399
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
2357
2400
|
|
2358
|
-
request
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2401
|
+
# Initialize request with required fields
|
2402
|
+
request = {"name": name}
|
2403
|
+
|
2404
|
+
# Add optional fields only if they are not None
|
2405
|
+
if filters is not None:
|
2406
|
+
request["filters"] = filters
|
2407
|
+
if documents is not None:
|
2408
|
+
request["documents"] = documents
|
2409
|
+
if prompt_overrides is not None:
|
2410
|
+
request["prompt_overrides"] = prompt_overrides
|
2364
2411
|
|
2365
2412
|
response = self._request("POST", "graph/create", request)
|
2366
2413
|
return self._logic._parse_graph_response(response)
|
morphik/tests/README.md
CHANGED
@@ -38,4 +38,4 @@ python example_usage.py --async
|
|
38
38
|
## Environment Variables
|
39
39
|
|
40
40
|
- `MORPHIK_TEST_URL` - The URL of the Morphik server to use for tests (default: http://localhost:8000)
|
41
|
-
- `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server
|
41
|
+
- `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server
|