morphik 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +2 -2
- morphik/_internal.py +29 -20
- morphik/async_.py +154 -116
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +189 -108
- morphik/tests/README.md +41 -0
- morphik/tests/__init__.py +0 -0
- morphik/tests/example_usage.py +280 -0
- morphik/tests/test_async.py +384 -0
- morphik/tests/test_docs/sample1.txt +11 -0
- morphik/tests/test_docs/sample2.txt +15 -0
- morphik/tests/test_docs/sample3.txt +17 -0
- morphik/tests/test_sync.py +371 -0
- morphik-0.1.5.dist-info/METADATA +149 -0
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.3.dist-info/METADATA +0 -47
- morphik-0.1.3.dist-info/RECORD +0 -10
- {morphik-0.1.3.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/sync.py
CHANGED
@@ -2,27 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
-
|
7
|
-
from PIL import Image
|
8
|
-
from PIL.Image import Image as PILImage
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
9
6
|
|
10
7
|
import httpx
|
8
|
+
from pydantic import BaseModel
|
11
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
12
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
13
14
|
Document,
|
14
15
|
DocumentResult,
|
15
|
-
CompletionResponse,
|
16
|
-
IngestTextRequest,
|
17
|
-
ChunkSource,
|
18
|
-
Graph,
|
19
16
|
FolderInfo,
|
20
|
-
|
17
|
+
Graph,
|
21
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
22
20
|
QueryPromptOverrides,
|
23
21
|
)
|
24
|
-
from .rules import Rule
|
25
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
26
22
|
|
27
23
|
logger = logging.getLogger(__name__)
|
28
24
|
|
@@ -71,16 +67,16 @@ class Folder:
|
|
71
67
|
def name(self) -> str:
|
72
68
|
"""Returns the folder name."""
|
73
69
|
return self._name
|
74
|
-
|
70
|
+
|
75
71
|
@property
|
76
72
|
def id(self) -> Optional[str]:
|
77
73
|
"""Returns the folder ID if available."""
|
78
74
|
return self._id
|
79
|
-
|
75
|
+
|
80
76
|
def get_info(self) -> Dict[str, Any]:
|
81
77
|
"""
|
82
78
|
Get detailed information about this folder.
|
83
|
-
|
79
|
+
|
84
80
|
Returns:
|
85
81
|
Dict[str, Any]: Detailed folder information
|
86
82
|
"""
|
@@ -93,9 +89,8 @@ class Folder:
|
|
93
89
|
break
|
94
90
|
if not self._id:
|
95
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
96
|
-
|
92
|
+
|
97
93
|
return self._client._request("GET", f"folders/{self._id}")
|
98
|
-
|
99
94
|
|
100
95
|
def signin(self, end_user_id: str) -> "UserScope":
|
101
96
|
"""
|
@@ -168,15 +163,15 @@ class Folder:
|
|
168
163
|
files = {"file": (filename, file_obj)}
|
169
164
|
|
170
165
|
# Create form data
|
171
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
172
|
-
metadata, rules, self._name, None
|
173
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
174
167
|
|
168
|
+
# use_colpali should be a query parameter as defined in the API
|
175
169
|
response = self._client._request(
|
176
170
|
"POST",
|
177
|
-
|
171
|
+
"ingest/file",
|
178
172
|
data=form_data,
|
179
173
|
files=files,
|
174
|
+
params={"use_colpali": str(use_colpali).lower()},
|
180
175
|
)
|
181
176
|
doc = self._client._logic._parse_document_response(response)
|
182
177
|
doc._client = self._client
|
@@ -216,16 +211,20 @@ class Folder:
|
|
216
211
|
metadata, rules, use_colpali, parallel, self._name, None
|
217
212
|
)
|
218
213
|
|
219
|
-
response = self._client._request(
|
214
|
+
response = self._client._request(
|
215
|
+
"POST",
|
216
|
+
"ingest/files",
|
217
|
+
data=data,
|
218
|
+
files=file_objects,
|
219
|
+
params={"use_colpali": str(use_colpali).lower()},
|
220
|
+
)
|
220
221
|
|
221
222
|
if response.get("errors"):
|
222
223
|
# Log errors but don't raise exception
|
223
224
|
for error in response["errors"]:
|
224
225
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
225
226
|
|
226
|
-
docs = [
|
227
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
228
|
-
]
|
227
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
229
228
|
for doc in docs:
|
230
229
|
doc._client = self._client
|
231
230
|
return docs
|
@@ -360,6 +359,7 @@ class Folder:
|
|
360
359
|
hop_depth: int = 1,
|
361
360
|
include_paths: bool = False,
|
362
361
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
362
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
363
363
|
) -> CompletionResponse:
|
364
364
|
"""
|
365
365
|
Generate completion using relevant chunks as context within this folder.
|
@@ -376,6 +376,7 @@ class Folder:
|
|
376
376
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
377
377
|
include_paths: Whether to include relationship paths in the response
|
378
378
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
379
|
+
schema: Optional schema for structured output
|
379
380
|
|
380
381
|
Returns:
|
381
382
|
CompletionResponse: Generated completion
|
@@ -394,7 +395,20 @@ class Folder:
|
|
394
395
|
prompt_overrides,
|
395
396
|
self._name,
|
396
397
|
None,
|
398
|
+
schema,
|
397
399
|
)
|
400
|
+
|
401
|
+
# Add schema to payload if provided
|
402
|
+
if schema:
|
403
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
404
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
405
|
+
payload["schema"] = schema.model_json_schema()
|
406
|
+
else:
|
407
|
+
payload["schema"] = schema
|
408
|
+
|
409
|
+
# Add a hint to the query to return in JSON format
|
410
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
411
|
+
|
398
412
|
response = self._client._request("POST", "query", data=payload)
|
399
413
|
return self._client._logic._parse_completion_response(response)
|
400
414
|
|
@@ -412,9 +426,7 @@ class Folder:
|
|
412
426
|
Returns:
|
413
427
|
List[Document]: List of documents
|
414
428
|
"""
|
415
|
-
params, data = self._client._logic._prepare_list_documents_request(
|
416
|
-
skip, limit, filters, self._name, None
|
417
|
-
)
|
429
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
|
418
430
|
response = self._client._request("POST", "documents", data=data, params=params)
|
419
431
|
docs = self._client._logic._parse_document_list_response(response)
|
420
432
|
for doc in docs:
|
@@ -439,9 +451,7 @@ class Folder:
|
|
439
451
|
doc._client = self._client
|
440
452
|
return docs
|
441
453
|
|
442
|
-
def batch_get_chunks(
|
443
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
444
|
-
) -> List[FinalChunkResult]:
|
454
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
445
455
|
"""
|
446
456
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
447
457
|
|
@@ -542,13 +552,8 @@ class Folder:
|
|
542
552
|
Returns:
|
543
553
|
Dict[str, str]: Deletion status
|
544
554
|
"""
|
545
|
-
# Get the document by filename with folder scope
|
546
|
-
request = {"filename": filename, "folder_name": self._name}
|
547
|
-
|
548
555
|
# First get the document ID
|
549
|
-
response = self._client._request(
|
550
|
-
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
551
|
-
)
|
556
|
+
response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
|
552
557
|
doc = self._client._logic._parse_document_response(response)
|
553
558
|
|
554
559
|
# Then delete by ID
|
@@ -670,11 +675,13 @@ class UserScope:
|
|
670
675
|
if self._folder_name:
|
671
676
|
form_data["folder_name"] = self._folder_name
|
672
677
|
|
678
|
+
# use_colpali should be a query parameter as defined in the API
|
673
679
|
response = self._client._request(
|
674
680
|
"POST",
|
675
|
-
|
681
|
+
"ingest/file",
|
676
682
|
data=form_data,
|
677
683
|
files=files,
|
684
|
+
params={"use_colpali": str(use_colpali).lower()},
|
678
685
|
)
|
679
686
|
doc = self._client._logic._parse_document_response(response)
|
680
687
|
doc._client = self._client
|
@@ -722,9 +729,7 @@ class UserScope:
|
|
722
729
|
if rules:
|
723
730
|
if all(isinstance(r, list) for r in rules):
|
724
731
|
# List of lists - per-file rules
|
725
|
-
converted_rules = [
|
726
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
727
|
-
]
|
732
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
728
733
|
else:
|
729
734
|
# Flat list - shared rules for all files
|
730
735
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -734,7 +739,7 @@ class UserScope:
|
|
734
739
|
data = {
|
735
740
|
"metadata": json.dumps(metadata or {}),
|
736
741
|
"rules": json.dumps(converted_rules),
|
737
|
-
|
742
|
+
# Remove use_colpali from form data - it should be a query param
|
738
743
|
"parallel": str(parallel).lower(),
|
739
744
|
"end_user_id": self._end_user_id, # Add end user ID here
|
740
745
|
}
|
@@ -743,16 +748,20 @@ class UserScope:
|
|
743
748
|
if self._folder_name:
|
744
749
|
data["folder_name"] = self._folder_name
|
745
750
|
|
746
|
-
response = self._client._request(
|
751
|
+
response = self._client._request(
|
752
|
+
"POST",
|
753
|
+
"ingest/files",
|
754
|
+
data=data,
|
755
|
+
files=file_objects,
|
756
|
+
params={"use_colpali": str(use_colpali).lower()},
|
757
|
+
)
|
747
758
|
|
748
759
|
if response.get("errors"):
|
749
760
|
# Log errors but don't raise exception
|
750
761
|
for error in response["errors"]:
|
751
762
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
752
763
|
|
753
|
-
docs = [
|
754
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
755
|
-
]
|
764
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
756
765
|
for doc in docs:
|
757
766
|
doc._client = self._client
|
758
767
|
return docs
|
@@ -895,6 +904,7 @@ class UserScope:
|
|
895
904
|
hop_depth: int = 1,
|
896
905
|
include_paths: bool = False,
|
897
906
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
907
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
898
908
|
) -> CompletionResponse:
|
899
909
|
"""
|
900
910
|
Generate completion using relevant chunks as context as this end user.
|
@@ -911,6 +921,7 @@ class UserScope:
|
|
911
921
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
912
922
|
include_paths: Whether to include relationship paths in the response
|
913
923
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
924
|
+
schema: Optional schema for structured output
|
914
925
|
|
915
926
|
Returns:
|
916
927
|
CompletionResponse: Generated completion
|
@@ -929,7 +940,20 @@ class UserScope:
|
|
929
940
|
prompt_overrides,
|
930
941
|
self._folder_name,
|
931
942
|
self._end_user_id,
|
943
|
+
schema,
|
932
944
|
)
|
945
|
+
|
946
|
+
# Add schema to payload if provided
|
947
|
+
if schema:
|
948
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
949
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
950
|
+
payload["schema"] = schema.model_json_schema()
|
951
|
+
else:
|
952
|
+
payload["schema"] = schema
|
953
|
+
|
954
|
+
# Add a hint to the query to return in JSON format
|
955
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
956
|
+
|
933
957
|
response = self._client._request("POST", "query", data=payload)
|
934
958
|
return self._client._logic._parse_completion_response(response)
|
935
959
|
|
@@ -954,7 +978,7 @@ class UserScope:
|
|
954
978
|
if self._folder_name:
|
955
979
|
params["folder_name"] = self._folder_name
|
956
980
|
|
957
|
-
response = self._client._request("POST",
|
981
|
+
response = self._client._request("POST", "documents", data=filters or {}, params=params)
|
958
982
|
|
959
983
|
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
960
984
|
for doc in docs:
|
@@ -983,9 +1007,7 @@ class UserScope:
|
|
983
1007
|
doc._client = self._client
|
984
1008
|
return docs
|
985
1009
|
|
986
|
-
def batch_get_chunks(
|
987
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
988
|
-
) -> List[FinalChunkResult]:
|
1010
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
989
1011
|
"""
|
990
1012
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
991
1013
|
|
@@ -1153,9 +1175,17 @@ class Morphik:
|
|
1153
1175
|
|
1154
1176
|
# Configure request data based on type
|
1155
1177
|
if files:
|
1156
|
-
#
|
1157
|
-
|
1158
|
-
|
1178
|
+
# When uploading files, we need to make sure not to set Content-Type
|
1179
|
+
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1180
|
+
if "Content-Type" in headers:
|
1181
|
+
del headers["Content-Type"]
|
1182
|
+
|
1183
|
+
# For file uploads with form data, use form data (not json)
|
1184
|
+
request_data = {"files": files}
|
1185
|
+
if data:
|
1186
|
+
request_data["data"] = data
|
1187
|
+
|
1188
|
+
# Files are now properly handled
|
1159
1189
|
else:
|
1160
1190
|
# JSON for everything else
|
1161
1191
|
headers["Content-Type"] = "application/json"
|
@@ -1168,8 +1198,13 @@ class Morphik:
|
|
1168
1198
|
params=params,
|
1169
1199
|
**request_data,
|
1170
1200
|
)
|
1171
|
-
|
1172
|
-
|
1201
|
+
try:
|
1202
|
+
response.raise_for_status()
|
1203
|
+
return response.json()
|
1204
|
+
except httpx.HTTPStatusError as e:
|
1205
|
+
# Print error response for debugging
|
1206
|
+
print(f"Error response: {e.response.status_code} - {e.response.text}")
|
1207
|
+
raise
|
1173
1208
|
|
1174
1209
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
1175
1210
|
"""Convert a rule to a dictionary format"""
|
@@ -1186,18 +1221,16 @@ class Morphik:
|
|
1186
1221
|
Returns:
|
1187
1222
|
Folder: A folder object ready for scoped operations
|
1188
1223
|
"""
|
1189
|
-
payload = {
|
1190
|
-
"name": name
|
1191
|
-
}
|
1224
|
+
payload = {"name": name}
|
1192
1225
|
if description:
|
1193
1226
|
payload["description"] = description
|
1194
|
-
|
1227
|
+
|
1195
1228
|
response = self._request("POST", "folders", data=payload)
|
1196
1229
|
folder_info = FolderInfo(**response)
|
1197
|
-
|
1230
|
+
|
1198
1231
|
# Return a usable Folder object with the ID from the response
|
1199
1232
|
return Folder(self, name, folder_id=folder_info.id)
|
1200
|
-
|
1233
|
+
|
1201
1234
|
def get_folder_by_name(self, name: str) -> Folder:
|
1202
1235
|
"""
|
1203
1236
|
Get a folder by name to scope operations.
|
@@ -1209,7 +1242,7 @@ class Morphik:
|
|
1209
1242
|
Folder: A folder object for scoped operations
|
1210
1243
|
"""
|
1211
1244
|
return Folder(self, name)
|
1212
|
-
|
1245
|
+
|
1213
1246
|
def get_folder(self, folder_id: str) -> Folder:
|
1214
1247
|
"""
|
1215
1248
|
Get a folder by ID.
|
@@ -1226,13 +1259,13 @@ class Morphik:
|
|
1226
1259
|
def list_folders(self) -> List[Folder]:
|
1227
1260
|
"""
|
1228
1261
|
List all folders the user has access to as Folder objects.
|
1229
|
-
|
1262
|
+
|
1230
1263
|
Returns:
|
1231
1264
|
List[Folder]: List of Folder objects ready for operations
|
1232
1265
|
"""
|
1233
1266
|
folder_infos = self._request("GET", "folders")
|
1234
1267
|
return [Folder(self, info["name"], info["id"]) for info in folder_infos]
|
1235
|
-
|
1268
|
+
|
1236
1269
|
def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1237
1270
|
"""
|
1238
1271
|
Add a document to a folder.
|
@@ -1246,7 +1279,7 @@ class Morphik:
|
|
1246
1279
|
"""
|
1247
1280
|
response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1248
1281
|
return response
|
1249
|
-
|
1282
|
+
|
1250
1283
|
def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1251
1284
|
"""
|
1252
1285
|
Remove a document from a folder.
|
@@ -1290,7 +1323,8 @@ class Morphik:
|
|
1290
1323
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1291
1324
|
- MetadataExtractionRule: Extract metadata using a schema
|
1292
1325
|
- NaturalLanguageRule: Transform content using natural language
|
1293
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1326
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1327
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1294
1328
|
Returns:
|
1295
1329
|
Document: Metadata of the ingested document
|
1296
1330
|
|
@@ -1343,7 +1377,8 @@ class Morphik:
|
|
1343
1377
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1344
1378
|
- MetadataExtractionRule: Extract metadata using a schema
|
1345
1379
|
- NaturalLanguageRule: Transform content using natural language
|
1346
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1380
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1381
|
+
(slower, but significantly better retrieval accuracy for images)
|
1347
1382
|
|
1348
1383
|
Returns:
|
1349
1384
|
Document: Metadata of the ingested document
|
@@ -1380,11 +1415,13 @@ class Morphik:
|
|
1380
1415
|
# Create form data
|
1381
1416
|
form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
|
1382
1417
|
|
1418
|
+
# use_colpali should be a query parameter as defined in the API
|
1383
1419
|
response = self._request(
|
1384
1420
|
"POST",
|
1385
|
-
|
1421
|
+
"ingest/file",
|
1386
1422
|
data=form_data,
|
1387
1423
|
files=files,
|
1424
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1388
1425
|
)
|
1389
1426
|
doc = self._logic._parse_document_response(response)
|
1390
1427
|
doc._client = self
|
@@ -1423,11 +1460,16 @@ class Morphik:
|
|
1423
1460
|
|
1424
1461
|
try:
|
1425
1462
|
# Prepare form data
|
1426
|
-
data
|
1427
|
-
|
1428
|
-
)
|
1463
|
+
# Prepare form data - use_colpali should be a query parameter, not form data
|
1464
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1429
1465
|
|
1430
|
-
response = self._request(
|
1466
|
+
response = self._request(
|
1467
|
+
"POST",
|
1468
|
+
"ingest/files",
|
1469
|
+
data=data,
|
1470
|
+
files=file_objects,
|
1471
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1472
|
+
)
|
1431
1473
|
|
1432
1474
|
if response.get("errors"):
|
1433
1475
|
# Log errors but don't raise exception
|
@@ -1509,7 +1551,8 @@ class Morphik:
|
|
1509
1551
|
filters: Optional metadata filters
|
1510
1552
|
k: Number of results (default: 4)
|
1511
1553
|
min_score: Minimum similarity threshold (default: 0.0)
|
1512
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1554
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1555
|
+
(only works for documents ingested with `use_colpali=True`)
|
1513
1556
|
Returns:
|
1514
1557
|
List[ChunkResult]
|
1515
1558
|
|
@@ -1521,9 +1564,7 @@ class Morphik:
|
|
1521
1564
|
)
|
1522
1565
|
```
|
1523
1566
|
"""
|
1524
|
-
payload = self._logic._prepare_retrieve_chunks_request(
|
1525
|
-
query, filters, k, min_score, use_colpali, None, None
|
1526
|
-
)
|
1567
|
+
payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
|
1527
1568
|
response = self._request("POST", "retrieve/chunks", data=payload)
|
1528
1569
|
return self._logic._parse_chunk_result_list_response(response)
|
1529
1570
|
|
@@ -1543,7 +1584,8 @@ class Morphik:
|
|
1543
1584
|
filters: Optional metadata filters
|
1544
1585
|
k: Number of results (default: 4)
|
1545
1586
|
min_score: Minimum similarity threshold (default: 0.0)
|
1546
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1587
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1588
|
+
(only works for documents ingested with `use_colpali=True`)
|
1547
1589
|
Returns:
|
1548
1590
|
List[DocumentResult]
|
1549
1591
|
|
@@ -1555,9 +1597,7 @@ class Morphik:
|
|
1555
1597
|
)
|
1556
1598
|
```
|
1557
1599
|
"""
|
1558
|
-
payload = self._logic._prepare_retrieve_docs_request(
|
1559
|
-
query, filters, k, min_score, use_colpali, None, None
|
1560
|
-
)
|
1600
|
+
payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
|
1561
1601
|
response = self._request("POST", "retrieve/docs", data=payload)
|
1562
1602
|
return self._logic._parse_document_result_list_response(response)
|
1563
1603
|
|
@@ -1574,6 +1614,7 @@ class Morphik:
|
|
1574
1614
|
hop_depth: int = 1,
|
1575
1615
|
include_paths: bool = False,
|
1576
1616
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1617
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1577
1618
|
) -> CompletionResponse:
|
1578
1619
|
"""
|
1579
1620
|
Generate completion using relevant chunks as context.
|
@@ -1585,12 +1626,14 @@ class Morphik:
|
|
1585
1626
|
min_score: Minimum similarity threshold (default: 0.0)
|
1586
1627
|
max_tokens: Maximum tokens in completion
|
1587
1628
|
temperature: Model temperature
|
1588
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1629
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1630
|
+
(only works for documents ingested with `use_colpali=True`)
|
1589
1631
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1590
1632
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1591
1633
|
include_paths: Whether to include relationship paths in the response
|
1592
1634
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1593
1635
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1636
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1594
1637
|
Returns:
|
1595
1638
|
CompletionResponse
|
1596
1639
|
|
@@ -1638,6 +1681,27 @@ class Morphik:
|
|
1638
1681
|
if response.metadata and "graph" in response.metadata:
|
1639
1682
|
for path in response.metadata["graph"]["paths"]:
|
1640
1683
|
print(" -> ".join(path))
|
1684
|
+
|
1685
|
+
# Using structured output with a Pydantic model
|
1686
|
+
from pydantic import BaseModel
|
1687
|
+
|
1688
|
+
class ResearchFindings(BaseModel):
|
1689
|
+
main_finding: str
|
1690
|
+
supporting_evidence: List[str]
|
1691
|
+
limitations: List[str]
|
1692
|
+
|
1693
|
+
response = db.query(
|
1694
|
+
"Summarize the key research findings from these documents",
|
1695
|
+
schema=ResearchFindings
|
1696
|
+
)
|
1697
|
+
|
1698
|
+
# Access structured output
|
1699
|
+
if response.structured_output:
|
1700
|
+
findings = response.structured_output
|
1701
|
+
print(f"Main finding: {findings.main_finding}")
|
1702
|
+
print("Supporting evidence:")
|
1703
|
+
for evidence in findings.supporting_evidence:
|
1704
|
+
print(f"- {evidence}")
|
1641
1705
|
```
|
1642
1706
|
"""
|
1643
1707
|
payload = self._logic._prepare_query_request(
|
@@ -1654,7 +1718,20 @@ class Morphik:
|
|
1654
1718
|
prompt_overrides,
|
1655
1719
|
None,
|
1656
1720
|
None,
|
1721
|
+
schema,
|
1657
1722
|
)
|
1723
|
+
|
1724
|
+
# Add schema to payload if provided
|
1725
|
+
if schema:
|
1726
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1727
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1728
|
+
payload["schema"] = schema.model_json_schema()
|
1729
|
+
else:
|
1730
|
+
payload["schema"] = schema
|
1731
|
+
|
1732
|
+
# Add a hint to the query to return in JSON format
|
1733
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1734
|
+
|
1658
1735
|
response = self._request("POST", "query", data=payload)
|
1659
1736
|
return self._logic._parse_completion_response(response)
|
1660
1737
|
|
@@ -1708,17 +1785,17 @@ class Morphik:
|
|
1708
1785
|
doc = self._logic._parse_document_response(response)
|
1709
1786
|
doc._client = self
|
1710
1787
|
return doc
|
1711
|
-
|
1788
|
+
|
1712
1789
|
def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1713
1790
|
"""
|
1714
1791
|
Get the current processing status of a document.
|
1715
|
-
|
1792
|
+
|
1716
1793
|
Args:
|
1717
1794
|
document_id: ID of the document to check
|
1718
|
-
|
1795
|
+
|
1719
1796
|
Returns:
|
1720
1797
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1721
|
-
|
1798
|
+
|
1722
1799
|
Example:
|
1723
1800
|
```python
|
1724
1801
|
status = db.get_document_status("doc_123")
|
@@ -1732,23 +1809,23 @@ class Morphik:
|
|
1732
1809
|
"""
|
1733
1810
|
response = self._request("GET", f"documents/{document_id}/status")
|
1734
1811
|
return response
|
1735
|
-
|
1812
|
+
|
1736
1813
|
def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
1737
1814
|
"""
|
1738
1815
|
Wait for a document's processing to complete.
|
1739
|
-
|
1816
|
+
|
1740
1817
|
Args:
|
1741
1818
|
document_id: ID of the document to wait for
|
1742
1819
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1743
1820
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1744
|
-
|
1821
|
+
|
1745
1822
|
Returns:
|
1746
1823
|
Document: Updated document with the latest status
|
1747
|
-
|
1824
|
+
|
1748
1825
|
Raises:
|
1749
1826
|
TimeoutError: If processing doesn't complete within the timeout period
|
1750
1827
|
ValueError: If processing fails with an error
|
1751
|
-
|
1828
|
+
|
1752
1829
|
Example:
|
1753
1830
|
```python
|
1754
1831
|
# Upload a file and wait for processing to complete
|
@@ -1763,20 +1840,21 @@ class Morphik:
|
|
1763
1840
|
```
|
1764
1841
|
"""
|
1765
1842
|
import time
|
1843
|
+
|
1766
1844
|
start_time = time.time()
|
1767
|
-
|
1845
|
+
|
1768
1846
|
while (time.time() - start_time) < timeout_seconds:
|
1769
1847
|
status = self.get_document_status(document_id)
|
1770
|
-
|
1848
|
+
|
1771
1849
|
if status["status"] == "completed":
|
1772
1850
|
# Get the full document now that it's complete
|
1773
1851
|
return self.get_document(document_id)
|
1774
1852
|
elif status["status"] == "failed":
|
1775
1853
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1776
|
-
|
1854
|
+
|
1777
1855
|
# Wait before checking again
|
1778
1856
|
time.sleep(check_interval_seconds)
|
1779
|
-
|
1857
|
+
|
1780
1858
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1781
1859
|
|
1782
1860
|
def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1930,9 +2008,7 @@ class Morphik:
|
|
1930
2008
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1931
2009
|
|
1932
2010
|
# Use the dedicated file update endpoint
|
1933
|
-
response = self._request(
|
1934
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1935
|
-
)
|
2011
|
+
response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1936
2012
|
|
1937
2013
|
doc = self._logic._parse_document_response(response)
|
1938
2014
|
doc._client = self
|
@@ -2151,15 +2227,14 @@ class Morphik:
|
|
2151
2227
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
2152
2228
|
```
|
2153
2229
|
"""
|
2154
|
-
|
2230
|
+
# API expects a dict with document_ids key, not a direct list
|
2231
|
+
response = self._request("POST", "batch/documents", data={"document_ids": document_ids})
|
2155
2232
|
docs = self._logic._parse_document_list_response(response)
|
2156
2233
|
for doc in docs:
|
2157
2234
|
doc._client = self
|
2158
2235
|
return docs
|
2159
2236
|
|
2160
|
-
def batch_get_chunks(
|
2161
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2162
|
-
) -> List[FinalChunkResult]:
|
2237
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
2163
2238
|
"""
|
2164
2239
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2165
2240
|
|
@@ -2215,8 +2290,10 @@ class Morphik:
|
|
2215
2290
|
name: Name of the cache to create
|
2216
2291
|
model: Name of the model to use (e.g. "llama2")
|
2217
2292
|
gguf_file: Name of the GGUF file to use for the model
|
2218
|
-
filters: Optional metadata filters to determine which documents to include.
|
2219
|
-
|
2293
|
+
filters: Optional metadata filters to determine which documents to include.
|
2294
|
+
These filters will be applied in addition to any specific docs provided.
|
2295
|
+
docs: Optional list of specific document IDs to include.
|
2296
|
+
These docs will be included in addition to any documents matching the filters.
|
2220
2297
|
|
2221
2298
|
Returns:
|
2222
2299
|
Dict[str, Any]: Created cache configuration
|
@@ -2321,12 +2398,16 @@ class Morphik:
|
|
2321
2398
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
2322
2399
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
2323
2400
|
|
2324
|
-
request
|
2325
|
-
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2401
|
+
# Initialize request with required fields
|
2402
|
+
request = {"name": name}
|
2403
|
+
|
2404
|
+
# Add optional fields only if they are not None
|
2405
|
+
if filters is not None:
|
2406
|
+
request["filters"] = filters
|
2407
|
+
if documents is not None:
|
2408
|
+
request["documents"] = documents
|
2409
|
+
if prompt_overrides is not None:
|
2410
|
+
request["prompt_overrides"] = prompt_overrides
|
2330
2411
|
|
2331
2412
|
response = self._request("POST", "graph/create", request)
|
2332
2413
|
return self._logic._parse_graph_response(response)
|