morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +28 -19
- morphik/async_.py +121 -110
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +156 -109
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/async_.py
CHANGED
@@ -2,25 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
6
6
|
|
7
7
|
import httpx
|
8
|
-
from
|
8
|
+
from pydantic import BaseModel
|
9
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
10
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
11
14
|
Document,
|
12
15
|
DocumentResult,
|
13
|
-
CompletionResponse,
|
14
|
-
IngestTextRequest,
|
15
|
-
ChunkSource,
|
16
|
-
Graph,
|
17
16
|
FolderInfo,
|
18
|
-
|
17
|
+
Graph,
|
19
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
20
20
|
QueryPromptOverrides,
|
21
21
|
)
|
22
|
-
from .rules import Rule
|
23
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
24
22
|
|
25
23
|
logger = logging.getLogger(__name__)
|
26
24
|
|
@@ -69,16 +67,16 @@ class AsyncFolder:
|
|
69
67
|
def name(self) -> str:
|
70
68
|
"""Returns the folder name."""
|
71
69
|
return self._name
|
72
|
-
|
70
|
+
|
73
71
|
@property
|
74
72
|
def id(self) -> Optional[str]:
|
75
73
|
"""Returns the folder ID if available."""
|
76
74
|
return self._id
|
77
|
-
|
75
|
+
|
78
76
|
async def get_info(self) -> Dict[str, Any]:
|
79
77
|
"""
|
80
78
|
Get detailed information about this folder.
|
81
|
-
|
79
|
+
|
82
80
|
Returns:
|
83
81
|
Dict[str, Any]: Detailed folder information
|
84
82
|
"""
|
@@ -91,9 +89,8 @@ class AsyncFolder:
|
|
91
89
|
break
|
92
90
|
if not self._id:
|
93
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
94
|
-
|
92
|
+
|
95
93
|
return await self._client._request("GET", f"folders/{self._id}")
|
96
|
-
|
97
94
|
|
98
95
|
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
99
96
|
"""
|
@@ -166,9 +163,7 @@ class AsyncFolder:
|
|
166
163
|
files = {"file": (filename, file_obj)}
|
167
164
|
|
168
165
|
# Create form data
|
169
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
170
|
-
metadata, rules, self._name, None
|
171
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
172
167
|
|
173
168
|
response = await self._client._request(
|
174
169
|
"POST",
|
@@ -216,9 +211,9 @@ class AsyncFolder:
|
|
216
211
|
)
|
217
212
|
|
218
213
|
response = await self._client._request(
|
219
|
-
"POST",
|
220
|
-
"ingest/files",
|
221
|
-
data=data,
|
214
|
+
"POST",
|
215
|
+
"ingest/files",
|
216
|
+
data=data,
|
222
217
|
files=file_objects,
|
223
218
|
params={"use_colpali": str(use_colpali).lower()},
|
224
219
|
)
|
@@ -228,9 +223,7 @@ class AsyncFolder:
|
|
228
223
|
for error in response["errors"]:
|
229
224
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
230
225
|
|
231
|
-
docs = [
|
232
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
233
|
-
]
|
226
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
234
227
|
for doc in docs:
|
235
228
|
doc._client = self._client
|
236
229
|
return docs
|
@@ -353,6 +346,7 @@ class AsyncFolder:
|
|
353
346
|
hop_depth: int = 1,
|
354
347
|
include_paths: bool = False,
|
355
348
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
349
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
356
350
|
) -> CompletionResponse:
|
357
351
|
"""
|
358
352
|
Generate completion using relevant chunks as context within this folder.
|
@@ -369,9 +363,10 @@ class AsyncFolder:
|
|
369
363
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
370
364
|
include_paths: Whether to include relationship paths in the response
|
371
365
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
366
|
+
schema: Optional schema for structured output
|
372
367
|
|
373
368
|
Returns:
|
374
|
-
CompletionResponse: Generated completion
|
369
|
+
CompletionResponse: Generated completion or structured output
|
375
370
|
"""
|
376
371
|
payload = self._client._logic._prepare_query_request(
|
377
372
|
query,
|
@@ -387,6 +382,7 @@ class AsyncFolder:
|
|
387
382
|
prompt_overrides,
|
388
383
|
self._name,
|
389
384
|
None,
|
385
|
+
schema,
|
390
386
|
)
|
391
387
|
response = await self._client._request("POST", "query", data=payload)
|
392
388
|
return self._client._logic._parse_completion_response(response)
|
@@ -405,9 +401,7 @@ class AsyncFolder:
|
|
405
401
|
Returns:
|
406
402
|
List[Document]: List of documents
|
407
403
|
"""
|
408
|
-
params, data = self._client._logic._prepare_list_documents_request(
|
409
|
-
skip, limit, filters, self._name, None
|
410
|
-
)
|
404
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
|
411
405
|
response = await self._client._request("POST", "documents", data=data, params=params)
|
412
406
|
docs = self._client._logic._parse_document_list_response(response)
|
413
407
|
for doc in docs:
|
@@ -434,9 +428,7 @@ class AsyncFolder:
|
|
434
428
|
doc._client = self._client
|
435
429
|
return docs
|
436
430
|
|
437
|
-
async def batch_get_chunks(
|
438
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
439
|
-
) -> List[FinalChunkResult]:
|
431
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
440
432
|
"""
|
441
433
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
442
434
|
|
@@ -510,9 +502,6 @@ class AsyncFolder:
|
|
510
502
|
Returns:
|
511
503
|
Dict[str, str]: Deletion status
|
512
504
|
"""
|
513
|
-
# Get the document by filename with folder scope
|
514
|
-
request = {"filename": filename, "folder_name": self._name}
|
515
|
-
|
516
505
|
# First get the document ID
|
517
506
|
response = await self._client._request(
|
518
507
|
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
@@ -685,9 +674,7 @@ class AsyncUserScope:
|
|
685
674
|
if rules:
|
686
675
|
if all(isinstance(r, list) for r in rules):
|
687
676
|
# List of lists - per-file rules
|
688
|
-
converted_rules = [
|
689
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
690
|
-
]
|
677
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
691
678
|
else:
|
692
679
|
# Flat list - shared rules for all files
|
693
680
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -707,9 +694,9 @@ class AsyncUserScope:
|
|
707
694
|
data["folder_name"] = self._folder_name
|
708
695
|
|
709
696
|
response = await self._client._request(
|
710
|
-
"POST",
|
711
|
-
"ingest/files",
|
712
|
-
data=data,
|
697
|
+
"POST",
|
698
|
+
"ingest/files",
|
699
|
+
data=data,
|
713
700
|
files=file_objects,
|
714
701
|
params={"use_colpali": str(use_colpali).lower()},
|
715
702
|
)
|
@@ -719,9 +706,7 @@ class AsyncUserScope:
|
|
719
706
|
for error in response["errors"]:
|
720
707
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
721
708
|
|
722
|
-
docs = [
|
723
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
724
|
-
]
|
709
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
725
710
|
for doc in docs:
|
726
711
|
doc._client = self._client
|
727
712
|
return docs
|
@@ -844,9 +829,10 @@ class AsyncUserScope:
|
|
844
829
|
hop_depth: int = 1,
|
845
830
|
include_paths: bool = False,
|
846
831
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
832
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
847
833
|
) -> CompletionResponse:
|
848
834
|
"""
|
849
|
-
Generate completion using relevant chunks as context
|
835
|
+
Generate completion using relevant chunks as context, scoped to the end user.
|
850
836
|
|
851
837
|
Args:
|
852
838
|
query: Query text
|
@@ -860,9 +846,10 @@ class AsyncUserScope:
|
|
860
846
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
861
847
|
include_paths: Whether to include relationship paths in the response
|
862
848
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
849
|
+
schema: Optional schema for structured output
|
863
850
|
|
864
851
|
Returns:
|
865
|
-
CompletionResponse: Generated completion
|
852
|
+
CompletionResponse: Generated completion or structured output
|
866
853
|
"""
|
867
854
|
payload = self._client._logic._prepare_query_request(
|
868
855
|
query,
|
@@ -876,8 +863,9 @@ class AsyncUserScope:
|
|
876
863
|
hop_depth,
|
877
864
|
include_paths,
|
878
865
|
prompt_overrides,
|
879
|
-
self.
|
880
|
-
self.
|
866
|
+
self.folder_name,
|
867
|
+
self.end_user_id,
|
868
|
+
schema,
|
881
869
|
)
|
882
870
|
response = await self._client._request("POST", "query", data=payload)
|
883
871
|
return self._client._logic._parse_completion_response(response)
|
@@ -927,9 +915,7 @@ class AsyncUserScope:
|
|
927
915
|
doc._client = self._client
|
928
916
|
return docs
|
929
917
|
|
930
|
-
async def batch_get_chunks(
|
931
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
932
|
-
) -> List[FinalChunkResult]:
|
918
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
933
919
|
"""
|
934
920
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
935
921
|
|
@@ -939,9 +925,7 @@ class AsyncUserScope:
|
|
939
925
|
Returns:
|
940
926
|
List[FinalChunkResult]: List of chunk results
|
941
927
|
"""
|
942
|
-
request = self._client._logic._prepare_batch_get_chunks_request(
|
943
|
-
sources, self._folder_name, self._end_user_id
|
944
|
-
)
|
928
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, self._folder_name, self._end_user_id)
|
945
929
|
response = await self._client._request("POST", "batch/chunks", data=request)
|
946
930
|
return self._client._logic._parse_chunk_result_list_response(response)
|
947
931
|
|
@@ -1018,9 +1002,7 @@ class AsyncUserScope:
|
|
1018
1002
|
params["folder_name"] = self._folder_name
|
1019
1003
|
|
1020
1004
|
# First get the document ID
|
1021
|
-
response = await self._client._request(
|
1022
|
-
"GET", f"documents/filename/{filename}", params=params
|
1023
|
-
)
|
1005
|
+
response = await self._client._request("GET", f"documents/filename/{filename}", params=params)
|
1024
1006
|
doc = self._client._logic._parse_document_response(response)
|
1025
1007
|
|
1026
1008
|
# Then delete by ID
|
@@ -1077,7 +1059,7 @@ class AsyncMorphik:
|
|
1077
1059
|
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1078
1060
|
if "Content-Type" in headers:
|
1079
1061
|
del headers["Content-Type"]
|
1080
|
-
|
1062
|
+
|
1081
1063
|
# For file uploads with form data, use form data (not json)
|
1082
1064
|
request_data = {"files": files}
|
1083
1065
|
if data:
|
@@ -1112,18 +1094,16 @@ class AsyncMorphik:
|
|
1112
1094
|
Returns:
|
1113
1095
|
AsyncFolder: A folder object ready for scoped operations
|
1114
1096
|
"""
|
1115
|
-
payload = {
|
1116
|
-
"name": name
|
1117
|
-
}
|
1097
|
+
payload = {"name": name}
|
1118
1098
|
if description:
|
1119
1099
|
payload["description"] = description
|
1120
|
-
|
1100
|
+
|
1121
1101
|
response = await self._request("POST", "folders", data=payload)
|
1122
1102
|
folder_info = FolderInfo(**response)
|
1123
|
-
|
1103
|
+
|
1124
1104
|
# Return a usable AsyncFolder object with the ID from the response
|
1125
1105
|
return AsyncFolder(self, name, folder_id=folder_info.id)
|
1126
|
-
|
1106
|
+
|
1127
1107
|
def get_folder_by_name(self, name: str) -> AsyncFolder:
|
1128
1108
|
"""
|
1129
1109
|
Get a folder by name to scope operations.
|
@@ -1135,7 +1115,7 @@ class AsyncMorphik:
|
|
1135
1115
|
AsyncFolder: A folder object for scoped operations
|
1136
1116
|
"""
|
1137
1117
|
return AsyncFolder(self, name)
|
1138
|
-
|
1118
|
+
|
1139
1119
|
async def get_folder(self, folder_id: str) -> AsyncFolder:
|
1140
1120
|
"""
|
1141
1121
|
Get a folder by ID.
|
@@ -1148,7 +1128,7 @@ class AsyncMorphik:
|
|
1148
1128
|
"""
|
1149
1129
|
response = await self._request("GET", f"folders/{folder_id}")
|
1150
1130
|
return AsyncFolder(self, response["name"], folder_id)
|
1151
|
-
|
1131
|
+
|
1152
1132
|
async def list_folders(self) -> List[AsyncFolder]:
|
1153
1133
|
"""
|
1154
1134
|
List all folders the user has access to as AsyncFolder objects.
|
@@ -1158,7 +1138,7 @@ class AsyncMorphik:
|
|
1158
1138
|
"""
|
1159
1139
|
response = await self._request("GET", "folders")
|
1160
1140
|
return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
|
1161
|
-
|
1141
|
+
|
1162
1142
|
async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1163
1143
|
"""
|
1164
1144
|
Add a document to a folder.
|
@@ -1172,7 +1152,7 @@ class AsyncMorphik:
|
|
1172
1152
|
"""
|
1173
1153
|
response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1174
1154
|
return response
|
1175
|
-
|
1155
|
+
|
1176
1156
|
async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1177
1157
|
"""
|
1178
1158
|
Remove a document from a folder.
|
@@ -1216,7 +1196,8 @@ class AsyncMorphik:
|
|
1216
1196
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1217
1197
|
- MetadataExtractionRule: Extract metadata using a schema
|
1218
1198
|
- NaturalLanguageRule: Transform content using natural language
|
1219
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1199
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1200
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1220
1201
|
Returns:
|
1221
1202
|
Document: Metadata of the ingested document
|
1222
1203
|
|
@@ -1314,14 +1295,12 @@ class AsyncMorphik:
|
|
1314
1295
|
|
1315
1296
|
try:
|
1316
1297
|
# Prepare form data
|
1317
|
-
data = self._logic._prepare_ingest_files_form_data(
|
1318
|
-
metadata, rules, use_colpali, parallel, None, None
|
1319
|
-
)
|
1298
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1320
1299
|
|
1321
1300
|
response = await self._request(
|
1322
|
-
"POST",
|
1323
|
-
"ingest/files",
|
1324
|
-
data=data,
|
1301
|
+
"POST",
|
1302
|
+
"ingest/files",
|
1303
|
+
data=data,
|
1325
1304
|
files=file_objects,
|
1326
1305
|
params={"use_colpali": str(use_colpali).lower()},
|
1327
1306
|
)
|
@@ -1407,7 +1386,8 @@ class AsyncMorphik:
|
|
1407
1386
|
filters: Optional metadata filters
|
1408
1387
|
k: Number of results (default: 4)
|
1409
1388
|
min_score: Minimum similarity threshold (default: 0.0)
|
1410
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1389
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1390
|
+
(only works for documents ingested with `use_colpali=True`)
|
1411
1391
|
Returns:
|
1412
1392
|
List[FinalChunkResult]
|
1413
1393
|
|
@@ -1419,9 +1399,7 @@ class AsyncMorphik:
|
|
1419
1399
|
)
|
1420
1400
|
```
|
1421
1401
|
"""
|
1422
|
-
payload = self._logic._prepare_retrieve_chunks_request(
|
1423
|
-
query, filters, k, min_score, use_colpali, None, None
|
1424
|
-
)
|
1402
|
+
payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
|
1425
1403
|
response = await self._request("POST", "retrieve/chunks", data=payload)
|
1426
1404
|
return self._logic._parse_chunk_result_list_response(response)
|
1427
1405
|
|
@@ -1441,7 +1419,8 @@ class AsyncMorphik:
|
|
1441
1419
|
filters: Optional metadata filters
|
1442
1420
|
k: Number of results (default: 4)
|
1443
1421
|
min_score: Minimum similarity threshold (default: 0.0)
|
1444
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1422
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1423
|
+
(only works for documents ingested with `use_colpali=True`)
|
1445
1424
|
Returns:
|
1446
1425
|
List[DocumentResult]
|
1447
1426
|
|
@@ -1453,9 +1432,7 @@ class AsyncMorphik:
|
|
1453
1432
|
)
|
1454
1433
|
```
|
1455
1434
|
"""
|
1456
|
-
payload = self._logic._prepare_retrieve_docs_request(
|
1457
|
-
query, filters, k, min_score, use_colpali, None, None
|
1458
|
-
)
|
1435
|
+
payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
|
1459
1436
|
response = await self._request("POST", "retrieve/docs", data=payload)
|
1460
1437
|
return self._logic._parse_document_result_list_response(response)
|
1461
1438
|
|
@@ -1472,6 +1449,7 @@ class AsyncMorphik:
|
|
1472
1449
|
hop_depth: int = 1,
|
1473
1450
|
include_paths: bool = False,
|
1474
1451
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1452
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1475
1453
|
) -> CompletionResponse:
|
1476
1454
|
"""
|
1477
1455
|
Generate completion using relevant chunks as context.
|
@@ -1483,12 +1461,14 @@ class AsyncMorphik:
|
|
1483
1461
|
min_score: Minimum similarity threshold (default: 0.0)
|
1484
1462
|
max_tokens: Maximum tokens in completion
|
1485
1463
|
temperature: Model temperature
|
1486
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1464
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1465
|
+
(only works for documents ingested with `use_colpali=True`)
|
1487
1466
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1488
1467
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1489
1468
|
include_paths: Whether to include relationship paths in the response
|
1490
1469
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1491
1470
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1471
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1492
1472
|
Returns:
|
1493
1473
|
CompletionResponse
|
1494
1474
|
|
@@ -1536,6 +1516,27 @@ class AsyncMorphik:
|
|
1536
1516
|
if response.metadata and "graph" in response.metadata:
|
1537
1517
|
for path in response.metadata["graph"]["paths"]:
|
1538
1518
|
print(" -> ".join(path))
|
1519
|
+
|
1520
|
+
# Using structured output with a Pydantic model
|
1521
|
+
from pydantic import BaseModel
|
1522
|
+
|
1523
|
+
class ResearchFindings(BaseModel):
|
1524
|
+
main_finding: str
|
1525
|
+
supporting_evidence: List[str]
|
1526
|
+
limitations: List[str]
|
1527
|
+
|
1528
|
+
response = await db.query(
|
1529
|
+
"Summarize the key research findings from these documents",
|
1530
|
+
schema=ResearchFindings
|
1531
|
+
)
|
1532
|
+
|
1533
|
+
# Access structured output
|
1534
|
+
if response.structured_output:
|
1535
|
+
findings = response.structured_output
|
1536
|
+
print(f"Main finding: {findings.main_finding}")
|
1537
|
+
print("Supporting evidence:")
|
1538
|
+
for evidence in findings.supporting_evidence:
|
1539
|
+
print(f"- {evidence}")
|
1539
1540
|
```
|
1540
1541
|
"""
|
1541
1542
|
payload = self._logic._prepare_query_request(
|
@@ -1552,7 +1553,20 @@ class AsyncMorphik:
|
|
1552
1553
|
prompt_overrides,
|
1553
1554
|
None,
|
1554
1555
|
None,
|
1556
|
+
schema,
|
1555
1557
|
)
|
1558
|
+
|
1559
|
+
# Add schema to payload if provided
|
1560
|
+
if schema:
|
1561
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1562
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1563
|
+
payload["schema"] = schema.model_json_schema()
|
1564
|
+
else:
|
1565
|
+
payload["schema"] = schema
|
1566
|
+
|
1567
|
+
# Add a hint to the query to return in JSON format
|
1568
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1569
|
+
|
1556
1570
|
response = await self._request("POST", "query", data=payload)
|
1557
1571
|
return self._logic._parse_completion_response(response)
|
1558
1572
|
|
@@ -1606,17 +1620,17 @@ class AsyncMorphik:
|
|
1606
1620
|
doc = self._logic._parse_document_response(response)
|
1607
1621
|
doc._client = self
|
1608
1622
|
return doc
|
1609
|
-
|
1623
|
+
|
1610
1624
|
async def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1611
1625
|
"""
|
1612
1626
|
Get the current processing status of a document.
|
1613
|
-
|
1627
|
+
|
1614
1628
|
Args:
|
1615
1629
|
document_id: ID of the document to check
|
1616
|
-
|
1630
|
+
|
1617
1631
|
Returns:
|
1618
1632
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1619
|
-
|
1633
|
+
|
1620
1634
|
Example:
|
1621
1635
|
```python
|
1622
1636
|
status = await db.get_document_status("doc_123")
|
@@ -1630,23 +1644,25 @@ class AsyncMorphik:
|
|
1630
1644
|
"""
|
1631
1645
|
response = await self._request("GET", f"documents/{document_id}/status")
|
1632
1646
|
return response
|
1633
|
-
|
1634
|
-
async def wait_for_document_completion(
|
1647
|
+
|
1648
|
+
async def wait_for_document_completion(
|
1649
|
+
self, document_id: str, timeout_seconds=300, check_interval_seconds=2
|
1650
|
+
) -> Document:
|
1635
1651
|
"""
|
1636
1652
|
Wait for a document's processing to complete.
|
1637
|
-
|
1653
|
+
|
1638
1654
|
Args:
|
1639
1655
|
document_id: ID of the document to wait for
|
1640
1656
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1641
1657
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1642
|
-
|
1658
|
+
|
1643
1659
|
Returns:
|
1644
1660
|
Document: Updated document with the latest status
|
1645
|
-
|
1661
|
+
|
1646
1662
|
Raises:
|
1647
1663
|
TimeoutError: If processing doesn't complete within the timeout period
|
1648
1664
|
ValueError: If processing fails with an error
|
1649
|
-
|
1665
|
+
|
1650
1666
|
Example:
|
1651
1667
|
```python
|
1652
1668
|
# Upload a file and wait for processing to complete
|
@@ -1661,20 +1677,21 @@ class AsyncMorphik:
|
|
1661
1677
|
```
|
1662
1678
|
"""
|
1663
1679
|
import asyncio
|
1680
|
+
|
1664
1681
|
start_time = asyncio.get_event_loop().time()
|
1665
|
-
|
1682
|
+
|
1666
1683
|
while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
|
1667
1684
|
status = await self.get_document_status(document_id)
|
1668
|
-
|
1685
|
+
|
1669
1686
|
if status["status"] == "completed":
|
1670
1687
|
# Get the full document now that it's complete
|
1671
1688
|
return await self.get_document(document_id)
|
1672
1689
|
elif status["status"] == "failed":
|
1673
1690
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1674
|
-
|
1691
|
+
|
1675
1692
|
# Wait before checking again
|
1676
1693
|
await asyncio.sleep(check_interval_seconds)
|
1677
|
-
|
1694
|
+
|
1678
1695
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1679
1696
|
|
1680
1697
|
async def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1828,9 +1845,7 @@ class AsyncMorphik:
|
|
1828
1845
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1829
1846
|
|
1830
1847
|
# Use the dedicated file update endpoint
|
1831
|
-
response = await self._request(
|
1832
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1833
|
-
)
|
1848
|
+
response = await self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1834
1849
|
|
1835
1850
|
doc = self._logic._parse_document_response(response)
|
1836
1851
|
doc._client = self
|
@@ -1866,9 +1881,7 @@ class AsyncMorphik:
|
|
1866
1881
|
```
|
1867
1882
|
"""
|
1868
1883
|
# Use the dedicated metadata update endpoint
|
1869
|
-
response = await self._request(
|
1870
|
-
"POST", f"documents/{document_id}/update_metadata", data=metadata
|
1871
|
-
)
|
1884
|
+
response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
1872
1885
|
doc = self._logic._parse_document_response(response)
|
1873
1886
|
doc._client = self
|
1874
1887
|
return doc
|
@@ -2059,9 +2072,7 @@ class AsyncMorphik:
|
|
2059
2072
|
doc._client = self
|
2060
2073
|
return docs
|
2061
2074
|
|
2062
|
-
async def batch_get_chunks(
|
2063
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2064
|
-
) -> List[FinalChunkResult]:
|
2075
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
2065
2076
|
"""
|
2066
2077
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2067
2078
|
|
@@ -2110,8 +2121,10 @@ class AsyncMorphik:
|
|
2110
2121
|
name: Name of the cache to create
|
2111
2122
|
model: Name of the model to use (e.g. "llama2")
|
2112
2123
|
gguf_file: Name of the GGUF file to use for the model
|
2113
|
-
filters: Optional metadata filters to determine which documents to include.
|
2114
|
-
|
2124
|
+
filters: Optional metadata filters to determine which documents to include.
|
2125
|
+
These filters will be applied in addition to any specific docs provided.
|
2126
|
+
docs: Optional list of specific document IDs to include.
|
2127
|
+
These docs will be included in addition to any documents matching the filters.
|
2115
2128
|
|
2116
2129
|
Returns:
|
2117
2130
|
Dict[str, Any]: Created cache configuration
|
@@ -2212,9 +2225,7 @@ class AsyncMorphik:
|
|
2212
2225
|
)
|
2213
2226
|
```
|
2214
2227
|
"""
|
2215
|
-
request = self._logic._prepare_create_graph_request(
|
2216
|
-
name, filters, documents, prompt_overrides, None, None
|
2217
|
-
)
|
2228
|
+
request = self._logic._prepare_create_graph_request(name, filters, documents, prompt_overrides, None, None)
|
2218
2229
|
response = await self._request("POST", "graph/create", data=request)
|
2219
2230
|
return self._logic._parse_graph_response(response)
|
2220
2231
|
|