morphik 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +2 -2
- morphik/_internal.py +29 -20
- morphik/async_.py +154 -116
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +189 -108
- morphik/tests/README.md +41 -0
- morphik/tests/__init__.py +0 -0
- morphik/tests/example_usage.py +280 -0
- morphik/tests/test_async.py +384 -0
- morphik/tests/test_docs/sample1.txt +11 -0
- morphik/tests/test_docs/sample2.txt +15 -0
- morphik/tests/test_docs/sample3.txt +17 -0
- morphik/tests/test_sync.py +371 -0
- morphik-0.1.5.dist-info/METADATA +149 -0
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.3.dist-info/METADATA +0 -47
- morphik-0.1.3.dist-info/RECORD +0 -10
- {morphik-0.1.3.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/async_.py
CHANGED
@@ -2,25 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
6
6
|
|
7
7
|
import httpx
|
8
|
-
from
|
8
|
+
from pydantic import BaseModel
|
9
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
10
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
11
14
|
Document,
|
12
15
|
DocumentResult,
|
13
|
-
CompletionResponse,
|
14
|
-
IngestTextRequest,
|
15
|
-
ChunkSource,
|
16
|
-
Graph,
|
17
16
|
FolderInfo,
|
18
|
-
|
17
|
+
Graph,
|
19
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
20
20
|
QueryPromptOverrides,
|
21
21
|
)
|
22
|
-
from .rules import Rule
|
23
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
24
22
|
|
25
23
|
logger = logging.getLogger(__name__)
|
26
24
|
|
@@ -69,16 +67,16 @@ class AsyncFolder:
|
|
69
67
|
def name(self) -> str:
|
70
68
|
"""Returns the folder name."""
|
71
69
|
return self._name
|
72
|
-
|
70
|
+
|
73
71
|
@property
|
74
72
|
def id(self) -> Optional[str]:
|
75
73
|
"""Returns the folder ID if available."""
|
76
74
|
return self._id
|
77
|
-
|
75
|
+
|
78
76
|
async def get_info(self) -> Dict[str, Any]:
|
79
77
|
"""
|
80
78
|
Get detailed information about this folder.
|
81
|
-
|
79
|
+
|
82
80
|
Returns:
|
83
81
|
Dict[str, Any]: Detailed folder information
|
84
82
|
"""
|
@@ -91,9 +89,8 @@ class AsyncFolder:
|
|
91
89
|
break
|
92
90
|
if not self._id:
|
93
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
94
|
-
|
92
|
+
|
95
93
|
return await self._client._request("GET", f"folders/{self._id}")
|
96
|
-
|
97
94
|
|
98
95
|
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
99
96
|
"""
|
@@ -166,15 +163,14 @@ class AsyncFolder:
|
|
166
163
|
files = {"file": (filename, file_obj)}
|
167
164
|
|
168
165
|
# Create form data
|
169
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
170
|
-
metadata, rules, self._name, None
|
171
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
172
167
|
|
173
168
|
response = await self._client._request(
|
174
169
|
"POST",
|
175
|
-
|
170
|
+
"ingest/file",
|
176
171
|
data=form_data,
|
177
172
|
files=files,
|
173
|
+
params={"use_colpali": str(use_colpali).lower()},
|
178
174
|
)
|
179
175
|
doc = self._client._logic._parse_document_response(response)
|
180
176
|
doc._client = self._client
|
@@ -215,7 +211,11 @@ class AsyncFolder:
|
|
215
211
|
)
|
216
212
|
|
217
213
|
response = await self._client._request(
|
218
|
-
"POST",
|
214
|
+
"POST",
|
215
|
+
"ingest/files",
|
216
|
+
data=data,
|
217
|
+
files=file_objects,
|
218
|
+
params={"use_colpali": str(use_colpali).lower()},
|
219
219
|
)
|
220
220
|
|
221
221
|
if response.get("errors"):
|
@@ -223,9 +223,7 @@ class AsyncFolder:
|
|
223
223
|
for error in response["errors"]:
|
224
224
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
225
225
|
|
226
|
-
docs = [
|
227
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
228
|
-
]
|
226
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
229
227
|
for doc in docs:
|
230
228
|
doc._client = self._client
|
231
229
|
return docs
|
@@ -348,6 +346,7 @@ class AsyncFolder:
|
|
348
346
|
hop_depth: int = 1,
|
349
347
|
include_paths: bool = False,
|
350
348
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
349
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
351
350
|
) -> CompletionResponse:
|
352
351
|
"""
|
353
352
|
Generate completion using relevant chunks as context within this folder.
|
@@ -364,9 +363,10 @@ class AsyncFolder:
|
|
364
363
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
365
364
|
include_paths: Whether to include relationship paths in the response
|
366
365
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
366
|
+
schema: Optional schema for structured output
|
367
367
|
|
368
368
|
Returns:
|
369
|
-
CompletionResponse: Generated completion
|
369
|
+
CompletionResponse: Generated completion or structured output
|
370
370
|
"""
|
371
371
|
payload = self._client._logic._prepare_query_request(
|
372
372
|
query,
|
@@ -382,6 +382,7 @@ class AsyncFolder:
|
|
382
382
|
prompt_overrides,
|
383
383
|
self._name,
|
384
384
|
None,
|
385
|
+
schema,
|
385
386
|
)
|
386
387
|
response = await self._client._request("POST", "query", data=payload)
|
387
388
|
return self._client._logic._parse_completion_response(response)
|
@@ -400,9 +401,7 @@ class AsyncFolder:
|
|
400
401
|
Returns:
|
401
402
|
List[Document]: List of documents
|
402
403
|
"""
|
403
|
-
params, data = self._client._logic._prepare_list_documents_request(
|
404
|
-
skip, limit, filters, self._name, None
|
405
|
-
)
|
404
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
|
406
405
|
response = await self._client._request("POST", "documents", data=data, params=params)
|
407
406
|
docs = self._client._logic._parse_document_list_response(response)
|
408
407
|
for doc in docs:
|
@@ -419,18 +418,17 @@ class AsyncFolder:
|
|
419
418
|
Returns:
|
420
419
|
List[Document]: List of document metadata for found documents
|
421
420
|
"""
|
422
|
-
|
423
|
-
|
424
|
-
|
421
|
+
# API expects a dict with document_ids key
|
422
|
+
request = {"document_ids": document_ids}
|
423
|
+
if self._name:
|
424
|
+
request["folder_name"] = self._name
|
425
425
|
response = await self._client._request("POST", "batch/documents", data=request)
|
426
426
|
docs = self._client._logic._parse_document_list_response(response)
|
427
427
|
for doc in docs:
|
428
428
|
doc._client = self._client
|
429
429
|
return docs
|
430
430
|
|
431
|
-
async def batch_get_chunks(
|
432
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
433
|
-
) -> List[FinalChunkResult]:
|
431
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
434
432
|
"""
|
435
433
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
436
434
|
|
@@ -504,9 +502,6 @@ class AsyncFolder:
|
|
504
502
|
Returns:
|
505
503
|
Dict[str, str]: Deletion status
|
506
504
|
"""
|
507
|
-
# Get the document by filename with folder scope
|
508
|
-
request = {"filename": filename, "folder_name": self._name}
|
509
|
-
|
510
505
|
# First get the document ID
|
511
506
|
response = await self._client._request(
|
512
507
|
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
@@ -679,9 +674,7 @@ class AsyncUserScope:
|
|
679
674
|
if rules:
|
680
675
|
if all(isinstance(r, list) for r in rules):
|
681
676
|
# List of lists - per-file rules
|
682
|
-
converted_rules = [
|
683
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
684
|
-
]
|
677
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
685
678
|
else:
|
686
679
|
# Flat list - shared rules for all files
|
687
680
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -701,7 +694,11 @@ class AsyncUserScope:
|
|
701
694
|
data["folder_name"] = self._folder_name
|
702
695
|
|
703
696
|
response = await self._client._request(
|
704
|
-
"POST",
|
697
|
+
"POST",
|
698
|
+
"ingest/files",
|
699
|
+
data=data,
|
700
|
+
files=file_objects,
|
701
|
+
params={"use_colpali": str(use_colpali).lower()},
|
705
702
|
)
|
706
703
|
|
707
704
|
if response.get("errors"):
|
@@ -709,9 +706,7 @@ class AsyncUserScope:
|
|
709
706
|
for error in response["errors"]:
|
710
707
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
711
708
|
|
712
|
-
docs = [
|
713
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
714
|
-
]
|
709
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
715
710
|
for doc in docs:
|
716
711
|
doc._client = self._client
|
717
712
|
return docs
|
@@ -834,9 +829,10 @@ class AsyncUserScope:
|
|
834
829
|
hop_depth: int = 1,
|
835
830
|
include_paths: bool = False,
|
836
831
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
832
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
837
833
|
) -> CompletionResponse:
|
838
834
|
"""
|
839
|
-
Generate completion using relevant chunks as context
|
835
|
+
Generate completion using relevant chunks as context, scoped to the end user.
|
840
836
|
|
841
837
|
Args:
|
842
838
|
query: Query text
|
@@ -850,9 +846,10 @@ class AsyncUserScope:
|
|
850
846
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
851
847
|
include_paths: Whether to include relationship paths in the response
|
852
848
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
849
|
+
schema: Optional schema for structured output
|
853
850
|
|
854
851
|
Returns:
|
855
|
-
CompletionResponse: Generated completion
|
852
|
+
CompletionResponse: Generated completion or structured output
|
856
853
|
"""
|
857
854
|
payload = self._client._logic._prepare_query_request(
|
858
855
|
query,
|
@@ -866,8 +863,9 @@ class AsyncUserScope:
|
|
866
863
|
hop_depth,
|
867
864
|
include_paths,
|
868
865
|
prompt_overrides,
|
869
|
-
self.
|
870
|
-
self.
|
866
|
+
self.folder_name,
|
867
|
+
self.end_user_id,
|
868
|
+
schema,
|
871
869
|
)
|
872
870
|
response = await self._client._request("POST", "query", data=payload)
|
873
871
|
return self._client._logic._parse_completion_response(response)
|
@@ -905,18 +903,19 @@ class AsyncUserScope:
|
|
905
903
|
Returns:
|
906
904
|
List[Document]: List of document metadata for found documents
|
907
905
|
"""
|
908
|
-
|
909
|
-
|
910
|
-
|
906
|
+
# API expects a dict with document_ids key
|
907
|
+
request = {"document_ids": document_ids}
|
908
|
+
if self._end_user_id:
|
909
|
+
request["end_user_id"] = self._end_user_id
|
910
|
+
if self._folder_name:
|
911
|
+
request["folder_name"] = self._folder_name
|
911
912
|
response = await self._client._request("POST", "batch/documents", data=request)
|
912
913
|
docs = self._client._logic._parse_document_list_response(response)
|
913
914
|
for doc in docs:
|
914
915
|
doc._client = self._client
|
915
916
|
return docs
|
916
917
|
|
917
|
-
async def batch_get_chunks(
|
918
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
919
|
-
) -> List[FinalChunkResult]:
|
918
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
920
919
|
"""
|
921
920
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
922
921
|
|
@@ -926,9 +925,7 @@ class AsyncUserScope:
|
|
926
925
|
Returns:
|
927
926
|
List[FinalChunkResult]: List of chunk results
|
928
927
|
"""
|
929
|
-
request = self._client._logic._prepare_batch_get_chunks_request(
|
930
|
-
sources, self._folder_name, self._end_user_id
|
931
|
-
)
|
928
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, self._folder_name, self._end_user_id)
|
932
929
|
response = await self._client._request("POST", "batch/chunks", data=request)
|
933
930
|
return self._client._logic._parse_chunk_result_list_response(response)
|
934
931
|
|
@@ -1005,9 +1002,7 @@ class AsyncUserScope:
|
|
1005
1002
|
params["folder_name"] = self._folder_name
|
1006
1003
|
|
1007
1004
|
# First get the document ID
|
1008
|
-
response = await self._client._request(
|
1009
|
-
"GET", f"documents/filename/{filename}", params=params
|
1010
|
-
)
|
1005
|
+
response = await self._client._request("GET", f"documents/filename/{filename}", params=params)
|
1011
1006
|
doc = self._client._logic._parse_document_response(response)
|
1012
1007
|
|
1013
1008
|
# Then delete by ID
|
@@ -1060,9 +1055,15 @@ class AsyncMorphik:
|
|
1060
1055
|
|
1061
1056
|
# Configure request data based on type
|
1062
1057
|
if files:
|
1063
|
-
#
|
1064
|
-
|
1065
|
-
|
1058
|
+
# When uploading files, we need to make sure not to set Content-Type
|
1059
|
+
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1060
|
+
if "Content-Type" in headers:
|
1061
|
+
del headers["Content-Type"]
|
1062
|
+
|
1063
|
+
# For file uploads with form data, use form data (not json)
|
1064
|
+
request_data = {"files": files}
|
1065
|
+
if data:
|
1066
|
+
request_data["data"] = data
|
1066
1067
|
else:
|
1067
1068
|
# JSON for everything else
|
1068
1069
|
headers["Content-Type"] = "application/json"
|
@@ -1093,18 +1094,16 @@ class AsyncMorphik:
|
|
1093
1094
|
Returns:
|
1094
1095
|
AsyncFolder: A folder object ready for scoped operations
|
1095
1096
|
"""
|
1096
|
-
payload = {
|
1097
|
-
"name": name
|
1098
|
-
}
|
1097
|
+
payload = {"name": name}
|
1099
1098
|
if description:
|
1100
1099
|
payload["description"] = description
|
1101
|
-
|
1100
|
+
|
1102
1101
|
response = await self._request("POST", "folders", data=payload)
|
1103
1102
|
folder_info = FolderInfo(**response)
|
1104
|
-
|
1103
|
+
|
1105
1104
|
# Return a usable AsyncFolder object with the ID from the response
|
1106
1105
|
return AsyncFolder(self, name, folder_id=folder_info.id)
|
1107
|
-
|
1106
|
+
|
1108
1107
|
def get_folder_by_name(self, name: str) -> AsyncFolder:
|
1109
1108
|
"""
|
1110
1109
|
Get a folder by name to scope operations.
|
@@ -1116,7 +1115,7 @@ class AsyncMorphik:
|
|
1116
1115
|
AsyncFolder: A folder object for scoped operations
|
1117
1116
|
"""
|
1118
1117
|
return AsyncFolder(self, name)
|
1119
|
-
|
1118
|
+
|
1120
1119
|
async def get_folder(self, folder_id: str) -> AsyncFolder:
|
1121
1120
|
"""
|
1122
1121
|
Get a folder by ID.
|
@@ -1129,7 +1128,7 @@ class AsyncMorphik:
|
|
1129
1128
|
"""
|
1130
1129
|
response = await self._request("GET", f"folders/{folder_id}")
|
1131
1130
|
return AsyncFolder(self, response["name"], folder_id)
|
1132
|
-
|
1131
|
+
|
1133
1132
|
async def list_folders(self) -> List[AsyncFolder]:
|
1134
1133
|
"""
|
1135
1134
|
List all folders the user has access to as AsyncFolder objects.
|
@@ -1139,7 +1138,7 @@ class AsyncMorphik:
|
|
1139
1138
|
"""
|
1140
1139
|
response = await self._request("GET", "folders")
|
1141
1140
|
return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
|
1142
|
-
|
1141
|
+
|
1143
1142
|
async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1144
1143
|
"""
|
1145
1144
|
Add a document to a folder.
|
@@ -1153,7 +1152,7 @@ class AsyncMorphik:
|
|
1153
1152
|
"""
|
1154
1153
|
response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1155
1154
|
return response
|
1156
|
-
|
1155
|
+
|
1157
1156
|
async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1158
1157
|
"""
|
1159
1158
|
Remove a document from a folder.
|
@@ -1197,7 +1196,8 @@ class AsyncMorphik:
|
|
1197
1196
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1198
1197
|
- MetadataExtractionRule: Extract metadata using a schema
|
1199
1198
|
- NaturalLanguageRule: Transform content using natural language
|
1200
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1199
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1200
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1201
1201
|
Returns:
|
1202
1202
|
Document: Metadata of the ingested document
|
1203
1203
|
|
@@ -1253,9 +1253,10 @@ class AsyncMorphik:
|
|
1253
1253
|
|
1254
1254
|
response = await self._request(
|
1255
1255
|
"POST",
|
1256
|
-
|
1256
|
+
"ingest/file",
|
1257
1257
|
data=form_data,
|
1258
1258
|
files=files,
|
1259
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1259
1260
|
)
|
1260
1261
|
doc = self._logic._parse_document_response(response)
|
1261
1262
|
doc._client = self
|
@@ -1294,11 +1295,15 @@ class AsyncMorphik:
|
|
1294
1295
|
|
1295
1296
|
try:
|
1296
1297
|
# Prepare form data
|
1297
|
-
data = self._logic._prepare_ingest_files_form_data(
|
1298
|
-
metadata, rules, use_colpali, parallel, None, None
|
1299
|
-
)
|
1298
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1300
1299
|
|
1301
|
-
response = await self._request(
|
1300
|
+
response = await self._request(
|
1301
|
+
"POST",
|
1302
|
+
"ingest/files",
|
1303
|
+
data=data,
|
1304
|
+
files=file_objects,
|
1305
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1306
|
+
)
|
1302
1307
|
|
1303
1308
|
if response.get("errors"):
|
1304
1309
|
# Log errors but don't raise exception
|
@@ -1306,7 +1311,7 @@ class AsyncMorphik:
|
|
1306
1311
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
1307
1312
|
|
1308
1313
|
# Parse the documents from the response
|
1309
|
-
docs = [self.
|
1314
|
+
docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
|
1310
1315
|
for doc in docs:
|
1311
1316
|
doc._client = self
|
1312
1317
|
return docs
|
@@ -1381,7 +1386,8 @@ class AsyncMorphik:
|
|
1381
1386
|
filters: Optional metadata filters
|
1382
1387
|
k: Number of results (default: 4)
|
1383
1388
|
min_score: Minimum similarity threshold (default: 0.0)
|
1384
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1389
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1390
|
+
(only works for documents ingested with `use_colpali=True`)
|
1385
1391
|
Returns:
|
1386
1392
|
List[FinalChunkResult]
|
1387
1393
|
|
@@ -1393,9 +1399,7 @@ class AsyncMorphik:
|
|
1393
1399
|
)
|
1394
1400
|
```
|
1395
1401
|
"""
|
1396
|
-
payload = self._logic._prepare_retrieve_chunks_request(
|
1397
|
-
query, filters, k, min_score, use_colpali, None, None
|
1398
|
-
)
|
1402
|
+
payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
|
1399
1403
|
response = await self._request("POST", "retrieve/chunks", data=payload)
|
1400
1404
|
return self._logic._parse_chunk_result_list_response(response)
|
1401
1405
|
|
@@ -1415,7 +1419,8 @@ class AsyncMorphik:
|
|
1415
1419
|
filters: Optional metadata filters
|
1416
1420
|
k: Number of results (default: 4)
|
1417
1421
|
min_score: Minimum similarity threshold (default: 0.0)
|
1418
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1422
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1423
|
+
(only works for documents ingested with `use_colpali=True`)
|
1419
1424
|
Returns:
|
1420
1425
|
List[DocumentResult]
|
1421
1426
|
|
@@ -1427,9 +1432,7 @@ class AsyncMorphik:
|
|
1427
1432
|
)
|
1428
1433
|
```
|
1429
1434
|
"""
|
1430
|
-
payload = self._logic._prepare_retrieve_docs_request(
|
1431
|
-
query, filters, k, min_score, use_colpali, None, None
|
1432
|
-
)
|
1435
|
+
payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
|
1433
1436
|
response = await self._request("POST", "retrieve/docs", data=payload)
|
1434
1437
|
return self._logic._parse_document_result_list_response(response)
|
1435
1438
|
|
@@ -1446,6 +1449,7 @@ class AsyncMorphik:
|
|
1446
1449
|
hop_depth: int = 1,
|
1447
1450
|
include_paths: bool = False,
|
1448
1451
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1452
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1449
1453
|
) -> CompletionResponse:
|
1450
1454
|
"""
|
1451
1455
|
Generate completion using relevant chunks as context.
|
@@ -1457,12 +1461,14 @@ class AsyncMorphik:
|
|
1457
1461
|
min_score: Minimum similarity threshold (default: 0.0)
|
1458
1462
|
max_tokens: Maximum tokens in completion
|
1459
1463
|
temperature: Model temperature
|
1460
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1464
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1465
|
+
(only works for documents ingested with `use_colpali=True`)
|
1461
1466
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1462
1467
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1463
1468
|
include_paths: Whether to include relationship paths in the response
|
1464
1469
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1465
1470
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1471
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1466
1472
|
Returns:
|
1467
1473
|
CompletionResponse
|
1468
1474
|
|
@@ -1510,6 +1516,27 @@ class AsyncMorphik:
|
|
1510
1516
|
if response.metadata and "graph" in response.metadata:
|
1511
1517
|
for path in response.metadata["graph"]["paths"]:
|
1512
1518
|
print(" -> ".join(path))
|
1519
|
+
|
1520
|
+
# Using structured output with a Pydantic model
|
1521
|
+
from pydantic import BaseModel
|
1522
|
+
|
1523
|
+
class ResearchFindings(BaseModel):
|
1524
|
+
main_finding: str
|
1525
|
+
supporting_evidence: List[str]
|
1526
|
+
limitations: List[str]
|
1527
|
+
|
1528
|
+
response = await db.query(
|
1529
|
+
"Summarize the key research findings from these documents",
|
1530
|
+
schema=ResearchFindings
|
1531
|
+
)
|
1532
|
+
|
1533
|
+
# Access structured output
|
1534
|
+
if response.structured_output:
|
1535
|
+
findings = response.structured_output
|
1536
|
+
print(f"Main finding: {findings.main_finding}")
|
1537
|
+
print("Supporting evidence:")
|
1538
|
+
for evidence in findings.supporting_evidence:
|
1539
|
+
print(f"- {evidence}")
|
1513
1540
|
```
|
1514
1541
|
"""
|
1515
1542
|
payload = self._logic._prepare_query_request(
|
@@ -1526,7 +1553,20 @@ class AsyncMorphik:
|
|
1526
1553
|
prompt_overrides,
|
1527
1554
|
None,
|
1528
1555
|
None,
|
1556
|
+
schema,
|
1529
1557
|
)
|
1558
|
+
|
1559
|
+
# Add schema to payload if provided
|
1560
|
+
if schema:
|
1561
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1562
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1563
|
+
payload["schema"] = schema.model_json_schema()
|
1564
|
+
else:
|
1565
|
+
payload["schema"] = schema
|
1566
|
+
|
1567
|
+
# Add a hint to the query to return in JSON format
|
1568
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1569
|
+
|
1530
1570
|
response = await self._request("POST", "query", data=payload)
|
1531
1571
|
return self._logic._parse_completion_response(response)
|
1532
1572
|
|
@@ -1580,17 +1620,17 @@ class AsyncMorphik:
|
|
1580
1620
|
doc = self._logic._parse_document_response(response)
|
1581
1621
|
doc._client = self
|
1582
1622
|
return doc
|
1583
|
-
|
1623
|
+
|
1584
1624
|
async def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1585
1625
|
"""
|
1586
1626
|
Get the current processing status of a document.
|
1587
|
-
|
1627
|
+
|
1588
1628
|
Args:
|
1589
1629
|
document_id: ID of the document to check
|
1590
|
-
|
1630
|
+
|
1591
1631
|
Returns:
|
1592
1632
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1593
|
-
|
1633
|
+
|
1594
1634
|
Example:
|
1595
1635
|
```python
|
1596
1636
|
status = await db.get_document_status("doc_123")
|
@@ -1604,23 +1644,25 @@ class AsyncMorphik:
|
|
1604
1644
|
"""
|
1605
1645
|
response = await self._request("GET", f"documents/{document_id}/status")
|
1606
1646
|
return response
|
1607
|
-
|
1608
|
-
async def wait_for_document_completion(
|
1647
|
+
|
1648
|
+
async def wait_for_document_completion(
|
1649
|
+
self, document_id: str, timeout_seconds=300, check_interval_seconds=2
|
1650
|
+
) -> Document:
|
1609
1651
|
"""
|
1610
1652
|
Wait for a document's processing to complete.
|
1611
|
-
|
1653
|
+
|
1612
1654
|
Args:
|
1613
1655
|
document_id: ID of the document to wait for
|
1614
1656
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1615
1657
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1616
|
-
|
1658
|
+
|
1617
1659
|
Returns:
|
1618
1660
|
Document: Updated document with the latest status
|
1619
|
-
|
1661
|
+
|
1620
1662
|
Raises:
|
1621
1663
|
TimeoutError: If processing doesn't complete within the timeout period
|
1622
1664
|
ValueError: If processing fails with an error
|
1623
|
-
|
1665
|
+
|
1624
1666
|
Example:
|
1625
1667
|
```python
|
1626
1668
|
# Upload a file and wait for processing to complete
|
@@ -1635,20 +1677,21 @@ class AsyncMorphik:
|
|
1635
1677
|
```
|
1636
1678
|
"""
|
1637
1679
|
import asyncio
|
1680
|
+
|
1638
1681
|
start_time = asyncio.get_event_loop().time()
|
1639
|
-
|
1682
|
+
|
1640
1683
|
while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
|
1641
1684
|
status = await self.get_document_status(document_id)
|
1642
|
-
|
1685
|
+
|
1643
1686
|
if status["status"] == "completed":
|
1644
1687
|
# Get the full document now that it's complete
|
1645
1688
|
return await self.get_document(document_id)
|
1646
1689
|
elif status["status"] == "failed":
|
1647
1690
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1648
|
-
|
1691
|
+
|
1649
1692
|
# Wait before checking again
|
1650
1693
|
await asyncio.sleep(check_interval_seconds)
|
1651
|
-
|
1694
|
+
|
1652
1695
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1653
1696
|
|
1654
1697
|
async def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1802,9 +1845,7 @@ class AsyncMorphik:
|
|
1802
1845
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1803
1846
|
|
1804
1847
|
# Use the dedicated file update endpoint
|
1805
|
-
response = await self._request(
|
1806
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1807
|
-
)
|
1848
|
+
response = await self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1808
1849
|
|
1809
1850
|
doc = self._logic._parse_document_response(response)
|
1810
1851
|
doc._client = self
|
@@ -1840,9 +1881,7 @@ class AsyncMorphik:
|
|
1840
1881
|
```
|
1841
1882
|
"""
|
1842
1883
|
# Use the dedicated metadata update endpoint
|
1843
|
-
response = await self._request(
|
1844
|
-
"POST", f"documents/{document_id}/update_metadata", data=metadata
|
1845
|
-
)
|
1884
|
+
response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
1846
1885
|
doc = self._logic._parse_document_response(response)
|
1847
1886
|
doc._client = self
|
1848
1887
|
return doc
|
@@ -2025,16 +2064,15 @@ class AsyncMorphik:
|
|
2025
2064
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
2026
2065
|
```
|
2027
2066
|
"""
|
2028
|
-
|
2067
|
+
# API expects a dict with document_ids key, not a direct list
|
2068
|
+
request = {"document_ids": document_ids}
|
2029
2069
|
response = await self._request("POST", "batch/documents", data=request)
|
2030
2070
|
docs = self._logic._parse_document_list_response(response)
|
2031
2071
|
for doc in docs:
|
2032
2072
|
doc._client = self
|
2033
2073
|
return docs
|
2034
2074
|
|
2035
|
-
async def batch_get_chunks(
|
2036
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2037
|
-
) -> List[FinalChunkResult]:
|
2075
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
2038
2076
|
"""
|
2039
2077
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2040
2078
|
|
@@ -2083,8 +2121,10 @@ class AsyncMorphik:
|
|
2083
2121
|
name: Name of the cache to create
|
2084
2122
|
model: Name of the model to use (e.g. "llama2")
|
2085
2123
|
gguf_file: Name of the GGUF file to use for the model
|
2086
|
-
filters: Optional metadata filters to determine which documents to include.
|
2087
|
-
|
2124
|
+
filters: Optional metadata filters to determine which documents to include.
|
2125
|
+
These filters will be applied in addition to any specific docs provided.
|
2126
|
+
docs: Optional list of specific document IDs to include.
|
2127
|
+
These docs will be included in addition to any documents matching the filters.
|
2088
2128
|
|
2089
2129
|
Returns:
|
2090
2130
|
Dict[str, Any]: Created cache configuration
|
@@ -2185,9 +2225,7 @@ class AsyncMorphik:
|
|
2185
2225
|
)
|
2186
2226
|
```
|
2187
2227
|
"""
|
2188
|
-
request = self._logic._prepare_create_graph_request(
|
2189
|
-
name, filters, documents, prompt_overrides, None, None
|
2190
|
-
)
|
2228
|
+
request = self._logic._prepare_create_graph_request(name, filters, documents, prompt_overrides, None, None)
|
2191
2229
|
response = await self._request("POST", "graph/create", data=request)
|
2192
2230
|
return self._logic._parse_graph_response(response)
|
2193
2231
|
|