morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +2 -2
- morphik/_internal.py +36 -27
- morphik/async_.py +294 -127
- morphik/models.py +79 -58
- morphik/rules.py +28 -5
- morphik/sync.py +352 -144
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/METADATA +4 -8
- morphik-0.1.6.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/WHEEL +0 -0
morphik/async_.py
CHANGED
@@ -2,25 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
6
6
|
|
7
7
|
import httpx
|
8
|
-
from
|
8
|
+
from pydantic import BaseModel
|
9
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
10
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
11
14
|
Document,
|
12
15
|
DocumentResult,
|
13
|
-
CompletionResponse,
|
14
|
-
IngestTextRequest,
|
15
|
-
ChunkSource,
|
16
|
-
Graph,
|
17
16
|
FolderInfo,
|
18
|
-
|
17
|
+
Graph,
|
19
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
20
20
|
QueryPromptOverrides,
|
21
21
|
)
|
22
|
-
from .rules import Rule
|
23
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
24
22
|
|
25
23
|
logger = logging.getLogger(__name__)
|
26
24
|
|
@@ -69,16 +67,16 @@ class AsyncFolder:
|
|
69
67
|
def name(self) -> str:
|
70
68
|
"""Returns the folder name."""
|
71
69
|
return self._name
|
72
|
-
|
70
|
+
|
73
71
|
@property
|
74
72
|
def id(self) -> Optional[str]:
|
75
73
|
"""Returns the folder ID if available."""
|
76
74
|
return self._id
|
77
|
-
|
75
|
+
|
78
76
|
async def get_info(self) -> Dict[str, Any]:
|
79
77
|
"""
|
80
78
|
Get detailed information about this folder.
|
81
|
-
|
79
|
+
|
82
80
|
Returns:
|
83
81
|
Dict[str, Any]: Detailed folder information
|
84
82
|
"""
|
@@ -91,9 +89,8 @@ class AsyncFolder:
|
|
91
89
|
break
|
92
90
|
if not self._id:
|
93
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
94
|
-
|
92
|
+
|
95
93
|
return await self._client._request("GET", f"folders/{self._id}")
|
96
|
-
|
97
94
|
|
98
95
|
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
99
96
|
"""
|
@@ -166,9 +163,7 @@ class AsyncFolder:
|
|
166
163
|
files = {"file": (filename, file_obj)}
|
167
164
|
|
168
165
|
# Create form data
|
169
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
170
|
-
metadata, rules, self._name, None
|
171
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
172
167
|
|
173
168
|
response = await self._client._request(
|
174
169
|
"POST",
|
@@ -216,9 +211,9 @@ class AsyncFolder:
|
|
216
211
|
)
|
217
212
|
|
218
213
|
response = await self._client._request(
|
219
|
-
"POST",
|
220
|
-
"ingest/files",
|
221
|
-
data=data,
|
214
|
+
"POST",
|
215
|
+
"ingest/files",
|
216
|
+
data=data,
|
222
217
|
files=file_objects,
|
223
218
|
params={"use_colpali": str(use_colpali).lower()},
|
224
219
|
)
|
@@ -228,9 +223,7 @@ class AsyncFolder:
|
|
228
223
|
for error in response["errors"]:
|
229
224
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
230
225
|
|
231
|
-
docs = [
|
232
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
233
|
-
]
|
226
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
234
227
|
for doc in docs:
|
235
228
|
doc._client = self._client
|
236
229
|
return docs
|
@@ -293,6 +286,7 @@ class AsyncFolder:
|
|
293
286
|
k: int = 4,
|
294
287
|
min_score: float = 0.0,
|
295
288
|
use_colpali: bool = True,
|
289
|
+
additional_folders: Optional[List[str]] = None,
|
296
290
|
) -> List[FinalChunkResult]:
|
297
291
|
"""
|
298
292
|
Retrieve relevant chunks within this folder.
|
@@ -303,12 +297,14 @@ class AsyncFolder:
|
|
303
297
|
k: Number of results (default: 4)
|
304
298
|
min_score: Minimum similarity threshold (default: 0.0)
|
305
299
|
use_colpali: Whether to use ColPali-style embedding model
|
300
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
306
301
|
|
307
302
|
Returns:
|
308
303
|
List[FinalChunkResult]: List of relevant chunks
|
309
304
|
"""
|
305
|
+
effective_folder = self._merge_folders(additional_folders)
|
310
306
|
payload = self._client._logic._prepare_retrieve_chunks_request(
|
311
|
-
query, filters, k, min_score, use_colpali,
|
307
|
+
query, filters, k, min_score, use_colpali, effective_folder, None
|
312
308
|
)
|
313
309
|
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
314
310
|
return self._client._logic._parse_chunk_result_list_response(response)
|
@@ -320,6 +316,7 @@ class AsyncFolder:
|
|
320
316
|
k: int = 4,
|
321
317
|
min_score: float = 0.0,
|
322
318
|
use_colpali: bool = True,
|
319
|
+
additional_folders: Optional[List[str]] = None,
|
323
320
|
) -> List[DocumentResult]:
|
324
321
|
"""
|
325
322
|
Retrieve relevant documents within this folder.
|
@@ -330,12 +327,14 @@ class AsyncFolder:
|
|
330
327
|
k: Number of results (default: 4)
|
331
328
|
min_score: Minimum similarity threshold (default: 0.0)
|
332
329
|
use_colpali: Whether to use ColPali-style embedding model
|
330
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
333
331
|
|
334
332
|
Returns:
|
335
333
|
List[DocumentResult]: List of relevant documents
|
336
334
|
"""
|
335
|
+
effective_folder = self._merge_folders(additional_folders)
|
337
336
|
payload = self._client._logic._prepare_retrieve_docs_request(
|
338
|
-
query, filters, k, min_score, use_colpali,
|
337
|
+
query, filters, k, min_score, use_colpali, effective_folder, None
|
339
338
|
)
|
340
339
|
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
341
340
|
return self._client._logic._parse_document_result_list_response(response)
|
@@ -353,6 +352,8 @@ class AsyncFolder:
|
|
353
352
|
hop_depth: int = 1,
|
354
353
|
include_paths: bool = False,
|
355
354
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
355
|
+
additional_folders: Optional[List[str]] = None,
|
356
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
356
357
|
) -> CompletionResponse:
|
357
358
|
"""
|
358
359
|
Generate completion using relevant chunks as context within this folder.
|
@@ -369,10 +370,13 @@ class AsyncFolder:
|
|
369
370
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
370
371
|
include_paths: Whether to include relationship paths in the response
|
371
372
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
373
|
+
schema: Optional schema for structured output
|
374
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
372
375
|
|
373
376
|
Returns:
|
374
|
-
CompletionResponse: Generated completion
|
377
|
+
CompletionResponse: Generated completion or structured output
|
375
378
|
"""
|
379
|
+
effective_folder = self._merge_folders(additional_folders)
|
376
380
|
payload = self._client._logic._prepare_query_request(
|
377
381
|
query,
|
378
382
|
filters,
|
@@ -385,14 +389,31 @@ class AsyncFolder:
|
|
385
389
|
hop_depth,
|
386
390
|
include_paths,
|
387
391
|
prompt_overrides,
|
388
|
-
|
392
|
+
effective_folder,
|
389
393
|
None,
|
394
|
+
schema,
|
390
395
|
)
|
396
|
+
|
397
|
+
# Add schema to payload if provided
|
398
|
+
if schema:
|
399
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
400
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
401
|
+
payload["schema"] = schema.model_json_schema()
|
402
|
+
else:
|
403
|
+
payload["schema"] = schema
|
404
|
+
|
405
|
+
# Add a hint to the query to return in JSON format
|
406
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
407
|
+
|
391
408
|
response = await self._client._request("POST", "query", data=payload)
|
392
409
|
return self._client._logic._parse_completion_response(response)
|
393
410
|
|
394
411
|
async def list_documents(
|
395
|
-
self,
|
412
|
+
self,
|
413
|
+
skip: int = 0,
|
414
|
+
limit: int = 100,
|
415
|
+
filters: Optional[Dict[str, Any]] = None,
|
416
|
+
additional_folders: Optional[List[str]] = None,
|
396
417
|
) -> List[Document]:
|
397
418
|
"""
|
398
419
|
List accessible documents within this folder.
|
@@ -401,33 +422,34 @@ class AsyncFolder:
|
|
401
422
|
skip: Number of documents to skip
|
402
423
|
limit: Maximum number of documents to return
|
403
424
|
filters: Optional filters
|
425
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
404
426
|
|
405
427
|
Returns:
|
406
428
|
List[Document]: List of documents
|
407
429
|
"""
|
408
|
-
|
409
|
-
|
410
|
-
)
|
430
|
+
effective_folder = self._merge_folders(additional_folders)
|
431
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, effective_folder, None)
|
411
432
|
response = await self._client._request("POST", "documents", data=data, params=params)
|
412
433
|
docs = self._client._logic._parse_document_list_response(response)
|
413
434
|
for doc in docs:
|
414
435
|
doc._client = self._client
|
415
436
|
return docs
|
416
437
|
|
417
|
-
async def batch_get_documents(
|
438
|
+
async def batch_get_documents(
|
439
|
+
self, document_ids: List[str], additional_folders: Optional[List[str]] = None
|
440
|
+
) -> List[Document]:
|
418
441
|
"""
|
419
442
|
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
420
443
|
|
421
444
|
Args:
|
422
445
|
document_ids: List of document IDs to retrieve
|
446
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
423
447
|
|
424
448
|
Returns:
|
425
449
|
List[Document]: List of document metadata for found documents
|
426
450
|
"""
|
427
|
-
|
428
|
-
request = {"document_ids": document_ids}
|
429
|
-
if self._name:
|
430
|
-
request["folder_name"] = self._name
|
451
|
+
merged = self._merge_folders(additional_folders)
|
452
|
+
request = {"document_ids": document_ids, "folder_name": merged}
|
431
453
|
response = await self._client._request("POST", "batch/documents", data=request)
|
432
454
|
docs = self._client._logic._parse_document_list_response(response)
|
433
455
|
for doc in docs:
|
@@ -435,18 +457,22 @@ class AsyncFolder:
|
|
435
457
|
return docs
|
436
458
|
|
437
459
|
async def batch_get_chunks(
|
438
|
-
self,
|
460
|
+
self,
|
461
|
+
sources: List[Union[ChunkSource, Dict[str, Any]]],
|
462
|
+
additional_folders: Optional[List[str]] = None,
|
439
463
|
) -> List[FinalChunkResult]:
|
440
464
|
"""
|
441
465
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
442
466
|
|
443
467
|
Args:
|
444
468
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
469
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
445
470
|
|
446
471
|
Returns:
|
447
472
|
List[FinalChunkResult]: List of chunk results
|
448
473
|
"""
|
449
|
-
|
474
|
+
merged = self._merge_folders(additional_folders)
|
475
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, merged, None)
|
450
476
|
response = await self._client._request("POST", "batch/chunks", data=request)
|
451
477
|
return self._client._logic._parse_chunk_result_list_response(response)
|
452
478
|
|
@@ -473,7 +499,9 @@ class AsyncFolder:
|
|
473
499
|
name, filters, documents, prompt_overrides, self._name, None
|
474
500
|
)
|
475
501
|
response = await self._client._request("POST", "graph/create", data=request)
|
476
|
-
|
502
|
+
graph = self._logic._parse_graph_response(response)
|
503
|
+
graph._client = self # Attach AsyncMorphik client for polling helpers
|
504
|
+
return graph
|
477
505
|
|
478
506
|
async def update_graph(
|
479
507
|
self,
|
@@ -498,7 +526,9 @@ class AsyncFolder:
|
|
498
526
|
name, additional_filters, additional_documents, prompt_overrides, self._name, None
|
499
527
|
)
|
500
528
|
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
501
|
-
|
529
|
+
graph = self._logic._parse_graph_response(response)
|
530
|
+
graph._client = self
|
531
|
+
return graph
|
502
532
|
|
503
533
|
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
504
534
|
"""
|
@@ -510,9 +540,6 @@ class AsyncFolder:
|
|
510
540
|
Returns:
|
511
541
|
Dict[str, str]: Deletion status
|
512
542
|
"""
|
513
|
-
# Get the document by filename with folder scope
|
514
|
-
request = {"filename": filename, "folder_name": self._name}
|
515
|
-
|
516
543
|
# First get the document ID
|
517
544
|
response = await self._client._request(
|
518
545
|
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
@@ -522,6 +549,18 @@ class AsyncFolder:
|
|
522
549
|
# Then delete by ID
|
523
550
|
return await self._client.delete_document(doc.external_id)
|
524
551
|
|
552
|
+
# Helper --------------------------------------------------------------
|
553
|
+
def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str]]:
|
554
|
+
"""Return the effective folder scope for this folder instance.
|
555
|
+
|
556
|
+
If *additional_folders* is provided it will be combined with the scoped
|
557
|
+
folder (*self._name*) and returned as a list. Otherwise just
|
558
|
+
*self._name* is returned so the API keeps backward-compatibility with
|
559
|
+
accepting a single string."""
|
560
|
+
if not additional_folders:
|
561
|
+
return self._name
|
562
|
+
return [self._name] + additional_folders
|
563
|
+
|
525
564
|
|
526
565
|
class AsyncUserScope:
|
527
566
|
"""
|
@@ -685,9 +724,7 @@ class AsyncUserScope:
|
|
685
724
|
if rules:
|
686
725
|
if all(isinstance(r, list) for r in rules):
|
687
726
|
# List of lists - per-file rules
|
688
|
-
converted_rules = [
|
689
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
690
|
-
]
|
727
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
691
728
|
else:
|
692
729
|
# Flat list - shared rules for all files
|
693
730
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -707,9 +744,9 @@ class AsyncUserScope:
|
|
707
744
|
data["folder_name"] = self._folder_name
|
708
745
|
|
709
746
|
response = await self._client._request(
|
710
|
-
"POST",
|
711
|
-
"ingest/files",
|
712
|
-
data=data,
|
747
|
+
"POST",
|
748
|
+
"ingest/files",
|
749
|
+
data=data,
|
713
750
|
files=file_objects,
|
714
751
|
params={"use_colpali": str(use_colpali).lower()},
|
715
752
|
)
|
@@ -719,9 +756,7 @@ class AsyncUserScope:
|
|
719
756
|
for error in response["errors"]:
|
720
757
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
721
758
|
|
722
|
-
docs = [
|
723
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
724
|
-
]
|
759
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
725
760
|
for doc in docs:
|
726
761
|
doc._client = self._client
|
727
762
|
return docs
|
@@ -784,6 +819,7 @@ class AsyncUserScope:
|
|
784
819
|
k: int = 4,
|
785
820
|
min_score: float = 0.0,
|
786
821
|
use_colpali: bool = True,
|
822
|
+
additional_folders: Optional[List[str]] = None,
|
787
823
|
) -> List[FinalChunkResult]:
|
788
824
|
"""
|
789
825
|
Retrieve relevant chunks as this end user.
|
@@ -794,12 +830,14 @@ class AsyncUserScope:
|
|
794
830
|
k: Number of results (default: 4)
|
795
831
|
min_score: Minimum similarity threshold (default: 0.0)
|
796
832
|
use_colpali: Whether to use ColPali-style embedding model
|
833
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
797
834
|
|
798
835
|
Returns:
|
799
836
|
List[FinalChunkResult]: List of relevant chunks
|
800
837
|
"""
|
838
|
+
effective_folder = self._merge_folders(additional_folders)
|
801
839
|
payload = self._client._logic._prepare_retrieve_chunks_request(
|
802
|
-
query, filters, k, min_score, use_colpali,
|
840
|
+
query, filters, k, min_score, use_colpali, effective_folder, self._end_user_id
|
803
841
|
)
|
804
842
|
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
805
843
|
return self._client._logic._parse_chunk_result_list_response(response)
|
@@ -811,6 +849,7 @@ class AsyncUserScope:
|
|
811
849
|
k: int = 4,
|
812
850
|
min_score: float = 0.0,
|
813
851
|
use_colpali: bool = True,
|
852
|
+
additional_folders: Optional[List[str]] = None,
|
814
853
|
) -> List[DocumentResult]:
|
815
854
|
"""
|
816
855
|
Retrieve relevant documents as this end user.
|
@@ -821,12 +860,14 @@ class AsyncUserScope:
|
|
821
860
|
k: Number of results (default: 4)
|
822
861
|
min_score: Minimum similarity threshold (default: 0.0)
|
823
862
|
use_colpali: Whether to use ColPali-style embedding model
|
863
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
824
864
|
|
825
865
|
Returns:
|
826
866
|
List[DocumentResult]: List of relevant documents
|
827
867
|
"""
|
868
|
+
effective_folder = self._merge_folders(additional_folders)
|
828
869
|
payload = self._client._logic._prepare_retrieve_docs_request(
|
829
|
-
query, filters, k, min_score, use_colpali,
|
870
|
+
query, filters, k, min_score, use_colpali, effective_folder, self._end_user_id
|
830
871
|
)
|
831
872
|
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
832
873
|
return self._client._logic._parse_document_result_list_response(response)
|
@@ -844,9 +885,11 @@ class AsyncUserScope:
|
|
844
885
|
hop_depth: int = 1,
|
845
886
|
include_paths: bool = False,
|
846
887
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
888
|
+
additional_folders: Optional[List[str]] = None,
|
889
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
847
890
|
) -> CompletionResponse:
|
848
891
|
"""
|
849
|
-
Generate completion using relevant chunks as context
|
892
|
+
Generate completion using relevant chunks as context, scoped to the end user.
|
850
893
|
|
851
894
|
Args:
|
852
895
|
query: Query text
|
@@ -860,10 +903,13 @@ class AsyncUserScope:
|
|
860
903
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
861
904
|
include_paths: Whether to include relationship paths in the response
|
862
905
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
906
|
+
schema: Optional schema for structured output
|
907
|
+
additional_folders: Optional list of additional folder names to further scope operations
|
863
908
|
|
864
909
|
Returns:
|
865
|
-
CompletionResponse: Generated completion
|
910
|
+
CompletionResponse: Generated completion or structured output
|
866
911
|
"""
|
912
|
+
effective_folder = self._merge_folders(additional_folders)
|
867
913
|
payload = self._client._logic._prepare_query_request(
|
868
914
|
query,
|
869
915
|
filters,
|
@@ -876,14 +922,31 @@ class AsyncUserScope:
|
|
876
922
|
hop_depth,
|
877
923
|
include_paths,
|
878
924
|
prompt_overrides,
|
879
|
-
|
925
|
+
effective_folder,
|
880
926
|
self._end_user_id,
|
927
|
+
schema,
|
881
928
|
)
|
929
|
+
|
930
|
+
# Add schema to payload if provided
|
931
|
+
if schema:
|
932
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
933
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
934
|
+
payload["schema"] = schema.model_json_schema()
|
935
|
+
else:
|
936
|
+
payload["schema"] = schema
|
937
|
+
|
938
|
+
# Add a hint to the query to return in JSON format
|
939
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
940
|
+
|
882
941
|
response = await self._client._request("POST", "query", data=payload)
|
883
942
|
return self._client._logic._parse_completion_response(response)
|
884
943
|
|
885
944
|
async def list_documents(
|
886
|
-
self,
|
945
|
+
self,
|
946
|
+
skip: int = 0,
|
947
|
+
limit: int = 100,
|
948
|
+
filters: Optional[Dict[str, Any]] = None,
|
949
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
887
950
|
) -> List[Document]:
|
888
951
|
"""
|
889
952
|
List accessible documents for this end user.
|
@@ -892,12 +955,13 @@ class AsyncUserScope:
|
|
892
955
|
skip: Number of documents to skip
|
893
956
|
limit: Maximum number of documents to return
|
894
957
|
filters: Optional filters
|
958
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
895
959
|
|
896
960
|
Returns:
|
897
961
|
List[Document]: List of documents
|
898
962
|
"""
|
899
963
|
params, data = self._client._logic._prepare_list_documents_request(
|
900
|
-
skip, limit, filters,
|
964
|
+
skip, limit, filters, folder_name, self._end_user_id
|
901
965
|
)
|
902
966
|
response = await self._client._request("POST", "documents", data=data, params=params)
|
903
967
|
docs = self._client._logic._parse_document_list_response(response)
|
@@ -905,12 +969,15 @@ class AsyncUserScope:
|
|
905
969
|
doc._client = self._client
|
906
970
|
return docs
|
907
971
|
|
908
|
-
async def batch_get_documents(
|
972
|
+
async def batch_get_documents(
|
973
|
+
self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
|
974
|
+
) -> List[Document]:
|
909
975
|
"""
|
910
976
|
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
911
977
|
|
912
978
|
Args:
|
913
979
|
document_ids: List of document IDs to retrieve
|
980
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
914
981
|
|
915
982
|
Returns:
|
916
983
|
List[Document]: List of document metadata for found documents
|
@@ -928,20 +995,21 @@ class AsyncUserScope:
|
|
928
995
|
return docs
|
929
996
|
|
930
997
|
async def batch_get_chunks(
|
931
|
-
self,
|
998
|
+
self,
|
999
|
+
sources: List[Union[ChunkSource, Dict[str, Any]]],
|
1000
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
932
1001
|
) -> List[FinalChunkResult]:
|
933
1002
|
"""
|
934
1003
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
935
1004
|
|
936
1005
|
Args:
|
937
1006
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1007
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
938
1008
|
|
939
1009
|
Returns:
|
940
1010
|
List[FinalChunkResult]: List of chunk results
|
941
1011
|
"""
|
942
|
-
request = self._client._logic._prepare_batch_get_chunks_request(
|
943
|
-
sources, self._folder_name, self._end_user_id
|
944
|
-
)
|
1012
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, self._folder_name, self._end_user_id)
|
945
1013
|
response = await self._client._request("POST", "batch/chunks", data=request)
|
946
1014
|
return self._client._logic._parse_chunk_result_list_response(response)
|
947
1015
|
|
@@ -968,7 +1036,9 @@ class AsyncUserScope:
|
|
968
1036
|
name, filters, documents, prompt_overrides, self._folder_name, self._end_user_id
|
969
1037
|
)
|
970
1038
|
response = await self._client._request("POST", "graph/create", data=request)
|
971
|
-
|
1039
|
+
graph = self._logic._parse_graph_response(response)
|
1040
|
+
graph._client = self
|
1041
|
+
return graph
|
972
1042
|
|
973
1043
|
async def update_graph(
|
974
1044
|
self,
|
@@ -998,7 +1068,9 @@ class AsyncUserScope:
|
|
998
1068
|
self._end_user_id,
|
999
1069
|
)
|
1000
1070
|
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
1001
|
-
|
1071
|
+
graph = self._logic._parse_graph_response(response)
|
1072
|
+
graph._client = self
|
1073
|
+
return graph
|
1002
1074
|
|
1003
1075
|
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1004
1076
|
"""
|
@@ -1018,9 +1090,7 @@ class AsyncUserScope:
|
|
1018
1090
|
params["folder_name"] = self._folder_name
|
1019
1091
|
|
1020
1092
|
# First get the document ID
|
1021
|
-
response = await self._client._request(
|
1022
|
-
"GET", f"documents/filename/{filename}", params=params
|
1023
|
-
)
|
1093
|
+
response = await self._client._request("GET", f"documents/filename/{filename}", params=params)
|
1024
1094
|
doc = self._client._logic._parse_document_response(response)
|
1025
1095
|
|
1026
1096
|
# Then delete by ID
|
@@ -1077,7 +1147,7 @@ class AsyncMorphik:
|
|
1077
1147
|
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1078
1148
|
if "Content-Type" in headers:
|
1079
1149
|
del headers["Content-Type"]
|
1080
|
-
|
1150
|
+
|
1081
1151
|
# For file uploads with form data, use form data (not json)
|
1082
1152
|
request_data = {"files": files}
|
1083
1153
|
if data:
|
@@ -1112,18 +1182,16 @@ class AsyncMorphik:
|
|
1112
1182
|
Returns:
|
1113
1183
|
AsyncFolder: A folder object ready for scoped operations
|
1114
1184
|
"""
|
1115
|
-
payload = {
|
1116
|
-
"name": name
|
1117
|
-
}
|
1185
|
+
payload = {"name": name}
|
1118
1186
|
if description:
|
1119
1187
|
payload["description"] = description
|
1120
|
-
|
1188
|
+
|
1121
1189
|
response = await self._request("POST", "folders", data=payload)
|
1122
1190
|
folder_info = FolderInfo(**response)
|
1123
|
-
|
1191
|
+
|
1124
1192
|
# Return a usable AsyncFolder object with the ID from the response
|
1125
1193
|
return AsyncFolder(self, name, folder_id=folder_info.id)
|
1126
|
-
|
1194
|
+
|
1127
1195
|
def get_folder_by_name(self, name: str) -> AsyncFolder:
|
1128
1196
|
"""
|
1129
1197
|
Get a folder by name to scope operations.
|
@@ -1135,7 +1203,7 @@ class AsyncMorphik:
|
|
1135
1203
|
AsyncFolder: A folder object for scoped operations
|
1136
1204
|
"""
|
1137
1205
|
return AsyncFolder(self, name)
|
1138
|
-
|
1206
|
+
|
1139
1207
|
async def get_folder(self, folder_id: str) -> AsyncFolder:
|
1140
1208
|
"""
|
1141
1209
|
Get a folder by ID.
|
@@ -1148,7 +1216,7 @@ class AsyncMorphik:
|
|
1148
1216
|
"""
|
1149
1217
|
response = await self._request("GET", f"folders/{folder_id}")
|
1150
1218
|
return AsyncFolder(self, response["name"], folder_id)
|
1151
|
-
|
1219
|
+
|
1152
1220
|
async def list_folders(self) -> List[AsyncFolder]:
|
1153
1221
|
"""
|
1154
1222
|
List all folders the user has access to as AsyncFolder objects.
|
@@ -1158,7 +1226,7 @@ class AsyncMorphik:
|
|
1158
1226
|
"""
|
1159
1227
|
response = await self._request("GET", "folders")
|
1160
1228
|
return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
|
1161
|
-
|
1229
|
+
|
1162
1230
|
async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1163
1231
|
"""
|
1164
1232
|
Add a document to a folder.
|
@@ -1172,7 +1240,7 @@ class AsyncMorphik:
|
|
1172
1240
|
"""
|
1173
1241
|
response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1174
1242
|
return response
|
1175
|
-
|
1243
|
+
|
1176
1244
|
async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1177
1245
|
"""
|
1178
1246
|
Remove a document from a folder.
|
@@ -1216,7 +1284,8 @@ class AsyncMorphik:
|
|
1216
1284
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1217
1285
|
- MetadataExtractionRule: Extract metadata using a schema
|
1218
1286
|
- NaturalLanguageRule: Transform content using natural language
|
1219
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1287
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1288
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1220
1289
|
Returns:
|
1221
1290
|
Document: Metadata of the ingested document
|
1222
1291
|
|
@@ -1314,14 +1383,12 @@ class AsyncMorphik:
|
|
1314
1383
|
|
1315
1384
|
try:
|
1316
1385
|
# Prepare form data
|
1317
|
-
data = self._logic._prepare_ingest_files_form_data(
|
1318
|
-
metadata, rules, use_colpali, parallel, None, None
|
1319
|
-
)
|
1386
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1320
1387
|
|
1321
1388
|
response = await self._request(
|
1322
|
-
"POST",
|
1323
|
-
"ingest/files",
|
1324
|
-
data=data,
|
1389
|
+
"POST",
|
1390
|
+
"ingest/files",
|
1391
|
+
data=data,
|
1325
1392
|
files=file_objects,
|
1326
1393
|
params={"use_colpali": str(use_colpali).lower()},
|
1327
1394
|
)
|
@@ -1398,6 +1465,7 @@ class AsyncMorphik:
|
|
1398
1465
|
k: int = 4,
|
1399
1466
|
min_score: float = 0.0,
|
1400
1467
|
use_colpali: bool = True,
|
1468
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1401
1469
|
) -> List[FinalChunkResult]:
|
1402
1470
|
"""
|
1403
1471
|
Search for relevant chunks.
|
@@ -1407,7 +1475,8 @@ class AsyncMorphik:
|
|
1407
1475
|
filters: Optional metadata filters
|
1408
1476
|
k: Number of results (default: 4)
|
1409
1477
|
min_score: Minimum similarity threshold (default: 0.0)
|
1410
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1478
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
|
1479
|
+
(only works for documents ingested with `use_colpali=True`)
|
1411
1480
|
Returns:
|
1412
1481
|
List[FinalChunkResult]
|
1413
1482
|
|
@@ -1419,8 +1488,9 @@ class AsyncMorphik:
|
|
1419
1488
|
)
|
1420
1489
|
```
|
1421
1490
|
"""
|
1491
|
+
effective_folder = folder_name if folder_name is not None else None
|
1422
1492
|
payload = self._logic._prepare_retrieve_chunks_request(
|
1423
|
-
query, filters, k, min_score, use_colpali,
|
1493
|
+
query, filters, k, min_score, use_colpali, effective_folder, None
|
1424
1494
|
)
|
1425
1495
|
response = await self._request("POST", "retrieve/chunks", data=payload)
|
1426
1496
|
return self._logic._parse_chunk_result_list_response(response)
|
@@ -1432,6 +1502,7 @@ class AsyncMorphik:
|
|
1432
1502
|
k: int = 4,
|
1433
1503
|
min_score: float = 0.0,
|
1434
1504
|
use_colpali: bool = True,
|
1505
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1435
1506
|
) -> List[DocumentResult]:
|
1436
1507
|
"""
|
1437
1508
|
Retrieve relevant documents.
|
@@ -1441,7 +1512,8 @@ class AsyncMorphik:
|
|
1441
1512
|
filters: Optional metadata filters
|
1442
1513
|
k: Number of results (default: 4)
|
1443
1514
|
min_score: Minimum similarity threshold (default: 0.0)
|
1444
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1515
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve documents
|
1516
|
+
(only works for documents ingested with `use_colpali=True`)
|
1445
1517
|
Returns:
|
1446
1518
|
List[DocumentResult]
|
1447
1519
|
|
@@ -1453,8 +1525,9 @@ class AsyncMorphik:
|
|
1453
1525
|
)
|
1454
1526
|
```
|
1455
1527
|
"""
|
1528
|
+
effective_folder = folder_name if folder_name is not None else None
|
1456
1529
|
payload = self._logic._prepare_retrieve_docs_request(
|
1457
|
-
query, filters, k, min_score, use_colpali,
|
1530
|
+
query, filters, k, min_score, use_colpali, effective_folder, None
|
1458
1531
|
)
|
1459
1532
|
response = await self._request("POST", "retrieve/docs", data=payload)
|
1460
1533
|
return self._logic._parse_document_result_list_response(response)
|
@@ -1472,6 +1545,8 @@ class AsyncMorphik:
|
|
1472
1545
|
hop_depth: int = 1,
|
1473
1546
|
include_paths: bool = False,
|
1474
1547
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1548
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1549
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1475
1550
|
) -> CompletionResponse:
|
1476
1551
|
"""
|
1477
1552
|
Generate completion using relevant chunks as context.
|
@@ -1483,12 +1558,14 @@ class AsyncMorphik:
|
|
1483
1558
|
min_score: Minimum similarity threshold (default: 0.0)
|
1484
1559
|
max_tokens: Maximum tokens in completion
|
1485
1560
|
temperature: Model temperature
|
1486
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1561
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1562
|
+
(only works for documents ingested with `use_colpali=True`)
|
1487
1563
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1488
1564
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1489
1565
|
include_paths: Whether to include relationship paths in the response
|
1490
1566
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1491
1567
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1568
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1492
1569
|
Returns:
|
1493
1570
|
CompletionResponse
|
1494
1571
|
|
@@ -1536,8 +1613,30 @@ class AsyncMorphik:
|
|
1536
1613
|
if response.metadata and "graph" in response.metadata:
|
1537
1614
|
for path in response.metadata["graph"]["paths"]:
|
1538
1615
|
print(" -> ".join(path))
|
1616
|
+
|
1617
|
+
# Using structured output with a Pydantic model
|
1618
|
+
from pydantic import BaseModel
|
1619
|
+
|
1620
|
+
class ResearchFindings(BaseModel):
|
1621
|
+
main_finding: str
|
1622
|
+
supporting_evidence: List[str]
|
1623
|
+
limitations: List[str]
|
1624
|
+
|
1625
|
+
response = await db.query(
|
1626
|
+
"Summarize the key research findings from these documents",
|
1627
|
+
schema=ResearchFindings
|
1628
|
+
)
|
1629
|
+
|
1630
|
+
# Access structured output
|
1631
|
+
if response.structured_output:
|
1632
|
+
findings = response.structured_output
|
1633
|
+
print(f"Main finding: {findings.main_finding}")
|
1634
|
+
print("Supporting evidence:")
|
1635
|
+
for evidence in findings.supporting_evidence:
|
1636
|
+
print(f"- {evidence}")
|
1539
1637
|
```
|
1540
1638
|
"""
|
1639
|
+
effective_folder = folder_name if folder_name is not None else None
|
1541
1640
|
payload = self._logic._prepare_query_request(
|
1542
1641
|
query,
|
1543
1642
|
filters,
|
@@ -1550,14 +1649,31 @@ class AsyncMorphik:
|
|
1550
1649
|
hop_depth,
|
1551
1650
|
include_paths,
|
1552
1651
|
prompt_overrides,
|
1652
|
+
effective_folder,
|
1553
1653
|
None,
|
1554
|
-
|
1654
|
+
schema,
|
1555
1655
|
)
|
1656
|
+
|
1657
|
+
# Add schema to payload if provided
|
1658
|
+
if schema:
|
1659
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1660
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1661
|
+
payload["schema"] = schema.model_json_schema()
|
1662
|
+
else:
|
1663
|
+
payload["schema"] = schema
|
1664
|
+
|
1665
|
+
# Add a hint to the query to return in JSON format
|
1666
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1667
|
+
|
1556
1668
|
response = await self._request("POST", "query", data=payload)
|
1557
1669
|
return self._logic._parse_completion_response(response)
|
1558
1670
|
|
1559
1671
|
async def list_documents(
|
1560
|
-
self,
|
1672
|
+
self,
|
1673
|
+
skip: int = 0,
|
1674
|
+
limit: int = 100,
|
1675
|
+
filters: Optional[Dict[str, Any]] = None,
|
1676
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1561
1677
|
) -> List[Document]:
|
1562
1678
|
"""
|
1563
1679
|
List accessible documents.
|
@@ -1566,6 +1682,7 @@ class AsyncMorphik:
|
|
1566
1682
|
skip: Number of documents to skip
|
1567
1683
|
limit: Maximum number of documents to return
|
1568
1684
|
filters: Optional filters
|
1685
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
1569
1686
|
|
1570
1687
|
Returns:
|
1571
1688
|
List[Document]: List of accessible documents
|
@@ -1579,7 +1696,7 @@ class AsyncMorphik:
|
|
1579
1696
|
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
1580
1697
|
```
|
1581
1698
|
"""
|
1582
|
-
params, data = self._logic._prepare_list_documents_request(skip, limit, filters,
|
1699
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, folder_name, None)
|
1583
1700
|
response = await self._request("POST", "documents", data=data, params=params)
|
1584
1701
|
docs = self._logic._parse_document_list_response(response)
|
1585
1702
|
for doc in docs:
|
@@ -1606,17 +1723,17 @@ class AsyncMorphik:
|
|
1606
1723
|
doc = self._logic._parse_document_response(response)
|
1607
1724
|
doc._client = self
|
1608
1725
|
return doc
|
1609
|
-
|
1726
|
+
|
1610
1727
|
async def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1611
1728
|
"""
|
1612
1729
|
Get the current processing status of a document.
|
1613
|
-
|
1730
|
+
|
1614
1731
|
Args:
|
1615
1732
|
document_id: ID of the document to check
|
1616
|
-
|
1733
|
+
|
1617
1734
|
Returns:
|
1618
1735
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1619
|
-
|
1736
|
+
|
1620
1737
|
Example:
|
1621
1738
|
```python
|
1622
1739
|
status = await db.get_document_status("doc_123")
|
@@ -1630,23 +1747,25 @@ class AsyncMorphik:
|
|
1630
1747
|
"""
|
1631
1748
|
response = await self._request("GET", f"documents/{document_id}/status")
|
1632
1749
|
return response
|
1633
|
-
|
1634
|
-
async def wait_for_document_completion(
|
1750
|
+
|
1751
|
+
async def wait_for_document_completion(
|
1752
|
+
self, document_id: str, timeout_seconds=300, check_interval_seconds=2
|
1753
|
+
) -> Document:
|
1635
1754
|
"""
|
1636
1755
|
Wait for a document's processing to complete.
|
1637
|
-
|
1756
|
+
|
1638
1757
|
Args:
|
1639
1758
|
document_id: ID of the document to wait for
|
1640
1759
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1641
1760
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1642
|
-
|
1761
|
+
|
1643
1762
|
Returns:
|
1644
1763
|
Document: Updated document with the latest status
|
1645
|
-
|
1764
|
+
|
1646
1765
|
Raises:
|
1647
1766
|
TimeoutError: If processing doesn't complete within the timeout period
|
1648
1767
|
ValueError: If processing fails with an error
|
1649
|
-
|
1768
|
+
|
1650
1769
|
Example:
|
1651
1770
|
```python
|
1652
1771
|
# Upload a file and wait for processing to complete
|
@@ -1661,20 +1780,21 @@ class AsyncMorphik:
|
|
1661
1780
|
```
|
1662
1781
|
"""
|
1663
1782
|
import asyncio
|
1783
|
+
|
1664
1784
|
start_time = asyncio.get_event_loop().time()
|
1665
|
-
|
1785
|
+
|
1666
1786
|
while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
|
1667
1787
|
status = await self.get_document_status(document_id)
|
1668
|
-
|
1788
|
+
|
1669
1789
|
if status["status"] == "completed":
|
1670
1790
|
# Get the full document now that it's complete
|
1671
1791
|
return await self.get_document(document_id)
|
1672
1792
|
elif status["status"] == "failed":
|
1673
1793
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1674
|
-
|
1794
|
+
|
1675
1795
|
# Wait before checking again
|
1676
1796
|
await asyncio.sleep(check_interval_seconds)
|
1677
|
-
|
1797
|
+
|
1678
1798
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1679
1799
|
|
1680
1800
|
async def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1828,9 +1948,7 @@ class AsyncMorphik:
|
|
1828
1948
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1829
1949
|
|
1830
1950
|
# Use the dedicated file update endpoint
|
1831
|
-
response = await self._request(
|
1832
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1833
|
-
)
|
1951
|
+
response = await self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1834
1952
|
|
1835
1953
|
doc = self._logic._parse_document_response(response)
|
1836
1954
|
doc._client = self
|
@@ -1866,9 +1984,7 @@ class AsyncMorphik:
|
|
1866
1984
|
```
|
1867
1985
|
"""
|
1868
1986
|
# Use the dedicated metadata update endpoint
|
1869
|
-
response = await self._request(
|
1870
|
-
"POST", f"documents/{document_id}/update_metadata", data=metadata
|
1871
|
-
)
|
1987
|
+
response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
1872
1988
|
doc = self._logic._parse_document_response(response)
|
1873
1989
|
doc._client = self
|
1874
1990
|
return doc
|
@@ -2034,12 +2150,15 @@ class AsyncMorphik:
|
|
2034
2150
|
|
2035
2151
|
return result
|
2036
2152
|
|
2037
|
-
async def batch_get_documents(
|
2153
|
+
async def batch_get_documents(
|
2154
|
+
self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
|
2155
|
+
) -> List[Document]:
|
2038
2156
|
"""
|
2039
2157
|
Retrieve multiple documents by their IDs in a single batch operation.
|
2040
2158
|
|
2041
2159
|
Args:
|
2042
2160
|
document_ids: List of document IDs to retrieve
|
2161
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
2043
2162
|
|
2044
2163
|
Returns:
|
2045
2164
|
List[Document]: List of document metadata for found documents
|
@@ -2053,6 +2172,8 @@ class AsyncMorphik:
|
|
2053
2172
|
"""
|
2054
2173
|
# API expects a dict with document_ids key, not a direct list
|
2055
2174
|
request = {"document_ids": document_ids}
|
2175
|
+
if folder_name:
|
2176
|
+
request["folder_name"] = folder_name
|
2056
2177
|
response = await self._request("POST", "batch/documents", data=request)
|
2057
2178
|
docs = self._logic._parse_document_list_response(response)
|
2058
2179
|
for doc in docs:
|
@@ -2060,13 +2181,16 @@ class AsyncMorphik:
|
|
2060
2181
|
return docs
|
2061
2182
|
|
2062
2183
|
async def batch_get_chunks(
|
2063
|
-
self,
|
2184
|
+
self,
|
2185
|
+
sources: List[Union[ChunkSource, Dict[str, Any]]],
|
2186
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
2064
2187
|
) -> List[FinalChunkResult]:
|
2065
2188
|
"""
|
2066
2189
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2067
2190
|
|
2068
2191
|
Args:
|
2069
2192
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
2193
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
2070
2194
|
|
2071
2195
|
Returns:
|
2072
2196
|
List[FinalChunkResult]: List of chunk results
|
@@ -2091,7 +2215,7 @@ class AsyncMorphik:
|
|
2091
2215
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
2092
2216
|
```
|
2093
2217
|
"""
|
2094
|
-
request = self._logic._prepare_batch_get_chunks_request(sources,
|
2218
|
+
request = self._logic._prepare_batch_get_chunks_request(sources, folder_name, None)
|
2095
2219
|
response = await self._request("POST", "batch/chunks", data=request)
|
2096
2220
|
return self._logic._parse_chunk_result_list_response(response)
|
2097
2221
|
|
@@ -2110,8 +2234,10 @@ class AsyncMorphik:
|
|
2110
2234
|
name: Name of the cache to create
|
2111
2235
|
model: Name of the model to use (e.g. "llama2")
|
2112
2236
|
gguf_file: Name of the GGUF file to use for the model
|
2113
|
-
filters: Optional metadata filters to determine which documents to include.
|
2114
|
-
|
2237
|
+
filters: Optional metadata filters to determine which documents to include.
|
2238
|
+
These filters will be applied in addition to any specific docs provided.
|
2239
|
+
docs: Optional list of specific document IDs to include.
|
2240
|
+
These docs will be included in addition to any documents matching the filters.
|
2115
2241
|
|
2116
2242
|
Returns:
|
2117
2243
|
Dict[str, Any]: Created cache configuration
|
@@ -2212,11 +2338,11 @@ class AsyncMorphik:
|
|
2212
2338
|
)
|
2213
2339
|
```
|
2214
2340
|
"""
|
2215
|
-
request = self._logic._prepare_create_graph_request(
|
2216
|
-
name, filters, documents, prompt_overrides, None, None
|
2217
|
-
)
|
2341
|
+
request = self._logic._prepare_create_graph_request(name, filters, documents, prompt_overrides, None, None)
|
2218
2342
|
response = await self._request("POST", "graph/create", data=request)
|
2219
|
-
|
2343
|
+
graph = self._logic._parse_graph_response(response)
|
2344
|
+
graph._client = self # Attach AsyncMorphik client for polling helpers
|
2345
|
+
return graph
|
2220
2346
|
|
2221
2347
|
async def get_graph(self, name: str) -> Graph:
|
2222
2348
|
"""
|
@@ -2236,7 +2362,9 @@ class AsyncMorphik:
|
|
2236
2362
|
```
|
2237
2363
|
"""
|
2238
2364
|
response = await self._request("GET", f"graph/{name}")
|
2239
|
-
|
2365
|
+
graph = self._logic._parse_graph_response(response)
|
2366
|
+
graph._client = self
|
2367
|
+
return graph
|
2240
2368
|
|
2241
2369
|
async def list_graphs(self) -> List[Graph]:
|
2242
2370
|
"""
|
@@ -2254,7 +2382,10 @@ class AsyncMorphik:
|
|
2254
2382
|
```
|
2255
2383
|
"""
|
2256
2384
|
response = await self._request("GET", "graphs")
|
2257
|
-
|
2385
|
+
graphs = self._logic._parse_graph_list_response(response)
|
2386
|
+
for g in graphs:
|
2387
|
+
g._client = self
|
2388
|
+
return graphs
|
2258
2389
|
|
2259
2390
|
async def update_graph(
|
2260
2391
|
self,
|
@@ -2311,7 +2442,9 @@ class AsyncMorphik:
|
|
2311
2442
|
name, additional_filters, additional_documents, prompt_overrides, None, None
|
2312
2443
|
)
|
2313
2444
|
response = await self._request("POST", f"graph/{name}/update", data=request)
|
2314
|
-
|
2445
|
+
graph = self._logic._parse_graph_response(response)
|
2446
|
+
graph._client = self
|
2447
|
+
return graph
|
2315
2448
|
|
2316
2449
|
async def delete_document(self, document_id: str) -> Dict[str, str]:
|
2317
2450
|
"""
|
@@ -2373,3 +2506,37 @@ class AsyncMorphik:
|
|
2373
2506
|
|
2374
2507
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
2375
2508
|
await self.close()
|
2509
|
+
|
2510
|
+
async def create_app(self, app_id: str, name: str, expiry_days: int = 30) -> Dict[str, str]:
|
2511
|
+
"""Create a new application in Morphik Cloud and obtain its auth URI (async)."""
|
2512
|
+
|
2513
|
+
payload = {"app_id": app_id, "name": name, "expiry_days": expiry_days}
|
2514
|
+
return await self._request("POST", "ee/create_app", data=payload)
|
2515
|
+
|
2516
|
+
async def wait_for_graph_completion(
|
2517
|
+
self,
|
2518
|
+
graph_name: str,
|
2519
|
+
timeout_seconds: int = 300,
|
2520
|
+
check_interval_seconds: int = 5,
|
2521
|
+
) -> Graph:
|
2522
|
+
"""Block until the specified graph finishes processing (async).
|
2523
|
+
|
2524
|
+
Args:
|
2525
|
+
graph_name: Name of the graph to monitor.
|
2526
|
+
timeout_seconds: Maximum seconds to wait.
|
2527
|
+
check_interval_seconds: Seconds between status checks.
|
2528
|
+
|
2529
|
+
Returns:
|
2530
|
+
Graph: The completed graph object.
|
2531
|
+
"""
|
2532
|
+
import asyncio
|
2533
|
+
|
2534
|
+
start = asyncio.get_event_loop().time()
|
2535
|
+
while (asyncio.get_event_loop().time() - start) < timeout_seconds:
|
2536
|
+
graph = await self.get_graph(graph_name)
|
2537
|
+
if graph.is_completed:
|
2538
|
+
return graph
|
2539
|
+
if graph.is_failed:
|
2540
|
+
raise RuntimeError(graph.error or "Graph processing failed")
|
2541
|
+
await asyncio.sleep(check_interval_seconds)
|
2542
|
+
raise TimeoutError("Timed out waiting for graph completion")
|