morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +2 -2
- morphik/_internal.py +36 -27
- morphik/async_.py +294 -127
- morphik/models.py +79 -58
- morphik/rules.py +28 -5
- morphik/sync.py +352 -144
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/METADATA +4 -8
- morphik-0.1.6.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/WHEEL +0 -0
morphik/sync.py
CHANGED
@@ -2,27 +2,23 @@ import json
|
|
2
2
|
import logging
|
3
3
|
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
-
|
7
|
-
from PIL import Image
|
8
|
-
from PIL.Image import Image as PILImage
|
5
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
|
9
6
|
|
10
7
|
import httpx
|
8
|
+
from pydantic import BaseModel
|
11
9
|
|
10
|
+
from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
|
12
11
|
from .models import (
|
12
|
+
ChunkSource,
|
13
|
+
CompletionResponse, # Prompt override models
|
13
14
|
Document,
|
14
15
|
DocumentResult,
|
15
|
-
CompletionResponse,
|
16
|
-
IngestTextRequest,
|
17
|
-
ChunkSource,
|
18
|
-
Graph,
|
19
16
|
FolderInfo,
|
20
|
-
|
17
|
+
Graph,
|
21
18
|
GraphPromptOverrides,
|
19
|
+
IngestTextRequest,
|
22
20
|
QueryPromptOverrides,
|
23
21
|
)
|
24
|
-
from .rules import Rule
|
25
|
-
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
26
22
|
|
27
23
|
logger = logging.getLogger(__name__)
|
28
24
|
|
@@ -71,16 +67,16 @@ class Folder:
|
|
71
67
|
def name(self) -> str:
|
72
68
|
"""Returns the folder name."""
|
73
69
|
return self._name
|
74
|
-
|
70
|
+
|
75
71
|
@property
|
76
72
|
def id(self) -> Optional[str]:
|
77
73
|
"""Returns the folder ID if available."""
|
78
74
|
return self._id
|
79
|
-
|
75
|
+
|
80
76
|
def get_info(self) -> Dict[str, Any]:
|
81
77
|
"""
|
82
78
|
Get detailed information about this folder.
|
83
|
-
|
79
|
+
|
84
80
|
Returns:
|
85
81
|
Dict[str, Any]: Detailed folder information
|
86
82
|
"""
|
@@ -93,9 +89,8 @@ class Folder:
|
|
93
89
|
break
|
94
90
|
if not self._id:
|
95
91
|
raise ValueError(f"Folder '{self._name}' not found")
|
96
|
-
|
92
|
+
|
97
93
|
return self._client._request("GET", f"folders/{self._id}")
|
98
|
-
|
99
94
|
|
100
95
|
def signin(self, end_user_id: str) -> "UserScope":
|
101
96
|
"""
|
@@ -168,9 +163,7 @@ class Folder:
|
|
168
163
|
files = {"file": (filename, file_obj)}
|
169
164
|
|
170
165
|
# Create form data
|
171
|
-
form_data = self._client._logic._prepare_ingest_file_form_data(
|
172
|
-
metadata, rules, self._name, None
|
173
|
-
)
|
166
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
|
174
167
|
|
175
168
|
# use_colpali should be a query parameter as defined in the API
|
176
169
|
response = self._client._request(
|
@@ -219,9 +212,9 @@ class Folder:
|
|
219
212
|
)
|
220
213
|
|
221
214
|
response = self._client._request(
|
222
|
-
"POST",
|
223
|
-
"ingest/files",
|
224
|
-
data=data,
|
215
|
+
"POST",
|
216
|
+
"ingest/files",
|
217
|
+
data=data,
|
225
218
|
files=file_objects,
|
226
219
|
params={"use_colpali": str(use_colpali).lower()},
|
227
220
|
)
|
@@ -231,9 +224,7 @@ class Folder:
|
|
231
224
|
for error in response["errors"]:
|
232
225
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
233
226
|
|
234
|
-
docs = [
|
235
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
236
|
-
]
|
227
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
237
228
|
for doc in docs:
|
238
229
|
doc._client = self._client
|
239
230
|
return docs
|
@@ -296,6 +287,7 @@ class Folder:
|
|
296
287
|
k: int = 4,
|
297
288
|
min_score: float = 0.0,
|
298
289
|
use_colpali: bool = True,
|
290
|
+
additional_folders: Optional[List[str]] = None,
|
299
291
|
) -> List[FinalChunkResult]:
|
300
292
|
"""
|
301
293
|
Retrieve relevant chunks within this folder.
|
@@ -306,17 +298,19 @@ class Folder:
|
|
306
298
|
k: Number of results (default: 4)
|
307
299
|
min_score: Minimum similarity threshold (default: 0.0)
|
308
300
|
use_colpali: Whether to use ColPali-style embedding model
|
301
|
+
additional_folders: Optional list of extra folders to include in the scope
|
309
302
|
|
310
303
|
Returns:
|
311
304
|
List[FinalChunkResult]: List of relevant chunks
|
312
305
|
"""
|
306
|
+
effective_folder = self._merge_folders(additional_folders)
|
313
307
|
request = {
|
314
308
|
"query": query,
|
315
309
|
"filters": filters,
|
316
310
|
"k": k,
|
317
311
|
"min_score": min_score,
|
318
312
|
"use_colpali": use_colpali,
|
319
|
-
"folder_name":
|
313
|
+
"folder_name": effective_folder,
|
320
314
|
}
|
321
315
|
|
322
316
|
response = self._client._request("POST", "retrieve/chunks", request)
|
@@ -329,6 +323,7 @@ class Folder:
|
|
329
323
|
k: int = 4,
|
330
324
|
min_score: float = 0.0,
|
331
325
|
use_colpali: bool = True,
|
326
|
+
additional_folders: Optional[List[str]] = None,
|
332
327
|
) -> List[DocumentResult]:
|
333
328
|
"""
|
334
329
|
Retrieve relevant documents within this folder.
|
@@ -339,17 +334,19 @@ class Folder:
|
|
339
334
|
k: Number of results (default: 4)
|
340
335
|
min_score: Minimum similarity threshold (default: 0.0)
|
341
336
|
use_colpali: Whether to use ColPali-style embedding model
|
337
|
+
additional_folders: Optional list of extra folders to include in the scope
|
342
338
|
|
343
339
|
Returns:
|
344
340
|
List[DocumentResult]: List of relevant documents
|
345
341
|
"""
|
342
|
+
effective_folder = self._merge_folders(additional_folders)
|
346
343
|
request = {
|
347
344
|
"query": query,
|
348
345
|
"filters": filters,
|
349
346
|
"k": k,
|
350
347
|
"min_score": min_score,
|
351
348
|
"use_colpali": use_colpali,
|
352
|
-
"folder_name":
|
349
|
+
"folder_name": effective_folder,
|
353
350
|
}
|
354
351
|
|
355
352
|
response = self._client._request("POST", "retrieve/docs", request)
|
@@ -368,6 +365,8 @@ class Folder:
|
|
368
365
|
hop_depth: int = 1,
|
369
366
|
include_paths: bool = False,
|
370
367
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
368
|
+
additional_folders: Optional[List[str]] = None,
|
369
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
371
370
|
) -> CompletionResponse:
|
372
371
|
"""
|
373
372
|
Generate completion using relevant chunks as context within this folder.
|
@@ -384,10 +383,13 @@ class Folder:
|
|
384
383
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
385
384
|
include_paths: Whether to include relationship paths in the response
|
386
385
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
386
|
+
additional_folders: Optional list of extra folders to include in the scope
|
387
|
+
schema: Optional schema for structured output
|
387
388
|
|
388
389
|
Returns:
|
389
390
|
CompletionResponse: Generated completion
|
390
391
|
"""
|
392
|
+
effective_folder = self._merge_folders(additional_folders)
|
391
393
|
payload = self._client._logic._prepare_query_request(
|
392
394
|
query,
|
393
395
|
filters,
|
@@ -400,14 +402,31 @@ class Folder:
|
|
400
402
|
hop_depth,
|
401
403
|
include_paths,
|
402
404
|
prompt_overrides,
|
403
|
-
|
404
|
-
None,
|
405
|
+
effective_folder,
|
406
|
+
None, # end_user_id not supported at this level
|
407
|
+
schema,
|
405
408
|
)
|
409
|
+
|
410
|
+
# Add schema to payload if provided
|
411
|
+
if schema:
|
412
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
413
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
414
|
+
payload["schema"] = schema.model_json_schema()
|
415
|
+
else:
|
416
|
+
payload["schema"] = schema
|
417
|
+
|
418
|
+
# Add a hint to the query to return in JSON format
|
419
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
420
|
+
|
406
421
|
response = self._client._request("POST", "query", data=payload)
|
407
422
|
return self._client._logic._parse_completion_response(response)
|
408
423
|
|
409
424
|
def list_documents(
|
410
|
-
self,
|
425
|
+
self,
|
426
|
+
skip: int = 0,
|
427
|
+
limit: int = 100,
|
428
|
+
filters: Optional[Dict[str, Any]] = None,
|
429
|
+
additional_folders: Optional[List[str]] = None,
|
411
430
|
) -> List[Document]:
|
412
431
|
"""
|
413
432
|
List accessible documents within this folder.
|
@@ -416,30 +435,34 @@ class Folder:
|
|
416
435
|
skip: Number of documents to skip
|
417
436
|
limit: Maximum number of documents to return
|
418
437
|
filters: Optional filters
|
438
|
+
additional_folders: Optional list of extra folders to include in the scope
|
419
439
|
|
420
440
|
Returns:
|
421
441
|
List[Document]: List of documents
|
422
442
|
"""
|
423
|
-
|
424
|
-
|
425
|
-
)
|
443
|
+
effective_folder = self._merge_folders(additional_folders)
|
444
|
+
params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, effective_folder, None)
|
426
445
|
response = self._client._request("POST", "documents", data=data, params=params)
|
427
446
|
docs = self._client._logic._parse_document_list_response(response)
|
428
447
|
for doc in docs:
|
429
448
|
doc._client = self._client
|
430
449
|
return docs
|
431
450
|
|
432
|
-
def batch_get_documents(
|
451
|
+
def batch_get_documents(
|
452
|
+
self, document_ids: List[str], additional_folders: Optional[List[str]] = None
|
453
|
+
) -> List[Document]:
|
433
454
|
"""
|
434
455
|
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
435
456
|
|
436
457
|
Args:
|
437
458
|
document_ids: List of document IDs to retrieve
|
459
|
+
additional_folders: Optional list of extra folders to include in the scope
|
438
460
|
|
439
461
|
Returns:
|
440
462
|
List[Document]: List of document metadata for found documents
|
441
463
|
"""
|
442
|
-
|
464
|
+
merged = self._merge_folders(additional_folders)
|
465
|
+
request = {"document_ids": document_ids, "folder_name": merged}
|
443
466
|
|
444
467
|
response = self._client._request("POST", "batch/documents", data=request)
|
445
468
|
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
@@ -448,13 +471,16 @@ class Folder:
|
|
448
471
|
return docs
|
449
472
|
|
450
473
|
def batch_get_chunks(
|
451
|
-
self,
|
474
|
+
self,
|
475
|
+
sources: List[Union[ChunkSource, Dict[str, Any]]],
|
476
|
+
additional_folders: Optional[List[str]] = None,
|
452
477
|
) -> List[FinalChunkResult]:
|
453
478
|
"""
|
454
479
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
455
480
|
|
456
481
|
Args:
|
457
482
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
483
|
+
additional_folders: Optional list of extra folders to include in the scope
|
458
484
|
|
459
485
|
Returns:
|
460
486
|
List[FinalChunkResult]: List of chunk results
|
@@ -467,8 +493,8 @@ class Folder:
|
|
467
493
|
else:
|
468
494
|
source_dicts.append(source.model_dump())
|
469
495
|
|
470
|
-
|
471
|
-
request = {"sources": source_dicts, "folder_name":
|
496
|
+
merged = self._merge_folders(additional_folders)
|
497
|
+
request = {"sources": source_dicts, "folder_name": merged}
|
472
498
|
|
473
499
|
response = self._client._request("POST", "batch/chunks", data=request)
|
474
500
|
return self._client._logic._parse_chunk_result_list_response(response)
|
@@ -505,7 +531,9 @@ class Folder:
|
|
505
531
|
}
|
506
532
|
|
507
533
|
response = self._client._request("POST", "graph/create", request)
|
508
|
-
|
534
|
+
graph = self._logic._parse_graph_response(response)
|
535
|
+
graph._client = self
|
536
|
+
return graph
|
509
537
|
|
510
538
|
def update_graph(
|
511
539
|
self,
|
@@ -538,7 +566,9 @@ class Folder:
|
|
538
566
|
}
|
539
567
|
|
540
568
|
response = self._client._request("POST", f"graph/{name}/update", request)
|
541
|
-
|
569
|
+
graph = self._logic._parse_graph_response(response)
|
570
|
+
graph._client = self
|
571
|
+
return graph
|
542
572
|
|
543
573
|
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
544
574
|
"""
|
@@ -550,18 +580,28 @@ class Folder:
|
|
550
580
|
Returns:
|
551
581
|
Dict[str, str]: Deletion status
|
552
582
|
"""
|
553
|
-
# Get the document by filename with folder scope
|
554
|
-
request = {"filename": filename, "folder_name": self._name}
|
555
|
-
|
556
583
|
# First get the document ID
|
557
|
-
response = self._client._request(
|
558
|
-
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
559
|
-
)
|
584
|
+
response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
|
560
585
|
doc = self._client._logic._parse_document_response(response)
|
561
586
|
|
562
587
|
# Then delete by ID
|
563
588
|
return self._client.delete_document(doc.external_id)
|
564
589
|
|
590
|
+
# Helper --------------------------------------------------------------
|
591
|
+
def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str]]:
|
592
|
+
"""Return the effective folder scope.
|
593
|
+
|
594
|
+
If *additional_folders* is provided it will be combined with the folder's
|
595
|
+
own *self._name* and returned as a list (to preserve ordering and allow
|
596
|
+
duplicates to be removed server-side). Otherwise just *self._name* is
|
597
|
+
returned so we keep backward-compatibility with the original API that
|
598
|
+
expected a single string.
|
599
|
+
"""
|
600
|
+
if not additional_folders:
|
601
|
+
return self._name
|
602
|
+
# Pre-pend the scoped folder to the list provided by the caller.
|
603
|
+
return [self._name] + additional_folders
|
604
|
+
|
565
605
|
|
566
606
|
class UserScope:
|
567
607
|
"""
|
@@ -677,7 +717,7 @@ class UserScope:
|
|
677
717
|
# Add folder name if scoped to a folder
|
678
718
|
if self._folder_name:
|
679
719
|
form_data["folder_name"] = self._folder_name
|
680
|
-
|
720
|
+
|
681
721
|
# use_colpali should be a query parameter as defined in the API
|
682
722
|
response = self._client._request(
|
683
723
|
"POST",
|
@@ -732,9 +772,7 @@ class UserScope:
|
|
732
772
|
if rules:
|
733
773
|
if all(isinstance(r, list) for r in rules):
|
734
774
|
# List of lists - per-file rules
|
735
|
-
converted_rules = [
|
736
|
-
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
737
|
-
]
|
775
|
+
converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
|
738
776
|
else:
|
739
777
|
# Flat list - shared rules for all files
|
740
778
|
converted_rules = [self._client._convert_rule(r) for r in rules]
|
@@ -754,9 +792,9 @@ class UserScope:
|
|
754
792
|
data["folder_name"] = self._folder_name
|
755
793
|
|
756
794
|
response = self._client._request(
|
757
|
-
"POST",
|
758
|
-
"ingest/files",
|
759
|
-
data=data,
|
795
|
+
"POST",
|
796
|
+
"ingest/files",
|
797
|
+
data=data,
|
760
798
|
files=file_objects,
|
761
799
|
params={"use_colpali": str(use_colpali).lower()},
|
762
800
|
)
|
@@ -766,9 +804,7 @@ class UserScope:
|
|
766
804
|
for error in response["errors"]:
|
767
805
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
768
806
|
|
769
|
-
docs = [
|
770
|
-
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
771
|
-
]
|
807
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
772
808
|
for doc in docs:
|
773
809
|
doc._client = self._client
|
774
810
|
return docs
|
@@ -831,6 +867,7 @@ class UserScope:
|
|
831
867
|
k: int = 4,
|
832
868
|
min_score: float = 0.0,
|
833
869
|
use_colpali: bool = True,
|
870
|
+
additional_folders: Optional[List[str]] = None,
|
834
871
|
) -> List[FinalChunkResult]:
|
835
872
|
"""
|
836
873
|
Retrieve relevant chunks as this end user.
|
@@ -841,10 +878,12 @@ class UserScope:
|
|
841
878
|
k: Number of results (default: 4)
|
842
879
|
min_score: Minimum similarity threshold (default: 0.0)
|
843
880
|
use_colpali: Whether to use ColPali-style embedding model
|
881
|
+
additional_folders: Optional list of extra folders to include in the scope
|
844
882
|
|
845
883
|
Returns:
|
846
884
|
List[FinalChunkResult]: List of relevant chunks
|
847
885
|
"""
|
886
|
+
effective_folder = self._merge_folders(additional_folders)
|
848
887
|
request = {
|
849
888
|
"query": query,
|
850
889
|
"filters": filters,
|
@@ -852,6 +891,7 @@ class UserScope:
|
|
852
891
|
"min_score": min_score,
|
853
892
|
"use_colpali": use_colpali,
|
854
893
|
"end_user_id": self._end_user_id, # Add end user ID here
|
894
|
+
"folder_name": effective_folder, # Add folder name if provided
|
855
895
|
}
|
856
896
|
|
857
897
|
# Add folder name if scoped to a folder
|
@@ -868,6 +908,7 @@ class UserScope:
|
|
868
908
|
k: int = 4,
|
869
909
|
min_score: float = 0.0,
|
870
910
|
use_colpali: bool = True,
|
911
|
+
additional_folders: Optional[List[str]] = None,
|
871
912
|
) -> List[DocumentResult]:
|
872
913
|
"""
|
873
914
|
Retrieve relevant documents as this end user.
|
@@ -878,10 +919,12 @@ class UserScope:
|
|
878
919
|
k: Number of results (default: 4)
|
879
920
|
min_score: Minimum similarity threshold (default: 0.0)
|
880
921
|
use_colpali: Whether to use ColPali-style embedding model
|
922
|
+
additional_folders: Optional list of extra folders to include in the scope
|
881
923
|
|
882
924
|
Returns:
|
883
925
|
List[DocumentResult]: List of relevant documents
|
884
926
|
"""
|
927
|
+
effective_folder = self._merge_folders(additional_folders)
|
885
928
|
request = {
|
886
929
|
"query": query,
|
887
930
|
"filters": filters,
|
@@ -889,6 +932,7 @@ class UserScope:
|
|
889
932
|
"min_score": min_score,
|
890
933
|
"use_colpali": use_colpali,
|
891
934
|
"end_user_id": self._end_user_id, # Add end user ID here
|
935
|
+
"folder_name": effective_folder, # Add folder name if provided
|
892
936
|
}
|
893
937
|
|
894
938
|
# Add folder name if scoped to a folder
|
@@ -911,6 +955,8 @@ class UserScope:
|
|
911
955
|
hop_depth: int = 1,
|
912
956
|
include_paths: bool = False,
|
913
957
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
958
|
+
additional_folders: Optional[List[str]] = None,
|
959
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
914
960
|
) -> CompletionResponse:
|
915
961
|
"""
|
916
962
|
Generate completion using relevant chunks as context as this end user.
|
@@ -927,10 +973,13 @@ class UserScope:
|
|
927
973
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
928
974
|
include_paths: Whether to include relationship paths in the response
|
929
975
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
976
|
+
additional_folders: Optional list of extra folders to include in the scope
|
977
|
+
schema: Optional schema for structured output
|
930
978
|
|
931
979
|
Returns:
|
932
980
|
CompletionResponse: Generated completion
|
933
981
|
"""
|
982
|
+
effective_folder = self._merge_folders(additional_folders)
|
934
983
|
payload = self._client._logic._prepare_query_request(
|
935
984
|
query,
|
936
985
|
filters,
|
@@ -943,14 +992,31 @@ class UserScope:
|
|
943
992
|
hop_depth,
|
944
993
|
include_paths,
|
945
994
|
prompt_overrides,
|
946
|
-
|
995
|
+
effective_folder,
|
947
996
|
self._end_user_id,
|
997
|
+
schema,
|
948
998
|
)
|
999
|
+
|
1000
|
+
# Add schema to payload if provided
|
1001
|
+
if schema:
|
1002
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1003
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1004
|
+
payload["schema"] = schema.model_json_schema()
|
1005
|
+
else:
|
1006
|
+
payload["schema"] = schema
|
1007
|
+
|
1008
|
+
# Add a hint to the query to return in JSON format
|
1009
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1010
|
+
|
949
1011
|
response = self._client._request("POST", "query", data=payload)
|
950
1012
|
return self._client._logic._parse_completion_response(response)
|
951
1013
|
|
952
1014
|
def list_documents(
|
953
|
-
self,
|
1015
|
+
self,
|
1016
|
+
skip: int = 0,
|
1017
|
+
limit: int = 100,
|
1018
|
+
filters: Optional[Dict[str, Any]] = None,
|
1019
|
+
additional_folders: Optional[List[str]] = None,
|
954
1020
|
) -> List[Document]:
|
955
1021
|
"""
|
956
1022
|
List accessible documents for this end user.
|
@@ -959,6 +1025,7 @@ class UserScope:
|
|
959
1025
|
skip: Number of documents to skip
|
960
1026
|
limit: Maximum number of documents to return
|
961
1027
|
filters: Optional filters
|
1028
|
+
additional_folders: Optional list of extra folders to include in the scope
|
962
1029
|
|
963
1030
|
Returns:
|
964
1031
|
List[Document]: List of documents
|
@@ -970,28 +1037,36 @@ class UserScope:
|
|
970
1037
|
if self._folder_name:
|
971
1038
|
params["folder_name"] = self._folder_name
|
972
1039
|
|
973
|
-
|
1040
|
+
# Merge any additional folders into the request params
|
1041
|
+
effective_folder = self._merge_folders(additional_folders)
|
1042
|
+
if effective_folder:
|
1043
|
+
params["folder_name"] = effective_folder
|
1044
|
+
|
1045
|
+
response = self._client._request("POST", "documents", data=filters or {}, params=params)
|
974
1046
|
|
975
1047
|
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
976
1048
|
for doc in docs:
|
977
1049
|
doc._client = self._client
|
978
1050
|
return docs
|
979
1051
|
|
980
|
-
def batch_get_documents(
|
1052
|
+
def batch_get_documents(
|
1053
|
+
self, document_ids: List[str], additional_folders: Optional[List[str]] = None
|
1054
|
+
) -> List[Document]:
|
981
1055
|
"""
|
982
1056
|
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
983
1057
|
|
984
1058
|
Args:
|
985
1059
|
document_ids: List of document IDs to retrieve
|
1060
|
+
additional_folders: Optional list of extra folders to include in the scope
|
986
1061
|
|
987
1062
|
Returns:
|
988
1063
|
List[Document]: List of document metadata for found documents
|
989
1064
|
"""
|
1065
|
+
merged = self._merge_folders(additional_folders)
|
990
1066
|
request = {"document_ids": document_ids, "end_user_id": self._end_user_id}
|
991
1067
|
|
992
|
-
|
993
|
-
|
994
|
-
request["folder_name"] = self._folder_name
|
1068
|
+
if merged:
|
1069
|
+
request["folder_name"] = merged
|
995
1070
|
|
996
1071
|
response = self._client._request("POST", "batch/documents", data=request)
|
997
1072
|
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
@@ -1000,13 +1075,16 @@ class UserScope:
|
|
1000
1075
|
return docs
|
1001
1076
|
|
1002
1077
|
def batch_get_chunks(
|
1003
|
-
self,
|
1078
|
+
self,
|
1079
|
+
sources: List[Union[ChunkSource, Dict[str, Any]]],
|
1080
|
+
additional_folders: Optional[List[str]] = None,
|
1004
1081
|
) -> List[FinalChunkResult]:
|
1005
1082
|
"""
|
1006
1083
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
1007
1084
|
|
1008
1085
|
Args:
|
1009
1086
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1087
|
+
additional_folders: Optional list of extra folders to include in the scope
|
1010
1088
|
|
1011
1089
|
Returns:
|
1012
1090
|
List[FinalChunkResult]: List of chunk results
|
@@ -1019,12 +1097,11 @@ class UserScope:
|
|
1019
1097
|
else:
|
1020
1098
|
source_dicts.append(source.model_dump())
|
1021
1099
|
|
1022
|
-
|
1100
|
+
merged = self._merge_folders(additional_folders)
|
1023
1101
|
request = {"sources": source_dicts, "end_user_id": self._end_user_id}
|
1024
1102
|
|
1025
|
-
|
1026
|
-
|
1027
|
-
request["folder_name"] = self._folder_name
|
1103
|
+
if merged:
|
1104
|
+
request["folder_name"] = merged
|
1028
1105
|
|
1029
1106
|
response = self._client._request("POST", "batch/chunks", data=request)
|
1030
1107
|
return self._client._logic._parse_chunk_result_list_response(response)
|
@@ -1065,7 +1142,9 @@ class UserScope:
|
|
1065
1142
|
request["folder_name"] = self._folder_name
|
1066
1143
|
|
1067
1144
|
response = self._client._request("POST", "graph/create", request)
|
1068
|
-
|
1145
|
+
graph = self._logic._parse_graph_response(response)
|
1146
|
+
graph._client = self
|
1147
|
+
return graph
|
1069
1148
|
|
1070
1149
|
def update_graph(
|
1071
1150
|
self,
|
@@ -1102,7 +1181,9 @@ class UserScope:
|
|
1102
1181
|
request["folder_name"] = self._folder_name
|
1103
1182
|
|
1104
1183
|
response = self._client._request("POST", f"graph/{name}/update", request)
|
1105
|
-
|
1184
|
+
graph = self._logic._parse_graph_response(response)
|
1185
|
+
graph._client = self
|
1186
|
+
return graph
|
1106
1187
|
|
1107
1188
|
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1108
1189
|
"""
|
@@ -1128,6 +1209,22 @@ class UserScope:
|
|
1128
1209
|
# Then delete by ID
|
1129
1210
|
return self._client.delete_document(doc.external_id)
|
1130
1211
|
|
1212
|
+
# Helper --------------------------------------------------------------
|
1213
|
+
def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str], None]:
|
1214
|
+
"""Return combined folder scope for user.
|
1215
|
+
|
1216
|
+
When this user scope is already tied to *self._folder_name* we combine it
|
1217
|
+
with any *additional_folders* passed by the caller. Otherwise just the
|
1218
|
+
*additional_folders* (or None) is returned so that upstream logic is
|
1219
|
+
unchanged.
|
1220
|
+
"""
|
1221
|
+
base = self._folder_name
|
1222
|
+
if additional_folders:
|
1223
|
+
if base:
|
1224
|
+
return [base] + additional_folders
|
1225
|
+
return additional_folders
|
1226
|
+
return base
|
1227
|
+
|
1131
1228
|
|
1132
1229
|
class Morphik:
|
1133
1230
|
"""
|
@@ -1173,12 +1270,12 @@ class Morphik:
|
|
1173
1270
|
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1174
1271
|
if "Content-Type" in headers:
|
1175
1272
|
del headers["Content-Type"]
|
1176
|
-
|
1273
|
+
|
1177
1274
|
# For file uploads with form data, use form data (not json)
|
1178
1275
|
request_data = {"files": files}
|
1179
1276
|
if data:
|
1180
1277
|
request_data["data"] = data
|
1181
|
-
|
1278
|
+
|
1182
1279
|
# Files are now properly handled
|
1183
1280
|
else:
|
1184
1281
|
# JSON for everything else
|
@@ -1192,8 +1289,13 @@ class Morphik:
|
|
1192
1289
|
params=params,
|
1193
1290
|
**request_data,
|
1194
1291
|
)
|
1195
|
-
|
1196
|
-
|
1292
|
+
try:
|
1293
|
+
response.raise_for_status()
|
1294
|
+
return response.json()
|
1295
|
+
except httpx.HTTPStatusError as e:
|
1296
|
+
# Print error response for debugging
|
1297
|
+
print(f"Error response: {e.response.status_code} - {e.response.text}")
|
1298
|
+
raise
|
1197
1299
|
|
1198
1300
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
1199
1301
|
"""Convert a rule to a dictionary format"""
|
@@ -1210,18 +1312,16 @@ class Morphik:
|
|
1210
1312
|
Returns:
|
1211
1313
|
Folder: A folder object ready for scoped operations
|
1212
1314
|
"""
|
1213
|
-
payload = {
|
1214
|
-
"name": name
|
1215
|
-
}
|
1315
|
+
payload = {"name": name}
|
1216
1316
|
if description:
|
1217
1317
|
payload["description"] = description
|
1218
|
-
|
1318
|
+
|
1219
1319
|
response = self._request("POST", "folders", data=payload)
|
1220
1320
|
folder_info = FolderInfo(**response)
|
1221
|
-
|
1321
|
+
|
1222
1322
|
# Return a usable Folder object with the ID from the response
|
1223
1323
|
return Folder(self, name, folder_id=folder_info.id)
|
1224
|
-
|
1324
|
+
|
1225
1325
|
def get_folder_by_name(self, name: str) -> Folder:
|
1226
1326
|
"""
|
1227
1327
|
Get a folder by name to scope operations.
|
@@ -1233,7 +1333,7 @@ class Morphik:
|
|
1233
1333
|
Folder: A folder object for scoped operations
|
1234
1334
|
"""
|
1235
1335
|
return Folder(self, name)
|
1236
|
-
|
1336
|
+
|
1237
1337
|
def get_folder(self, folder_id: str) -> Folder:
|
1238
1338
|
"""
|
1239
1339
|
Get a folder by ID.
|
@@ -1250,13 +1350,13 @@ class Morphik:
|
|
1250
1350
|
def list_folders(self) -> List[Folder]:
|
1251
1351
|
"""
|
1252
1352
|
List all folders the user has access to as Folder objects.
|
1253
|
-
|
1353
|
+
|
1254
1354
|
Returns:
|
1255
1355
|
List[Folder]: List of Folder objects ready for operations
|
1256
1356
|
"""
|
1257
1357
|
folder_infos = self._request("GET", "folders")
|
1258
1358
|
return [Folder(self, info["name"], info["id"]) for info in folder_infos]
|
1259
|
-
|
1359
|
+
|
1260
1360
|
def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1261
1361
|
"""
|
1262
1362
|
Add a document to a folder.
|
@@ -1270,7 +1370,7 @@ class Morphik:
|
|
1270
1370
|
"""
|
1271
1371
|
response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1272
1372
|
return response
|
1273
|
-
|
1373
|
+
|
1274
1374
|
def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1275
1375
|
"""
|
1276
1376
|
Remove a document from a folder.
|
@@ -1314,7 +1414,8 @@ class Morphik:
|
|
1314
1414
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1315
1415
|
- MetadataExtractionRule: Extract metadata using a schema
|
1316
1416
|
- NaturalLanguageRule: Transform content using natural language
|
1317
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1417
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
1418
|
+
(slower, but significantly better retrieval accuracy for text and images)
|
1318
1419
|
Returns:
|
1319
1420
|
Document: Metadata of the ingested document
|
1320
1421
|
|
@@ -1367,7 +1468,8 @@ class Morphik:
|
|
1367
1468
|
rules: Optional list of rules to apply during ingestion. Can be:
|
1368
1469
|
- MetadataExtractionRule: Extract metadata using a schema
|
1369
1470
|
- NaturalLanguageRule: Transform content using natural language
|
1370
|
-
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1471
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
1472
|
+
(slower, but significantly better retrieval accuracy for images)
|
1371
1473
|
|
1372
1474
|
Returns:
|
1373
1475
|
Document: Metadata of the ingested document
|
@@ -1450,14 +1552,12 @@ class Morphik:
|
|
1450
1552
|
try:
|
1451
1553
|
# Prepare form data
|
1452
1554
|
# Prepare form data - use_colpali should be a query parameter, not form data
|
1453
|
-
data = self._logic._prepare_ingest_files_form_data(
|
1454
|
-
metadata, rules, use_colpali, parallel, None, None
|
1455
|
-
)
|
1555
|
+
data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
|
1456
1556
|
|
1457
1557
|
response = self._request(
|
1458
|
-
"POST",
|
1459
|
-
"ingest/files",
|
1460
|
-
data=data,
|
1558
|
+
"POST",
|
1559
|
+
"ingest/files",
|
1560
|
+
data=data,
|
1461
1561
|
files=file_objects,
|
1462
1562
|
params={"use_colpali": str(use_colpali).lower()},
|
1463
1563
|
)
|
@@ -1533,6 +1633,7 @@ class Morphik:
|
|
1533
1633
|
k: int = 4,
|
1534
1634
|
min_score: float = 0.0,
|
1535
1635
|
use_colpali: bool = True,
|
1636
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1536
1637
|
) -> List[FinalChunkResult]:
|
1537
1638
|
"""
|
1538
1639
|
Retrieve relevant chunks.
|
@@ -1542,7 +1643,8 @@ class Morphik:
|
|
1542
1643
|
filters: Optional metadata filters
|
1543
1644
|
k: Number of results (default: 4)
|
1544
1645
|
min_score: Minimum similarity threshold (default: 0.0)
|
1545
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1646
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
|
1647
|
+
(only works for documents ingested with `use_colpali=True`)
|
1546
1648
|
Returns:
|
1547
1649
|
List[ChunkResult]
|
1548
1650
|
|
@@ -1555,7 +1657,7 @@ class Morphik:
|
|
1555
1657
|
```
|
1556
1658
|
"""
|
1557
1659
|
payload = self._logic._prepare_retrieve_chunks_request(
|
1558
|
-
query, filters, k, min_score, use_colpali,
|
1660
|
+
query, filters, k, min_score, use_colpali, folder_name, None
|
1559
1661
|
)
|
1560
1662
|
response = self._request("POST", "retrieve/chunks", data=payload)
|
1561
1663
|
return self._logic._parse_chunk_result_list_response(response)
|
@@ -1567,6 +1669,7 @@ class Morphik:
|
|
1567
1669
|
k: int = 4,
|
1568
1670
|
min_score: float = 0.0,
|
1569
1671
|
use_colpali: bool = True,
|
1672
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1570
1673
|
) -> List[DocumentResult]:
|
1571
1674
|
"""
|
1572
1675
|
Retrieve relevant documents.
|
@@ -1576,7 +1679,8 @@ class Morphik:
|
|
1576
1679
|
filters: Optional metadata filters
|
1577
1680
|
k: Number of results (default: 4)
|
1578
1681
|
min_score: Minimum similarity threshold (default: 0.0)
|
1579
|
-
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1682
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
|
1683
|
+
(only works for documents ingested with `use_colpali=True`)
|
1580
1684
|
Returns:
|
1581
1685
|
List[DocumentResult]
|
1582
1686
|
|
@@ -1589,7 +1693,7 @@ class Morphik:
|
|
1589
1693
|
```
|
1590
1694
|
"""
|
1591
1695
|
payload = self._logic._prepare_retrieve_docs_request(
|
1592
|
-
query, filters, k, min_score, use_colpali,
|
1696
|
+
query, filters, k, min_score, use_colpali, folder_name, None
|
1593
1697
|
)
|
1594
1698
|
response = self._request("POST", "retrieve/docs", data=payload)
|
1595
1699
|
return self._logic._parse_document_result_list_response(response)
|
@@ -1607,6 +1711,8 @@ class Morphik:
|
|
1607
1711
|
hop_depth: int = 1,
|
1608
1712
|
include_paths: bool = False,
|
1609
1713
|
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
1714
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1715
|
+
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
1610
1716
|
) -> CompletionResponse:
|
1611
1717
|
"""
|
1612
1718
|
Generate completion using relevant chunks as context.
|
@@ -1618,12 +1724,15 @@ class Morphik:
|
|
1618
1724
|
min_score: Minimum similarity threshold (default: 0.0)
|
1619
1725
|
max_tokens: Maximum tokens in completion
|
1620
1726
|
temperature: Model temperature
|
1621
|
-
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1727
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion
|
1728
|
+
(only works for documents ingested with `use_colpali=True`)
|
1622
1729
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
1623
1730
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
1624
1731
|
include_paths: Whether to include relationship paths in the response
|
1625
1732
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
1626
1733
|
Either a QueryPromptOverrides object or a dictionary with the same structure
|
1734
|
+
folder_name: Optional folder name to further scope operations
|
1735
|
+
schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
|
1627
1736
|
Returns:
|
1628
1737
|
CompletionResponse
|
1629
1738
|
|
@@ -1671,8 +1780,30 @@ class Morphik:
|
|
1671
1780
|
if response.metadata and "graph" in response.metadata:
|
1672
1781
|
for path in response.metadata["graph"]["paths"]:
|
1673
1782
|
print(" -> ".join(path))
|
1783
|
+
|
1784
|
+
# Using structured output with a Pydantic model
|
1785
|
+
from pydantic import BaseModel
|
1786
|
+
|
1787
|
+
class ResearchFindings(BaseModel):
|
1788
|
+
main_finding: str
|
1789
|
+
supporting_evidence: List[str]
|
1790
|
+
limitations: List[str]
|
1791
|
+
|
1792
|
+
response = db.query(
|
1793
|
+
"Summarize the key research findings from these documents",
|
1794
|
+
schema=ResearchFindings
|
1795
|
+
)
|
1796
|
+
|
1797
|
+
# Access structured output
|
1798
|
+
if response.structured_output:
|
1799
|
+
findings = response.structured_output
|
1800
|
+
print(f"Main finding: {findings.main_finding}")
|
1801
|
+
print("Supporting evidence:")
|
1802
|
+
for evidence in findings.supporting_evidence:
|
1803
|
+
print(f"- {evidence}")
|
1674
1804
|
```
|
1675
1805
|
"""
|
1806
|
+
# Directly forward the supplied folder_name (may be None, str, or List[str])
|
1676
1807
|
payload = self._logic._prepare_query_request(
|
1677
1808
|
query,
|
1678
1809
|
filters,
|
@@ -1685,14 +1816,31 @@ class Morphik:
|
|
1685
1816
|
hop_depth,
|
1686
1817
|
include_paths,
|
1687
1818
|
prompt_overrides,
|
1688
|
-
|
1689
|
-
None,
|
1819
|
+
folder_name,
|
1820
|
+
None, # end_user_id not supported at this level
|
1821
|
+
schema,
|
1690
1822
|
)
|
1823
|
+
|
1824
|
+
# Add schema to payload if provided
|
1825
|
+
if schema:
|
1826
|
+
# If schema is a Pydantic model class, we need to serialize it to a schema dict
|
1827
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
1828
|
+
payload["schema"] = schema.model_json_schema()
|
1829
|
+
else:
|
1830
|
+
payload["schema"] = schema
|
1831
|
+
|
1832
|
+
# Add a hint to the query to return in JSON format
|
1833
|
+
payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
|
1834
|
+
|
1691
1835
|
response = self._request("POST", "query", data=payload)
|
1692
1836
|
return self._logic._parse_completion_response(response)
|
1693
1837
|
|
1694
1838
|
def list_documents(
|
1695
|
-
self,
|
1839
|
+
self,
|
1840
|
+
skip: int = 0,
|
1841
|
+
limit: int = 100,
|
1842
|
+
filters: Optional[Dict[str, Any]] = None,
|
1843
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
1696
1844
|
) -> List[Document]:
|
1697
1845
|
"""
|
1698
1846
|
List accessible documents.
|
@@ -1701,6 +1849,7 @@ class Morphik:
|
|
1701
1849
|
skip: Number of documents to skip
|
1702
1850
|
limit: Maximum number of documents to return
|
1703
1851
|
filters: Optional filters
|
1852
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
1704
1853
|
|
1705
1854
|
Returns:
|
1706
1855
|
List[Document]: List of accessible documents
|
@@ -1714,7 +1863,7 @@ class Morphik:
|
|
1714
1863
|
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
1715
1864
|
```
|
1716
1865
|
"""
|
1717
|
-
params, data = self._logic._prepare_list_documents_request(skip, limit, filters,
|
1866
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, folder_name, None)
|
1718
1867
|
response = self._request("POST", "documents", data=data, params=params)
|
1719
1868
|
docs = self._logic._parse_document_list_response(response)
|
1720
1869
|
for doc in docs:
|
@@ -1741,17 +1890,17 @@ class Morphik:
|
|
1741
1890
|
doc = self._logic._parse_document_response(response)
|
1742
1891
|
doc._client = self
|
1743
1892
|
return doc
|
1744
|
-
|
1893
|
+
|
1745
1894
|
def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1746
1895
|
"""
|
1747
1896
|
Get the current processing status of a document.
|
1748
|
-
|
1897
|
+
|
1749
1898
|
Args:
|
1750
1899
|
document_id: ID of the document to check
|
1751
|
-
|
1900
|
+
|
1752
1901
|
Returns:
|
1753
1902
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1754
|
-
|
1903
|
+
|
1755
1904
|
Example:
|
1756
1905
|
```python
|
1757
1906
|
status = db.get_document_status("doc_123")
|
@@ -1765,23 +1914,23 @@ class Morphik:
|
|
1765
1914
|
"""
|
1766
1915
|
response = self._request("GET", f"documents/{document_id}/status")
|
1767
1916
|
return response
|
1768
|
-
|
1917
|
+
|
1769
1918
|
def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
1770
1919
|
"""
|
1771
1920
|
Wait for a document's processing to complete.
|
1772
|
-
|
1921
|
+
|
1773
1922
|
Args:
|
1774
1923
|
document_id: ID of the document to wait for
|
1775
1924
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1776
1925
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1777
|
-
|
1926
|
+
|
1778
1927
|
Returns:
|
1779
1928
|
Document: Updated document with the latest status
|
1780
|
-
|
1929
|
+
|
1781
1930
|
Raises:
|
1782
1931
|
TimeoutError: If processing doesn't complete within the timeout period
|
1783
1932
|
ValueError: If processing fails with an error
|
1784
|
-
|
1933
|
+
|
1785
1934
|
Example:
|
1786
1935
|
```python
|
1787
1936
|
# Upload a file and wait for processing to complete
|
@@ -1796,20 +1945,21 @@ class Morphik:
|
|
1796
1945
|
```
|
1797
1946
|
"""
|
1798
1947
|
import time
|
1948
|
+
|
1799
1949
|
start_time = time.time()
|
1800
|
-
|
1950
|
+
|
1801
1951
|
while (time.time() - start_time) < timeout_seconds:
|
1802
1952
|
status = self.get_document_status(document_id)
|
1803
|
-
|
1953
|
+
|
1804
1954
|
if status["status"] == "completed":
|
1805
1955
|
# Get the full document now that it's complete
|
1806
1956
|
return self.get_document(document_id)
|
1807
1957
|
elif status["status"] == "failed":
|
1808
1958
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1809
|
-
|
1959
|
+
|
1810
1960
|
# Wait before checking again
|
1811
1961
|
time.sleep(check_interval_seconds)
|
1812
|
-
|
1962
|
+
|
1813
1963
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1814
1964
|
|
1815
1965
|
def get_document_by_filename(self, filename: str) -> Document:
|
@@ -1963,9 +2113,7 @@ class Morphik:
|
|
1963
2113
|
form_data["use_colpali"] = str(use_colpali).lower()
|
1964
2114
|
|
1965
2115
|
# Use the dedicated file update endpoint
|
1966
|
-
response = self._request(
|
1967
|
-
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
1968
|
-
)
|
2116
|
+
response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
|
1969
2117
|
|
1970
2118
|
doc = self._logic._parse_document_response(response)
|
1971
2119
|
doc._client = self
|
@@ -2167,12 +2315,15 @@ class Morphik:
|
|
2167
2315
|
|
2168
2316
|
return result
|
2169
2317
|
|
2170
|
-
def batch_get_documents(
|
2318
|
+
def batch_get_documents(
|
2319
|
+
self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
|
2320
|
+
) -> List[Document]:
|
2171
2321
|
"""
|
2172
|
-
Retrieve multiple documents by their IDs
|
2322
|
+
Retrieve multiple documents by their IDs.
|
2173
2323
|
|
2174
2324
|
Args:
|
2175
2325
|
document_ids: List of document IDs to retrieve
|
2326
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
2176
2327
|
|
2177
2328
|
Returns:
|
2178
2329
|
List[Document]: List of document metadata for found documents
|
@@ -2184,21 +2335,23 @@ class Morphik:
|
|
2184
2335
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
2185
2336
|
```
|
2186
2337
|
"""
|
2187
|
-
#
|
2188
|
-
|
2338
|
+
# Build request respecting folder scoping if provided
|
2339
|
+
request = self._logic._prepare_batch_get_documents_request(document_ids, folder_name, None)
|
2340
|
+
response = self._request("POST", "batch/documents", data=request)
|
2189
2341
|
docs = self._logic._parse_document_list_response(response)
|
2190
2342
|
for doc in docs:
|
2191
2343
|
doc._client = self
|
2192
2344
|
return docs
|
2193
2345
|
|
2194
2346
|
def batch_get_chunks(
|
2195
|
-
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2347
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]], folder_name: Optional[Union[str, List[str]]] = None
|
2196
2348
|
) -> List[FinalChunkResult]:
|
2197
2349
|
"""
|
2198
|
-
Retrieve specific chunks by their document ID and chunk number
|
2350
|
+
Retrieve specific chunks by their document ID and chunk number.
|
2199
2351
|
|
2200
2352
|
Args:
|
2201
2353
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
2354
|
+
folder_name: Optional folder name (or list of names) to scope the request
|
2202
2355
|
|
2203
2356
|
Returns:
|
2204
2357
|
List[FinalChunkResult]: List of chunk results
|
@@ -2223,15 +2376,8 @@ class Morphik:
|
|
2223
2376
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
2224
2377
|
```
|
2225
2378
|
"""
|
2226
|
-
|
2227
|
-
|
2228
|
-
for source in sources:
|
2229
|
-
if isinstance(source, dict):
|
2230
|
-
source_dicts.append(source)
|
2231
|
-
else:
|
2232
|
-
source_dicts.append(source.model_dump())
|
2233
|
-
|
2234
|
-
response = self._request("POST", "batch/chunks", data=source_dicts)
|
2379
|
+
request = self._logic._prepare_batch_get_chunks_request(sources, folder_name, None)
|
2380
|
+
response = self._request("POST", "batch/chunks", data=request)
|
2235
2381
|
return self._logic._parse_chunk_result_list_response(response)
|
2236
2382
|
|
2237
2383
|
def create_cache(
|
@@ -2249,8 +2395,10 @@ class Morphik:
|
|
2249
2395
|
name: Name of the cache to create
|
2250
2396
|
model: Name of the model to use (e.g. "llama2")
|
2251
2397
|
gguf_file: Name of the GGUF file to use for the model
|
2252
|
-
filters: Optional metadata filters to determine which documents to include.
|
2253
|
-
|
2398
|
+
filters: Optional metadata filters to determine which documents to include.
|
2399
|
+
These filters will be applied in addition to any specific docs provided.
|
2400
|
+
docs: Optional list of specific document IDs to include.
|
2401
|
+
These docs will be included in addition to any documents matching the filters.
|
2254
2402
|
|
2255
2403
|
Returns:
|
2256
2404
|
Dict[str, Any]: Created cache configuration
|
@@ -2355,15 +2503,21 @@ class Morphik:
|
|
2355
2503
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
2356
2504
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
2357
2505
|
|
2358
|
-
request
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2506
|
+
# Initialize request with required fields
|
2507
|
+
request = {"name": name}
|
2508
|
+
|
2509
|
+
# Add optional fields only if they are not None
|
2510
|
+
if filters is not None:
|
2511
|
+
request["filters"] = filters
|
2512
|
+
if documents is not None:
|
2513
|
+
request["documents"] = documents
|
2514
|
+
if prompt_overrides is not None:
|
2515
|
+
request["prompt_overrides"] = prompt_overrides
|
2364
2516
|
|
2365
2517
|
response = self._request("POST", "graph/create", request)
|
2366
|
-
|
2518
|
+
graph = self._logic._parse_graph_response(response)
|
2519
|
+
graph._client = self
|
2520
|
+
return graph
|
2367
2521
|
|
2368
2522
|
def get_graph(self, name: str) -> Graph:
|
2369
2523
|
"""
|
@@ -2383,7 +2537,9 @@ class Morphik:
|
|
2383
2537
|
```
|
2384
2538
|
"""
|
2385
2539
|
response = self._request("GET", f"graph/{name}")
|
2386
|
-
|
2540
|
+
graph = self._logic._parse_graph_response(response)
|
2541
|
+
graph._client = self
|
2542
|
+
return graph
|
2387
2543
|
|
2388
2544
|
def list_graphs(self) -> List[Graph]:
|
2389
2545
|
"""
|
@@ -2401,7 +2557,10 @@ class Morphik:
|
|
2401
2557
|
```
|
2402
2558
|
"""
|
2403
2559
|
response = self._request("GET", "graphs")
|
2404
|
-
|
2560
|
+
graphs = self._logic._parse_graph_list_response(response)
|
2561
|
+
for g in graphs:
|
2562
|
+
g._client = self
|
2563
|
+
return graphs
|
2405
2564
|
|
2406
2565
|
def update_graph(
|
2407
2566
|
self,
|
@@ -2465,7 +2624,9 @@ class Morphik:
|
|
2465
2624
|
}
|
2466
2625
|
|
2467
2626
|
response = self._request("POST", f"graph/{name}/update", request)
|
2468
|
-
|
2627
|
+
graph = self._logic._parse_graph_response(response)
|
2628
|
+
graph._client = self
|
2629
|
+
return graph
|
2469
2630
|
|
2470
2631
|
def delete_document(self, document_id: str) -> Dict[str, str]:
|
2471
2632
|
"""
|
@@ -2527,3 +2688,50 @@ class Morphik:
|
|
2527
2688
|
|
2528
2689
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
2529
2690
|
self.close()
|
2691
|
+
|
2692
|
+
def create_app(self, app_id: str, name: str, expiry_days: int = 30) -> Dict[str, str]:
|
2693
|
+
"""Create a new application in Morphik Cloud and obtain its auth URI.
|
2694
|
+
|
2695
|
+
This wraps the enterprise endpoint ``/ee/create_app`` which
|
2696
|
+
returns a dictionary ``{\"uri\": ..., \"app_id\": ...}``.
|
2697
|
+
|
2698
|
+
Parameters
|
2699
|
+
----------
|
2700
|
+
app_id:
|
2701
|
+
Identifier for the new application.
|
2702
|
+
name:
|
2703
|
+
Human-readable application name (will be slugified by the server).
|
2704
|
+
expiry_days:
|
2705
|
+
Token validity period. Defaults to 30 days.
|
2706
|
+
"""
|
2707
|
+
|
2708
|
+
payload = {"app_id": app_id, "name": name, "expiry_days": expiry_days}
|
2709
|
+
return self._request("POST", "ee/create_app", data=payload)
|
2710
|
+
|
2711
|
+
def wait_for_graph_completion(
|
2712
|
+
self,
|
2713
|
+
graph_name: str,
|
2714
|
+
timeout_seconds: int = 300,
|
2715
|
+
check_interval_seconds: int = 5,
|
2716
|
+
) -> Graph:
|
2717
|
+
"""Block until the specified graph finishes processing.
|
2718
|
+
|
2719
|
+
Args:
|
2720
|
+
graph_name: Name of the graph to monitor.
|
2721
|
+
timeout_seconds: Maximum seconds to wait.
|
2722
|
+
check_interval_seconds: Seconds between status checks.
|
2723
|
+
|
2724
|
+
Returns:
|
2725
|
+
Graph: The completed graph object.
|
2726
|
+
"""
|
2727
|
+
import time
|
2728
|
+
|
2729
|
+
start = time.time()
|
2730
|
+
while time.time() - start < timeout_seconds:
|
2731
|
+
graph = self.get_graph(graph_name)
|
2732
|
+
if graph.is_completed:
|
2733
|
+
return graph
|
2734
|
+
if graph.is_failed:
|
2735
|
+
raise RuntimeError(graph.error or "Graph processing failed")
|
2736
|
+
time.sleep(check_interval_seconds)
|
2737
|
+
raise TimeoutError("Timed out waiting for graph completion")
|