morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/async_.py CHANGED
@@ -2,25 +2,23 @@ import json
2
2
  import logging
3
3
  from io import BytesIO, IOBase
4
4
  from pathlib import Path
5
- from typing import Dict, Any, List, Optional, Union, BinaryIO
5
+ from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
6
6
 
7
7
  import httpx
8
- from PIL.Image import Image as PILImage
8
+ from pydantic import BaseModel
9
9
 
10
+ from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
10
11
  from .models import (
12
+ ChunkSource,
13
+ CompletionResponse, # Prompt override models
11
14
  Document,
12
15
  DocumentResult,
13
- CompletionResponse,
14
- IngestTextRequest,
15
- ChunkSource,
16
- Graph,
17
16
  FolderInfo,
18
- # Prompt override models
17
+ Graph,
19
18
  GraphPromptOverrides,
19
+ IngestTextRequest,
20
20
  QueryPromptOverrides,
21
21
  )
22
- from .rules import Rule
23
- from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
 
@@ -69,16 +67,16 @@ class AsyncFolder:
69
67
  def name(self) -> str:
70
68
  """Returns the folder name."""
71
69
  return self._name
72
-
70
+
73
71
  @property
74
72
  def id(self) -> Optional[str]:
75
73
  """Returns the folder ID if available."""
76
74
  return self._id
77
-
75
+
78
76
  async def get_info(self) -> Dict[str, Any]:
79
77
  """
80
78
  Get detailed information about this folder.
81
-
79
+
82
80
  Returns:
83
81
  Dict[str, Any]: Detailed folder information
84
82
  """
@@ -91,9 +89,8 @@ class AsyncFolder:
91
89
  break
92
90
  if not self._id:
93
91
  raise ValueError(f"Folder '{self._name}' not found")
94
-
92
+
95
93
  return await self._client._request("GET", f"folders/{self._id}")
96
-
97
94
 
98
95
  def signin(self, end_user_id: str) -> "AsyncUserScope":
99
96
  """
@@ -166,9 +163,7 @@ class AsyncFolder:
166
163
  files = {"file": (filename, file_obj)}
167
164
 
168
165
  # Create form data
169
- form_data = self._client._logic._prepare_ingest_file_form_data(
170
- metadata, rules, self._name, None
171
- )
166
+ form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
172
167
 
173
168
  response = await self._client._request(
174
169
  "POST",
@@ -216,9 +211,9 @@ class AsyncFolder:
216
211
  )
217
212
 
218
213
  response = await self._client._request(
219
- "POST",
220
- "ingest/files",
221
- data=data,
214
+ "POST",
215
+ "ingest/files",
216
+ data=data,
222
217
  files=file_objects,
223
218
  params={"use_colpali": str(use_colpali).lower()},
224
219
  )
@@ -228,9 +223,7 @@ class AsyncFolder:
228
223
  for error in response["errors"]:
229
224
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
230
225
 
231
- docs = [
232
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
233
- ]
226
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
234
227
  for doc in docs:
235
228
  doc._client = self._client
236
229
  return docs
@@ -293,6 +286,7 @@ class AsyncFolder:
293
286
  k: int = 4,
294
287
  min_score: float = 0.0,
295
288
  use_colpali: bool = True,
289
+ additional_folders: Optional[List[str]] = None,
296
290
  ) -> List[FinalChunkResult]:
297
291
  """
298
292
  Retrieve relevant chunks within this folder.
@@ -303,12 +297,14 @@ class AsyncFolder:
303
297
  k: Number of results (default: 4)
304
298
  min_score: Minimum similarity threshold (default: 0.0)
305
299
  use_colpali: Whether to use ColPali-style embedding model
300
+ additional_folders: Optional list of additional folder names to further scope operations
306
301
 
307
302
  Returns:
308
303
  List[FinalChunkResult]: List of relevant chunks
309
304
  """
305
+ effective_folder = self._merge_folders(additional_folders)
310
306
  payload = self._client._logic._prepare_retrieve_chunks_request(
311
- query, filters, k, min_score, use_colpali, self._name, None
307
+ query, filters, k, min_score, use_colpali, effective_folder, None
312
308
  )
313
309
  response = await self._client._request("POST", "retrieve/chunks", data=payload)
314
310
  return self._client._logic._parse_chunk_result_list_response(response)
@@ -320,6 +316,7 @@ class AsyncFolder:
320
316
  k: int = 4,
321
317
  min_score: float = 0.0,
322
318
  use_colpali: bool = True,
319
+ additional_folders: Optional[List[str]] = None,
323
320
  ) -> List[DocumentResult]:
324
321
  """
325
322
  Retrieve relevant documents within this folder.
@@ -330,12 +327,14 @@ class AsyncFolder:
330
327
  k: Number of results (default: 4)
331
328
  min_score: Minimum similarity threshold (default: 0.0)
332
329
  use_colpali: Whether to use ColPali-style embedding model
330
+ additional_folders: Optional list of additional folder names to further scope operations
333
331
 
334
332
  Returns:
335
333
  List[DocumentResult]: List of relevant documents
336
334
  """
335
+ effective_folder = self._merge_folders(additional_folders)
337
336
  payload = self._client._logic._prepare_retrieve_docs_request(
338
- query, filters, k, min_score, use_colpali, self._name, None
337
+ query, filters, k, min_score, use_colpali, effective_folder, None
339
338
  )
340
339
  response = await self._client._request("POST", "retrieve/docs", data=payload)
341
340
  return self._client._logic._parse_document_result_list_response(response)
@@ -353,6 +352,8 @@ class AsyncFolder:
353
352
  hop_depth: int = 1,
354
353
  include_paths: bool = False,
355
354
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
355
+ additional_folders: Optional[List[str]] = None,
356
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
356
357
  ) -> CompletionResponse:
357
358
  """
358
359
  Generate completion using relevant chunks as context within this folder.
@@ -369,10 +370,13 @@ class AsyncFolder:
369
370
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
370
371
  include_paths: Whether to include relationship paths in the response
371
372
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
373
+ schema: Optional schema for structured output
374
+ additional_folders: Optional list of additional folder names to further scope operations
372
375
 
373
376
  Returns:
374
- CompletionResponse: Generated completion
377
+ CompletionResponse: Generated completion or structured output
375
378
  """
379
+ effective_folder = self._merge_folders(additional_folders)
376
380
  payload = self._client._logic._prepare_query_request(
377
381
  query,
378
382
  filters,
@@ -385,14 +389,31 @@ class AsyncFolder:
385
389
  hop_depth,
386
390
  include_paths,
387
391
  prompt_overrides,
388
- self._name,
392
+ effective_folder,
389
393
  None,
394
+ schema,
390
395
  )
396
+
397
+ # Add schema to payload if provided
398
+ if schema:
399
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
400
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
401
+ payload["schema"] = schema.model_json_schema()
402
+ else:
403
+ payload["schema"] = schema
404
+
405
+ # Add a hint to the query to return in JSON format
406
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
407
+
391
408
  response = await self._client._request("POST", "query", data=payload)
392
409
  return self._client._logic._parse_completion_response(response)
393
410
 
394
411
  async def list_documents(
395
- self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
412
+ self,
413
+ skip: int = 0,
414
+ limit: int = 100,
415
+ filters: Optional[Dict[str, Any]] = None,
416
+ additional_folders: Optional[List[str]] = None,
396
417
  ) -> List[Document]:
397
418
  """
398
419
  List accessible documents within this folder.
@@ -401,33 +422,34 @@ class AsyncFolder:
401
422
  skip: Number of documents to skip
402
423
  limit: Maximum number of documents to return
403
424
  filters: Optional filters
425
+ additional_folders: Optional list of additional folder names to further scope operations
404
426
 
405
427
  Returns:
406
428
  List[Document]: List of documents
407
429
  """
408
- params, data = self._client._logic._prepare_list_documents_request(
409
- skip, limit, filters, self._name, None
410
- )
430
+ effective_folder = self._merge_folders(additional_folders)
431
+ params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, effective_folder, None)
411
432
  response = await self._client._request("POST", "documents", data=data, params=params)
412
433
  docs = self._client._logic._parse_document_list_response(response)
413
434
  for doc in docs:
414
435
  doc._client = self._client
415
436
  return docs
416
437
 
417
- async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
438
+ async def batch_get_documents(
439
+ self, document_ids: List[str], additional_folders: Optional[List[str]] = None
440
+ ) -> List[Document]:
418
441
  """
419
442
  Retrieve multiple documents by their IDs in a single batch operation within this folder.
420
443
 
421
444
  Args:
422
445
  document_ids: List of document IDs to retrieve
446
+ additional_folders: Optional list of additional folder names to further scope operations
423
447
 
424
448
  Returns:
425
449
  List[Document]: List of document metadata for found documents
426
450
  """
427
- # API expects a dict with document_ids key
428
- request = {"document_ids": document_ids}
429
- if self._name:
430
- request["folder_name"] = self._name
451
+ merged = self._merge_folders(additional_folders)
452
+ request = {"document_ids": document_ids, "folder_name": merged}
431
453
  response = await self._client._request("POST", "batch/documents", data=request)
432
454
  docs = self._client._logic._parse_document_list_response(response)
433
455
  for doc in docs:
@@ -435,18 +457,22 @@ class AsyncFolder:
435
457
  return docs
436
458
 
437
459
  async def batch_get_chunks(
438
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
460
+ self,
461
+ sources: List[Union[ChunkSource, Dict[str, Any]]],
462
+ additional_folders: Optional[List[str]] = None,
439
463
  ) -> List[FinalChunkResult]:
440
464
  """
441
465
  Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
442
466
 
443
467
  Args:
444
468
  sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
469
+ additional_folders: Optional list of additional folder names to further scope operations
445
470
 
446
471
  Returns:
447
472
  List[FinalChunkResult]: List of chunk results
448
473
  """
449
- request = self._client._logic._prepare_batch_get_chunks_request(sources, self._name, None)
474
+ merged = self._merge_folders(additional_folders)
475
+ request = self._client._logic._prepare_batch_get_chunks_request(sources, merged, None)
450
476
  response = await self._client._request("POST", "batch/chunks", data=request)
451
477
  return self._client._logic._parse_chunk_result_list_response(response)
452
478
 
@@ -473,7 +499,9 @@ class AsyncFolder:
473
499
  name, filters, documents, prompt_overrides, self._name, None
474
500
  )
475
501
  response = await self._client._request("POST", "graph/create", data=request)
476
- return self._client._logic._parse_graph_response(response)
502
+ graph = self._logic._parse_graph_response(response)
503
+ graph._client = self # Attach AsyncMorphik client for polling helpers
504
+ return graph
477
505
 
478
506
  async def update_graph(
479
507
  self,
@@ -498,7 +526,9 @@ class AsyncFolder:
498
526
  name, additional_filters, additional_documents, prompt_overrides, self._name, None
499
527
  )
500
528
  response = await self._client._request("POST", f"graph/{name}/update", data=request)
501
- return self._client._logic._parse_graph_response(response)
529
+ graph = self._logic._parse_graph_response(response)
530
+ graph._client = self
531
+ return graph
502
532
 
503
533
  async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
504
534
  """
@@ -510,9 +540,6 @@ class AsyncFolder:
510
540
  Returns:
511
541
  Dict[str, str]: Deletion status
512
542
  """
513
- # Get the document by filename with folder scope
514
- request = {"filename": filename, "folder_name": self._name}
515
-
516
543
  # First get the document ID
517
544
  response = await self._client._request(
518
545
  "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
@@ -522,6 +549,18 @@ class AsyncFolder:
522
549
  # Then delete by ID
523
550
  return await self._client.delete_document(doc.external_id)
524
551
 
552
+ # Helper --------------------------------------------------------------
553
+ def _merge_folders(self, additional_folders: Optional[List[str]] = None) -> Union[str, List[str]]:
554
+ """Return the effective folder scope for this folder instance.
555
+
556
+ If *additional_folders* is provided it will be combined with the scoped
557
+ folder (*self._name*) and returned as a list. Otherwise just
558
+ *self._name* is returned so the API keeps backward-compatibility with
559
+ accepting a single string."""
560
+ if not additional_folders:
561
+ return self._name
562
+ return [self._name] + additional_folders
563
+
525
564
 
526
565
  class AsyncUserScope:
527
566
  """
@@ -685,9 +724,7 @@ class AsyncUserScope:
685
724
  if rules:
686
725
  if all(isinstance(r, list) for r in rules):
687
726
  # List of lists - per-file rules
688
- converted_rules = [
689
- [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
690
- ]
727
+ converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
691
728
  else:
692
729
  # Flat list - shared rules for all files
693
730
  converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -707,9 +744,9 @@ class AsyncUserScope:
707
744
  data["folder_name"] = self._folder_name
708
745
 
709
746
  response = await self._client._request(
710
- "POST",
711
- "ingest/files",
712
- data=data,
747
+ "POST",
748
+ "ingest/files",
749
+ data=data,
713
750
  files=file_objects,
714
751
  params={"use_colpali": str(use_colpali).lower()},
715
752
  )
@@ -719,9 +756,7 @@ class AsyncUserScope:
719
756
  for error in response["errors"]:
720
757
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
721
758
 
722
- docs = [
723
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
724
- ]
759
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
725
760
  for doc in docs:
726
761
  doc._client = self._client
727
762
  return docs
@@ -784,6 +819,7 @@ class AsyncUserScope:
784
819
  k: int = 4,
785
820
  min_score: float = 0.0,
786
821
  use_colpali: bool = True,
822
+ additional_folders: Optional[List[str]] = None,
787
823
  ) -> List[FinalChunkResult]:
788
824
  """
789
825
  Retrieve relevant chunks as this end user.
@@ -794,12 +830,14 @@ class AsyncUserScope:
794
830
  k: Number of results (default: 4)
795
831
  min_score: Minimum similarity threshold (default: 0.0)
796
832
  use_colpali: Whether to use ColPali-style embedding model
833
+ additional_folders: Optional list of additional folder names to further scope operations
797
834
 
798
835
  Returns:
799
836
  List[FinalChunkResult]: List of relevant chunks
800
837
  """
838
+ effective_folder = self._merge_folders(additional_folders)
801
839
  payload = self._client._logic._prepare_retrieve_chunks_request(
802
- query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
840
+ query, filters, k, min_score, use_colpali, effective_folder, self._end_user_id
803
841
  )
804
842
  response = await self._client._request("POST", "retrieve/chunks", data=payload)
805
843
  return self._client._logic._parse_chunk_result_list_response(response)
@@ -811,6 +849,7 @@ class AsyncUserScope:
811
849
  k: int = 4,
812
850
  min_score: float = 0.0,
813
851
  use_colpali: bool = True,
852
+ additional_folders: Optional[List[str]] = None,
814
853
  ) -> List[DocumentResult]:
815
854
  """
816
855
  Retrieve relevant documents as this end user.
@@ -821,12 +860,14 @@ class AsyncUserScope:
821
860
  k: Number of results (default: 4)
822
861
  min_score: Minimum similarity threshold (default: 0.0)
823
862
  use_colpali: Whether to use ColPali-style embedding model
863
+ additional_folders: Optional list of additional folder names to further scope operations
824
864
 
825
865
  Returns:
826
866
  List[DocumentResult]: List of relevant documents
827
867
  """
868
+ effective_folder = self._merge_folders(additional_folders)
828
869
  payload = self._client._logic._prepare_retrieve_docs_request(
829
- query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
870
+ query, filters, k, min_score, use_colpali, effective_folder, self._end_user_id
830
871
  )
831
872
  response = await self._client._request("POST", "retrieve/docs", data=payload)
832
873
  return self._client._logic._parse_document_result_list_response(response)
@@ -844,9 +885,11 @@ class AsyncUserScope:
844
885
  hop_depth: int = 1,
845
886
  include_paths: bool = False,
846
887
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
888
+ additional_folders: Optional[List[str]] = None,
889
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
847
890
  ) -> CompletionResponse:
848
891
  """
849
- Generate completion using relevant chunks as context as this end user.
892
+ Generate completion using relevant chunks as context, scoped to the end user.
850
893
 
851
894
  Args:
852
895
  query: Query text
@@ -860,10 +903,13 @@ class AsyncUserScope:
860
903
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
861
904
  include_paths: Whether to include relationship paths in the response
862
905
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
906
+ schema: Optional schema for structured output
907
+ additional_folders: Optional list of additional folder names to further scope operations
863
908
 
864
909
  Returns:
865
- CompletionResponse: Generated completion
910
+ CompletionResponse: Generated completion or structured output
866
911
  """
912
+ effective_folder = self._merge_folders(additional_folders)
867
913
  payload = self._client._logic._prepare_query_request(
868
914
  query,
869
915
  filters,
@@ -876,14 +922,31 @@ class AsyncUserScope:
876
922
  hop_depth,
877
923
  include_paths,
878
924
  prompt_overrides,
879
- self._folder_name,
925
+ effective_folder,
880
926
  self._end_user_id,
927
+ schema,
881
928
  )
929
+
930
+ # Add schema to payload if provided
931
+ if schema:
932
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
933
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
934
+ payload["schema"] = schema.model_json_schema()
935
+ else:
936
+ payload["schema"] = schema
937
+
938
+ # Add a hint to the query to return in JSON format
939
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
940
+
882
941
  response = await self._client._request("POST", "query", data=payload)
883
942
  return self._client._logic._parse_completion_response(response)
884
943
 
885
944
  async def list_documents(
886
- self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
945
+ self,
946
+ skip: int = 0,
947
+ limit: int = 100,
948
+ filters: Optional[Dict[str, Any]] = None,
949
+ folder_name: Optional[Union[str, List[str]]] = None,
887
950
  ) -> List[Document]:
888
951
  """
889
952
  List accessible documents for this end user.
@@ -892,12 +955,13 @@ class AsyncUserScope:
892
955
  skip: Number of documents to skip
893
956
  limit: Maximum number of documents to return
894
957
  filters: Optional filters
958
+ folder_name: Optional folder name (or list of names) to scope the request
895
959
 
896
960
  Returns:
897
961
  List[Document]: List of documents
898
962
  """
899
963
  params, data = self._client._logic._prepare_list_documents_request(
900
- skip, limit, filters, self._folder_name, self._end_user_id
964
+ skip, limit, filters, folder_name, self._end_user_id
901
965
  )
902
966
  response = await self._client._request("POST", "documents", data=data, params=params)
903
967
  docs = self._client._logic._parse_document_list_response(response)
@@ -905,12 +969,15 @@ class AsyncUserScope:
905
969
  doc._client = self._client
906
970
  return docs
907
971
 
908
- async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
972
+ async def batch_get_documents(
973
+ self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
974
+ ) -> List[Document]:
909
975
  """
910
976
  Retrieve multiple documents by their IDs in a single batch operation for this end user.
911
977
 
912
978
  Args:
913
979
  document_ids: List of document IDs to retrieve
980
+ folder_name: Optional folder name (or list of names) to scope the request
914
981
 
915
982
  Returns:
916
983
  List[Document]: List of document metadata for found documents
@@ -928,20 +995,21 @@ class AsyncUserScope:
928
995
  return docs
929
996
 
930
997
  async def batch_get_chunks(
931
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
998
+ self,
999
+ sources: List[Union[ChunkSource, Dict[str, Any]]],
1000
+ folder_name: Optional[Union[str, List[str]]] = None,
932
1001
  ) -> List[FinalChunkResult]:
933
1002
  """
934
1003
  Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
935
1004
 
936
1005
  Args:
937
1006
  sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
1007
+ folder_name: Optional folder name (or list of names) to scope the request
938
1008
 
939
1009
  Returns:
940
1010
  List[FinalChunkResult]: List of chunk results
941
1011
  """
942
- request = self._client._logic._prepare_batch_get_chunks_request(
943
- sources, self._folder_name, self._end_user_id
944
- )
1012
+ request = self._client._logic._prepare_batch_get_chunks_request(sources, self._folder_name, self._end_user_id)
945
1013
  response = await self._client._request("POST", "batch/chunks", data=request)
946
1014
  return self._client._logic._parse_chunk_result_list_response(response)
947
1015
 
@@ -968,7 +1036,9 @@ class AsyncUserScope:
968
1036
  name, filters, documents, prompt_overrides, self._folder_name, self._end_user_id
969
1037
  )
970
1038
  response = await self._client._request("POST", "graph/create", data=request)
971
- return self._client._logic._parse_graph_response(response)
1039
+ graph = self._logic._parse_graph_response(response)
1040
+ graph._client = self
1041
+ return graph
972
1042
 
973
1043
  async def update_graph(
974
1044
  self,
@@ -998,7 +1068,9 @@ class AsyncUserScope:
998
1068
  self._end_user_id,
999
1069
  )
1000
1070
  response = await self._client._request("POST", f"graph/{name}/update", data=request)
1001
- return self._client._logic._parse_graph_response(response)
1071
+ graph = self._logic._parse_graph_response(response)
1072
+ graph._client = self
1073
+ return graph
1002
1074
 
1003
1075
  async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
1004
1076
  """
@@ -1018,9 +1090,7 @@ class AsyncUserScope:
1018
1090
  params["folder_name"] = self._folder_name
1019
1091
 
1020
1092
  # First get the document ID
1021
- response = await self._client._request(
1022
- "GET", f"documents/filename/{filename}", params=params
1023
- )
1093
+ response = await self._client._request("GET", f"documents/filename/{filename}", params=params)
1024
1094
  doc = self._client._logic._parse_document_response(response)
1025
1095
 
1026
1096
  # Then delete by ID
@@ -1077,7 +1147,7 @@ class AsyncMorphik:
1077
1147
  # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1078
1148
  if "Content-Type" in headers:
1079
1149
  del headers["Content-Type"]
1080
-
1150
+
1081
1151
  # For file uploads with form data, use form data (not json)
1082
1152
  request_data = {"files": files}
1083
1153
  if data:
@@ -1112,18 +1182,16 @@ class AsyncMorphik:
1112
1182
  Returns:
1113
1183
  AsyncFolder: A folder object ready for scoped operations
1114
1184
  """
1115
- payload = {
1116
- "name": name
1117
- }
1185
+ payload = {"name": name}
1118
1186
  if description:
1119
1187
  payload["description"] = description
1120
-
1188
+
1121
1189
  response = await self._request("POST", "folders", data=payload)
1122
1190
  folder_info = FolderInfo(**response)
1123
-
1191
+
1124
1192
  # Return a usable AsyncFolder object with the ID from the response
1125
1193
  return AsyncFolder(self, name, folder_id=folder_info.id)
1126
-
1194
+
1127
1195
  def get_folder_by_name(self, name: str) -> AsyncFolder:
1128
1196
  """
1129
1197
  Get a folder by name to scope operations.
@@ -1135,7 +1203,7 @@ class AsyncMorphik:
1135
1203
  AsyncFolder: A folder object for scoped operations
1136
1204
  """
1137
1205
  return AsyncFolder(self, name)
1138
-
1206
+
1139
1207
  async def get_folder(self, folder_id: str) -> AsyncFolder:
1140
1208
  """
1141
1209
  Get a folder by ID.
@@ -1148,7 +1216,7 @@ class AsyncMorphik:
1148
1216
  """
1149
1217
  response = await self._request("GET", f"folders/{folder_id}")
1150
1218
  return AsyncFolder(self, response["name"], folder_id)
1151
-
1219
+
1152
1220
  async def list_folders(self) -> List[AsyncFolder]:
1153
1221
  """
1154
1222
  List all folders the user has access to as AsyncFolder objects.
@@ -1158,7 +1226,7 @@ class AsyncMorphik:
1158
1226
  """
1159
1227
  response = await self._request("GET", "folders")
1160
1228
  return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
1161
-
1229
+
1162
1230
  async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1163
1231
  """
1164
1232
  Add a document to a folder.
@@ -1172,7 +1240,7 @@ class AsyncMorphik:
1172
1240
  """
1173
1241
  response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1174
1242
  return response
1175
-
1243
+
1176
1244
  async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1177
1245
  """
1178
1246
  Remove a document from a folder.
@@ -1216,7 +1284,8 @@ class AsyncMorphik:
1216
1284
  rules: Optional list of rules to apply during ingestion. Can be:
1217
1285
  - MetadataExtractionRule: Extract metadata using a schema
1218
1286
  - NaturalLanguageRule: Transform content using natural language
1219
- use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
1287
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text
1288
+ (slower, but significantly better retrieval accuracy for text and images)
1220
1289
  Returns:
1221
1290
  Document: Metadata of the ingested document
1222
1291
 
@@ -1314,14 +1383,12 @@ class AsyncMorphik:
1314
1383
 
1315
1384
  try:
1316
1385
  # Prepare form data
1317
- data = self._logic._prepare_ingest_files_form_data(
1318
- metadata, rules, use_colpali, parallel, None, None
1319
- )
1386
+ data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
1320
1387
 
1321
1388
  response = await self._request(
1322
- "POST",
1323
- "ingest/files",
1324
- data=data,
1389
+ "POST",
1390
+ "ingest/files",
1391
+ data=data,
1325
1392
  files=file_objects,
1326
1393
  params={"use_colpali": str(use_colpali).lower()},
1327
1394
  )
@@ -1398,6 +1465,7 @@ class AsyncMorphik:
1398
1465
  k: int = 4,
1399
1466
  min_score: float = 0.0,
1400
1467
  use_colpali: bool = True,
1468
+ folder_name: Optional[Union[str, List[str]]] = None,
1401
1469
  ) -> List[FinalChunkResult]:
1402
1470
  """
1403
1471
  Search for relevant chunks.
@@ -1407,7 +1475,8 @@ class AsyncMorphik:
1407
1475
  filters: Optional metadata filters
1408
1476
  k: Number of results (default: 4)
1409
1477
  min_score: Minimum similarity threshold (default: 0.0)
1410
- use_colpali: Whether to use ColPali-style embedding model to retrieve chunks (only works for documents ingested with `use_colpali=True`)
1478
+ use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
1479
+ (only works for documents ingested with `use_colpali=True`)
1411
1480
  Returns:
1412
1481
  List[FinalChunkResult]
1413
1482
 
@@ -1419,8 +1488,9 @@ class AsyncMorphik:
1419
1488
  )
1420
1489
  ```
1421
1490
  """
1491
+ effective_folder = folder_name if folder_name is not None else None
1422
1492
  payload = self._logic._prepare_retrieve_chunks_request(
1423
- query, filters, k, min_score, use_colpali, None, None
1493
+ query, filters, k, min_score, use_colpali, effective_folder, None
1424
1494
  )
1425
1495
  response = await self._request("POST", "retrieve/chunks", data=payload)
1426
1496
  return self._logic._parse_chunk_result_list_response(response)
@@ -1432,6 +1502,7 @@ class AsyncMorphik:
1432
1502
  k: int = 4,
1433
1503
  min_score: float = 0.0,
1434
1504
  use_colpali: bool = True,
1505
+ folder_name: Optional[Union[str, List[str]]] = None,
1435
1506
  ) -> List[DocumentResult]:
1436
1507
  """
1437
1508
  Retrieve relevant documents.
@@ -1441,7 +1512,8 @@ class AsyncMorphik:
1441
1512
  filters: Optional metadata filters
1442
1513
  k: Number of results (default: 4)
1443
1514
  min_score: Minimum similarity threshold (default: 0.0)
1444
- use_colpali: Whether to use ColPali-style embedding model to retrieve documents (only works for documents ingested with `use_colpali=True`)
1515
+ use_colpali: Whether to use ColPali-style embedding model to retrieve documents
1516
+ (only works for documents ingested with `use_colpali=True`)
1445
1517
  Returns:
1446
1518
  List[DocumentResult]
1447
1519
 
@@ -1453,8 +1525,9 @@ class AsyncMorphik:
1453
1525
  )
1454
1526
  ```
1455
1527
  """
1528
+ effective_folder = folder_name if folder_name is not None else None
1456
1529
  payload = self._logic._prepare_retrieve_docs_request(
1457
- query, filters, k, min_score, use_colpali, None, None
1530
+ query, filters, k, min_score, use_colpali, effective_folder, None
1458
1531
  )
1459
1532
  response = await self._request("POST", "retrieve/docs", data=payload)
1460
1533
  return self._logic._parse_document_result_list_response(response)
@@ -1472,6 +1545,8 @@ class AsyncMorphik:
1472
1545
  hop_depth: int = 1,
1473
1546
  include_paths: bool = False,
1474
1547
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
1548
+ folder_name: Optional[Union[str, List[str]]] = None,
1549
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
1475
1550
  ) -> CompletionResponse:
1476
1551
  """
1477
1552
  Generate completion using relevant chunks as context.
@@ -1483,12 +1558,14 @@ class AsyncMorphik:
1483
1558
  min_score: Minimum similarity threshold (default: 0.0)
1484
1559
  max_tokens: Maximum tokens in completion
1485
1560
  temperature: Model temperature
1486
- use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
1561
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion
1562
+ (only works for documents ingested with `use_colpali=True`)
1487
1563
  graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
1488
1564
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
1489
1565
  include_paths: Whether to include relationship paths in the response
1490
1566
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
1491
1567
  Either a QueryPromptOverrides object or a dictionary with the same structure
1568
+ schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
1492
1569
  Returns:
1493
1570
  CompletionResponse
1494
1571
 
@@ -1536,8 +1613,30 @@ class AsyncMorphik:
1536
1613
  if response.metadata and "graph" in response.metadata:
1537
1614
  for path in response.metadata["graph"]["paths"]:
1538
1615
  print(" -> ".join(path))
1616
+
1617
+ # Using structured output with a Pydantic model
1618
+ from pydantic import BaseModel
1619
+
1620
+ class ResearchFindings(BaseModel):
1621
+ main_finding: str
1622
+ supporting_evidence: List[str]
1623
+ limitations: List[str]
1624
+
1625
+ response = await db.query(
1626
+ "Summarize the key research findings from these documents",
1627
+ schema=ResearchFindings
1628
+ )
1629
+
1630
+ # Access structured output
1631
+ if response.structured_output:
1632
+ findings = response.structured_output
1633
+ print(f"Main finding: {findings.main_finding}")
1634
+ print("Supporting evidence:")
1635
+ for evidence in findings.supporting_evidence:
1636
+ print(f"- {evidence}")
1539
1637
  ```
1540
1638
  """
1639
+ effective_folder = folder_name if folder_name is not None else None
1541
1640
  payload = self._logic._prepare_query_request(
1542
1641
  query,
1543
1642
  filters,
@@ -1550,14 +1649,31 @@ class AsyncMorphik:
1550
1649
  hop_depth,
1551
1650
  include_paths,
1552
1651
  prompt_overrides,
1652
+ effective_folder,
1553
1653
  None,
1554
- None,
1654
+ schema,
1555
1655
  )
1656
+
1657
+ # Add schema to payload if provided
1658
+ if schema:
1659
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
1660
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
1661
+ payload["schema"] = schema.model_json_schema()
1662
+ else:
1663
+ payload["schema"] = schema
1664
+
1665
+ # Add a hint to the query to return in JSON format
1666
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
1667
+
1556
1668
  response = await self._request("POST", "query", data=payload)
1557
1669
  return self._logic._parse_completion_response(response)
1558
1670
 
1559
1671
  async def list_documents(
1560
- self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
1672
+ self,
1673
+ skip: int = 0,
1674
+ limit: int = 100,
1675
+ filters: Optional[Dict[str, Any]] = None,
1676
+ folder_name: Optional[Union[str, List[str]]] = None,
1561
1677
  ) -> List[Document]:
1562
1678
  """
1563
1679
  List accessible documents.
@@ -1566,6 +1682,7 @@ class AsyncMorphik:
1566
1682
  skip: Number of documents to skip
1567
1683
  limit: Maximum number of documents to return
1568
1684
  filters: Optional filters
1685
+ folder_name: Optional folder name (or list of names) to scope the request
1569
1686
 
1570
1687
  Returns:
1571
1688
  List[Document]: List of accessible documents
@@ -1579,7 +1696,7 @@ class AsyncMorphik:
1579
1696
  next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
1580
1697
  ```
1581
1698
  """
1582
- params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
1699
+ params, data = self._logic._prepare_list_documents_request(skip, limit, filters, folder_name, None)
1583
1700
  response = await self._request("POST", "documents", data=data, params=params)
1584
1701
  docs = self._logic._parse_document_list_response(response)
1585
1702
  for doc in docs:
@@ -1606,17 +1723,17 @@ class AsyncMorphik:
1606
1723
  doc = self._logic._parse_document_response(response)
1607
1724
  doc._client = self
1608
1725
  return doc
1609
-
1726
+
1610
1727
  async def get_document_status(self, document_id: str) -> Dict[str, Any]:
1611
1728
  """
1612
1729
  Get the current processing status of a document.
1613
-
1730
+
1614
1731
  Args:
1615
1732
  document_id: ID of the document to check
1616
-
1733
+
1617
1734
  Returns:
1618
1735
  Dict[str, Any]: Status information including current status, potential errors, and other metadata
1619
-
1736
+
1620
1737
  Example:
1621
1738
  ```python
1622
1739
  status = await db.get_document_status("doc_123")
@@ -1630,23 +1747,25 @@ class AsyncMorphik:
1630
1747
  """
1631
1748
  response = await self._request("GET", f"documents/{document_id}/status")
1632
1749
  return response
1633
-
1634
- async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1750
+
1751
+ async def wait_for_document_completion(
1752
+ self, document_id: str, timeout_seconds=300, check_interval_seconds=2
1753
+ ) -> Document:
1635
1754
  """
1636
1755
  Wait for a document's processing to complete.
1637
-
1756
+
1638
1757
  Args:
1639
1758
  document_id: ID of the document to wait for
1640
1759
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1641
1760
  check_interval_seconds: Time between status checks (default: 2 seconds)
1642
-
1761
+
1643
1762
  Returns:
1644
1763
  Document: Updated document with the latest status
1645
-
1764
+
1646
1765
  Raises:
1647
1766
  TimeoutError: If processing doesn't complete within the timeout period
1648
1767
  ValueError: If processing fails with an error
1649
-
1768
+
1650
1769
  Example:
1651
1770
  ```python
1652
1771
  # Upload a file and wait for processing to complete
@@ -1661,20 +1780,21 @@ class AsyncMorphik:
1661
1780
  ```
1662
1781
  """
1663
1782
  import asyncio
1783
+
1664
1784
  start_time = asyncio.get_event_loop().time()
1665
-
1785
+
1666
1786
  while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
1667
1787
  status = await self.get_document_status(document_id)
1668
-
1788
+
1669
1789
  if status["status"] == "completed":
1670
1790
  # Get the full document now that it's complete
1671
1791
  return await self.get_document(document_id)
1672
1792
  elif status["status"] == "failed":
1673
1793
  raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1674
-
1794
+
1675
1795
  # Wait before checking again
1676
1796
  await asyncio.sleep(check_interval_seconds)
1677
-
1797
+
1678
1798
  raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1679
1799
 
1680
1800
  async def get_document_by_filename(self, filename: str) -> Document:
@@ -1828,9 +1948,7 @@ class AsyncMorphik:
1828
1948
  form_data["use_colpali"] = str(use_colpali).lower()
1829
1949
 
1830
1950
  # Use the dedicated file update endpoint
1831
- response = await self._request(
1832
- "POST", f"documents/{document_id}/update_file", data=form_data, files=files
1833
- )
1951
+ response = await self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
1834
1952
 
1835
1953
  doc = self._logic._parse_document_response(response)
1836
1954
  doc._client = self
@@ -1866,9 +1984,7 @@ class AsyncMorphik:
1866
1984
  ```
1867
1985
  """
1868
1986
  # Use the dedicated metadata update endpoint
1869
- response = await self._request(
1870
- "POST", f"documents/{document_id}/update_metadata", data=metadata
1871
- )
1987
+ response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
1872
1988
  doc = self._logic._parse_document_response(response)
1873
1989
  doc._client = self
1874
1990
  return doc
@@ -2034,12 +2150,15 @@ class AsyncMorphik:
2034
2150
 
2035
2151
  return result
2036
2152
 
2037
- async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
2153
+ async def batch_get_documents(
2154
+ self, document_ids: List[str], folder_name: Optional[Union[str, List[str]]] = None
2155
+ ) -> List[Document]:
2038
2156
  """
2039
2157
  Retrieve multiple documents by their IDs in a single batch operation.
2040
2158
 
2041
2159
  Args:
2042
2160
  document_ids: List of document IDs to retrieve
2161
+ folder_name: Optional folder name (or list of names) to scope the request
2043
2162
 
2044
2163
  Returns:
2045
2164
  List[Document]: List of document metadata for found documents
@@ -2053,6 +2172,8 @@ class AsyncMorphik:
2053
2172
  """
2054
2173
  # API expects a dict with document_ids key, not a direct list
2055
2174
  request = {"document_ids": document_ids}
2175
+ if folder_name:
2176
+ request["folder_name"] = folder_name
2056
2177
  response = await self._request("POST", "batch/documents", data=request)
2057
2178
  docs = self._logic._parse_document_list_response(response)
2058
2179
  for doc in docs:
@@ -2060,13 +2181,16 @@ class AsyncMorphik:
2060
2181
  return docs
2061
2182
 
2062
2183
  async def batch_get_chunks(
2063
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
2184
+ self,
2185
+ sources: List[Union[ChunkSource, Dict[str, Any]]],
2186
+ folder_name: Optional[Union[str, List[str]]] = None,
2064
2187
  ) -> List[FinalChunkResult]:
2065
2188
  """
2066
2189
  Retrieve specific chunks by their document ID and chunk number in a single batch operation.
2067
2190
 
2068
2191
  Args:
2069
2192
  sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
2193
+ folder_name: Optional folder name (or list of names) to scope the request
2070
2194
 
2071
2195
  Returns:
2072
2196
  List[FinalChunkResult]: List of chunk results
@@ -2091,7 +2215,7 @@ class AsyncMorphik:
2091
2215
  print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
2092
2216
  ```
2093
2217
  """
2094
- request = self._logic._prepare_batch_get_chunks_request(sources, None, None)
2218
+ request = self._logic._prepare_batch_get_chunks_request(sources, folder_name, None)
2095
2219
  response = await self._request("POST", "batch/chunks", data=request)
2096
2220
  return self._logic._parse_chunk_result_list_response(response)
2097
2221
 
@@ -2110,8 +2234,10 @@ class AsyncMorphik:
2110
2234
  name: Name of the cache to create
2111
2235
  model: Name of the model to use (e.g. "llama2")
2112
2236
  gguf_file: Name of the GGUF file to use for the model
2113
- filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
2114
- docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
2237
+ filters: Optional metadata filters to determine which documents to include.
2238
+ These filters will be applied in addition to any specific docs provided.
2239
+ docs: Optional list of specific document IDs to include.
2240
+ These docs will be included in addition to any documents matching the filters.
2115
2241
 
2116
2242
  Returns:
2117
2243
  Dict[str, Any]: Created cache configuration
@@ -2212,11 +2338,11 @@ class AsyncMorphik:
2212
2338
  )
2213
2339
  ```
2214
2340
  """
2215
- request = self._logic._prepare_create_graph_request(
2216
- name, filters, documents, prompt_overrides, None, None
2217
- )
2341
+ request = self._logic._prepare_create_graph_request(name, filters, documents, prompt_overrides, None, None)
2218
2342
  response = await self._request("POST", "graph/create", data=request)
2219
- return self._logic._parse_graph_response(response)
2343
+ graph = self._logic._parse_graph_response(response)
2344
+ graph._client = self # Attach AsyncMorphik client for polling helpers
2345
+ return graph
2220
2346
 
2221
2347
  async def get_graph(self, name: str) -> Graph:
2222
2348
  """
@@ -2236,7 +2362,9 @@ class AsyncMorphik:
2236
2362
  ```
2237
2363
  """
2238
2364
  response = await self._request("GET", f"graph/{name}")
2239
- return self._logic._parse_graph_response(response)
2365
+ graph = self._logic._parse_graph_response(response)
2366
+ graph._client = self
2367
+ return graph
2240
2368
 
2241
2369
  async def list_graphs(self) -> List[Graph]:
2242
2370
  """
@@ -2254,7 +2382,10 @@ class AsyncMorphik:
2254
2382
  ```
2255
2383
  """
2256
2384
  response = await self._request("GET", "graphs")
2257
- return self._logic._parse_graph_list_response(response)
2385
+ graphs = self._logic._parse_graph_list_response(response)
2386
+ for g in graphs:
2387
+ g._client = self
2388
+ return graphs
2258
2389
 
2259
2390
  async def update_graph(
2260
2391
  self,
@@ -2311,7 +2442,9 @@ class AsyncMorphik:
2311
2442
  name, additional_filters, additional_documents, prompt_overrides, None, None
2312
2443
  )
2313
2444
  response = await self._request("POST", f"graph/{name}/update", data=request)
2314
- return self._logic._parse_graph_response(response)
2445
+ graph = self._logic._parse_graph_response(response)
2446
+ graph._client = self
2447
+ return graph
2315
2448
 
2316
2449
  async def delete_document(self, document_id: str) -> Dict[str, str]:
2317
2450
  """
@@ -2373,3 +2506,37 @@ class AsyncMorphik:
2373
2506
 
2374
2507
  async def __aexit__(self, exc_type, exc_val, exc_tb):
2375
2508
  await self.close()
2509
+
2510
+ async def create_app(self, app_id: str, name: str, expiry_days: int = 30) -> Dict[str, str]:
2511
+ """Create a new application in Morphik Cloud and obtain its auth URI (async)."""
2512
+
2513
+ payload = {"app_id": app_id, "name": name, "expiry_days": expiry_days}
2514
+ return await self._request("POST", "ee/create_app", data=payload)
2515
+
2516
+ async def wait_for_graph_completion(
2517
+ self,
2518
+ graph_name: str,
2519
+ timeout_seconds: int = 300,
2520
+ check_interval_seconds: int = 5,
2521
+ ) -> Graph:
2522
+ """Block until the specified graph finishes processing (async).
2523
+
2524
+ Args:
2525
+ graph_name: Name of the graph to monitor.
2526
+ timeout_seconds: Maximum seconds to wait.
2527
+ check_interval_seconds: Seconds between status checks.
2528
+
2529
+ Returns:
2530
+ Graph: The completed graph object.
2531
+ """
2532
+ import asyncio
2533
+
2534
+ start = asyncio.get_event_loop().time()
2535
+ while (asyncio.get_event_loop().time() - start) < timeout_seconds:
2536
+ graph = await self.get_graph(graph_name)
2537
+ if graph.is_completed:
2538
+ return graph
2539
+ if graph.is_failed:
2540
+ raise RuntimeError(graph.error or "Graph processing failed")
2541
+ await asyncio.sleep(check_interval_seconds)
2542
+ raise TimeoutError("Timed out waiting for graph completion")