morphik 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/async_.py CHANGED
@@ -2,25 +2,23 @@ import json
2
2
  import logging
3
3
  from io import BytesIO, IOBase
4
4
  from pathlib import Path
5
- from typing import Dict, Any, List, Optional, Union, BinaryIO
5
+ from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
6
6
 
7
7
  import httpx
8
- from PIL.Image import Image as PILImage
8
+ from pydantic import BaseModel
9
9
 
10
+ from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
10
11
  from .models import (
12
+ ChunkSource,
13
+ CompletionResponse, # Prompt override models
11
14
  Document,
12
15
  DocumentResult,
13
- CompletionResponse,
14
- IngestTextRequest,
15
- ChunkSource,
16
- Graph,
17
16
  FolderInfo,
18
- # Prompt override models
17
+ Graph,
19
18
  GraphPromptOverrides,
19
+ IngestTextRequest,
20
20
  QueryPromptOverrides,
21
21
  )
22
- from .rules import Rule
23
- from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
 
@@ -69,16 +67,16 @@ class AsyncFolder:
69
67
  def name(self) -> str:
70
68
  """Returns the folder name."""
71
69
  return self._name
72
-
70
+
73
71
  @property
74
72
  def id(self) -> Optional[str]:
75
73
  """Returns the folder ID if available."""
76
74
  return self._id
77
-
75
+
78
76
  async def get_info(self) -> Dict[str, Any]:
79
77
  """
80
78
  Get detailed information about this folder.
81
-
79
+
82
80
  Returns:
83
81
  Dict[str, Any]: Detailed folder information
84
82
  """
@@ -91,9 +89,8 @@ class AsyncFolder:
91
89
  break
92
90
  if not self._id:
93
91
  raise ValueError(f"Folder '{self._name}' not found")
94
-
92
+
95
93
  return await self._client._request("GET", f"folders/{self._id}")
96
-
97
94
 
98
95
  def signin(self, end_user_id: str) -> "AsyncUserScope":
99
96
  """
@@ -166,15 +163,14 @@ class AsyncFolder:
166
163
  files = {"file": (filename, file_obj)}
167
164
 
168
165
  # Create form data
169
- form_data = self._client._logic._prepare_ingest_file_form_data(
170
- metadata, rules, self._name, None
171
- )
166
+ form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
172
167
 
173
168
  response = await self._client._request(
174
169
  "POST",
175
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
170
+ "ingest/file",
176
171
  data=form_data,
177
172
  files=files,
173
+ params={"use_colpali": str(use_colpali).lower()},
178
174
  )
179
175
  doc = self._client._logic._parse_document_response(response)
180
176
  doc._client = self._client
@@ -215,7 +211,11 @@ class AsyncFolder:
215
211
  )
216
212
 
217
213
  response = await self._client._request(
218
- "POST", "ingest/files", data=data, files=file_objects
214
+ "POST",
215
+ "ingest/files",
216
+ data=data,
217
+ files=file_objects,
218
+ params={"use_colpali": str(use_colpali).lower()},
219
219
  )
220
220
 
221
221
  if response.get("errors"):
@@ -223,9 +223,7 @@ class AsyncFolder:
223
223
  for error in response["errors"]:
224
224
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
225
225
 
226
- docs = [
227
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
228
- ]
226
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
229
227
  for doc in docs:
230
228
  doc._client = self._client
231
229
  return docs
@@ -348,6 +346,7 @@ class AsyncFolder:
348
346
  hop_depth: int = 1,
349
347
  include_paths: bool = False,
350
348
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
349
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
351
350
  ) -> CompletionResponse:
352
351
  """
353
352
  Generate completion using relevant chunks as context within this folder.
@@ -364,9 +363,10 @@ class AsyncFolder:
364
363
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
365
364
  include_paths: Whether to include relationship paths in the response
366
365
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
366
+ schema: Optional schema for structured output
367
367
 
368
368
  Returns:
369
- CompletionResponse: Generated completion
369
+ CompletionResponse: Generated completion or structured output
370
370
  """
371
371
  payload = self._client._logic._prepare_query_request(
372
372
  query,
@@ -382,6 +382,7 @@ class AsyncFolder:
382
382
  prompt_overrides,
383
383
  self._name,
384
384
  None,
385
+ schema,
385
386
  )
386
387
  response = await self._client._request("POST", "query", data=payload)
387
388
  return self._client._logic._parse_completion_response(response)
@@ -400,9 +401,7 @@ class AsyncFolder:
400
401
  Returns:
401
402
  List[Document]: List of documents
402
403
  """
403
- params, data = self._client._logic._prepare_list_documents_request(
404
- skip, limit, filters, self._name, None
405
- )
404
+ params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
406
405
  response = await self._client._request("POST", "documents", data=data, params=params)
407
406
  docs = self._client._logic._parse_document_list_response(response)
408
407
  for doc in docs:
@@ -419,18 +418,17 @@ class AsyncFolder:
419
418
  Returns:
420
419
  List[Document]: List of document metadata for found documents
421
420
  """
422
- request = self._client._logic._prepare_batch_get_documents_request(
423
- document_ids, self._name, None
424
- )
421
+ # API expects a dict with document_ids key
422
+ request = {"document_ids": document_ids}
423
+ if self._name:
424
+ request["folder_name"] = self._name
425
425
  response = await self._client._request("POST", "batch/documents", data=request)
426
426
  docs = self._client._logic._parse_document_list_response(response)
427
427
  for doc in docs:
428
428
  doc._client = self._client
429
429
  return docs
430
430
 
431
- async def batch_get_chunks(
432
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
433
- ) -> List[FinalChunkResult]:
431
+ async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
434
432
  """
435
433
  Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
436
434
 
@@ -504,9 +502,6 @@ class AsyncFolder:
504
502
  Returns:
505
503
  Dict[str, str]: Deletion status
506
504
  """
507
- # Get the document by filename with folder scope
508
- request = {"filename": filename, "folder_name": self._name}
509
-
510
505
  # First get the document ID
511
506
  response = await self._client._request(
512
507
  "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
@@ -679,9 +674,7 @@ class AsyncUserScope:
679
674
  if rules:
680
675
  if all(isinstance(r, list) for r in rules):
681
676
  # List of lists - per-file rules
682
- converted_rules = [
683
- [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
684
- ]
677
+ converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
685
678
  else:
686
679
  # Flat list - shared rules for all files
687
680
  converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -701,7 +694,11 @@ class AsyncUserScope:
701
694
  data["folder_name"] = self._folder_name
702
695
 
703
696
  response = await self._client._request(
704
- "POST", "ingest/files", data=data, files=file_objects
697
+ "POST",
698
+ "ingest/files",
699
+ data=data,
700
+ files=file_objects,
701
+ params={"use_colpali": str(use_colpali).lower()},
705
702
  )
706
703
 
707
704
  if response.get("errors"):
@@ -709,9 +706,7 @@ class AsyncUserScope:
709
706
  for error in response["errors"]:
710
707
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
711
708
 
712
- docs = [
713
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
714
- ]
709
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
715
710
  for doc in docs:
716
711
  doc._client = self._client
717
712
  return docs
@@ -834,9 +829,10 @@ class AsyncUserScope:
834
829
  hop_depth: int = 1,
835
830
  include_paths: bool = False,
836
831
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
832
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
837
833
  ) -> CompletionResponse:
838
834
  """
839
- Generate completion using relevant chunks as context as this end user.
835
+ Generate completion using relevant chunks as context, scoped to the end user.
840
836
 
841
837
  Args:
842
838
  query: Query text
@@ -850,9 +846,10 @@ class AsyncUserScope:
850
846
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
851
847
  include_paths: Whether to include relationship paths in the response
852
848
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
849
+ schema: Optional schema for structured output
853
850
 
854
851
  Returns:
855
- CompletionResponse: Generated completion
852
+ CompletionResponse: Generated completion or structured output
856
853
  """
857
854
  payload = self._client._logic._prepare_query_request(
858
855
  query,
@@ -866,8 +863,9 @@ class AsyncUserScope:
866
863
  hop_depth,
867
864
  include_paths,
868
865
  prompt_overrides,
869
- self._folder_name,
870
- self._end_user_id,
866
+ self.folder_name,
867
+ self.end_user_id,
868
+ schema,
871
869
  )
872
870
  response = await self._client._request("POST", "query", data=payload)
873
871
  return self._client._logic._parse_completion_response(response)
@@ -905,18 +903,19 @@ class AsyncUserScope:
905
903
  Returns:
906
904
  List[Document]: List of document metadata for found documents
907
905
  """
908
- request = self._client._logic._prepare_batch_get_documents_request(
909
- document_ids, self._folder_name, self._end_user_id
910
- )
906
+ # API expects a dict with document_ids key
907
+ request = {"document_ids": document_ids}
908
+ if self._end_user_id:
909
+ request["end_user_id"] = self._end_user_id
910
+ if self._folder_name:
911
+ request["folder_name"] = self._folder_name
911
912
  response = await self._client._request("POST", "batch/documents", data=request)
912
913
  docs = self._client._logic._parse_document_list_response(response)
913
914
  for doc in docs:
914
915
  doc._client = self._client
915
916
  return docs
916
917
 
917
- async def batch_get_chunks(
918
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
919
- ) -> List[FinalChunkResult]:
918
+ async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
920
919
  """
921
920
  Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
922
921
 
@@ -926,9 +925,7 @@ class AsyncUserScope:
926
925
  Returns:
927
926
  List[FinalChunkResult]: List of chunk results
928
927
  """
929
- request = self._client._logic._prepare_batch_get_chunks_request(
930
- sources, self._folder_name, self._end_user_id
931
- )
928
+ request = self._client._logic._prepare_batch_get_chunks_request(sources, self._folder_name, self._end_user_id)
932
929
  response = await self._client._request("POST", "batch/chunks", data=request)
933
930
  return self._client._logic._parse_chunk_result_list_response(response)
934
931
 
@@ -1005,9 +1002,7 @@ class AsyncUserScope:
1005
1002
  params["folder_name"] = self._folder_name
1006
1003
 
1007
1004
  # First get the document ID
1008
- response = await self._client._request(
1009
- "GET", f"documents/filename/{filename}", params=params
1010
- )
1005
+ response = await self._client._request("GET", f"documents/filename/{filename}", params=params)
1011
1006
  doc = self._client._logic._parse_document_response(response)
1012
1007
 
1013
1008
  # Then delete by ID
@@ -1060,9 +1055,15 @@ class AsyncMorphik:
1060
1055
 
1061
1056
  # Configure request data based on type
1062
1057
  if files:
1063
- # Multipart form data for files
1064
- request_data = {"files": files, "data": data}
1065
- # Don't set Content-Type, let httpx handle it
1058
+ # When uploading files, we need to make sure not to set Content-Type
1059
+ # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1060
+ if "Content-Type" in headers:
1061
+ del headers["Content-Type"]
1062
+
1063
+ # For file uploads with form data, use form data (not json)
1064
+ request_data = {"files": files}
1065
+ if data:
1066
+ request_data["data"] = data
1066
1067
  else:
1067
1068
  # JSON for everything else
1068
1069
  headers["Content-Type"] = "application/json"
@@ -1093,18 +1094,16 @@ class AsyncMorphik:
1093
1094
  Returns:
1094
1095
  AsyncFolder: A folder object ready for scoped operations
1095
1096
  """
1096
- payload = {
1097
- "name": name
1098
- }
1097
+ payload = {"name": name}
1099
1098
  if description:
1100
1099
  payload["description"] = description
1101
-
1100
+
1102
1101
  response = await self._request("POST", "folders", data=payload)
1103
1102
  folder_info = FolderInfo(**response)
1104
-
1103
+
1105
1104
  # Return a usable AsyncFolder object with the ID from the response
1106
1105
  return AsyncFolder(self, name, folder_id=folder_info.id)
1107
-
1106
+
1108
1107
  def get_folder_by_name(self, name: str) -> AsyncFolder:
1109
1108
  """
1110
1109
  Get a folder by name to scope operations.
@@ -1116,7 +1115,7 @@ class AsyncMorphik:
1116
1115
  AsyncFolder: A folder object for scoped operations
1117
1116
  """
1118
1117
  return AsyncFolder(self, name)
1119
-
1118
+
1120
1119
  async def get_folder(self, folder_id: str) -> AsyncFolder:
1121
1120
  """
1122
1121
  Get a folder by ID.
@@ -1129,7 +1128,7 @@ class AsyncMorphik:
1129
1128
  """
1130
1129
  response = await self._request("GET", f"folders/{folder_id}")
1131
1130
  return AsyncFolder(self, response["name"], folder_id)
1132
-
1131
+
1133
1132
  async def list_folders(self) -> List[AsyncFolder]:
1134
1133
  """
1135
1134
  List all folders the user has access to as AsyncFolder objects.
@@ -1139,7 +1138,7 @@ class AsyncMorphik:
1139
1138
  """
1140
1139
  response = await self._request("GET", "folders")
1141
1140
  return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
1142
-
1141
+
1143
1142
  async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1144
1143
  """
1145
1144
  Add a document to a folder.
@@ -1153,7 +1152,7 @@ class AsyncMorphik:
1153
1152
  """
1154
1153
  response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1155
1154
  return response
1156
-
1155
+
1157
1156
  async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1158
1157
  """
1159
1158
  Remove a document from a folder.
@@ -1197,7 +1196,8 @@ class AsyncMorphik:
1197
1196
  rules: Optional list of rules to apply during ingestion. Can be:
1198
1197
  - MetadataExtractionRule: Extract metadata using a schema
1199
1198
  - NaturalLanguageRule: Transform content using natural language
1200
- use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
1199
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text
1200
+ (slower, but significantly better retrieval accuracy for text and images)
1201
1201
  Returns:
1202
1202
  Document: Metadata of the ingested document
1203
1203
 
@@ -1253,9 +1253,10 @@ class AsyncMorphik:
1253
1253
 
1254
1254
  response = await self._request(
1255
1255
  "POST",
1256
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
1256
+ "ingest/file",
1257
1257
  data=form_data,
1258
1258
  files=files,
1259
+ params={"use_colpali": str(use_colpali).lower()},
1259
1260
  )
1260
1261
  doc = self._logic._parse_document_response(response)
1261
1262
  doc._client = self
@@ -1294,11 +1295,15 @@ class AsyncMorphik:
1294
1295
 
1295
1296
  try:
1296
1297
  # Prepare form data
1297
- data = self._logic._prepare_ingest_files_form_data(
1298
- metadata, rules, use_colpali, parallel, None, None
1299
- )
1298
+ data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
1300
1299
 
1301
- response = await self._request("POST", "ingest/files", data=data, files=file_objects)
1300
+ response = await self._request(
1301
+ "POST",
1302
+ "ingest/files",
1303
+ data=data,
1304
+ files=file_objects,
1305
+ params={"use_colpali": str(use_colpali).lower()},
1306
+ )
1302
1307
 
1303
1308
  if response.get("errors"):
1304
1309
  # Log errors but don't raise exception
@@ -1306,7 +1311,7 @@ class AsyncMorphik:
1306
1311
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
1307
1312
 
1308
1313
  # Parse the documents from the response
1309
- docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
1314
+ docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
1310
1315
  for doc in docs:
1311
1316
  doc._client = self
1312
1317
  return docs
@@ -1381,7 +1386,8 @@ class AsyncMorphik:
1381
1386
  filters: Optional metadata filters
1382
1387
  k: Number of results (default: 4)
1383
1388
  min_score: Minimum similarity threshold (default: 0.0)
1384
- use_colpali: Whether to use ColPali-style embedding model to retrieve chunks (only works for documents ingested with `use_colpali=True`)
1389
+ use_colpali: Whether to use ColPali-style embedding model to retrieve chunks
1390
+ (only works for documents ingested with `use_colpali=True`)
1385
1391
  Returns:
1386
1392
  List[FinalChunkResult]
1387
1393
 
@@ -1393,9 +1399,7 @@ class AsyncMorphik:
1393
1399
  )
1394
1400
  ```
1395
1401
  """
1396
- payload = self._logic._prepare_retrieve_chunks_request(
1397
- query, filters, k, min_score, use_colpali, None, None
1398
- )
1402
+ payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
1399
1403
  response = await self._request("POST", "retrieve/chunks", data=payload)
1400
1404
  return self._logic._parse_chunk_result_list_response(response)
1401
1405
 
@@ -1415,7 +1419,8 @@ class AsyncMorphik:
1415
1419
  filters: Optional metadata filters
1416
1420
  k: Number of results (default: 4)
1417
1421
  min_score: Minimum similarity threshold (default: 0.0)
1418
- use_colpali: Whether to use ColPali-style embedding model to retrieve documents (only works for documents ingested with `use_colpali=True`)
1422
+ use_colpali: Whether to use ColPali-style embedding model to retrieve documents
1423
+ (only works for documents ingested with `use_colpali=True`)
1419
1424
  Returns:
1420
1425
  List[DocumentResult]
1421
1426
 
@@ -1427,9 +1432,7 @@ class AsyncMorphik:
1427
1432
  )
1428
1433
  ```
1429
1434
  """
1430
- payload = self._logic._prepare_retrieve_docs_request(
1431
- query, filters, k, min_score, use_colpali, None, None
1432
- )
1435
+ payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
1433
1436
  response = await self._request("POST", "retrieve/docs", data=payload)
1434
1437
  return self._logic._parse_document_result_list_response(response)
1435
1438
 
@@ -1446,6 +1449,7 @@ class AsyncMorphik:
1446
1449
  hop_depth: int = 1,
1447
1450
  include_paths: bool = False,
1448
1451
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
1452
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
1449
1453
  ) -> CompletionResponse:
1450
1454
  """
1451
1455
  Generate completion using relevant chunks as context.
@@ -1457,12 +1461,14 @@ class AsyncMorphik:
1457
1461
  min_score: Minimum similarity threshold (default: 0.0)
1458
1462
  max_tokens: Maximum tokens in completion
1459
1463
  temperature: Model temperature
1460
- use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
1464
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion
1465
+ (only works for documents ingested with `use_colpali=True`)
1461
1466
  graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
1462
1467
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
1463
1468
  include_paths: Whether to include relationship paths in the response
1464
1469
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
1465
1470
  Either a QueryPromptOverrides object or a dictionary with the same structure
1471
+ schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
1466
1472
  Returns:
1467
1473
  CompletionResponse
1468
1474
 
@@ -1510,6 +1516,27 @@ class AsyncMorphik:
1510
1516
  if response.metadata and "graph" in response.metadata:
1511
1517
  for path in response.metadata["graph"]["paths"]:
1512
1518
  print(" -> ".join(path))
1519
+
1520
+ # Using structured output with a Pydantic model
1521
+ from pydantic import BaseModel
1522
+
1523
+ class ResearchFindings(BaseModel):
1524
+ main_finding: str
1525
+ supporting_evidence: List[str]
1526
+ limitations: List[str]
1527
+
1528
+ response = await db.query(
1529
+ "Summarize the key research findings from these documents",
1530
+ schema=ResearchFindings
1531
+ )
1532
+
1533
+ # Access structured output
1534
+ if response.structured_output:
1535
+ findings = response.structured_output
1536
+ print(f"Main finding: {findings.main_finding}")
1537
+ print("Supporting evidence:")
1538
+ for evidence in findings.supporting_evidence:
1539
+ print(f"- {evidence}")
1513
1540
  ```
1514
1541
  """
1515
1542
  payload = self._logic._prepare_query_request(
@@ -1526,7 +1553,20 @@ class AsyncMorphik:
1526
1553
  prompt_overrides,
1527
1554
  None,
1528
1555
  None,
1556
+ schema,
1529
1557
  )
1558
+
1559
+ # Add schema to payload if provided
1560
+ if schema:
1561
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
1562
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
1563
+ payload["schema"] = schema.model_json_schema()
1564
+ else:
1565
+ payload["schema"] = schema
1566
+
1567
+ # Add a hint to the query to return in JSON format
1568
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
1569
+
1530
1570
  response = await self._request("POST", "query", data=payload)
1531
1571
  return self._logic._parse_completion_response(response)
1532
1572
 
@@ -1580,17 +1620,17 @@ class AsyncMorphik:
1580
1620
  doc = self._logic._parse_document_response(response)
1581
1621
  doc._client = self
1582
1622
  return doc
1583
-
1623
+
1584
1624
  async def get_document_status(self, document_id: str) -> Dict[str, Any]:
1585
1625
  """
1586
1626
  Get the current processing status of a document.
1587
-
1627
+
1588
1628
  Args:
1589
1629
  document_id: ID of the document to check
1590
-
1630
+
1591
1631
  Returns:
1592
1632
  Dict[str, Any]: Status information including current status, potential errors, and other metadata
1593
-
1633
+
1594
1634
  Example:
1595
1635
  ```python
1596
1636
  status = await db.get_document_status("doc_123")
@@ -1604,23 +1644,25 @@ class AsyncMorphik:
1604
1644
  """
1605
1645
  response = await self._request("GET", f"documents/{document_id}/status")
1606
1646
  return response
1607
-
1608
- async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1647
+
1648
+ async def wait_for_document_completion(
1649
+ self, document_id: str, timeout_seconds=300, check_interval_seconds=2
1650
+ ) -> Document:
1609
1651
  """
1610
1652
  Wait for a document's processing to complete.
1611
-
1653
+
1612
1654
  Args:
1613
1655
  document_id: ID of the document to wait for
1614
1656
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1615
1657
  check_interval_seconds: Time between status checks (default: 2 seconds)
1616
-
1658
+
1617
1659
  Returns:
1618
1660
  Document: Updated document with the latest status
1619
-
1661
+
1620
1662
  Raises:
1621
1663
  TimeoutError: If processing doesn't complete within the timeout period
1622
1664
  ValueError: If processing fails with an error
1623
-
1665
+
1624
1666
  Example:
1625
1667
  ```python
1626
1668
  # Upload a file and wait for processing to complete
@@ -1635,20 +1677,21 @@ class AsyncMorphik:
1635
1677
  ```
1636
1678
  """
1637
1679
  import asyncio
1680
+
1638
1681
  start_time = asyncio.get_event_loop().time()
1639
-
1682
+
1640
1683
  while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
1641
1684
  status = await self.get_document_status(document_id)
1642
-
1685
+
1643
1686
  if status["status"] == "completed":
1644
1687
  # Get the full document now that it's complete
1645
1688
  return await self.get_document(document_id)
1646
1689
  elif status["status"] == "failed":
1647
1690
  raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1648
-
1691
+
1649
1692
  # Wait before checking again
1650
1693
  await asyncio.sleep(check_interval_seconds)
1651
-
1694
+
1652
1695
  raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1653
1696
 
1654
1697
  async def get_document_by_filename(self, filename: str) -> Document:
@@ -1802,9 +1845,7 @@ class AsyncMorphik:
1802
1845
  form_data["use_colpali"] = str(use_colpali).lower()
1803
1846
 
1804
1847
  # Use the dedicated file update endpoint
1805
- response = await self._request(
1806
- "POST", f"documents/{document_id}/update_file", data=form_data, files=files
1807
- )
1848
+ response = await self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
1808
1849
 
1809
1850
  doc = self._logic._parse_document_response(response)
1810
1851
  doc._client = self
@@ -1840,9 +1881,7 @@ class AsyncMorphik:
1840
1881
  ```
1841
1882
  """
1842
1883
  # Use the dedicated metadata update endpoint
1843
- response = await self._request(
1844
- "POST", f"documents/{document_id}/update_metadata", data=metadata
1845
- )
1884
+ response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
1846
1885
  doc = self._logic._parse_document_response(response)
1847
1886
  doc._client = self
1848
1887
  return doc
@@ -2025,16 +2064,15 @@ class AsyncMorphik:
2025
2064
  print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
2026
2065
  ```
2027
2066
  """
2028
- request = self._logic._prepare_batch_get_documents_request(document_ids, None, None)
2067
+ # API expects a dict with document_ids key, not a direct list
2068
+ request = {"document_ids": document_ids}
2029
2069
  response = await self._request("POST", "batch/documents", data=request)
2030
2070
  docs = self._logic._parse_document_list_response(response)
2031
2071
  for doc in docs:
2032
2072
  doc._client = self
2033
2073
  return docs
2034
2074
 
2035
- async def batch_get_chunks(
2036
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
2037
- ) -> List[FinalChunkResult]:
2075
+ async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
2038
2076
  """
2039
2077
  Retrieve specific chunks by their document ID and chunk number in a single batch operation.
2040
2078
 
@@ -2083,8 +2121,10 @@ class AsyncMorphik:
2083
2121
  name: Name of the cache to create
2084
2122
  model: Name of the model to use (e.g. "llama2")
2085
2123
  gguf_file: Name of the GGUF file to use for the model
2086
- filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
2087
- docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
2124
+ filters: Optional metadata filters to determine which documents to include.
2125
+ These filters will be applied in addition to any specific docs provided.
2126
+ docs: Optional list of specific document IDs to include.
2127
+ These docs will be included in addition to any documents matching the filters.
2088
2128
 
2089
2129
  Returns:
2090
2130
  Dict[str, Any]: Created cache configuration
@@ -2185,9 +2225,7 @@ class AsyncMorphik:
2185
2225
  )
2186
2226
  ```
2187
2227
  """
2188
- request = self._logic._prepare_create_graph_request(
2189
- name, filters, documents, prompt_overrides, None, None
2190
- )
2228
+ request = self._logic._prepare_create_graph_request(name, filters, documents, prompt_overrides, None, None)
2191
2229
  response = await self._request("POST", "graph/create", data=request)
2192
2230
  return self._logic._parse_graph_response(response)
2193
2231