morphik 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/sync.py CHANGED
@@ -2,27 +2,23 @@ import json
2
2
  import logging
3
3
  from io import BytesIO, IOBase
4
4
  from pathlib import Path
5
- from typing import Dict, Any, List, Optional, Union, BinaryIO
6
-
7
- from PIL import Image
8
- from PIL.Image import Image as PILImage
5
+ from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
9
6
 
10
7
  import httpx
8
+ from pydantic import BaseModel
11
9
 
10
+ from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
12
11
  from .models import (
12
+ ChunkSource,
13
+ CompletionResponse, # Prompt override models
13
14
  Document,
14
15
  DocumentResult,
15
- CompletionResponse,
16
- IngestTextRequest,
17
- ChunkSource,
18
- Graph,
19
16
  FolderInfo,
20
- # Prompt override models
17
+ Graph,
21
18
  GraphPromptOverrides,
19
+ IngestTextRequest,
22
20
  QueryPromptOverrides,
23
21
  )
24
- from .rules import Rule
25
- from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
26
22
 
27
23
  logger = logging.getLogger(__name__)
28
24
 
@@ -71,16 +67,16 @@ class Folder:
71
67
  def name(self) -> str:
72
68
  """Returns the folder name."""
73
69
  return self._name
74
-
70
+
75
71
  @property
76
72
  def id(self) -> Optional[str]:
77
73
  """Returns the folder ID if available."""
78
74
  return self._id
79
-
75
+
80
76
  def get_info(self) -> Dict[str, Any]:
81
77
  """
82
78
  Get detailed information about this folder.
83
-
79
+
84
80
  Returns:
85
81
  Dict[str, Any]: Detailed folder information
86
82
  """
@@ -93,9 +89,8 @@ class Folder:
93
89
  break
94
90
  if not self._id:
95
91
  raise ValueError(f"Folder '{self._name}' not found")
96
-
92
+
97
93
  return self._client._request("GET", f"folders/{self._id}")
98
-
99
94
 
100
95
  def signin(self, end_user_id: str) -> "UserScope":
101
96
  """
@@ -168,15 +163,15 @@ class Folder:
168
163
  files = {"file": (filename, file_obj)}
169
164
 
170
165
  # Create form data
171
- form_data = self._client._logic._prepare_ingest_file_form_data(
172
- metadata, rules, self._name, None
173
- )
166
+ form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
174
167
 
168
+ # use_colpali should be a query parameter as defined in the API
175
169
  response = self._client._request(
176
170
  "POST",
177
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
171
+ "ingest/file",
178
172
  data=form_data,
179
173
  files=files,
174
+ params={"use_colpali": str(use_colpali).lower()},
180
175
  )
181
176
  doc = self._client._logic._parse_document_response(response)
182
177
  doc._client = self._client
@@ -216,16 +211,20 @@ class Folder:
216
211
  metadata, rules, use_colpali, parallel, self._name, None
217
212
  )
218
213
 
219
- response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
214
+ response = self._client._request(
215
+ "POST",
216
+ "ingest/files",
217
+ data=data,
218
+ files=file_objects,
219
+ params={"use_colpali": str(use_colpali).lower()},
220
+ )
220
221
 
221
222
  if response.get("errors"):
222
223
  # Log errors but don't raise exception
223
224
  for error in response["errors"]:
224
225
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
225
226
 
226
- docs = [
227
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
228
- ]
227
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
229
228
  for doc in docs:
230
229
  doc._client = self._client
231
230
  return docs
@@ -360,6 +359,7 @@ class Folder:
360
359
  hop_depth: int = 1,
361
360
  include_paths: bool = False,
362
361
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
362
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
363
363
  ) -> CompletionResponse:
364
364
  """
365
365
  Generate completion using relevant chunks as context within this folder.
@@ -376,6 +376,7 @@ class Folder:
376
376
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
377
377
  include_paths: Whether to include relationship paths in the response
378
378
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
379
+ schema: Optional schema for structured output
379
380
 
380
381
  Returns:
381
382
  CompletionResponse: Generated completion
@@ -394,7 +395,20 @@ class Folder:
394
395
  prompt_overrides,
395
396
  self._name,
396
397
  None,
398
+ schema,
397
399
  )
400
+
401
+ # Add schema to payload if provided
402
+ if schema:
403
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
404
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
405
+ payload["schema"] = schema.model_json_schema()
406
+ else:
407
+ payload["schema"] = schema
408
+
409
+ # Add a hint to the query to return in JSON format
410
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
411
+
398
412
  response = self._client._request("POST", "query", data=payload)
399
413
  return self._client._logic._parse_completion_response(response)
400
414
 
@@ -412,9 +426,7 @@ class Folder:
412
426
  Returns:
413
427
  List[Document]: List of documents
414
428
  """
415
- params, data = self._client._logic._prepare_list_documents_request(
416
- skip, limit, filters, self._name, None
417
- )
429
+ params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
418
430
  response = self._client._request("POST", "documents", data=data, params=params)
419
431
  docs = self._client._logic._parse_document_list_response(response)
420
432
  for doc in docs:
@@ -439,9 +451,7 @@ class Folder:
439
451
  doc._client = self._client
440
452
  return docs
441
453
 
442
- def batch_get_chunks(
443
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
444
- ) -> List[FinalChunkResult]:
454
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
445
455
  """
446
456
  Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
447
457
 
@@ -542,13 +552,8 @@ class Folder:
542
552
  Returns:
543
553
  Dict[str, str]: Deletion status
544
554
  """
545
- # Get the document by filename with folder scope
546
- request = {"filename": filename, "folder_name": self._name}
547
-
548
555
  # First get the document ID
549
- response = self._client._request(
550
- "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
551
- )
556
+ response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
552
557
  doc = self._client._logic._parse_document_response(response)
553
558
 
554
559
  # Then delete by ID
@@ -670,11 +675,13 @@ class UserScope:
670
675
  if self._folder_name:
671
676
  form_data["folder_name"] = self._folder_name
672
677
 
678
+ # use_colpali should be a query parameter as defined in the API
673
679
  response = self._client._request(
674
680
  "POST",
675
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
681
+ "ingest/file",
676
682
  data=form_data,
677
683
  files=files,
684
+ params={"use_colpali": str(use_colpali).lower()},
678
685
  )
679
686
  doc = self._client._logic._parse_document_response(response)
680
687
  doc._client = self._client
@@ -722,9 +729,7 @@ class UserScope:
722
729
  if rules:
723
730
  if all(isinstance(r, list) for r in rules):
724
731
  # List of lists - per-file rules
725
- converted_rules = [
726
- [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
727
- ]
732
+ converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
728
733
  else:
729
734
  # Flat list - shared rules for all files
730
735
  converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -734,7 +739,7 @@ class UserScope:
734
739
  data = {
735
740
  "metadata": json.dumps(metadata or {}),
736
741
  "rules": json.dumps(converted_rules),
737
- "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
742
+ # Remove use_colpali from form data - it should be a query param
738
743
  "parallel": str(parallel).lower(),
739
744
  "end_user_id": self._end_user_id, # Add end user ID here
740
745
  }
@@ -743,16 +748,20 @@ class UserScope:
743
748
  if self._folder_name:
744
749
  data["folder_name"] = self._folder_name
745
750
 
746
- response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
751
+ response = self._client._request(
752
+ "POST",
753
+ "ingest/files",
754
+ data=data,
755
+ files=file_objects,
756
+ params={"use_colpali": str(use_colpali).lower()},
757
+ )
747
758
 
748
759
  if response.get("errors"):
749
760
  # Log errors but don't raise exception
750
761
  for error in response["errors"]:
751
762
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
752
763
 
753
- docs = [
754
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
755
- ]
764
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
756
765
  for doc in docs:
757
766
  doc._client = self._client
758
767
  return docs
@@ -895,6 +904,7 @@ class UserScope:
895
904
  hop_depth: int = 1,
896
905
  include_paths: bool = False,
897
906
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
907
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
898
908
  ) -> CompletionResponse:
899
909
  """
900
910
  Generate completion using relevant chunks as context as this end user.
@@ -911,6 +921,7 @@ class UserScope:
911
921
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
912
922
  include_paths: Whether to include relationship paths in the response
913
923
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
924
+ schema: Optional schema for structured output
914
925
 
915
926
  Returns:
916
927
  CompletionResponse: Generated completion
@@ -929,7 +940,20 @@ class UserScope:
929
940
  prompt_overrides,
930
941
  self._folder_name,
931
942
  self._end_user_id,
943
+ schema,
932
944
  )
945
+
946
+ # Add schema to payload if provided
947
+ if schema:
948
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
949
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
950
+ payload["schema"] = schema.model_json_schema()
951
+ else:
952
+ payload["schema"] = schema
953
+
954
+ # Add a hint to the query to return in JSON format
955
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
956
+
933
957
  response = self._client._request("POST", "query", data=payload)
934
958
  return self._client._logic._parse_completion_response(response)
935
959
 
@@ -954,7 +978,7 @@ class UserScope:
954
978
  if self._folder_name:
955
979
  params["folder_name"] = self._folder_name
956
980
 
957
- response = self._client._request("POST", f"documents", data=filters or {}, params=params)
981
+ response = self._client._request("POST", "documents", data=filters or {}, params=params)
958
982
 
959
983
  docs = [self._client._logic._parse_document_response(doc) for doc in response]
960
984
  for doc in docs:
@@ -983,9 +1007,7 @@ class UserScope:
983
1007
  doc._client = self._client
984
1008
  return docs
985
1009
 
986
- def batch_get_chunks(
987
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
988
- ) -> List[FinalChunkResult]:
1010
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
989
1011
  """
990
1012
  Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
991
1013
 
@@ -1153,9 +1175,17 @@ class Morphik:
1153
1175
 
1154
1176
  # Configure request data based on type
1155
1177
  if files:
1156
- # Multipart form data for files
1157
- request_data = {"files": files, "data": data}
1158
- # Don't set Content-Type, let httpx handle it
1178
+ # When uploading files, we need to make sure not to set Content-Type
1179
+ # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1180
+ if "Content-Type" in headers:
1181
+ del headers["Content-Type"]
1182
+
1183
+ # For file uploads with form data, use form data (not json)
1184
+ request_data = {"files": files}
1185
+ if data:
1186
+ request_data["data"] = data
1187
+
1188
+ # Files are now properly handled
1159
1189
  else:
1160
1190
  # JSON for everything else
1161
1191
  headers["Content-Type"] = "application/json"
@@ -1168,8 +1198,13 @@ class Morphik:
1168
1198
  params=params,
1169
1199
  **request_data,
1170
1200
  )
1171
- response.raise_for_status()
1172
- return response.json()
1201
+ try:
1202
+ response.raise_for_status()
1203
+ return response.json()
1204
+ except httpx.HTTPStatusError as e:
1205
+ # Print error response for debugging
1206
+ print(f"Error response: {e.response.status_code} - {e.response.text}")
1207
+ raise
1173
1208
 
1174
1209
  def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
1175
1210
  """Convert a rule to a dictionary format"""
@@ -1186,18 +1221,16 @@ class Morphik:
1186
1221
  Returns:
1187
1222
  Folder: A folder object ready for scoped operations
1188
1223
  """
1189
- payload = {
1190
- "name": name
1191
- }
1224
+ payload = {"name": name}
1192
1225
  if description:
1193
1226
  payload["description"] = description
1194
-
1227
+
1195
1228
  response = self._request("POST", "folders", data=payload)
1196
1229
  folder_info = FolderInfo(**response)
1197
-
1230
+
1198
1231
  # Return a usable Folder object with the ID from the response
1199
1232
  return Folder(self, name, folder_id=folder_info.id)
1200
-
1233
+
1201
1234
  def get_folder_by_name(self, name: str) -> Folder:
1202
1235
  """
1203
1236
  Get a folder by name to scope operations.
@@ -1209,7 +1242,7 @@ class Morphik:
1209
1242
  Folder: A folder object for scoped operations
1210
1243
  """
1211
1244
  return Folder(self, name)
1212
-
1245
+
1213
1246
  def get_folder(self, folder_id: str) -> Folder:
1214
1247
  """
1215
1248
  Get a folder by ID.
@@ -1226,13 +1259,13 @@ class Morphik:
1226
1259
  def list_folders(self) -> List[Folder]:
1227
1260
  """
1228
1261
  List all folders the user has access to as Folder objects.
1229
-
1262
+
1230
1263
  Returns:
1231
1264
  List[Folder]: List of Folder objects ready for operations
1232
1265
  """
1233
1266
  folder_infos = self._request("GET", "folders")
1234
1267
  return [Folder(self, info["name"], info["id"]) for info in folder_infos]
1235
-
1268
+
1236
1269
  def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1237
1270
  """
1238
1271
  Add a document to a folder.
@@ -1246,7 +1279,7 @@ class Morphik:
1246
1279
  """
1247
1280
  response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1248
1281
  return response
1249
-
1282
+
1250
1283
  def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1251
1284
  """
1252
1285
  Remove a document from a folder.
@@ -1290,7 +1323,8 @@ class Morphik:
1290
1323
  rules: Optional list of rules to apply during ingestion. Can be:
1291
1324
  - MetadataExtractionRule: Extract metadata using a schema
1292
1325
  - NaturalLanguageRule: Transform content using natural language
1293
- use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
1326
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text
1327
+ (slower, but significantly better retrieval accuracy for text and images)
1294
1328
  Returns:
1295
1329
  Document: Metadata of the ingested document
1296
1330
 
@@ -1343,7 +1377,8 @@ class Morphik:
1343
1377
  rules: Optional list of rules to apply during ingestion. Can be:
1344
1378
  - MetadataExtractionRule: Extract metadata using a schema
1345
1379
  - NaturalLanguageRule: Transform content using natural language
1346
- use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
1380
+ use_colpali: Whether to use ColPali-style embedding model to ingest the file
1381
+ (slower, but significantly better retrieval accuracy for images)
1347
1382
 
1348
1383
  Returns:
1349
1384
  Document: Metadata of the ingested document
@@ -1380,11 +1415,13 @@ class Morphik:
1380
1415
  # Create form data
1381
1416
  form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
1382
1417
 
1418
+ # use_colpali should be a query parameter as defined in the API
1383
1419
  response = self._request(
1384
1420
  "POST",
1385
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
1421
+ "ingest/file",
1386
1422
  data=form_data,
1387
1423
  files=files,
1424
+ params={"use_colpali": str(use_colpali).lower()},
1388
1425
  )
1389
1426
  doc = self._logic._parse_document_response(response)
1390
1427
  doc._client = self
@@ -1423,11 +1460,16 @@ class Morphik:
1423
1460
 
1424
1461
  try:
1425
1462
  # Prepare form data
1426
- data = self._logic._prepare_ingest_files_form_data(
1427
- metadata, rules, use_colpali, parallel, None, None
1428
- )
1463
+ # Prepare form data - use_colpali should be a query parameter, not form data
1464
+ data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
1429
1465
 
1430
- response = self._request("POST", "ingest/files", data=data, files=file_objects)
1466
+ response = self._request(
1467
+ "POST",
1468
+ "ingest/files",
1469
+ data=data,
1470
+ files=file_objects,
1471
+ params={"use_colpali": str(use_colpali).lower()},
1472
+ )
1431
1473
 
1432
1474
  if response.get("errors"):
1433
1475
  # Log errors but don't raise exception
@@ -1509,7 +1551,8 @@ class Morphik:
1509
1551
  filters: Optional metadata filters
1510
1552
  k: Number of results (default: 4)
1511
1553
  min_score: Minimum similarity threshold (default: 0.0)
1512
- use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
1554
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
1555
+ (only works for documents ingested with `use_colpali=True`)
1513
1556
  Returns:
1514
1557
  List[ChunkResult]
1515
1558
 
@@ -1521,9 +1564,7 @@ class Morphik:
1521
1564
  )
1522
1565
  ```
1523
1566
  """
1524
- payload = self._logic._prepare_retrieve_chunks_request(
1525
- query, filters, k, min_score, use_colpali, None, None
1526
- )
1567
+ payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
1527
1568
  response = self._request("POST", "retrieve/chunks", data=payload)
1528
1569
  return self._logic._parse_chunk_result_list_response(response)
1529
1570
 
@@ -1543,7 +1584,8 @@ class Morphik:
1543
1584
  filters: Optional metadata filters
1544
1585
  k: Number of results (default: 4)
1545
1586
  min_score: Minimum similarity threshold (default: 0.0)
1546
- use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
1587
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
1588
+ (only works for documents ingested with `use_colpali=True`)
1547
1589
  Returns:
1548
1590
  List[DocumentResult]
1549
1591
 
@@ -1555,9 +1597,7 @@ class Morphik:
1555
1597
  )
1556
1598
  ```
1557
1599
  """
1558
- payload = self._logic._prepare_retrieve_docs_request(
1559
- query, filters, k, min_score, use_colpali, None, None
1560
- )
1600
+ payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
1561
1601
  response = self._request("POST", "retrieve/docs", data=payload)
1562
1602
  return self._logic._parse_document_result_list_response(response)
1563
1603
 
@@ -1574,6 +1614,7 @@ class Morphik:
1574
1614
  hop_depth: int = 1,
1575
1615
  include_paths: bool = False,
1576
1616
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
1617
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
1577
1618
  ) -> CompletionResponse:
1578
1619
  """
1579
1620
  Generate completion using relevant chunks as context.
@@ -1585,12 +1626,14 @@ class Morphik:
1585
1626
  min_score: Minimum similarity threshold (default: 0.0)
1586
1627
  max_tokens: Maximum tokens in completion
1587
1628
  temperature: Model temperature
1588
- use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
1629
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion
1630
+ (only works for documents ingested with `use_colpali=True`)
1589
1631
  graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
1590
1632
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
1591
1633
  include_paths: Whether to include relationship paths in the response
1592
1634
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
1593
1635
  Either a QueryPromptOverrides object or a dictionary with the same structure
1636
+ schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
1594
1637
  Returns:
1595
1638
  CompletionResponse
1596
1639
 
@@ -1638,6 +1681,27 @@ class Morphik:
1638
1681
  if response.metadata and "graph" in response.metadata:
1639
1682
  for path in response.metadata["graph"]["paths"]:
1640
1683
  print(" -> ".join(path))
1684
+
1685
+ # Using structured output with a Pydantic model
1686
+ from pydantic import BaseModel
1687
+
1688
+ class ResearchFindings(BaseModel):
1689
+ main_finding: str
1690
+ supporting_evidence: List[str]
1691
+ limitations: List[str]
1692
+
1693
+ response = db.query(
1694
+ "Summarize the key research findings from these documents",
1695
+ schema=ResearchFindings
1696
+ )
1697
+
1698
+ # Access structured output
1699
+ if response.structured_output:
1700
+ findings = response.structured_output
1701
+ print(f"Main finding: {findings.main_finding}")
1702
+ print("Supporting evidence:")
1703
+ for evidence in findings.supporting_evidence:
1704
+ print(f"- {evidence}")
1641
1705
  ```
1642
1706
  """
1643
1707
  payload = self._logic._prepare_query_request(
@@ -1654,7 +1718,20 @@ class Morphik:
1654
1718
  prompt_overrides,
1655
1719
  None,
1656
1720
  None,
1721
+ schema,
1657
1722
  )
1723
+
1724
+ # Add schema to payload if provided
1725
+ if schema:
1726
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
1727
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
1728
+ payload["schema"] = schema.model_json_schema()
1729
+ else:
1730
+ payload["schema"] = schema
1731
+
1732
+ # Add a hint to the query to return in JSON format
1733
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
1734
+
1658
1735
  response = self._request("POST", "query", data=payload)
1659
1736
  return self._logic._parse_completion_response(response)
1660
1737
 
@@ -1708,17 +1785,17 @@ class Morphik:
1708
1785
  doc = self._logic._parse_document_response(response)
1709
1786
  doc._client = self
1710
1787
  return doc
1711
-
1788
+
1712
1789
  def get_document_status(self, document_id: str) -> Dict[str, Any]:
1713
1790
  """
1714
1791
  Get the current processing status of a document.
1715
-
1792
+
1716
1793
  Args:
1717
1794
  document_id: ID of the document to check
1718
-
1795
+
1719
1796
  Returns:
1720
1797
  Dict[str, Any]: Status information including current status, potential errors, and other metadata
1721
-
1798
+
1722
1799
  Example:
1723
1800
  ```python
1724
1801
  status = db.get_document_status("doc_123")
@@ -1732,23 +1809,23 @@ class Morphik:
1732
1809
  """
1733
1810
  response = self._request("GET", f"documents/{document_id}/status")
1734
1811
  return response
1735
-
1812
+
1736
1813
  def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1737
1814
  """
1738
1815
  Wait for a document's processing to complete.
1739
-
1816
+
1740
1817
  Args:
1741
1818
  document_id: ID of the document to wait for
1742
1819
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1743
1820
  check_interval_seconds: Time between status checks (default: 2 seconds)
1744
-
1821
+
1745
1822
  Returns:
1746
1823
  Document: Updated document with the latest status
1747
-
1824
+
1748
1825
  Raises:
1749
1826
  TimeoutError: If processing doesn't complete within the timeout period
1750
1827
  ValueError: If processing fails with an error
1751
-
1828
+
1752
1829
  Example:
1753
1830
  ```python
1754
1831
  # Upload a file and wait for processing to complete
@@ -1763,20 +1840,21 @@ class Morphik:
1763
1840
  ```
1764
1841
  """
1765
1842
  import time
1843
+
1766
1844
  start_time = time.time()
1767
-
1845
+
1768
1846
  while (time.time() - start_time) < timeout_seconds:
1769
1847
  status = self.get_document_status(document_id)
1770
-
1848
+
1771
1849
  if status["status"] == "completed":
1772
1850
  # Get the full document now that it's complete
1773
1851
  return self.get_document(document_id)
1774
1852
  elif status["status"] == "failed":
1775
1853
  raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1776
-
1854
+
1777
1855
  # Wait before checking again
1778
1856
  time.sleep(check_interval_seconds)
1779
-
1857
+
1780
1858
  raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1781
1859
 
1782
1860
  def get_document_by_filename(self, filename: str) -> Document:
@@ -1930,9 +2008,7 @@ class Morphik:
1930
2008
  form_data["use_colpali"] = str(use_colpali).lower()
1931
2009
 
1932
2010
  # Use the dedicated file update endpoint
1933
- response = self._request(
1934
- "POST", f"documents/{document_id}/update_file", data=form_data, files=files
1935
- )
2011
+ response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
1936
2012
 
1937
2013
  doc = self._logic._parse_document_response(response)
1938
2014
  doc._client = self
@@ -2151,15 +2227,14 @@ class Morphik:
2151
2227
  print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
2152
2228
  ```
2153
2229
  """
2154
- response = self._request("POST", "batch/documents", data=document_ids)
2230
+ # API expects a dict with document_ids key, not a direct list
2231
+ response = self._request("POST", "batch/documents", data={"document_ids": document_ids})
2155
2232
  docs = self._logic._parse_document_list_response(response)
2156
2233
  for doc in docs:
2157
2234
  doc._client = self
2158
2235
  return docs
2159
2236
 
2160
- def batch_get_chunks(
2161
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
2162
- ) -> List[FinalChunkResult]:
2237
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
2163
2238
  """
2164
2239
  Retrieve specific chunks by their document ID and chunk number in a single batch operation.
2165
2240
 
@@ -2215,8 +2290,10 @@ class Morphik:
2215
2290
  name: Name of the cache to create
2216
2291
  model: Name of the model to use (e.g. "llama2")
2217
2292
  gguf_file: Name of the GGUF file to use for the model
2218
- filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
2219
- docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
2293
+ filters: Optional metadata filters to determine which documents to include.
2294
+ These filters will be applied in addition to any specific docs provided.
2295
+ docs: Optional list of specific document IDs to include.
2296
+ These docs will be included in addition to any documents matching the filters.
2220
2297
 
2221
2298
  Returns:
2222
2299
  Dict[str, Any]: Created cache configuration
@@ -2321,12 +2398,16 @@ class Morphik:
2321
2398
  if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
2322
2399
  prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
2323
2400
 
2324
- request = {
2325
- "name": name,
2326
- "filters": filters,
2327
- "documents": documents,
2328
- "prompt_overrides": prompt_overrides,
2329
- }
2401
+ # Initialize request with required fields
2402
+ request = {"name": name}
2403
+
2404
+ # Add optional fields only if they are not None
2405
+ if filters is not None:
2406
+ request["filters"] = filters
2407
+ if documents is not None:
2408
+ request["documents"] = documents
2409
+ if prompt_overrides is not None:
2410
+ request["prompt_overrides"] = prompt_overrides
2330
2411
 
2331
2412
  response = self._request("POST", "graph/create", request)
2332
2413
  return self._logic._parse_graph_response(response)