morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/sync.py CHANGED
@@ -2,27 +2,23 @@ import json
2
2
  import logging
3
3
  from io import BytesIO, IOBase
4
4
  from pathlib import Path
5
- from typing import Dict, Any, List, Optional, Union, BinaryIO
6
-
7
- from PIL import Image
8
- from PIL.Image import Image as PILImage
5
+ from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
9
6
 
10
7
  import httpx
8
+ from pydantic import BaseModel
11
9
 
10
+ from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
12
11
  from .models import (
12
+ ChunkSource,
13
+ CompletionResponse, # Prompt override models
13
14
  Document,
14
15
  DocumentResult,
15
- CompletionResponse,
16
- IngestTextRequest,
17
- ChunkSource,
18
- Graph,
19
16
  FolderInfo,
20
- # Prompt override models
17
+ Graph,
21
18
  GraphPromptOverrides,
19
+ IngestTextRequest,
22
20
  QueryPromptOverrides,
23
21
  )
24
- from .rules import Rule
25
- from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
26
22
 
27
23
  logger = logging.getLogger(__name__)
28
24
 
@@ -71,16 +67,16 @@ class Folder:
71
67
  def name(self) -> str:
72
68
  """Returns the folder name."""
73
69
  return self._name
74
-
70
+
75
71
  @property
76
72
  def id(self) -> Optional[str]:
77
73
  """Returns the folder ID if available."""
78
74
  return self._id
79
-
75
+
80
76
  def get_info(self) -> Dict[str, Any]:
81
77
  """
82
78
  Get detailed information about this folder.
83
-
79
+
84
80
  Returns:
85
81
  Dict[str, Any]: Detailed folder information
86
82
  """
@@ -93,9 +89,8 @@ class Folder:
93
89
  break
94
90
  if not self._id:
95
91
  raise ValueError(f"Folder '{self._name}' not found")
96
-
92
+
97
93
  return self._client._request("GET", f"folders/{self._id}")
98
-
99
94
 
100
95
  def signin(self, end_user_id: str) -> "UserScope":
101
96
  """
@@ -168,9 +163,7 @@ class Folder:
168
163
  files = {"file": (filename, file_obj)}
169
164
 
170
165
  # Create form data
171
- form_data = self._client._logic._prepare_ingest_file_form_data(
172
- metadata, rules, self._name, None
173
- )
166
+ form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)
174
167
 
175
168
  # use_colpali should be a query parameter as defined in the API
176
169
  response = self._client._request(
@@ -219,9 +212,9 @@ class Folder:
219
212
  )
220
213
 
221
214
  response = self._client._request(
222
- "POST",
223
- "ingest/files",
224
- data=data,
215
+ "POST",
216
+ "ingest/files",
217
+ data=data,
225
218
  files=file_objects,
226
219
  params={"use_colpali": str(use_colpali).lower()},
227
220
  )
@@ -231,9 +224,7 @@ class Folder:
231
224
  for error in response["errors"]:
232
225
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
233
226
 
234
- docs = [
235
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
236
- ]
227
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
237
228
  for doc in docs:
238
229
  doc._client = self._client
239
230
  return docs
@@ -368,6 +359,7 @@ class Folder:
368
359
  hop_depth: int = 1,
369
360
  include_paths: bool = False,
370
361
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
362
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
371
363
  ) -> CompletionResponse:
372
364
  """
373
365
  Generate completion using relevant chunks as context within this folder.
@@ -384,6 +376,7 @@ class Folder:
384
376
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
385
377
  include_paths: Whether to include relationship paths in the response
386
378
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
379
+ schema: Optional schema for structured output
387
380
 
388
381
  Returns:
389
382
  CompletionResponse: Generated completion
@@ -402,7 +395,20 @@ class Folder:
402
395
  prompt_overrides,
403
396
  self._name,
404
397
  None,
398
+ schema,
405
399
  )
400
+
401
+ # Add schema to payload if provided
402
+ if schema:
403
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
404
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
405
+ payload["schema"] = schema.model_json_schema()
406
+ else:
407
+ payload["schema"] = schema
408
+
409
+ # Add a hint to the query to return in JSON format
410
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
411
+
406
412
  response = self._client._request("POST", "query", data=payload)
407
413
  return self._client._logic._parse_completion_response(response)
408
414
 
@@ -420,9 +426,7 @@ class Folder:
420
426
  Returns:
421
427
  List[Document]: List of documents
422
428
  """
423
- params, data = self._client._logic._prepare_list_documents_request(
424
- skip, limit, filters, self._name, None
425
- )
429
+ params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
426
430
  response = self._client._request("POST", "documents", data=data, params=params)
427
431
  docs = self._client._logic._parse_document_list_response(response)
428
432
  for doc in docs:
@@ -447,9 +451,7 @@ class Folder:
447
451
  doc._client = self._client
448
452
  return docs
449
453
 
450
- def batch_get_chunks(
451
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
452
- ) -> List[FinalChunkResult]:
454
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
453
455
  """
454
456
  Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
455
457
 
@@ -550,13 +552,8 @@ class Folder:
550
552
  Returns:
551
553
  Dict[str, str]: Deletion status
552
554
  """
553
- # Get the document by filename with folder scope
554
- request = {"filename": filename, "folder_name": self._name}
555
-
556
555
  # First get the document ID
557
- response = self._client._request(
558
- "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
559
- )
556
+ response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
560
557
  doc = self._client._logic._parse_document_response(response)
561
558
 
562
559
  # Then delete by ID
@@ -677,7 +674,7 @@ class UserScope:
677
674
  # Add folder name if scoped to a folder
678
675
  if self._folder_name:
679
676
  form_data["folder_name"] = self._folder_name
680
-
677
+
681
678
  # use_colpali should be a query parameter as defined in the API
682
679
  response = self._client._request(
683
680
  "POST",
@@ -732,9 +729,7 @@ class UserScope:
732
729
  if rules:
733
730
  if all(isinstance(r, list) for r in rules):
734
731
  # List of lists - per-file rules
735
- converted_rules = [
736
- [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
737
- ]
732
+ converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
738
733
  else:
739
734
  # Flat list - shared rules for all files
740
735
  converted_rules = [self._client._convert_rule(r) for r in rules]
@@ -754,9 +749,9 @@ class UserScope:
754
749
  data["folder_name"] = self._folder_name
755
750
 
756
751
  response = self._client._request(
757
- "POST",
758
- "ingest/files",
759
- data=data,
752
+ "POST",
753
+ "ingest/files",
754
+ data=data,
760
755
  files=file_objects,
761
756
  params={"use_colpali": str(use_colpali).lower()},
762
757
  )
@@ -766,9 +761,7 @@ class UserScope:
766
761
  for error in response["errors"]:
767
762
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
768
763
 
769
- docs = [
770
- self._client._logic._parse_document_response(doc) for doc in response["documents"]
771
- ]
764
+ docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
772
765
  for doc in docs:
773
766
  doc._client = self._client
774
767
  return docs
@@ -911,6 +904,7 @@ class UserScope:
911
904
  hop_depth: int = 1,
912
905
  include_paths: bool = False,
913
906
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
907
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
914
908
  ) -> CompletionResponse:
915
909
  """
916
910
  Generate completion using relevant chunks as context as this end user.
@@ -927,6 +921,7 @@ class UserScope:
927
921
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
928
922
  include_paths: Whether to include relationship paths in the response
929
923
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
924
+ schema: Optional schema for structured output
930
925
 
931
926
  Returns:
932
927
  CompletionResponse: Generated completion
@@ -945,7 +940,20 @@ class UserScope:
945
940
  prompt_overrides,
946
941
  self._folder_name,
947
942
  self._end_user_id,
943
+ schema,
948
944
  )
945
+
946
+ # Add schema to payload if provided
947
+ if schema:
948
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
949
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
950
+ payload["schema"] = schema.model_json_schema()
951
+ else:
952
+ payload["schema"] = schema
953
+
954
+ # Add a hint to the query to return in JSON format
955
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
956
+
949
957
  response = self._client._request("POST", "query", data=payload)
950
958
  return self._client._logic._parse_completion_response(response)
951
959
 
@@ -970,7 +978,7 @@ class UserScope:
970
978
  if self._folder_name:
971
979
  params["folder_name"] = self._folder_name
972
980
 
973
- response = self._client._request("POST", f"documents", data=filters or {}, params=params)
981
+ response = self._client._request("POST", "documents", data=filters or {}, params=params)
974
982
 
975
983
  docs = [self._client._logic._parse_document_response(doc) for doc in response]
976
984
  for doc in docs:
@@ -999,9 +1007,7 @@ class UserScope:
999
1007
  doc._client = self._client
1000
1008
  return docs
1001
1009
 
1002
- def batch_get_chunks(
1003
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
1004
- ) -> List[FinalChunkResult]:
1010
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
1005
1011
  """
1006
1012
  Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
1007
1013
 
@@ -1173,12 +1179,12 @@ class Morphik:
1173
1179
  # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1174
1180
  if "Content-Type" in headers:
1175
1181
  del headers["Content-Type"]
1176
-
1182
+
1177
1183
  # For file uploads with form data, use form data (not json)
1178
1184
  request_data = {"files": files}
1179
1185
  if data:
1180
1186
  request_data["data"] = data
1181
-
1187
+
1182
1188
  # Files are now properly handled
1183
1189
  else:
1184
1190
  # JSON for everything else
@@ -1192,8 +1198,13 @@ class Morphik:
1192
1198
  params=params,
1193
1199
  **request_data,
1194
1200
  )
1195
- response.raise_for_status()
1196
- return response.json()
1201
+ try:
1202
+ response.raise_for_status()
1203
+ return response.json()
1204
+ except httpx.HTTPStatusError as e:
1205
+ # Print error response for debugging
1206
+ print(f"Error response: {e.response.status_code} - {e.response.text}")
1207
+ raise
1197
1208
 
1198
1209
  def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
1199
1210
  """Convert a rule to a dictionary format"""
@@ -1210,18 +1221,16 @@ class Morphik:
1210
1221
  Returns:
1211
1222
  Folder: A folder object ready for scoped operations
1212
1223
  """
1213
- payload = {
1214
- "name": name
1215
- }
1224
+ payload = {"name": name}
1216
1225
  if description:
1217
1226
  payload["description"] = description
1218
-
1227
+
1219
1228
  response = self._request("POST", "folders", data=payload)
1220
1229
  folder_info = FolderInfo(**response)
1221
-
1230
+
1222
1231
  # Return a usable Folder object with the ID from the response
1223
1232
  return Folder(self, name, folder_id=folder_info.id)
1224
-
1233
+
1225
1234
  def get_folder_by_name(self, name: str) -> Folder:
1226
1235
  """
1227
1236
  Get a folder by name to scope operations.
@@ -1233,7 +1242,7 @@ class Morphik:
1233
1242
  Folder: A folder object for scoped operations
1234
1243
  """
1235
1244
  return Folder(self, name)
1236
-
1245
+
1237
1246
  def get_folder(self, folder_id: str) -> Folder:
1238
1247
  """
1239
1248
  Get a folder by ID.
@@ -1250,13 +1259,13 @@ class Morphik:
1250
1259
  def list_folders(self) -> List[Folder]:
1251
1260
  """
1252
1261
  List all folders the user has access to as Folder objects.
1253
-
1262
+
1254
1263
  Returns:
1255
1264
  List[Folder]: List of Folder objects ready for operations
1256
1265
  """
1257
1266
  folder_infos = self._request("GET", "folders")
1258
1267
  return [Folder(self, info["name"], info["id"]) for info in folder_infos]
1259
-
1268
+
1260
1269
  def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1261
1270
  """
1262
1271
  Add a document to a folder.
@@ -1270,7 +1279,7 @@ class Morphik:
1270
1279
  """
1271
1280
  response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1272
1281
  return response
1273
-
1282
+
1274
1283
  def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1275
1284
  """
1276
1285
  Remove a document from a folder.
@@ -1314,7 +1323,8 @@ class Morphik:
1314
1323
  rules: Optional list of rules to apply during ingestion. Can be:
1315
1324
  - MetadataExtractionRule: Extract metadata using a schema
1316
1325
  - NaturalLanguageRule: Transform content using natural language
1317
- use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
1326
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text
1327
+ (slower, but significantly better retrieval accuracy for text and images)
1318
1328
  Returns:
1319
1329
  Document: Metadata of the ingested document
1320
1330
 
@@ -1367,7 +1377,8 @@ class Morphik:
1367
1377
  rules: Optional list of rules to apply during ingestion. Can be:
1368
1378
  - MetadataExtractionRule: Extract metadata using a schema
1369
1379
  - NaturalLanguageRule: Transform content using natural language
1370
- use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
1380
+ use_colpali: Whether to use ColPali-style embedding model to ingest the file
1381
+ (slower, but significantly better retrieval accuracy for images)
1371
1382
 
1372
1383
  Returns:
1373
1384
  Document: Metadata of the ingested document
@@ -1450,14 +1461,12 @@ class Morphik:
1450
1461
  try:
1451
1462
  # Prepare form data
1452
1463
  # Prepare form data - use_colpali should be a query parameter, not form data
1453
- data = self._logic._prepare_ingest_files_form_data(
1454
- metadata, rules, use_colpali, parallel, None, None
1455
- )
1464
+ data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)
1456
1465
 
1457
1466
  response = self._request(
1458
- "POST",
1459
- "ingest/files",
1460
- data=data,
1467
+ "POST",
1468
+ "ingest/files",
1469
+ data=data,
1461
1470
  files=file_objects,
1462
1471
  params={"use_colpali": str(use_colpali).lower()},
1463
1472
  )
@@ -1542,7 +1551,8 @@ class Morphik:
1542
1551
  filters: Optional metadata filters
1543
1552
  k: Number of results (default: 4)
1544
1553
  min_score: Minimum similarity threshold (default: 0.0)
1545
- use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
1554
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
1555
+ (only works for documents ingested with `use_colpali=True`)
1546
1556
  Returns:
1547
1557
  List[ChunkResult]
1548
1558
 
@@ -1554,9 +1564,7 @@ class Morphik:
1554
1564
  )
1555
1565
  ```
1556
1566
  """
1557
- payload = self._logic._prepare_retrieve_chunks_request(
1558
- query, filters, k, min_score, use_colpali, None, None
1559
- )
1567
+ payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
1560
1568
  response = self._request("POST", "retrieve/chunks", data=payload)
1561
1569
  return self._logic._parse_chunk_result_list_response(response)
1562
1570
 
@@ -1576,7 +1584,8 @@ class Morphik:
1576
1584
  filters: Optional metadata filters
1577
1585
  k: Number of results (default: 4)
1578
1586
  min_score: Minimum similarity threshold (default: 0.0)
1579
- use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
1587
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
1588
+ (only works for documents ingested with `use_colpali=True`)
1580
1589
  Returns:
1581
1590
  List[DocumentResult]
1582
1591
 
@@ -1588,9 +1597,7 @@ class Morphik:
1588
1597
  )
1589
1598
  ```
1590
1599
  """
1591
- payload = self._logic._prepare_retrieve_docs_request(
1592
- query, filters, k, min_score, use_colpali, None, None
1593
- )
1600
+ payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
1594
1601
  response = self._request("POST", "retrieve/docs", data=payload)
1595
1602
  return self._logic._parse_document_result_list_response(response)
1596
1603
 
@@ -1607,6 +1614,7 @@ class Morphik:
1607
1614
  hop_depth: int = 1,
1608
1615
  include_paths: bool = False,
1609
1616
  prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
1617
+ schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
1610
1618
  ) -> CompletionResponse:
1611
1619
  """
1612
1620
  Generate completion using relevant chunks as context.
@@ -1618,12 +1626,14 @@ class Morphik:
1618
1626
  min_score: Minimum similarity threshold (default: 0.0)
1619
1627
  max_tokens: Maximum tokens in completion
1620
1628
  temperature: Model temperature
1621
- use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
1629
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion
1630
+ (only works for documents ingested with `use_colpali=True`)
1622
1631
  graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
1623
1632
  hop_depth: Number of relationship hops to traverse in the graph (1-3)
1624
1633
  include_paths: Whether to include relationship paths in the response
1625
1634
  prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
1626
1635
  Either a QueryPromptOverrides object or a dictionary with the same structure
1636
+ schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
1627
1637
  Returns:
1628
1638
  CompletionResponse
1629
1639
 
@@ -1671,6 +1681,27 @@ class Morphik:
1671
1681
  if response.metadata and "graph" in response.metadata:
1672
1682
  for path in response.metadata["graph"]["paths"]:
1673
1683
  print(" -> ".join(path))
1684
+
1685
+ # Using structured output with a Pydantic model
1686
+ from pydantic import BaseModel
1687
+
1688
+ class ResearchFindings(BaseModel):
1689
+ main_finding: str
1690
+ supporting_evidence: List[str]
1691
+ limitations: List[str]
1692
+
1693
+ response = db.query(
1694
+ "Summarize the key research findings from these documents",
1695
+ schema=ResearchFindings
1696
+ )
1697
+
1698
+ # Access structured output
1699
+ if response.structured_output:
1700
+ findings = response.structured_output
1701
+ print(f"Main finding: {findings.main_finding}")
1702
+ print("Supporting evidence:")
1703
+ for evidence in findings.supporting_evidence:
1704
+ print(f"- {evidence}")
1674
1705
  ```
1675
1706
  """
1676
1707
  payload = self._logic._prepare_query_request(
@@ -1687,7 +1718,20 @@ class Morphik:
1687
1718
  prompt_overrides,
1688
1719
  None,
1689
1720
  None,
1721
+ schema,
1690
1722
  )
1723
+
1724
+ # Add schema to payload if provided
1725
+ if schema:
1726
+ # If schema is a Pydantic model class, we need to serialize it to a schema dict
1727
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
1728
+ payload["schema"] = schema.model_json_schema()
1729
+ else:
1730
+ payload["schema"] = schema
1731
+
1732
+ # Add a hint to the query to return in JSON format
1733
+ payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."
1734
+
1691
1735
  response = self._request("POST", "query", data=payload)
1692
1736
  return self._logic._parse_completion_response(response)
1693
1737
 
@@ -1741,17 +1785,17 @@ class Morphik:
1741
1785
  doc = self._logic._parse_document_response(response)
1742
1786
  doc._client = self
1743
1787
  return doc
1744
-
1788
+
1745
1789
  def get_document_status(self, document_id: str) -> Dict[str, Any]:
1746
1790
  """
1747
1791
  Get the current processing status of a document.
1748
-
1792
+
1749
1793
  Args:
1750
1794
  document_id: ID of the document to check
1751
-
1795
+
1752
1796
  Returns:
1753
1797
  Dict[str, Any]: Status information including current status, potential errors, and other metadata
1754
-
1798
+
1755
1799
  Example:
1756
1800
  ```python
1757
1801
  status = db.get_document_status("doc_123")
@@ -1765,23 +1809,23 @@ class Morphik:
1765
1809
  """
1766
1810
  response = self._request("GET", f"documents/{document_id}/status")
1767
1811
  return response
1768
-
1812
+
1769
1813
  def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1770
1814
  """
1771
1815
  Wait for a document's processing to complete.
1772
-
1816
+
1773
1817
  Args:
1774
1818
  document_id: ID of the document to wait for
1775
1819
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1776
1820
  check_interval_seconds: Time between status checks (default: 2 seconds)
1777
-
1821
+
1778
1822
  Returns:
1779
1823
  Document: Updated document with the latest status
1780
-
1824
+
1781
1825
  Raises:
1782
1826
  TimeoutError: If processing doesn't complete within the timeout period
1783
1827
  ValueError: If processing fails with an error
1784
-
1828
+
1785
1829
  Example:
1786
1830
  ```python
1787
1831
  # Upload a file and wait for processing to complete
@@ -1796,20 +1840,21 @@ class Morphik:
1796
1840
  ```
1797
1841
  """
1798
1842
  import time
1843
+
1799
1844
  start_time = time.time()
1800
-
1845
+
1801
1846
  while (time.time() - start_time) < timeout_seconds:
1802
1847
  status = self.get_document_status(document_id)
1803
-
1848
+
1804
1849
  if status["status"] == "completed":
1805
1850
  # Get the full document now that it's complete
1806
1851
  return self.get_document(document_id)
1807
1852
  elif status["status"] == "failed":
1808
1853
  raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1809
-
1854
+
1810
1855
  # Wait before checking again
1811
1856
  time.sleep(check_interval_seconds)
1812
-
1857
+
1813
1858
  raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1814
1859
 
1815
1860
  def get_document_by_filename(self, filename: str) -> Document:
@@ -1963,9 +2008,7 @@ class Morphik:
1963
2008
  form_data["use_colpali"] = str(use_colpali).lower()
1964
2009
 
1965
2010
  # Use the dedicated file update endpoint
1966
- response = self._request(
1967
- "POST", f"documents/{document_id}/update_file", data=form_data, files=files
1968
- )
2011
+ response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)
1969
2012
 
1970
2013
  doc = self._logic._parse_document_response(response)
1971
2014
  doc._client = self
@@ -2191,9 +2234,7 @@ class Morphik:
2191
2234
  doc._client = self
2192
2235
  return docs
2193
2236
 
2194
- def batch_get_chunks(
2195
- self, sources: List[Union[ChunkSource, Dict[str, Any]]]
2196
- ) -> List[FinalChunkResult]:
2237
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
2197
2238
  """
2198
2239
  Retrieve specific chunks by their document ID and chunk number in a single batch operation.
2199
2240
 
@@ -2249,8 +2290,10 @@ class Morphik:
2249
2290
  name: Name of the cache to create
2250
2291
  model: Name of the model to use (e.g. "llama2")
2251
2292
  gguf_file: Name of the GGUF file to use for the model
2252
- filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
2253
- docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
2293
+ filters: Optional metadata filters to determine which documents to include.
2294
+ These filters will be applied in addition to any specific docs provided.
2295
+ docs: Optional list of specific document IDs to include.
2296
+ These docs will be included in addition to any documents matching the filters.
2254
2297
 
2255
2298
  Returns:
2256
2299
  Dict[str, Any]: Created cache configuration
@@ -2355,12 +2398,16 @@ class Morphik:
2355
2398
  if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
2356
2399
  prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
2357
2400
 
2358
- request = {
2359
- "name": name,
2360
- "filters": filters,
2361
- "documents": documents,
2362
- "prompt_overrides": prompt_overrides,
2363
- }
2401
+ # Initialize request with required fields
2402
+ request = {"name": name}
2403
+
2404
+ # Add optional fields only if they are not None
2405
+ if filters is not None:
2406
+ request["filters"] = filters
2407
+ if documents is not None:
2408
+ request["documents"] = documents
2409
+ if prompt_overrides is not None:
2410
+ request["prompt_overrides"] = prompt_overrides
2364
2411
 
2365
2412
  response = self._request("POST", "graph/create", request)
2366
2413
  return self._logic._parse_graph_response(response)
morphik/tests/README.md CHANGED
@@ -38,4 +38,4 @@ python example_usage.py --async
38
38
  ## Environment Variables
39
39
 
40
40
  - `MORPHIK_TEST_URL` - The URL of the Morphik server to use for tests (default: http://localhost:8000)
41
- - `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server
41
+ - `SKIP_LIVE_TESTS` - Set to "1" to skip tests that require a running server