morphik 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/__init__.py CHANGED
@@ -12,4 +12,4 @@ __all__ = [
12
12
  "Document",
13
13
  ]
14
14
 
15
- __version__ = "0.1.2"
15
+ __version__ = "0.1.4"
morphik/_internal.py CHANGED
@@ -211,7 +211,7 @@ class _MorphikClientLogic:
211
211
  data = {
212
212
  "metadata": json.dumps(metadata or {}),
213
213
  "rules": json.dumps(converted_rules),
214
- "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
214
+ # use_colpali is a query parameter, not a form field
215
215
  "parallel": str(parallel).lower(),
216
216
  }
217
217
 
morphik/async_.py CHANGED
@@ -14,6 +14,7 @@ from .models import (
14
14
  IngestTextRequest,
15
15
  ChunkSource,
16
16
  Graph,
17
+ FolderInfo,
17
18
  # Prompt override models
18
19
  GraphPromptOverrides,
19
20
  QueryPromptOverrides,
@@ -56,16 +57,43 @@ class AsyncFolder:
56
57
  Args:
57
58
  client: The AsyncMorphik client instance
58
59
  name: The name of the folder
60
+ folder_id: Optional folder ID (if already known)
59
61
  """
60
62
 
61
- def __init__(self, client: "AsyncMorphik", name: str):
63
+ def __init__(self, client: "AsyncMorphik", name: str, folder_id: Optional[str] = None):
62
64
  self._client = client
63
65
  self._name = name
66
+ self._id = folder_id
64
67
 
65
68
  @property
66
69
  def name(self) -> str:
67
70
  """Returns the folder name."""
68
71
  return self._name
72
+
73
+ @property
74
+ def id(self) -> Optional[str]:
75
+ """Returns the folder ID if available."""
76
+ return self._id
77
+
78
+ async def get_info(self) -> Dict[str, Any]:
79
+ """
80
+ Get detailed information about this folder.
81
+
82
+ Returns:
83
+ Dict[str, Any]: Detailed folder information
84
+ """
85
+ if not self._id:
86
+ # If we don't have the ID, find the folder by name first
87
+ folders = await self._client.list_folders()
88
+ for folder in folders:
89
+ if folder.name == self._name:
90
+ self._id = folder.id
91
+ break
92
+ if not self._id:
93
+ raise ValueError(f"Folder '{self._name}' not found")
94
+
95
+ return await self._client._request("GET", f"folders/{self._id}")
96
+
69
97
 
70
98
  def signin(self, end_user_id: str) -> "AsyncUserScope":
71
99
  """
@@ -144,9 +172,10 @@ class AsyncFolder:
144
172
 
145
173
  response = await self._client._request(
146
174
  "POST",
147
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
175
+ "ingest/file",
148
176
  data=form_data,
149
177
  files=files,
178
+ params={"use_colpali": str(use_colpali).lower()},
150
179
  )
151
180
  doc = self._client._logic._parse_document_response(response)
152
181
  doc._client = self._client
@@ -187,7 +216,11 @@ class AsyncFolder:
187
216
  )
188
217
 
189
218
  response = await self._client._request(
190
- "POST", "ingest/files", data=data, files=file_objects
219
+ "POST",
220
+ "ingest/files",
221
+ data=data,
222
+ files=file_objects,
223
+ params={"use_colpali": str(use_colpali).lower()},
191
224
  )
192
225
 
193
226
  if response.get("errors"):
@@ -391,9 +424,10 @@ class AsyncFolder:
391
424
  Returns:
392
425
  List[Document]: List of document metadata for found documents
393
426
  """
394
- request = self._client._logic._prepare_batch_get_documents_request(
395
- document_ids, self._name, None
396
- )
427
+ # API expects a dict with document_ids key
428
+ request = {"document_ids": document_ids}
429
+ if self._name:
430
+ request["folder_name"] = self._name
397
431
  response = await self._client._request("POST", "batch/documents", data=request)
398
432
  docs = self._client._logic._parse_document_list_response(response)
399
433
  for doc in docs:
@@ -673,7 +707,11 @@ class AsyncUserScope:
673
707
  data["folder_name"] = self._folder_name
674
708
 
675
709
  response = await self._client._request(
676
- "POST", "ingest/files", data=data, files=file_objects
710
+ "POST",
711
+ "ingest/files",
712
+ data=data,
713
+ files=file_objects,
714
+ params={"use_colpali": str(use_colpali).lower()},
677
715
  )
678
716
 
679
717
  if response.get("errors"):
@@ -877,9 +915,12 @@ class AsyncUserScope:
877
915
  Returns:
878
916
  List[Document]: List of document metadata for found documents
879
917
  """
880
- request = self._client._logic._prepare_batch_get_documents_request(
881
- document_ids, self._folder_name, self._end_user_id
882
- )
918
+ # API expects a dict with document_ids key
919
+ request = {"document_ids": document_ids}
920
+ if self._end_user_id:
921
+ request["end_user_id"] = self._end_user_id
922
+ if self._folder_name:
923
+ request["folder_name"] = self._folder_name
883
924
  response = await self._client._request("POST", "batch/documents", data=request)
884
925
  docs = self._client._logic._parse_document_list_response(response)
885
926
  for doc in docs:
@@ -1032,9 +1073,15 @@ class AsyncMorphik:
1032
1073
 
1033
1074
  # Configure request data based on type
1034
1075
  if files:
1035
- # Multipart form data for files
1036
- request_data = {"files": files, "data": data}
1037
- # Don't set Content-Type, let httpx handle it
1076
+ # When uploading files, we need to make sure not to set Content-Type
1077
+ # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1078
+ if "Content-Type" in headers:
1079
+ del headers["Content-Type"]
1080
+
1081
+ # For file uploads with form data, use form data (not json)
1082
+ request_data = {"files": files}
1083
+ if data:
1084
+ request_data["data"] = data
1038
1085
  else:
1039
1086
  # JSON for everything else
1040
1087
  headers["Content-Type"] = "application/json"
@@ -1054,19 +1101,30 @@ class AsyncMorphik:
1054
1101
  """Convert a rule to a dictionary format"""
1055
1102
  return self._logic._convert_rule(rule)
1056
1103
 
1057
- def create_folder(self, name: str) -> AsyncFolder:
1104
+ async def create_folder(self, name: str, description: Optional[str] = None) -> AsyncFolder:
1058
1105
  """
1059
1106
  Create a folder to scope operations.
1060
1107
 
1061
1108
  Args:
1062
1109
  name: The name of the folder
1110
+ description: Optional description for the folder
1063
1111
 
1064
1112
  Returns:
1065
- AsyncFolder: A folder object for scoped operations
1066
- """
1067
- return AsyncFolder(self, name)
1068
-
1069
- def get_folder(self, name: str) -> AsyncFolder:
1113
+ AsyncFolder: A folder object ready for scoped operations
1114
+ """
1115
+ payload = {
1116
+ "name": name
1117
+ }
1118
+ if description:
1119
+ payload["description"] = description
1120
+
1121
+ response = await self._request("POST", "folders", data=payload)
1122
+ folder_info = FolderInfo(**response)
1123
+
1124
+ # Return a usable AsyncFolder object with the ID from the response
1125
+ return AsyncFolder(self, name, folder_id=folder_info.id)
1126
+
1127
+ def get_folder_by_name(self, name: str) -> AsyncFolder:
1070
1128
  """
1071
1129
  Get a folder by name to scope operations.
1072
1130
 
@@ -1077,6 +1135,57 @@ class AsyncMorphik:
1077
1135
  AsyncFolder: A folder object for scoped operations
1078
1136
  """
1079
1137
  return AsyncFolder(self, name)
1138
+
1139
+ async def get_folder(self, folder_id: str) -> AsyncFolder:
1140
+ """
1141
+ Get a folder by ID.
1142
+
1143
+ Args:
1144
+ folder_id: ID of the folder
1145
+
1146
+ Returns:
1147
+ AsyncFolder: A folder object for scoped operations
1148
+ """
1149
+ response = await self._request("GET", f"folders/{folder_id}")
1150
+ return AsyncFolder(self, response["name"], folder_id)
1151
+
1152
+ async def list_folders(self) -> List[AsyncFolder]:
1153
+ """
1154
+ List all folders the user has access to as AsyncFolder objects.
1155
+
1156
+ Returns:
1157
+ List[AsyncFolder]: List of AsyncFolder objects ready for operations
1158
+ """
1159
+ response = await self._request("GET", "folders")
1160
+ return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
1161
+
1162
+ async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1163
+ """
1164
+ Add a document to a folder.
1165
+
1166
+ Args:
1167
+ folder_id: ID of the folder
1168
+ document_id: ID of the document
1169
+
1170
+ Returns:
1171
+ Dict[str, str]: Success status
1172
+ """
1173
+ response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1174
+ return response
1175
+
1176
+ async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1177
+ """
1178
+ Remove a document from a folder.
1179
+
1180
+ Args:
1181
+ folder_id: ID of the folder
1182
+ document_id: ID of the document
1183
+
1184
+ Returns:
1185
+ Dict[str, str]: Success status
1186
+ """
1187
+ response = await self._request("DELETE", f"folders/{folder_id}/documents/{document_id}")
1188
+ return response
1080
1189
 
1081
1190
  def signin(self, end_user_id: str) -> AsyncUserScope:
1082
1191
  """
@@ -1163,9 +1272,10 @@ class AsyncMorphik:
1163
1272
 
1164
1273
  response = await self._request(
1165
1274
  "POST",
1166
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
1275
+ "ingest/file",
1167
1276
  data=form_data,
1168
1277
  files=files,
1278
+ params={"use_colpali": str(use_colpali).lower()},
1169
1279
  )
1170
1280
  doc = self._logic._parse_document_response(response)
1171
1281
  doc._client = self
@@ -1208,7 +1318,13 @@ class AsyncMorphik:
1208
1318
  metadata, rules, use_colpali, parallel, None, None
1209
1319
  )
1210
1320
 
1211
- response = await self._request("POST", "ingest/files", data=data, files=file_objects)
1321
+ response = await self._request(
1322
+ "POST",
1323
+ "ingest/files",
1324
+ data=data,
1325
+ files=file_objects,
1326
+ params={"use_colpali": str(use_colpali).lower()},
1327
+ )
1212
1328
 
1213
1329
  if response.get("errors"):
1214
1330
  # Log errors but don't raise exception
@@ -1216,7 +1332,7 @@ class AsyncMorphik:
1216
1332
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
1217
1333
 
1218
1334
  # Parse the documents from the response
1219
- docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
1335
+ docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
1220
1336
  for doc in docs:
1221
1337
  doc._client = self
1222
1338
  return docs
@@ -1490,6 +1606,76 @@ class AsyncMorphik:
1490
1606
  doc = self._logic._parse_document_response(response)
1491
1607
  doc._client = self
1492
1608
  return doc
1609
+
1610
+ async def get_document_status(self, document_id: str) -> Dict[str, Any]:
1611
+ """
1612
+ Get the current processing status of a document.
1613
+
1614
+ Args:
1615
+ document_id: ID of the document to check
1616
+
1617
+ Returns:
1618
+ Dict[str, Any]: Status information including current status, potential errors, and other metadata
1619
+
1620
+ Example:
1621
+ ```python
1622
+ status = await db.get_document_status("doc_123")
1623
+ if status["status"] == "completed":
1624
+ print("Document processing complete")
1625
+ elif status["status"] == "failed":
1626
+ print(f"Processing failed: {status['error']}")
1627
+ else:
1628
+ print("Document still processing...")
1629
+ ```
1630
+ """
1631
+ response = await self._request("GET", f"documents/{document_id}/status")
1632
+ return response
1633
+
1634
+ async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1635
+ """
1636
+ Wait for a document's processing to complete.
1637
+
1638
+ Args:
1639
+ document_id: ID of the document to wait for
1640
+ timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1641
+ check_interval_seconds: Time between status checks (default: 2 seconds)
1642
+
1643
+ Returns:
1644
+ Document: Updated document with the latest status
1645
+
1646
+ Raises:
1647
+ TimeoutError: If processing doesn't complete within the timeout period
1648
+ ValueError: If processing fails with an error
1649
+
1650
+ Example:
1651
+ ```python
1652
+ # Upload a file and wait for processing to complete
1653
+ doc = await db.ingest_file("large_document.pdf")
1654
+ try:
1655
+ completed_doc = await db.wait_for_document_completion(doc.external_id)
1656
+ print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
1657
+ except TimeoutError:
1658
+ print("Processing is taking too long")
1659
+ except ValueError as e:
1660
+ print(f"Processing failed: {e}")
1661
+ ```
1662
+ """
1663
+ import asyncio
1664
+ start_time = asyncio.get_event_loop().time()
1665
+
1666
+ while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
1667
+ status = await self.get_document_status(document_id)
1668
+
1669
+ if status["status"] == "completed":
1670
+ # Get the full document now that it's complete
1671
+ return await self.get_document(document_id)
1672
+ elif status["status"] == "failed":
1673
+ raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1674
+
1675
+ # Wait before checking again
1676
+ await asyncio.sleep(check_interval_seconds)
1677
+
1678
+ raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1493
1679
 
1494
1680
  async def get_document_by_filename(self, filename: str) -> Document:
1495
1681
  """
@@ -1865,7 +2051,8 @@ class AsyncMorphik:
1865
2051
  print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
1866
2052
  ```
1867
2053
  """
1868
- request = self._logic._prepare_batch_get_documents_request(document_ids, None, None)
2054
+ # API expects a dict with document_ids key, not a direct list
2055
+ request = {"document_ids": document_ids}
1869
2056
  response = await self._request("POST", "batch/documents", data=request)
1870
2057
  docs = self._logic._parse_document_list_response(response)
1871
2058
  for doc in docs:
morphik/models.py CHANGED
@@ -24,6 +24,60 @@ class Document(BaseModel):
24
24
 
25
25
  # Client reference for update methods
26
26
  _client = None
27
+
28
+ @property
29
+ def status(self) -> Dict[str, Any]:
30
+ """Get the latest processing status of the document from the API.
31
+
32
+ Returns:
33
+ Dict[str, Any]: Status information including current status, potential errors, and other metadata
34
+ """
35
+ if self._client is None:
36
+ raise ValueError(
37
+ "Document instance not connected to a client. Use a document returned from a Morphik client method."
38
+ )
39
+ return self._client.get_document_status(self.external_id)
40
+
41
+ @property
42
+ def is_processing(self) -> bool:
43
+ """Check if the document is still being processed."""
44
+ return self.status.get("status") == "processing"
45
+
46
+ @property
47
+ def is_ingested(self) -> bool:
48
+ """Check if the document has completed processing."""
49
+ return self.status.get("status") == "completed"
50
+
51
+ @property
52
+ def is_failed(self) -> bool:
53
+ """Check if document processing has failed."""
54
+ return self.status.get("status") == "failed"
55
+
56
+ @property
57
+ def error(self) -> Optional[str]:
58
+ """Get the error message if processing failed."""
59
+ status_info = self.status
60
+ return status_info.get("error") if status_info.get("status") == "failed" else None
61
+
62
+ def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
63
+ """Wait for document processing to complete.
64
+
65
+ Args:
66
+ timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
67
+ check_interval_seconds: Time between status checks (default: 2 seconds)
68
+
69
+ Returns:
70
+ Document: Updated document with the latest status
71
+
72
+ Raises:
73
+ TimeoutError: If processing doesn't complete within the timeout period
74
+ ValueError: If processing fails with an error
75
+ """
76
+ if self._client is None:
77
+ raise ValueError(
78
+ "Document instance not connected to a client. Use a document returned from a Morphik client method."
79
+ )
80
+ return self._client.wait_for_document_completion(self.external_id, timeout_seconds, check_interval_seconds)
27
81
 
28
82
  def update_with_text(
29
83
  self,
@@ -411,3 +465,19 @@ class QueryPromptOverrides(BaseModel):
411
465
  None,
412
466
  description="Overrides for query prompts - controls response generation style, format, and tone",
413
467
  )
468
+
469
+
470
+ class FolderInfo(BaseModel):
471
+ """Folder metadata model"""
472
+
473
+ id: str = Field(..., description="Unique folder identifier")
474
+ name: str = Field(..., description="Folder name")
475
+ description: Optional[str] = Field(None, description="Folder description")
476
+ owner: Dict[str, str] = Field(..., description="Owner information")
477
+ document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
478
+ system_metadata: Dict[str, Any] = Field(
479
+ default_factory=dict, description="System-managed metadata"
480
+ )
481
+ access_control: Dict[str, List[str]] = Field(
482
+ default_factory=dict, description="Access control information"
483
+ )