morphik 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +1 -1
- morphik/async_.py +210 -23
- morphik/models.py +70 -0
- morphik/sync.py +212 -18
- morphik/tests/README.md +41 -0
- morphik/tests/__init__.py +0 -0
- morphik/tests/example_usage.py +280 -0
- morphik/tests/test_async.py +300 -0
- morphik/tests/test_docs/sample1.txt +11 -0
- morphik/tests/test_docs/sample2.txt +15 -0
- morphik/tests/test_docs/sample3.txt +17 -0
- morphik/tests/test_sync.py +293 -0
- morphik-0.1.4.dist-info/METADATA +153 -0
- morphik-0.1.4.dist-info/RECORD +18 -0
- morphik-0.1.2.dist-info/METADATA +0 -47
- morphik-0.1.2.dist-info/RECORD +0 -10
- {morphik-0.1.2.dist-info → morphik-0.1.4.dist-info}/WHEEL +0 -0
morphik/__init__.py
CHANGED
morphik/_internal.py
CHANGED
@@ -211,7 +211,7 @@ class _MorphikClientLogic:
|
|
211
211
|
data = {
|
212
212
|
"metadata": json.dumps(metadata or {}),
|
213
213
|
"rules": json.dumps(converted_rules),
|
214
|
-
|
214
|
+
# use_colpali is a query parameter, not a form field
|
215
215
|
"parallel": str(parallel).lower(),
|
216
216
|
}
|
217
217
|
|
morphik/async_.py
CHANGED
@@ -14,6 +14,7 @@ from .models import (
|
|
14
14
|
IngestTextRequest,
|
15
15
|
ChunkSource,
|
16
16
|
Graph,
|
17
|
+
FolderInfo,
|
17
18
|
# Prompt override models
|
18
19
|
GraphPromptOverrides,
|
19
20
|
QueryPromptOverrides,
|
@@ -56,16 +57,43 @@ class AsyncFolder:
|
|
56
57
|
Args:
|
57
58
|
client: The AsyncMorphik client instance
|
58
59
|
name: The name of the folder
|
60
|
+
folder_id: Optional folder ID (if already known)
|
59
61
|
"""
|
60
62
|
|
61
|
-
def __init__(self, client: "AsyncMorphik", name: str):
|
63
|
+
def __init__(self, client: "AsyncMorphik", name: str, folder_id: Optional[str] = None):
|
62
64
|
self._client = client
|
63
65
|
self._name = name
|
66
|
+
self._id = folder_id
|
64
67
|
|
65
68
|
@property
|
66
69
|
def name(self) -> str:
|
67
70
|
"""Returns the folder name."""
|
68
71
|
return self._name
|
72
|
+
|
73
|
+
@property
|
74
|
+
def id(self) -> Optional[str]:
|
75
|
+
"""Returns the folder ID if available."""
|
76
|
+
return self._id
|
77
|
+
|
78
|
+
async def get_info(self) -> Dict[str, Any]:
|
79
|
+
"""
|
80
|
+
Get detailed information about this folder.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
Dict[str, Any]: Detailed folder information
|
84
|
+
"""
|
85
|
+
if not self._id:
|
86
|
+
# If we don't have the ID, find the folder by name first
|
87
|
+
folders = await self._client.list_folders()
|
88
|
+
for folder in folders:
|
89
|
+
if folder.name == self._name:
|
90
|
+
self._id = folder.id
|
91
|
+
break
|
92
|
+
if not self._id:
|
93
|
+
raise ValueError(f"Folder '{self._name}' not found")
|
94
|
+
|
95
|
+
return await self._client._request("GET", f"folders/{self._id}")
|
96
|
+
|
69
97
|
|
70
98
|
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
71
99
|
"""
|
@@ -144,9 +172,10 @@ class AsyncFolder:
|
|
144
172
|
|
145
173
|
response = await self._client._request(
|
146
174
|
"POST",
|
147
|
-
|
175
|
+
"ingest/file",
|
148
176
|
data=form_data,
|
149
177
|
files=files,
|
178
|
+
params={"use_colpali": str(use_colpali).lower()},
|
150
179
|
)
|
151
180
|
doc = self._client._logic._parse_document_response(response)
|
152
181
|
doc._client = self._client
|
@@ -187,7 +216,11 @@ class AsyncFolder:
|
|
187
216
|
)
|
188
217
|
|
189
218
|
response = await self._client._request(
|
190
|
-
"POST",
|
219
|
+
"POST",
|
220
|
+
"ingest/files",
|
221
|
+
data=data,
|
222
|
+
files=file_objects,
|
223
|
+
params={"use_colpali": str(use_colpali).lower()},
|
191
224
|
)
|
192
225
|
|
193
226
|
if response.get("errors"):
|
@@ -391,9 +424,10 @@ class AsyncFolder:
|
|
391
424
|
Returns:
|
392
425
|
List[Document]: List of document metadata for found documents
|
393
426
|
"""
|
394
|
-
|
395
|
-
|
396
|
-
|
427
|
+
# API expects a dict with document_ids key
|
428
|
+
request = {"document_ids": document_ids}
|
429
|
+
if self._name:
|
430
|
+
request["folder_name"] = self._name
|
397
431
|
response = await self._client._request("POST", "batch/documents", data=request)
|
398
432
|
docs = self._client._logic._parse_document_list_response(response)
|
399
433
|
for doc in docs:
|
@@ -673,7 +707,11 @@ class AsyncUserScope:
|
|
673
707
|
data["folder_name"] = self._folder_name
|
674
708
|
|
675
709
|
response = await self._client._request(
|
676
|
-
"POST",
|
710
|
+
"POST",
|
711
|
+
"ingest/files",
|
712
|
+
data=data,
|
713
|
+
files=file_objects,
|
714
|
+
params={"use_colpali": str(use_colpali).lower()},
|
677
715
|
)
|
678
716
|
|
679
717
|
if response.get("errors"):
|
@@ -877,9 +915,12 @@ class AsyncUserScope:
|
|
877
915
|
Returns:
|
878
916
|
List[Document]: List of document metadata for found documents
|
879
917
|
"""
|
880
|
-
|
881
|
-
|
882
|
-
|
918
|
+
# API expects a dict with document_ids key
|
919
|
+
request = {"document_ids": document_ids}
|
920
|
+
if self._end_user_id:
|
921
|
+
request["end_user_id"] = self._end_user_id
|
922
|
+
if self._folder_name:
|
923
|
+
request["folder_name"] = self._folder_name
|
883
924
|
response = await self._client._request("POST", "batch/documents", data=request)
|
884
925
|
docs = self._client._logic._parse_document_list_response(response)
|
885
926
|
for doc in docs:
|
@@ -1032,9 +1073,15 @@ class AsyncMorphik:
|
|
1032
1073
|
|
1033
1074
|
# Configure request data based on type
|
1034
1075
|
if files:
|
1035
|
-
#
|
1036
|
-
|
1037
|
-
|
1076
|
+
# When uploading files, we need to make sure not to set Content-Type
|
1077
|
+
# Remove Content-Type if it exists - httpx will set the correct multipart boundary
|
1078
|
+
if "Content-Type" in headers:
|
1079
|
+
del headers["Content-Type"]
|
1080
|
+
|
1081
|
+
# For file uploads with form data, use form data (not json)
|
1082
|
+
request_data = {"files": files}
|
1083
|
+
if data:
|
1084
|
+
request_data["data"] = data
|
1038
1085
|
else:
|
1039
1086
|
# JSON for everything else
|
1040
1087
|
headers["Content-Type"] = "application/json"
|
@@ -1054,19 +1101,30 @@ class AsyncMorphik:
|
|
1054
1101
|
"""Convert a rule to a dictionary format"""
|
1055
1102
|
return self._logic._convert_rule(rule)
|
1056
1103
|
|
1057
|
-
def create_folder(self, name: str) -> AsyncFolder:
|
1104
|
+
async def create_folder(self, name: str, description: Optional[str] = None) -> AsyncFolder:
|
1058
1105
|
"""
|
1059
1106
|
Create a folder to scope operations.
|
1060
1107
|
|
1061
1108
|
Args:
|
1062
1109
|
name: The name of the folder
|
1110
|
+
description: Optional description for the folder
|
1063
1111
|
|
1064
1112
|
Returns:
|
1065
|
-
AsyncFolder: A folder object for scoped operations
|
1066
|
-
"""
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1113
|
+
AsyncFolder: A folder object ready for scoped operations
|
1114
|
+
"""
|
1115
|
+
payload = {
|
1116
|
+
"name": name
|
1117
|
+
}
|
1118
|
+
if description:
|
1119
|
+
payload["description"] = description
|
1120
|
+
|
1121
|
+
response = await self._request("POST", "folders", data=payload)
|
1122
|
+
folder_info = FolderInfo(**response)
|
1123
|
+
|
1124
|
+
# Return a usable AsyncFolder object with the ID from the response
|
1125
|
+
return AsyncFolder(self, name, folder_id=folder_info.id)
|
1126
|
+
|
1127
|
+
def get_folder_by_name(self, name: str) -> AsyncFolder:
|
1070
1128
|
"""
|
1071
1129
|
Get a folder by name to scope operations.
|
1072
1130
|
|
@@ -1077,6 +1135,57 @@ class AsyncMorphik:
|
|
1077
1135
|
AsyncFolder: A folder object for scoped operations
|
1078
1136
|
"""
|
1079
1137
|
return AsyncFolder(self, name)
|
1138
|
+
|
1139
|
+
async def get_folder(self, folder_id: str) -> AsyncFolder:
|
1140
|
+
"""
|
1141
|
+
Get a folder by ID.
|
1142
|
+
|
1143
|
+
Args:
|
1144
|
+
folder_id: ID of the folder
|
1145
|
+
|
1146
|
+
Returns:
|
1147
|
+
AsyncFolder: A folder object for scoped operations
|
1148
|
+
"""
|
1149
|
+
response = await self._request("GET", f"folders/{folder_id}")
|
1150
|
+
return AsyncFolder(self, response["name"], folder_id)
|
1151
|
+
|
1152
|
+
async def list_folders(self) -> List[AsyncFolder]:
|
1153
|
+
"""
|
1154
|
+
List all folders the user has access to as AsyncFolder objects.
|
1155
|
+
|
1156
|
+
Returns:
|
1157
|
+
List[AsyncFolder]: List of AsyncFolder objects ready for operations
|
1158
|
+
"""
|
1159
|
+
response = await self._request("GET", "folders")
|
1160
|
+
return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
|
1161
|
+
|
1162
|
+
async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1163
|
+
"""
|
1164
|
+
Add a document to a folder.
|
1165
|
+
|
1166
|
+
Args:
|
1167
|
+
folder_id: ID of the folder
|
1168
|
+
document_id: ID of the document
|
1169
|
+
|
1170
|
+
Returns:
|
1171
|
+
Dict[str, str]: Success status
|
1172
|
+
"""
|
1173
|
+
response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
|
1174
|
+
return response
|
1175
|
+
|
1176
|
+
async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
|
1177
|
+
"""
|
1178
|
+
Remove a document from a folder.
|
1179
|
+
|
1180
|
+
Args:
|
1181
|
+
folder_id: ID of the folder
|
1182
|
+
document_id: ID of the document
|
1183
|
+
|
1184
|
+
Returns:
|
1185
|
+
Dict[str, str]: Success status
|
1186
|
+
"""
|
1187
|
+
response = await self._request("DELETE", f"folders/{folder_id}/documents/{document_id}")
|
1188
|
+
return response
|
1080
1189
|
|
1081
1190
|
def signin(self, end_user_id: str) -> AsyncUserScope:
|
1082
1191
|
"""
|
@@ -1163,9 +1272,10 @@ class AsyncMorphik:
|
|
1163
1272
|
|
1164
1273
|
response = await self._request(
|
1165
1274
|
"POST",
|
1166
|
-
|
1275
|
+
"ingest/file",
|
1167
1276
|
data=form_data,
|
1168
1277
|
files=files,
|
1278
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1169
1279
|
)
|
1170
1280
|
doc = self._logic._parse_document_response(response)
|
1171
1281
|
doc._client = self
|
@@ -1208,7 +1318,13 @@ class AsyncMorphik:
|
|
1208
1318
|
metadata, rules, use_colpali, parallel, None, None
|
1209
1319
|
)
|
1210
1320
|
|
1211
|
-
response = await self._request(
|
1321
|
+
response = await self._request(
|
1322
|
+
"POST",
|
1323
|
+
"ingest/files",
|
1324
|
+
data=data,
|
1325
|
+
files=file_objects,
|
1326
|
+
params={"use_colpali": str(use_colpali).lower()},
|
1327
|
+
)
|
1212
1328
|
|
1213
1329
|
if response.get("errors"):
|
1214
1330
|
# Log errors but don't raise exception
|
@@ -1216,7 +1332,7 @@ class AsyncMorphik:
|
|
1216
1332
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
1217
1333
|
|
1218
1334
|
# Parse the documents from the response
|
1219
|
-
docs = [self.
|
1335
|
+
docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
|
1220
1336
|
for doc in docs:
|
1221
1337
|
doc._client = self
|
1222
1338
|
return docs
|
@@ -1490,6 +1606,76 @@ class AsyncMorphik:
|
|
1490
1606
|
doc = self._logic._parse_document_response(response)
|
1491
1607
|
doc._client = self
|
1492
1608
|
return doc
|
1609
|
+
|
1610
|
+
async def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
1611
|
+
"""
|
1612
|
+
Get the current processing status of a document.
|
1613
|
+
|
1614
|
+
Args:
|
1615
|
+
document_id: ID of the document to check
|
1616
|
+
|
1617
|
+
Returns:
|
1618
|
+
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
1619
|
+
|
1620
|
+
Example:
|
1621
|
+
```python
|
1622
|
+
status = await db.get_document_status("doc_123")
|
1623
|
+
if status["status"] == "completed":
|
1624
|
+
print("Document processing complete")
|
1625
|
+
elif status["status"] == "failed":
|
1626
|
+
print(f"Processing failed: {status['error']}")
|
1627
|
+
else:
|
1628
|
+
print("Document still processing...")
|
1629
|
+
```
|
1630
|
+
"""
|
1631
|
+
response = await self._request("GET", f"documents/{document_id}/status")
|
1632
|
+
return response
|
1633
|
+
|
1634
|
+
async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
1635
|
+
"""
|
1636
|
+
Wait for a document's processing to complete.
|
1637
|
+
|
1638
|
+
Args:
|
1639
|
+
document_id: ID of the document to wait for
|
1640
|
+
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
1641
|
+
check_interval_seconds: Time between status checks (default: 2 seconds)
|
1642
|
+
|
1643
|
+
Returns:
|
1644
|
+
Document: Updated document with the latest status
|
1645
|
+
|
1646
|
+
Raises:
|
1647
|
+
TimeoutError: If processing doesn't complete within the timeout period
|
1648
|
+
ValueError: If processing fails with an error
|
1649
|
+
|
1650
|
+
Example:
|
1651
|
+
```python
|
1652
|
+
# Upload a file and wait for processing to complete
|
1653
|
+
doc = await db.ingest_file("large_document.pdf")
|
1654
|
+
try:
|
1655
|
+
completed_doc = await db.wait_for_document_completion(doc.external_id)
|
1656
|
+
print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
|
1657
|
+
except TimeoutError:
|
1658
|
+
print("Processing is taking too long")
|
1659
|
+
except ValueError as e:
|
1660
|
+
print(f"Processing failed: {e}")
|
1661
|
+
```
|
1662
|
+
"""
|
1663
|
+
import asyncio
|
1664
|
+
start_time = asyncio.get_event_loop().time()
|
1665
|
+
|
1666
|
+
while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
|
1667
|
+
status = await self.get_document_status(document_id)
|
1668
|
+
|
1669
|
+
if status["status"] == "completed":
|
1670
|
+
# Get the full document now that it's complete
|
1671
|
+
return await self.get_document(document_id)
|
1672
|
+
elif status["status"] == "failed":
|
1673
|
+
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
1674
|
+
|
1675
|
+
# Wait before checking again
|
1676
|
+
await asyncio.sleep(check_interval_seconds)
|
1677
|
+
|
1678
|
+
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
1493
1679
|
|
1494
1680
|
async def get_document_by_filename(self, filename: str) -> Document:
|
1495
1681
|
"""
|
@@ -1865,7 +2051,8 @@ class AsyncMorphik:
|
|
1865
2051
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
1866
2052
|
```
|
1867
2053
|
"""
|
1868
|
-
|
2054
|
+
# API expects a dict with document_ids key, not a direct list
|
2055
|
+
request = {"document_ids": document_ids}
|
1869
2056
|
response = await self._request("POST", "batch/documents", data=request)
|
1870
2057
|
docs = self._logic._parse_document_list_response(response)
|
1871
2058
|
for doc in docs:
|
morphik/models.py
CHANGED
@@ -24,6 +24,60 @@ class Document(BaseModel):
|
|
24
24
|
|
25
25
|
# Client reference for update methods
|
26
26
|
_client = None
|
27
|
+
|
28
|
+
@property
|
29
|
+
def status(self) -> Dict[str, Any]:
|
30
|
+
"""Get the latest processing status of the document from the API.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
34
|
+
"""
|
35
|
+
if self._client is None:
|
36
|
+
raise ValueError(
|
37
|
+
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
38
|
+
)
|
39
|
+
return self._client.get_document_status(self.external_id)
|
40
|
+
|
41
|
+
@property
|
42
|
+
def is_processing(self) -> bool:
|
43
|
+
"""Check if the document is still being processed."""
|
44
|
+
return self.status.get("status") == "processing"
|
45
|
+
|
46
|
+
@property
|
47
|
+
def is_ingested(self) -> bool:
|
48
|
+
"""Check if the document has completed processing."""
|
49
|
+
return self.status.get("status") == "completed"
|
50
|
+
|
51
|
+
@property
|
52
|
+
def is_failed(self) -> bool:
|
53
|
+
"""Check if document processing has failed."""
|
54
|
+
return self.status.get("status") == "failed"
|
55
|
+
|
56
|
+
@property
|
57
|
+
def error(self) -> Optional[str]:
|
58
|
+
"""Get the error message if processing failed."""
|
59
|
+
status_info = self.status
|
60
|
+
return status_info.get("error") if status_info.get("status") == "failed" else None
|
61
|
+
|
62
|
+
def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
|
63
|
+
"""Wait for document processing to complete.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
67
|
+
check_interval_seconds: Time between status checks (default: 2 seconds)
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Document: Updated document with the latest status
|
71
|
+
|
72
|
+
Raises:
|
73
|
+
TimeoutError: If processing doesn't complete within the timeout period
|
74
|
+
ValueError: If processing fails with an error
|
75
|
+
"""
|
76
|
+
if self._client is None:
|
77
|
+
raise ValueError(
|
78
|
+
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
79
|
+
)
|
80
|
+
return self._client.wait_for_document_completion(self.external_id, timeout_seconds, check_interval_seconds)
|
27
81
|
|
28
82
|
def update_with_text(
|
29
83
|
self,
|
@@ -411,3 +465,19 @@ class QueryPromptOverrides(BaseModel):
|
|
411
465
|
None,
|
412
466
|
description="Overrides for query prompts - controls response generation style, format, and tone",
|
413
467
|
)
|
468
|
+
|
469
|
+
|
470
|
+
class FolderInfo(BaseModel):
|
471
|
+
"""Folder metadata model"""
|
472
|
+
|
473
|
+
id: str = Field(..., description="Unique folder identifier")
|
474
|
+
name: str = Field(..., description="Folder name")
|
475
|
+
description: Optional[str] = Field(None, description="Folder description")
|
476
|
+
owner: Dict[str, str] = Field(..., description="Owner information")
|
477
|
+
document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
|
478
|
+
system_metadata: Dict[str, Any] = Field(
|
479
|
+
default_factory=dict, description="System-managed metadata"
|
480
|
+
)
|
481
|
+
access_control: Dict[str, List[str]] = Field(
|
482
|
+
default_factory=dict, description="Access control information"
|
483
|
+
)
|