morphik 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik-0.1.4/PKG-INFO ADDED
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.4
2
+ Name: morphik
3
+ Version: 0.1.4
4
+ Summary: Morphik Python Client
5
+ Author-email: Morphik <founders@morphik.ai>
6
+ Requires-Python: >=3.8
7
+ Requires-Dist: httpx>=0.24.0
8
+ Requires-Dist: pillow==10.4.0
9
+ Requires-Dist: pydantic==2.10.3
10
+ Requires-Dist: pyjwt>=2.0.0
11
+ Requires-Dist: requests>=2.32.3
12
+ Description-Content-Type: text/markdown
13
+
14
+ # Morphik
15
+
16
+ A Python client for Morphik API that enables document ingestion, semantic search, and retrieval augmented generation capabilities.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install morphik
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ The SDK provides both synchronous and asynchronous clients:
27
+
28
+ ### Synchronous Usage
29
+
30
+ ```python
31
+ from morphik import Morphik
32
+
33
+ # Initialize client - connects to localhost:8000 by default
34
+ db = Morphik()
35
+
36
+ # Or with authentication URI (for production)
37
+ # db = Morphik("morphik://owner_id:token@api.morphik.ai")
38
+
39
+ # Ingest a text document
40
+ doc = db.ingest_text(
41
+ content="Your document content",
42
+ metadata={"title": "Example Document"}
43
+ )
44
+
45
+ # Ingest a file
46
+ doc = db.ingest_file(
47
+ file="path/to/document.pdf",
48
+ metadata={"category": "reports"}
49
+ )
50
+
51
+ # Retrieve relevant chunks
52
+ chunks = db.retrieve_chunks(
53
+ query="Your search query",
54
+ filters={"category": "reports"}
55
+ )
56
+
57
+ # Query with RAG
58
+ response = db.query(
59
+ query="Summarize the key points in the document",
60
+ filters={"category": "reports"}
61
+ )
62
+
63
+ print(response.completion)
64
+ ```
65
+
66
+ ### Asynchronous Usage
67
+
68
+ ```python
69
+ import asyncio
70
+ from morphik.async_ import AsyncMorphik
71
+
72
+ async def main():
73
+ # Initialize async client - connects to localhost:8000 by default
74
+ async with AsyncMorphik() as db:
75
+
76
+ # Or with authentication URI (for production)
77
+ # async with AsyncMorphik("morphik://owner_id:token@api.morphik.ai") as db:
78
+ # Ingest a text document
79
+ doc = await db.ingest_text(
80
+ content="Your document content",
81
+ metadata={"title": "Example Document"}
82
+ )
83
+
84
+ # Query with RAG
85
+ response = await db.query(
86
+ query="Summarize the key points in the document",
87
+ )
88
+
89
+ print(response.completion)
90
+
91
+ # Run the async function
92
+ asyncio.run(main())
93
+ ```
94
+
95
+ ## Features
96
+
97
+ - Document ingestion (text, files, directories)
98
+ - Semantic search and retrieval
99
+ - Retrieval-augmented generation (RAG)
100
+ - Knowledge graph creation and querying
101
+ - Multi-user and multi-folder scoping
102
+ - Metadata filtering
103
+ - Document management
104
+
105
+ ## Development
106
+
107
+ ### Running Tests
108
+
109
+ To run the tests, first install the development dependencies:
110
+
111
+ ```bash
112
+ pip install -r test_requirements.txt
113
+ ```
114
+
115
+ Then run the tests:
116
+
117
+ ```bash
118
+ # Run all tests (requires a running Morphik server)
119
+ pytest morphik/tests/ -v
120
+
121
+ # Run specific test modules
122
+ pytest morphik/tests/test_sync.py -v
123
+ pytest morphik/tests/test_async.py -v
124
+
125
+ # Skip tests if you don't have a running server
126
+ SKIP_LIVE_TESTS=1 pytest morphik/tests/ -v
127
+
128
+ # Specify a custom server URL for tests
129
+ MORPHIK_TEST_URL=http://custom-server:8000 pytest morphik/tests/ -v
130
+ ```
131
+
132
+ ### Example Usage Script
133
+
134
+ The SDK comes with an example script that demonstrates basic usage:
135
+
136
+ ```bash
137
+ # Run synchronous example
138
+ python -m morphik.tests.example_usage
139
+
140
+ # Run asynchronous example
141
+ python -m morphik.tests.example_usage --async
142
+ ```
143
+
144
+ The example script demonstrates:
145
+ - Text and file ingestion
146
+ - Creating folders and user scopes
147
+ - Retrieving chunks and documents
148
+ - Generating completions using RAG
149
+ - Batch operations and cleanup
150
+
151
+ ## License
152
+
153
+ [License information]
@@ -0,0 +1,140 @@
1
+ # Morphik
2
+
3
+ A Python client for Morphik API that enables document ingestion, semantic search, and retrieval augmented generation capabilities.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install morphik
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ The SDK provides both synchronous and asynchronous clients:
14
+
15
+ ### Synchronous Usage
16
+
17
+ ```python
18
+ from morphik import Morphik
19
+
20
+ # Initialize client - connects to localhost:8000 by default
21
+ db = Morphik()
22
+
23
+ # Or with authentication URI (for production)
24
+ # db = Morphik("morphik://owner_id:token@api.morphik.ai")
25
+
26
+ # Ingest a text document
27
+ doc = db.ingest_text(
28
+ content="Your document content",
29
+ metadata={"title": "Example Document"}
30
+ )
31
+
32
+ # Ingest a file
33
+ doc = db.ingest_file(
34
+ file="path/to/document.pdf",
35
+ metadata={"category": "reports"}
36
+ )
37
+
38
+ # Retrieve relevant chunks
39
+ chunks = db.retrieve_chunks(
40
+ query="Your search query",
41
+ filters={"category": "reports"}
42
+ )
43
+
44
+ # Query with RAG
45
+ response = db.query(
46
+ query="Summarize the key points in the document",
47
+ filters={"category": "reports"}
48
+ )
49
+
50
+ print(response.completion)
51
+ ```
52
+
53
+ ### Asynchronous Usage
54
+
55
+ ```python
56
+ import asyncio
57
+ from morphik.async_ import AsyncMorphik
58
+
59
+ async def main():
60
+ # Initialize async client - connects to localhost:8000 by default
61
+ async with AsyncMorphik() as db:
62
+
63
+ # Or with authentication URI (for production)
64
+ # async with AsyncMorphik("morphik://owner_id:token@api.morphik.ai") as db:
65
+ # Ingest a text document
66
+ doc = await db.ingest_text(
67
+ content="Your document content",
68
+ metadata={"title": "Example Document"}
69
+ )
70
+
71
+ # Query with RAG
72
+ response = await db.query(
73
+ query="Summarize the key points in the document",
74
+ )
75
+
76
+ print(response.completion)
77
+
78
+ # Run the async function
79
+ asyncio.run(main())
80
+ ```
81
+
82
+ ## Features
83
+
84
+ - Document ingestion (text, files, directories)
85
+ - Semantic search and retrieval
86
+ - Retrieval-augmented generation (RAG)
87
+ - Knowledge graph creation and querying
88
+ - Multi-user and multi-folder scoping
89
+ - Metadata filtering
90
+ - Document management
91
+
92
+ ## Development
93
+
94
+ ### Running Tests
95
+
96
+ To run the tests, first install the development dependencies:
97
+
98
+ ```bash
99
+ pip install -r test_requirements.txt
100
+ ```
101
+
102
+ Then run the tests:
103
+
104
+ ```bash
105
+ # Run all tests (requires a running Morphik server)
106
+ pytest morphik/tests/ -v
107
+
108
+ # Run specific test modules
109
+ pytest morphik/tests/test_sync.py -v
110
+ pytest morphik/tests/test_async.py -v
111
+
112
+ # Skip tests if you don't have a running server
113
+ SKIP_LIVE_TESTS=1 pytest morphik/tests/ -v
114
+
115
+ # Specify a custom server URL for tests
116
+ MORPHIK_TEST_URL=http://custom-server:8000 pytest morphik/tests/ -v
117
+ ```
118
+
119
+ ### Example Usage Script
120
+
121
+ The SDK comes with an example script that demonstrates basic usage:
122
+
123
+ ```bash
124
+ # Run synchronous example
125
+ python -m morphik.tests.example_usage
126
+
127
+ # Run asynchronous example
128
+ python -m morphik.tests.example_usage --async
129
+ ```
130
+
131
+ The example script demonstrates:
132
+ - Text and file ingestion
133
+ - Creating folders and user scopes
134
+ - Retrieving chunks and documents
135
+ - Generating completions using RAG
136
+ - Batch operations and cleanup
137
+
138
+ ## License
139
+
140
+ [License information]
@@ -12,4 +12,4 @@ __all__ = [
12
12
  "Document",
13
13
  ]
14
14
 
15
- __version__ = "0.1.2"
15
+ __version__ = "0.1.4"
@@ -211,7 +211,7 @@ class _MorphikClientLogic:
211
211
  data = {
212
212
  "metadata": json.dumps(metadata or {}),
213
213
  "rules": json.dumps(converted_rules),
214
- "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
214
+ # use_colpali is a query parameter, not a form field
215
215
  "parallel": str(parallel).lower(),
216
216
  }
217
217
 
@@ -14,6 +14,7 @@ from .models import (
14
14
  IngestTextRequest,
15
15
  ChunkSource,
16
16
  Graph,
17
+ FolderInfo,
17
18
  # Prompt override models
18
19
  GraphPromptOverrides,
19
20
  QueryPromptOverrides,
@@ -56,16 +57,43 @@ class AsyncFolder:
56
57
  Args:
57
58
  client: The AsyncMorphik client instance
58
59
  name: The name of the folder
60
+ folder_id: Optional folder ID (if already known)
59
61
  """
60
62
 
61
- def __init__(self, client: "AsyncMorphik", name: str):
63
+ def __init__(self, client: "AsyncMorphik", name: str, folder_id: Optional[str] = None):
62
64
  self._client = client
63
65
  self._name = name
66
+ self._id = folder_id
64
67
 
65
68
  @property
66
69
  def name(self) -> str:
67
70
  """Returns the folder name."""
68
71
  return self._name
72
+
73
+ @property
74
+ def id(self) -> Optional[str]:
75
+ """Returns the folder ID if available."""
76
+ return self._id
77
+
78
+ async def get_info(self) -> Dict[str, Any]:
79
+ """
80
+ Get detailed information about this folder.
81
+
82
+ Returns:
83
+ Dict[str, Any]: Detailed folder information
84
+ """
85
+ if not self._id:
86
+ # If we don't have the ID, find the folder by name first
87
+ folders = await self._client.list_folders()
88
+ for folder in folders:
89
+ if folder.name == self._name:
90
+ self._id = folder.id
91
+ break
92
+ if not self._id:
93
+ raise ValueError(f"Folder '{self._name}' not found")
94
+
95
+ return await self._client._request("GET", f"folders/{self._id}")
96
+
69
97
 
70
98
  def signin(self, end_user_id: str) -> "AsyncUserScope":
71
99
  """
@@ -144,9 +172,10 @@ class AsyncFolder:
144
172
 
145
173
  response = await self._client._request(
146
174
  "POST",
147
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
175
+ "ingest/file",
148
176
  data=form_data,
149
177
  files=files,
178
+ params={"use_colpali": str(use_colpali).lower()},
150
179
  )
151
180
  doc = self._client._logic._parse_document_response(response)
152
181
  doc._client = self._client
@@ -187,7 +216,11 @@ class AsyncFolder:
187
216
  )
188
217
 
189
218
  response = await self._client._request(
190
- "POST", "ingest/files", data=data, files=file_objects
219
+ "POST",
220
+ "ingest/files",
221
+ data=data,
222
+ files=file_objects,
223
+ params={"use_colpali": str(use_colpali).lower()},
191
224
  )
192
225
 
193
226
  if response.get("errors"):
@@ -391,9 +424,10 @@ class AsyncFolder:
391
424
  Returns:
392
425
  List[Document]: List of document metadata for found documents
393
426
  """
394
- request = self._client._logic._prepare_batch_get_documents_request(
395
- document_ids, self._name, None
396
- )
427
+ # API expects a dict with document_ids key
428
+ request = {"document_ids": document_ids}
429
+ if self._name:
430
+ request["folder_name"] = self._name
397
431
  response = await self._client._request("POST", "batch/documents", data=request)
398
432
  docs = self._client._logic._parse_document_list_response(response)
399
433
  for doc in docs:
@@ -673,7 +707,11 @@ class AsyncUserScope:
673
707
  data["folder_name"] = self._folder_name
674
708
 
675
709
  response = await self._client._request(
676
- "POST", "ingest/files", data=data, files=file_objects
710
+ "POST",
711
+ "ingest/files",
712
+ data=data,
713
+ files=file_objects,
714
+ params={"use_colpali": str(use_colpali).lower()},
677
715
  )
678
716
 
679
717
  if response.get("errors"):
@@ -877,9 +915,12 @@ class AsyncUserScope:
877
915
  Returns:
878
916
  List[Document]: List of document metadata for found documents
879
917
  """
880
- request = self._client._logic._prepare_batch_get_documents_request(
881
- document_ids, self._folder_name, self._end_user_id
882
- )
918
+ # API expects a dict with document_ids key
919
+ request = {"document_ids": document_ids}
920
+ if self._end_user_id:
921
+ request["end_user_id"] = self._end_user_id
922
+ if self._folder_name:
923
+ request["folder_name"] = self._folder_name
883
924
  response = await self._client._request("POST", "batch/documents", data=request)
884
925
  docs = self._client._logic._parse_document_list_response(response)
885
926
  for doc in docs:
@@ -1032,9 +1073,15 @@ class AsyncMorphik:
1032
1073
 
1033
1074
  # Configure request data based on type
1034
1075
  if files:
1035
- # Multipart form data for files
1036
- request_data = {"files": files, "data": data}
1037
- # Don't set Content-Type, let httpx handle it
1076
+ # When uploading files, we need to make sure not to set Content-Type
1077
+ # Remove Content-Type if it exists - httpx will set the correct multipart boundary
1078
+ if "Content-Type" in headers:
1079
+ del headers["Content-Type"]
1080
+
1081
+ # For file uploads with form data, use form data (not json)
1082
+ request_data = {"files": files}
1083
+ if data:
1084
+ request_data["data"] = data
1038
1085
  else:
1039
1086
  # JSON for everything else
1040
1087
  headers["Content-Type"] = "application/json"
@@ -1054,19 +1101,30 @@ class AsyncMorphik:
1054
1101
  """Convert a rule to a dictionary format"""
1055
1102
  return self._logic._convert_rule(rule)
1056
1103
 
1057
- def create_folder(self, name: str) -> AsyncFolder:
1104
+ async def create_folder(self, name: str, description: Optional[str] = None) -> AsyncFolder:
1058
1105
  """
1059
1106
  Create a folder to scope operations.
1060
1107
 
1061
1108
  Args:
1062
1109
  name: The name of the folder
1110
+ description: Optional description for the folder
1063
1111
 
1064
1112
  Returns:
1065
- AsyncFolder: A folder object for scoped operations
1066
- """
1067
- return AsyncFolder(self, name)
1068
-
1069
- def get_folder(self, name: str) -> AsyncFolder:
1113
+ AsyncFolder: A folder object ready for scoped operations
1114
+ """
1115
+ payload = {
1116
+ "name": name
1117
+ }
1118
+ if description:
1119
+ payload["description"] = description
1120
+
1121
+ response = await self._request("POST", "folders", data=payload)
1122
+ folder_info = FolderInfo(**response)
1123
+
1124
+ # Return a usable AsyncFolder object with the ID from the response
1125
+ return AsyncFolder(self, name, folder_id=folder_info.id)
1126
+
1127
+ def get_folder_by_name(self, name: str) -> AsyncFolder:
1070
1128
  """
1071
1129
  Get a folder by name to scope operations.
1072
1130
 
@@ -1077,6 +1135,57 @@ class AsyncMorphik:
1077
1135
  AsyncFolder: A folder object for scoped operations
1078
1136
  """
1079
1137
  return AsyncFolder(self, name)
1138
+
1139
+ async def get_folder(self, folder_id: str) -> AsyncFolder:
1140
+ """
1141
+ Get a folder by ID.
1142
+
1143
+ Args:
1144
+ folder_id: ID of the folder
1145
+
1146
+ Returns:
1147
+ AsyncFolder: A folder object for scoped operations
1148
+ """
1149
+ response = await self._request("GET", f"folders/{folder_id}")
1150
+ return AsyncFolder(self, response["name"], folder_id)
1151
+
1152
+ async def list_folders(self) -> List[AsyncFolder]:
1153
+ """
1154
+ List all folders the user has access to as AsyncFolder objects.
1155
+
1156
+ Returns:
1157
+ List[AsyncFolder]: List of AsyncFolder objects ready for operations
1158
+ """
1159
+ response = await self._request("GET", "folders")
1160
+ return [AsyncFolder(self, folder["name"], folder["id"]) for folder in response]
1161
+
1162
+ async def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1163
+ """
1164
+ Add a document to a folder.
1165
+
1166
+ Args:
1167
+ folder_id: ID of the folder
1168
+ document_id: ID of the document
1169
+
1170
+ Returns:
1171
+ Dict[str, str]: Success status
1172
+ """
1173
+ response = await self._request("POST", f"folders/{folder_id}/documents/{document_id}")
1174
+ return response
1175
+
1176
+ async def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
1177
+ """
1178
+ Remove a document from a folder.
1179
+
1180
+ Args:
1181
+ folder_id: ID of the folder
1182
+ document_id: ID of the document
1183
+
1184
+ Returns:
1185
+ Dict[str, str]: Success status
1186
+ """
1187
+ response = await self._request("DELETE", f"folders/{folder_id}/documents/{document_id}")
1188
+ return response
1080
1189
 
1081
1190
  def signin(self, end_user_id: str) -> AsyncUserScope:
1082
1191
  """
@@ -1163,9 +1272,10 @@ class AsyncMorphik:
1163
1272
 
1164
1273
  response = await self._request(
1165
1274
  "POST",
1166
- f"ingest/file?use_colpali={str(use_colpali).lower()}",
1275
+ "ingest/file",
1167
1276
  data=form_data,
1168
1277
  files=files,
1278
+ params={"use_colpali": str(use_colpali).lower()},
1169
1279
  )
1170
1280
  doc = self._logic._parse_document_response(response)
1171
1281
  doc._client = self
@@ -1208,7 +1318,13 @@ class AsyncMorphik:
1208
1318
  metadata, rules, use_colpali, parallel, None, None
1209
1319
  )
1210
1320
 
1211
- response = await self._request("POST", "ingest/files", data=data, files=file_objects)
1321
+ response = await self._request(
1322
+ "POST",
1323
+ "ingest/files",
1324
+ data=data,
1325
+ files=file_objects,
1326
+ params={"use_colpali": str(use_colpali).lower()},
1327
+ )
1212
1328
 
1213
1329
  if response.get("errors"):
1214
1330
  # Log errors but don't raise exception
@@ -1216,7 +1332,7 @@ class AsyncMorphik:
1216
1332
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
1217
1333
 
1218
1334
  # Parse the documents from the response
1219
- docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
1335
+ docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
1220
1336
  for doc in docs:
1221
1337
  doc._client = self
1222
1338
  return docs
@@ -1490,6 +1606,76 @@ class AsyncMorphik:
1490
1606
  doc = self._logic._parse_document_response(response)
1491
1607
  doc._client = self
1492
1608
  return doc
1609
+
1610
+ async def get_document_status(self, document_id: str) -> Dict[str, Any]:
1611
+ """
1612
+ Get the current processing status of a document.
1613
+
1614
+ Args:
1615
+ document_id: ID of the document to check
1616
+
1617
+ Returns:
1618
+ Dict[str, Any]: Status information including current status, potential errors, and other metadata
1619
+
1620
+ Example:
1621
+ ```python
1622
+ status = await db.get_document_status("doc_123")
1623
+ if status["status"] == "completed":
1624
+ print("Document processing complete")
1625
+ elif status["status"] == "failed":
1626
+ print(f"Processing failed: {status['error']}")
1627
+ else:
1628
+ print("Document still processing...")
1629
+ ```
1630
+ """
1631
+ response = await self._request("GET", f"documents/{document_id}/status")
1632
+ return response
1633
+
1634
+ async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
1635
+ """
1636
+ Wait for a document's processing to complete.
1637
+
1638
+ Args:
1639
+ document_id: ID of the document to wait for
1640
+ timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
1641
+ check_interval_seconds: Time between status checks (default: 2 seconds)
1642
+
1643
+ Returns:
1644
+ Document: Updated document with the latest status
1645
+
1646
+ Raises:
1647
+ TimeoutError: If processing doesn't complete within the timeout period
1648
+ ValueError: If processing fails with an error
1649
+
1650
+ Example:
1651
+ ```python
1652
+ # Upload a file and wait for processing to complete
1653
+ doc = await db.ingest_file("large_document.pdf")
1654
+ try:
1655
+ completed_doc = await db.wait_for_document_completion(doc.external_id)
1656
+ print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
1657
+ except TimeoutError:
1658
+ print("Processing is taking too long")
1659
+ except ValueError as e:
1660
+ print(f"Processing failed: {e}")
1661
+ ```
1662
+ """
1663
+ import asyncio
1664
+ start_time = asyncio.get_event_loop().time()
1665
+
1666
+ while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
1667
+ status = await self.get_document_status(document_id)
1668
+
1669
+ if status["status"] == "completed":
1670
+ # Get the full document now that it's complete
1671
+ return await self.get_document(document_id)
1672
+ elif status["status"] == "failed":
1673
+ raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
1674
+
1675
+ # Wait before checking again
1676
+ await asyncio.sleep(check_interval_seconds)
1677
+
1678
+ raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
1493
1679
 
1494
1680
  async def get_document_by_filename(self, filename: str) -> Document:
1495
1681
  """
@@ -1865,7 +2051,8 @@ class AsyncMorphik:
1865
2051
  print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
1866
2052
  ```
1867
2053
  """
1868
- request = self._logic._prepare_batch_get_documents_request(document_ids, None, None)
2054
+ # API expects a dict with document_ids key, not a direct list
2055
+ request = {"document_ids": document_ids}
1869
2056
  response = await self._request("POST", "batch/documents", data=request)
1870
2057
  docs = self._logic._parse_document_list_response(response)
1871
2058
  for doc in docs: