morphik 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +507 -0
- morphik/async_.py +1174 -402
- morphik/models.py +38 -25
- morphik/sync.py +1259 -371
- {morphik-0.1.0.dist-info → morphik-0.1.2.dist-info}/METADATA +1 -1
- morphik-0.1.2.dist-info/RECORD +10 -0
- morphik-0.1.0.dist-info/RECORD +0 -9
- {morphik-0.1.0.dist-info → morphik-0.1.2.dist-info}/WHEEL +0 -0
morphik/async_.py
CHANGED
@@ -1,77 +1,989 @@
|
|
1
|
-
from io import BytesIO, IOBase
|
2
1
|
import json
|
3
2
|
import logging
|
3
|
+
from io import BytesIO, IOBase
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
6
|
-
from urllib.parse import urlparse
|
7
6
|
|
8
|
-
import httpx
|
9
|
-
import
|
10
|
-
|
11
|
-
from
|
7
|
+
import httpx
|
8
|
+
from PIL.Image import Image as PILImage
|
9
|
+
|
10
|
+
from .models import (
|
11
|
+
Document,
|
12
|
+
DocumentResult,
|
13
|
+
CompletionResponse,
|
14
|
+
IngestTextRequest,
|
15
|
+
ChunkSource,
|
16
|
+
Graph,
|
17
|
+
# Prompt override models
|
18
|
+
GraphPromptOverrides,
|
19
|
+
QueryPromptOverrides,
|
20
|
+
)
|
21
|
+
from .rules import Rule
|
22
|
+
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class AsyncCache:
|
28
|
+
def __init__(self, db: "AsyncMorphik", name: str):
|
29
|
+
self._db = db
|
30
|
+
self._name = name
|
31
|
+
|
32
|
+
async def update(self) -> bool:
|
33
|
+
response = await self._db._request("POST", f"cache/{self._name}/update")
|
34
|
+
return response.get("success", False)
|
35
|
+
|
36
|
+
async def add_docs(self, docs: List[str]) -> bool:
|
37
|
+
response = await self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
38
|
+
return response.get("success", False)
|
39
|
+
|
40
|
+
async def query(
|
41
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
42
|
+
) -> CompletionResponse:
|
43
|
+
response = await self._db._request(
|
44
|
+
"POST",
|
45
|
+
f"cache/{self._name}/query",
|
46
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
47
|
+
data="",
|
48
|
+
)
|
49
|
+
return CompletionResponse(**response)
|
50
|
+
|
51
|
+
|
52
|
+
class AsyncFolder:
|
53
|
+
"""
|
54
|
+
A folder that allows operations to be scoped to a specific folder.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
client: The AsyncMorphik client instance
|
58
|
+
name: The name of the folder
|
59
|
+
"""
|
60
|
+
|
61
|
+
def __init__(self, client: "AsyncMorphik", name: str):
|
62
|
+
self._client = client
|
63
|
+
self._name = name
|
64
|
+
|
65
|
+
@property
|
66
|
+
def name(self) -> str:
|
67
|
+
"""Returns the folder name."""
|
68
|
+
return self._name
|
69
|
+
|
70
|
+
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
71
|
+
"""
|
72
|
+
Returns an AsyncUserScope object scoped to this folder and the end user.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
end_user_id: The ID of the end user
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
AsyncUserScope: A user scope scoped to this folder and the end user
|
79
|
+
"""
|
80
|
+
return AsyncUserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)
|
81
|
+
|
82
|
+
async def ingest_text(
|
83
|
+
self,
|
84
|
+
content: str,
|
85
|
+
filename: Optional[str] = None,
|
86
|
+
metadata: Optional[Dict[str, Any]] = None,
|
87
|
+
rules: Optional[List[RuleOrDict]] = None,
|
88
|
+
use_colpali: bool = True,
|
89
|
+
) -> Document:
|
90
|
+
"""
|
91
|
+
Ingest a text document into Morphik within this folder.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
content: Text content to ingest
|
95
|
+
filename: Optional file name
|
96
|
+
metadata: Optional metadata dictionary
|
97
|
+
rules: Optional list of rules to apply during ingestion
|
98
|
+
use_colpali: Whether to use ColPali-style embedding model
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Document: Metadata of the ingested document
|
102
|
+
"""
|
103
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
104
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
105
|
+
content, filename, metadata, rules_list, use_colpali, self._name, None
|
106
|
+
)
|
107
|
+
response = await self._client._request("POST", "ingest/text", data=payload)
|
108
|
+
doc = self._client._logic._parse_document_response(response)
|
109
|
+
doc._client = self._client
|
110
|
+
return doc
|
111
|
+
|
112
|
+
async def ingest_file(
|
113
|
+
self,
|
114
|
+
file: Union[str, bytes, BinaryIO, Path],
|
115
|
+
filename: Optional[str] = None,
|
116
|
+
metadata: Optional[Dict[str, Any]] = None,
|
117
|
+
rules: Optional[List[RuleOrDict]] = None,
|
118
|
+
use_colpali: bool = True,
|
119
|
+
) -> Document:
|
120
|
+
"""
|
121
|
+
Ingest a file document into Morphik within this folder.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
125
|
+
filename: Name of the file
|
126
|
+
metadata: Optional metadata dictionary
|
127
|
+
rules: Optional list of rules to apply during ingestion
|
128
|
+
use_colpali: Whether to use ColPali-style embedding model
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Document: Metadata of the ingested document
|
132
|
+
"""
|
133
|
+
# Process file input
|
134
|
+
file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)
|
135
|
+
|
136
|
+
try:
|
137
|
+
# Prepare multipart form data
|
138
|
+
files = {"file": (filename, file_obj)}
|
139
|
+
|
140
|
+
# Create form data
|
141
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(
|
142
|
+
metadata, rules, self._name, None
|
143
|
+
)
|
144
|
+
|
145
|
+
response = await self._client._request(
|
146
|
+
"POST",
|
147
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
148
|
+
data=form_data,
|
149
|
+
files=files,
|
150
|
+
)
|
151
|
+
doc = self._client._logic._parse_document_response(response)
|
152
|
+
doc._client = self._client
|
153
|
+
return doc
|
154
|
+
finally:
|
155
|
+
# Close file if we opened it
|
156
|
+
if isinstance(file, (str, Path)):
|
157
|
+
file_obj.close()
|
158
|
+
|
159
|
+
async def ingest_files(
|
160
|
+
self,
|
161
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
162
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
163
|
+
rules: Optional[List[RuleOrDict]] = None,
|
164
|
+
use_colpali: bool = True,
|
165
|
+
parallel: bool = True,
|
166
|
+
) -> List[Document]:
|
167
|
+
"""
|
168
|
+
Ingest multiple files into Morphik within this folder.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
files: List of files to ingest
|
172
|
+
metadata: Optional metadata
|
173
|
+
rules: Optional list of rules to apply
|
174
|
+
use_colpali: Whether to use ColPali-style embedding
|
175
|
+
parallel: Whether to process files in parallel
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
List[Document]: List of ingested documents
|
179
|
+
"""
|
180
|
+
# Convert files to format expected by API
|
181
|
+
file_objects = self._client._logic._prepare_files_for_upload(files)
|
182
|
+
|
183
|
+
try:
|
184
|
+
# Prepare form data
|
185
|
+
data = self._client._logic._prepare_ingest_files_form_data(
|
186
|
+
metadata, rules, use_colpali, parallel, self._name, None
|
187
|
+
)
|
188
|
+
|
189
|
+
response = await self._client._request(
|
190
|
+
"POST", "ingest/files", data=data, files=file_objects
|
191
|
+
)
|
192
|
+
|
193
|
+
if response.get("errors"):
|
194
|
+
# Log errors but don't raise exception
|
195
|
+
for error in response["errors"]:
|
196
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
197
|
+
|
198
|
+
docs = [
|
199
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
200
|
+
]
|
201
|
+
for doc in docs:
|
202
|
+
doc._client = self._client
|
203
|
+
return docs
|
204
|
+
finally:
|
205
|
+
# Clean up file objects
|
206
|
+
for _, (_, file_obj) in file_objects:
|
207
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
208
|
+
file_obj.close()
|
209
|
+
|
210
|
+
async def ingest_directory(
|
211
|
+
self,
|
212
|
+
directory: Union[str, Path],
|
213
|
+
recursive: bool = False,
|
214
|
+
pattern: str = "*",
|
215
|
+
metadata: Optional[Dict[str, Any]] = None,
|
216
|
+
rules: Optional[List[RuleOrDict]] = None,
|
217
|
+
use_colpali: bool = True,
|
218
|
+
parallel: bool = True,
|
219
|
+
) -> List[Document]:
|
220
|
+
"""
|
221
|
+
Ingest all files in a directory into Morphik within this folder.
|
222
|
+
|
223
|
+
Args:
|
224
|
+
directory: Path to directory containing files to ingest
|
225
|
+
recursive: Whether to recursively process subdirectories
|
226
|
+
pattern: Optional glob pattern to filter files
|
227
|
+
metadata: Optional metadata dictionary to apply to all files
|
228
|
+
rules: Optional list of rules to apply
|
229
|
+
use_colpali: Whether to use ColPali-style embedding
|
230
|
+
parallel: Whether to process files in parallel
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
List[Document]: List of ingested documents
|
234
|
+
"""
|
235
|
+
directory = Path(directory)
|
236
|
+
if not directory.is_dir():
|
237
|
+
raise ValueError(f"Directory not found: {directory}")
|
238
|
+
|
239
|
+
# Collect all files matching pattern
|
240
|
+
if recursive:
|
241
|
+
files = list(directory.rglob(pattern))
|
242
|
+
else:
|
243
|
+
files = list(directory.glob(pattern))
|
244
|
+
|
245
|
+
# Filter out directories
|
246
|
+
files = [f for f in files if f.is_file()]
|
247
|
+
|
248
|
+
if not files:
|
249
|
+
return []
|
250
|
+
|
251
|
+
# Use ingest_files with collected paths
|
252
|
+
return await self.ingest_files(
|
253
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
254
|
+
)
|
255
|
+
|
256
|
+
async def retrieve_chunks(
|
257
|
+
self,
|
258
|
+
query: str,
|
259
|
+
filters: Optional[Dict[str, Any]] = None,
|
260
|
+
k: int = 4,
|
261
|
+
min_score: float = 0.0,
|
262
|
+
use_colpali: bool = True,
|
263
|
+
) -> List[FinalChunkResult]:
|
264
|
+
"""
|
265
|
+
Retrieve relevant chunks within this folder.
|
266
|
+
|
267
|
+
Args:
|
268
|
+
query: Search query text
|
269
|
+
filters: Optional metadata filters
|
270
|
+
k: Number of results (default: 4)
|
271
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
272
|
+
use_colpali: Whether to use ColPali-style embedding model
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
List[FinalChunkResult]: List of relevant chunks
|
276
|
+
"""
|
277
|
+
payload = self._client._logic._prepare_retrieve_chunks_request(
|
278
|
+
query, filters, k, min_score, use_colpali, self._name, None
|
279
|
+
)
|
280
|
+
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
281
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
282
|
+
|
283
|
+
async def retrieve_docs(
|
284
|
+
self,
|
285
|
+
query: str,
|
286
|
+
filters: Optional[Dict[str, Any]] = None,
|
287
|
+
k: int = 4,
|
288
|
+
min_score: float = 0.0,
|
289
|
+
use_colpali: bool = True,
|
290
|
+
) -> List[DocumentResult]:
|
291
|
+
"""
|
292
|
+
Retrieve relevant documents within this folder.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
query: Search query text
|
296
|
+
filters: Optional metadata filters
|
297
|
+
k: Number of results (default: 4)
|
298
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
299
|
+
use_colpali: Whether to use ColPali-style embedding model
|
300
|
+
|
301
|
+
Returns:
|
302
|
+
List[DocumentResult]: List of relevant documents
|
303
|
+
"""
|
304
|
+
payload = self._client._logic._prepare_retrieve_docs_request(
|
305
|
+
query, filters, k, min_score, use_colpali, self._name, None
|
306
|
+
)
|
307
|
+
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
308
|
+
return self._client._logic._parse_document_result_list_response(response)
|
309
|
+
|
310
|
+
async def query(
|
311
|
+
self,
|
312
|
+
query: str,
|
313
|
+
filters: Optional[Dict[str, Any]] = None,
|
314
|
+
k: int = 4,
|
315
|
+
min_score: float = 0.0,
|
316
|
+
max_tokens: Optional[int] = None,
|
317
|
+
temperature: Optional[float] = None,
|
318
|
+
use_colpali: bool = True,
|
319
|
+
graph_name: Optional[str] = None,
|
320
|
+
hop_depth: int = 1,
|
321
|
+
include_paths: bool = False,
|
322
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
323
|
+
) -> CompletionResponse:
|
324
|
+
"""
|
325
|
+
Generate completion using relevant chunks as context within this folder.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
query: Query text
|
329
|
+
filters: Optional metadata filters
|
330
|
+
k: Number of chunks to use as context (default: 4)
|
331
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
332
|
+
max_tokens: Maximum tokens in completion
|
333
|
+
temperature: Model temperature
|
334
|
+
use_colpali: Whether to use ColPali-style embedding model
|
335
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
336
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
337
|
+
include_paths: Whether to include relationship paths in the response
|
338
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
339
|
+
|
340
|
+
Returns:
|
341
|
+
CompletionResponse: Generated completion
|
342
|
+
"""
|
343
|
+
payload = self._client._logic._prepare_query_request(
|
344
|
+
query,
|
345
|
+
filters,
|
346
|
+
k,
|
347
|
+
min_score,
|
348
|
+
max_tokens,
|
349
|
+
temperature,
|
350
|
+
use_colpali,
|
351
|
+
graph_name,
|
352
|
+
hop_depth,
|
353
|
+
include_paths,
|
354
|
+
prompt_overrides,
|
355
|
+
self._name,
|
356
|
+
None,
|
357
|
+
)
|
358
|
+
response = await self._client._request("POST", "query", data=payload)
|
359
|
+
return self._client._logic._parse_completion_response(response)
|
360
|
+
|
361
|
+
async def list_documents(
|
362
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
363
|
+
) -> List[Document]:
|
364
|
+
"""
|
365
|
+
List accessible documents within this folder.
|
366
|
+
|
367
|
+
Args:
|
368
|
+
skip: Number of documents to skip
|
369
|
+
limit: Maximum number of documents to return
|
370
|
+
filters: Optional filters
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
List[Document]: List of documents
|
374
|
+
"""
|
375
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
376
|
+
skip, limit, filters, self._name, None
|
377
|
+
)
|
378
|
+
response = await self._client._request("POST", "documents", data=data, params=params)
|
379
|
+
docs = self._client._logic._parse_document_list_response(response)
|
380
|
+
for doc in docs:
|
381
|
+
doc._client = self._client
|
382
|
+
return docs
|
383
|
+
|
384
|
+
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
385
|
+
"""
|
386
|
+
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
387
|
+
|
388
|
+
Args:
|
389
|
+
document_ids: List of document IDs to retrieve
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
List[Document]: List of document metadata for found documents
|
393
|
+
"""
|
394
|
+
request = self._client._logic._prepare_batch_get_documents_request(
|
395
|
+
document_ids, self._name, None
|
396
|
+
)
|
397
|
+
response = await self._client._request("POST", "batch/documents", data=request)
|
398
|
+
docs = self._client._logic._parse_document_list_response(response)
|
399
|
+
for doc in docs:
|
400
|
+
doc._client = self._client
|
401
|
+
return docs
|
402
|
+
|
403
|
+
async def batch_get_chunks(
|
404
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
405
|
+
) -> List[FinalChunkResult]:
|
406
|
+
"""
|
407
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
408
|
+
|
409
|
+
Args:
|
410
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
411
|
+
|
412
|
+
Returns:
|
413
|
+
List[FinalChunkResult]: List of chunk results
|
414
|
+
"""
|
415
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, self._name, None)
|
416
|
+
response = await self._client._request("POST", "batch/chunks", data=request)
|
417
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
418
|
+
|
419
|
+
async def create_graph(
|
420
|
+
self,
|
421
|
+
name: str,
|
422
|
+
filters: Optional[Dict[str, Any]] = None,
|
423
|
+
documents: Optional[List[str]] = None,
|
424
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
425
|
+
) -> Graph:
|
426
|
+
"""
|
427
|
+
Create a graph from documents within this folder.
|
428
|
+
|
429
|
+
Args:
|
430
|
+
name: Name of the graph to create
|
431
|
+
filters: Optional metadata filters to determine which documents to include
|
432
|
+
documents: Optional list of specific document IDs to include
|
433
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
434
|
+
|
435
|
+
Returns:
|
436
|
+
Graph: The created graph object
|
437
|
+
"""
|
438
|
+
request = self._client._logic._prepare_create_graph_request(
|
439
|
+
name, filters, documents, prompt_overrides, self._name, None
|
440
|
+
)
|
441
|
+
response = await self._client._request("POST", "graph/create", data=request)
|
442
|
+
return self._client._logic._parse_graph_response(response)
|
443
|
+
|
444
|
+
async def update_graph(
|
445
|
+
self,
|
446
|
+
name: str,
|
447
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
448
|
+
additional_documents: Optional[List[str]] = None,
|
449
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
450
|
+
) -> Graph:
|
451
|
+
"""
|
452
|
+
Update an existing graph with new documents from this folder.
|
453
|
+
|
454
|
+
Args:
|
455
|
+
name: Name of the graph to update
|
456
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
457
|
+
additional_documents: Optional list of additional document IDs to include
|
458
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
459
|
+
|
460
|
+
Returns:
|
461
|
+
Graph: The updated graph
|
462
|
+
"""
|
463
|
+
request = self._client._logic._prepare_update_graph_request(
|
464
|
+
name, additional_filters, additional_documents, prompt_overrides, self._name, None
|
465
|
+
)
|
466
|
+
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
467
|
+
return self._client._logic._parse_graph_response(response)
|
468
|
+
|
469
|
+
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
470
|
+
"""
|
471
|
+
Delete a document by its filename within this folder.
|
472
|
+
|
473
|
+
Args:
|
474
|
+
filename: Filename of the document to delete
|
475
|
+
|
476
|
+
Returns:
|
477
|
+
Dict[str, str]: Deletion status
|
478
|
+
"""
|
479
|
+
# Get the document by filename with folder scope
|
480
|
+
request = {"filename": filename, "folder_name": self._name}
|
481
|
+
|
482
|
+
# First get the document ID
|
483
|
+
response = await self._client._request(
|
484
|
+
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
485
|
+
)
|
486
|
+
doc = self._client._logic._parse_document_response(response)
|
487
|
+
|
488
|
+
# Then delete by ID
|
489
|
+
return await self._client.delete_document(doc.external_id)
|
490
|
+
|
491
|
+
|
492
|
+
class AsyncUserScope:
|
493
|
+
"""
|
494
|
+
A user scope that allows operations to be scoped to a specific end user and optionally a folder.
|
495
|
+
|
496
|
+
Args:
|
497
|
+
client: The AsyncMorphik client instance
|
498
|
+
end_user_id: The ID of the end user
|
499
|
+
folder_name: Optional folder name to further scope operations
|
500
|
+
"""
|
501
|
+
|
502
|
+
def __init__(self, client: "AsyncMorphik", end_user_id: str, folder_name: Optional[str] = None):
|
503
|
+
self._client = client
|
504
|
+
self._end_user_id = end_user_id
|
505
|
+
self._folder_name = folder_name
|
506
|
+
|
507
|
+
@property
|
508
|
+
def end_user_id(self) -> str:
|
509
|
+
"""Returns the end user ID."""
|
510
|
+
return self._end_user_id
|
511
|
+
|
512
|
+
@property
|
513
|
+
def folder_name(self) -> Optional[str]:
|
514
|
+
"""Returns the folder name if any."""
|
515
|
+
return self._folder_name
|
516
|
+
|
517
|
+
async def ingest_text(
|
518
|
+
self,
|
519
|
+
content: str,
|
520
|
+
filename: Optional[str] = None,
|
521
|
+
metadata: Optional[Dict[str, Any]] = None,
|
522
|
+
rules: Optional[List[RuleOrDict]] = None,
|
523
|
+
use_colpali: bool = True,
|
524
|
+
) -> Document:
|
525
|
+
"""
|
526
|
+
Ingest a text document into Morphik as this end user.
|
527
|
+
|
528
|
+
Args:
|
529
|
+
content: Text content to ingest
|
530
|
+
filename: Optional file name
|
531
|
+
metadata: Optional metadata dictionary
|
532
|
+
rules: Optional list of rules to apply during ingestion
|
533
|
+
use_colpali: Whether to use ColPali-style embedding model
|
534
|
+
|
535
|
+
Returns:
|
536
|
+
Document: Metadata of the ingested document
|
537
|
+
"""
|
538
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
539
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
540
|
+
content,
|
541
|
+
filename,
|
542
|
+
metadata,
|
543
|
+
rules_list,
|
544
|
+
use_colpali,
|
545
|
+
self._folder_name,
|
546
|
+
self._end_user_id,
|
547
|
+
)
|
548
|
+
response = await self._client._request("POST", "ingest/text", data=payload)
|
549
|
+
doc = self._client._logic._parse_document_response(response)
|
550
|
+
doc._client = self._client
|
551
|
+
return doc
|
552
|
+
|
553
|
+
async def ingest_file(
|
554
|
+
self,
|
555
|
+
file: Union[str, bytes, BinaryIO, Path],
|
556
|
+
filename: Optional[str] = None,
|
557
|
+
metadata: Optional[Dict[str, Any]] = None,
|
558
|
+
rules: Optional[List[RuleOrDict]] = None,
|
559
|
+
use_colpali: bool = True,
|
560
|
+
) -> Document:
|
561
|
+
"""
|
562
|
+
Ingest a file document into Morphik as this end user.
|
563
|
+
|
564
|
+
Args:
|
565
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
566
|
+
filename: Name of the file
|
567
|
+
metadata: Optional metadata dictionary
|
568
|
+
rules: Optional list of rules to apply during ingestion
|
569
|
+
use_colpali: Whether to use ColPali-style embedding model
|
570
|
+
|
571
|
+
Returns:
|
572
|
+
Document: Metadata of the ingested document
|
573
|
+
"""
|
574
|
+
# Handle different file input types
|
575
|
+
if isinstance(file, (str, Path)):
|
576
|
+
file_path = Path(file)
|
577
|
+
if not file_path.exists():
|
578
|
+
raise ValueError(f"File not found: {file}")
|
579
|
+
filename = file_path.name if filename is None else filename
|
580
|
+
with open(file_path, "rb") as f:
|
581
|
+
content = f.read()
|
582
|
+
file_obj = BytesIO(content)
|
583
|
+
elif isinstance(file, bytes):
|
584
|
+
if filename is None:
|
585
|
+
raise ValueError("filename is required when ingesting bytes")
|
586
|
+
file_obj = BytesIO(file)
|
587
|
+
else:
|
588
|
+
if filename is None:
|
589
|
+
raise ValueError("filename is required when ingesting file object")
|
590
|
+
file_obj = file
|
591
|
+
|
592
|
+
try:
|
593
|
+
# Prepare multipart form data
|
594
|
+
files = {"file": (filename, file_obj)}
|
595
|
+
|
596
|
+
# Add metadata and rules
|
597
|
+
data = {
|
598
|
+
"metadata": json.dumps(metadata or {}),
|
599
|
+
"rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
|
600
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
601
|
+
}
|
602
|
+
|
603
|
+
# Add folder name if scoped to a folder
|
604
|
+
if self._folder_name:
|
605
|
+
data["folder_name"] = self._folder_name
|
606
|
+
|
607
|
+
response = await self._client._request("POST", "ingest/file", data=data, files=files)
|
608
|
+
doc = self._client._logic._parse_document_response(response)
|
609
|
+
doc._client = self._client
|
610
|
+
return doc
|
611
|
+
finally:
|
612
|
+
# Close file if we opened it
|
613
|
+
if isinstance(file, (str, Path)):
|
614
|
+
file_obj.close()
|
615
|
+
|
616
|
+
async def ingest_files(
|
617
|
+
self,
|
618
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
619
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
620
|
+
rules: Optional[List[RuleOrDict]] = None,
|
621
|
+
use_colpali: bool = True,
|
622
|
+
parallel: bool = True,
|
623
|
+
) -> List[Document]:
|
624
|
+
"""
|
625
|
+
Ingest multiple files into Morphik as this end user.
|
626
|
+
|
627
|
+
Args:
|
628
|
+
files: List of files to ingest
|
629
|
+
metadata: Optional metadata
|
630
|
+
rules: Optional list of rules to apply
|
631
|
+
use_colpali: Whether to use ColPali-style embedding
|
632
|
+
parallel: Whether to process files in parallel
|
633
|
+
|
634
|
+
Returns:
|
635
|
+
List[Document]: List of ingested documents
|
636
|
+
"""
|
637
|
+
# Convert files to format expected by API
|
638
|
+
file_objects = []
|
639
|
+
for file in files:
|
640
|
+
if isinstance(file, (str, Path)):
|
641
|
+
path = Path(file)
|
642
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
643
|
+
elif isinstance(file, bytes):
|
644
|
+
file_objects.append(("files", ("file.bin", file)))
|
645
|
+
else:
|
646
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
647
|
+
|
648
|
+
try:
|
649
|
+
# Prepare request data
|
650
|
+
# Convert rules appropriately
|
651
|
+
if rules:
|
652
|
+
if all(isinstance(r, list) for r in rules):
|
653
|
+
# List of lists - per-file rules
|
654
|
+
converted_rules = [
|
655
|
+
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
656
|
+
]
|
657
|
+
else:
|
658
|
+
# Flat list - shared rules for all files
|
659
|
+
converted_rules = [self._client._convert_rule(r) for r in rules]
|
660
|
+
else:
|
661
|
+
converted_rules = []
|
662
|
+
|
663
|
+
data = {
|
664
|
+
"metadata": json.dumps(metadata or {}),
|
665
|
+
"rules": json.dumps(converted_rules),
|
666
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
667
|
+
"parallel": str(parallel).lower(),
|
668
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
669
|
+
}
|
670
|
+
|
671
|
+
# Add folder name if scoped to a folder
|
672
|
+
if self._folder_name:
|
673
|
+
data["folder_name"] = self._folder_name
|
674
|
+
|
675
|
+
response = await self._client._request(
|
676
|
+
"POST", "ingest/files", data=data, files=file_objects
|
677
|
+
)
|
678
|
+
|
679
|
+
if response.get("errors"):
|
680
|
+
# Log errors but don't raise exception
|
681
|
+
for error in response["errors"]:
|
682
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
683
|
+
|
684
|
+
docs = [
|
685
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
686
|
+
]
|
687
|
+
for doc in docs:
|
688
|
+
doc._client = self._client
|
689
|
+
return docs
|
690
|
+
finally:
|
691
|
+
# Clean up file objects
|
692
|
+
for _, (_, file_obj) in file_objects:
|
693
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
694
|
+
file_obj.close()
|
695
|
+
|
696
|
+
async def ingest_directory(
|
697
|
+
self,
|
698
|
+
directory: Union[str, Path],
|
699
|
+
recursive: bool = False,
|
700
|
+
pattern: str = "*",
|
701
|
+
metadata: Optional[Dict[str, Any]] = None,
|
702
|
+
rules: Optional[List[RuleOrDict]] = None,
|
703
|
+
use_colpali: bool = True,
|
704
|
+
parallel: bool = True,
|
705
|
+
) -> List[Document]:
|
706
|
+
"""
|
707
|
+
Ingest all files in a directory into Morphik as this end user.
|
708
|
+
|
709
|
+
Args:
|
710
|
+
directory: Path to directory containing files to ingest
|
711
|
+
recursive: Whether to recursively process subdirectories
|
712
|
+
pattern: Optional glob pattern to filter files
|
713
|
+
metadata: Optional metadata dictionary to apply to all files
|
714
|
+
rules: Optional list of rules to apply
|
715
|
+
use_colpali: Whether to use ColPali-style embedding
|
716
|
+
parallel: Whether to process files in parallel
|
717
|
+
|
718
|
+
Returns:
|
719
|
+
List[Document]: List of ingested documents
|
720
|
+
"""
|
721
|
+
directory = Path(directory)
|
722
|
+
if not directory.is_dir():
|
723
|
+
raise ValueError(f"Directory not found: {directory}")
|
724
|
+
|
725
|
+
# Collect all files matching pattern
|
726
|
+
if recursive:
|
727
|
+
files = list(directory.rglob(pattern))
|
728
|
+
else:
|
729
|
+
files = list(directory.glob(pattern))
|
730
|
+
|
731
|
+
# Filter out directories
|
732
|
+
files = [f for f in files if f.is_file()]
|
733
|
+
|
734
|
+
if not files:
|
735
|
+
return []
|
736
|
+
|
737
|
+
# Use ingest_files with collected paths
|
738
|
+
return await self.ingest_files(
|
739
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
740
|
+
)
|
741
|
+
|
742
|
+
async def retrieve_chunks(
|
743
|
+
self,
|
744
|
+
query: str,
|
745
|
+
filters: Optional[Dict[str, Any]] = None,
|
746
|
+
k: int = 4,
|
747
|
+
min_score: float = 0.0,
|
748
|
+
use_colpali: bool = True,
|
749
|
+
) -> List[FinalChunkResult]:
|
750
|
+
"""
|
751
|
+
Retrieve relevant chunks as this end user.
|
752
|
+
|
753
|
+
Args:
|
754
|
+
query: Search query text
|
755
|
+
filters: Optional metadata filters
|
756
|
+
k: Number of results (default: 4)
|
757
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
758
|
+
use_colpali: Whether to use ColPali-style embedding model
|
759
|
+
|
760
|
+
Returns:
|
761
|
+
List[FinalChunkResult]: List of relevant chunks
|
762
|
+
"""
|
763
|
+
payload = self._client._logic._prepare_retrieve_chunks_request(
|
764
|
+
query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
|
765
|
+
)
|
766
|
+
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
767
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
768
|
+
|
769
|
+
async def retrieve_docs(
|
770
|
+
self,
|
771
|
+
query: str,
|
772
|
+
filters: Optional[Dict[str, Any]] = None,
|
773
|
+
k: int = 4,
|
774
|
+
min_score: float = 0.0,
|
775
|
+
use_colpali: bool = True,
|
776
|
+
) -> List[DocumentResult]:
|
777
|
+
"""
|
778
|
+
Retrieve relevant documents as this end user.
|
779
|
+
|
780
|
+
Args:
|
781
|
+
query: Search query text
|
782
|
+
filters: Optional metadata filters
|
783
|
+
k: Number of results (default: 4)
|
784
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
785
|
+
use_colpali: Whether to use ColPali-style embedding model
|
786
|
+
|
787
|
+
Returns:
|
788
|
+
List[DocumentResult]: List of relevant documents
|
789
|
+
"""
|
790
|
+
payload = self._client._logic._prepare_retrieve_docs_request(
|
791
|
+
query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
|
792
|
+
)
|
793
|
+
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
794
|
+
return self._client._logic._parse_document_result_list_response(response)
|
795
|
+
|
796
|
+
async def query(
|
797
|
+
self,
|
798
|
+
query: str,
|
799
|
+
filters: Optional[Dict[str, Any]] = None,
|
800
|
+
k: int = 4,
|
801
|
+
min_score: float = 0.0,
|
802
|
+
max_tokens: Optional[int] = None,
|
803
|
+
temperature: Optional[float] = None,
|
804
|
+
use_colpali: bool = True,
|
805
|
+
graph_name: Optional[str] = None,
|
806
|
+
hop_depth: int = 1,
|
807
|
+
include_paths: bool = False,
|
808
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
809
|
+
) -> CompletionResponse:
|
810
|
+
"""
|
811
|
+
Generate completion using relevant chunks as context as this end user.
|
812
|
+
|
813
|
+
Args:
|
814
|
+
query: Query text
|
815
|
+
filters: Optional metadata filters
|
816
|
+
k: Number of chunks to use as context (default: 4)
|
817
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
818
|
+
max_tokens: Maximum tokens in completion
|
819
|
+
temperature: Model temperature
|
820
|
+
use_colpali: Whether to use ColPali-style embedding model
|
821
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
822
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
823
|
+
include_paths: Whether to include relationship paths in the response
|
824
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
825
|
+
|
826
|
+
Returns:
|
827
|
+
CompletionResponse: Generated completion
|
828
|
+
"""
|
829
|
+
payload = self._client._logic._prepare_query_request(
|
830
|
+
query,
|
831
|
+
filters,
|
832
|
+
k,
|
833
|
+
min_score,
|
834
|
+
max_tokens,
|
835
|
+
temperature,
|
836
|
+
use_colpali,
|
837
|
+
graph_name,
|
838
|
+
hop_depth,
|
839
|
+
include_paths,
|
840
|
+
prompt_overrides,
|
841
|
+
self._folder_name,
|
842
|
+
self._end_user_id,
|
843
|
+
)
|
844
|
+
response = await self._client._request("POST", "query", data=payload)
|
845
|
+
return self._client._logic._parse_completion_response(response)
|
846
|
+
|
847
|
+
async def list_documents(
|
848
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
849
|
+
) -> List[Document]:
|
850
|
+
"""
|
851
|
+
List accessible documents for this end user.
|
852
|
+
|
853
|
+
Args:
|
854
|
+
skip: Number of documents to skip
|
855
|
+
limit: Maximum number of documents to return
|
856
|
+
filters: Optional filters
|
857
|
+
|
858
|
+
Returns:
|
859
|
+
List[Document]: List of documents
|
860
|
+
"""
|
861
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
862
|
+
skip, limit, filters, self._folder_name, self._end_user_id
|
863
|
+
)
|
864
|
+
response = await self._client._request("POST", "documents", data=data, params=params)
|
865
|
+
docs = self._client._logic._parse_document_list_response(response)
|
866
|
+
for doc in docs:
|
867
|
+
doc._client = self._client
|
868
|
+
return docs
|
869
|
+
|
870
|
+
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
871
|
+
"""
|
872
|
+
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
873
|
+
|
874
|
+
Args:
|
875
|
+
document_ids: List of document IDs to retrieve
|
876
|
+
|
877
|
+
Returns:
|
878
|
+
List[Document]: List of document metadata for found documents
|
879
|
+
"""
|
880
|
+
request = self._client._logic._prepare_batch_get_documents_request(
|
881
|
+
document_ids, self._folder_name, self._end_user_id
|
882
|
+
)
|
883
|
+
response = await self._client._request("POST", "batch/documents", data=request)
|
884
|
+
docs = self._client._logic._parse_document_list_response(response)
|
885
|
+
for doc in docs:
|
886
|
+
doc._client = self._client
|
887
|
+
return docs
|
888
|
+
|
889
|
+
async def batch_get_chunks(
|
890
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
891
|
+
) -> List[FinalChunkResult]:
|
892
|
+
"""
|
893
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
12
894
|
|
13
|
-
|
14
|
-
|
15
|
-
ChunkResult,
|
16
|
-
DocumentResult,
|
17
|
-
CompletionResponse,
|
18
|
-
IngestTextRequest,
|
19
|
-
ChunkSource,
|
20
|
-
Graph,
|
21
|
-
# Prompt override models
|
22
|
-
EntityExtractionExample,
|
23
|
-
EntityResolutionExample,
|
24
|
-
EntityExtractionPromptOverride,
|
25
|
-
EntityResolutionPromptOverride,
|
26
|
-
QueryPromptOverride,
|
27
|
-
GraphPromptOverrides,
|
28
|
-
QueryPromptOverrides
|
29
|
-
)
|
30
|
-
from .rules import Rule
|
895
|
+
Args:
|
896
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
31
897
|
|
32
|
-
|
898
|
+
Returns:
|
899
|
+
List[FinalChunkResult]: List of chunk results
|
900
|
+
"""
|
901
|
+
request = self._client._logic._prepare_batch_get_chunks_request(
|
902
|
+
sources, self._folder_name, self._end_user_id
|
903
|
+
)
|
904
|
+
response = await self._client._request("POST", "batch/chunks", data=request)
|
905
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
33
906
|
|
34
|
-
|
35
|
-
|
907
|
+
async def create_graph(
|
908
|
+
self,
|
909
|
+
name: str,
|
910
|
+
filters: Optional[Dict[str, Any]] = None,
|
911
|
+
documents: Optional[List[str]] = None,
|
912
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
913
|
+
) -> Graph:
|
914
|
+
"""
|
915
|
+
Create a graph from documents for this end user.
|
36
916
|
|
917
|
+
Args:
|
918
|
+
name: Name of the graph to create
|
919
|
+
filters: Optional metadata filters to determine which documents to include
|
920
|
+
documents: Optional list of specific document IDs to include
|
921
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
37
922
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
923
|
+
Returns:
|
924
|
+
Graph: The created graph object
|
925
|
+
"""
|
926
|
+
request = self._client._logic._prepare_create_graph_request(
|
927
|
+
name, filters, documents, prompt_overrides, self._folder_name, self._end_user_id
|
928
|
+
)
|
929
|
+
response = await self._client._request("POST", "graph/create", data=request)
|
930
|
+
return self._client._logic._parse_graph_response(response)
|
42
931
|
|
43
|
-
async def
|
44
|
-
|
45
|
-
|
932
|
+
async def update_graph(
|
933
|
+
self,
|
934
|
+
name: str,
|
935
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
936
|
+
additional_documents: Optional[List[str]] = None,
|
937
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
938
|
+
) -> Graph:
|
939
|
+
"""
|
940
|
+
Update an existing graph with new documents for this end user.
|
46
941
|
|
47
|
-
|
48
|
-
|
49
|
-
|
942
|
+
Args:
|
943
|
+
name: Name of the graph to update
|
944
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
945
|
+
additional_documents: Optional list of additional document IDs to include
|
946
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
50
947
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
948
|
+
Returns:
|
949
|
+
Graph: The updated graph
|
950
|
+
"""
|
951
|
+
request = self._client._logic._prepare_update_graph_request(
|
952
|
+
name,
|
953
|
+
additional_filters,
|
954
|
+
additional_documents,
|
955
|
+
prompt_overrides,
|
956
|
+
self._folder_name,
|
957
|
+
self._end_user_id,
|
59
958
|
)
|
60
|
-
|
959
|
+
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
960
|
+
return self._client._logic._parse_graph_response(response)
|
961
|
+
|
962
|
+
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
963
|
+
"""
|
964
|
+
Delete a document by its filename for this end user.
|
965
|
+
|
966
|
+
Args:
|
967
|
+
filename: Filename of the document to delete
|
968
|
+
|
969
|
+
Returns:
|
970
|
+
Dict[str, str]: Deletion status
|
971
|
+
"""
|
972
|
+
# Build parameters for the filename lookup
|
973
|
+
params = {"end_user_id": self._end_user_id}
|
61
974
|
|
975
|
+
# Add folder name if scoped to a folder
|
976
|
+
if self._folder_name:
|
977
|
+
params["folder_name"] = self._folder_name
|
62
978
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
69
|
-
content_type: str = Field(..., description="Content type")
|
70
|
-
filename: Optional[str] = Field(None, description="Original filename")
|
71
|
-
download_url: Optional[str] = Field(None, description="URL to download full document")
|
979
|
+
# First get the document ID
|
980
|
+
response = await self._client._request(
|
981
|
+
"GET", f"documents/filename/{filename}", params=params
|
982
|
+
)
|
983
|
+
doc = self._client._logic._parse_document_response(response)
|
72
984
|
|
73
|
-
|
74
|
-
|
985
|
+
# Then delete by ID
|
986
|
+
return await self._client.delete_document(doc.external_id)
|
75
987
|
|
76
988
|
|
77
989
|
class AsyncMorphik:
|
@@ -97,39 +1009,12 @@ class AsyncMorphik:
|
|
97
1009
|
"""
|
98
1010
|
|
99
1011
|
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
100
|
-
self.
|
101
|
-
self._client = (
|
102
|
-
|
103
|
-
|
104
|
-
else
|
105
|
-
timeout=timeout,
|
106
|
-
verify=False, # Disable SSL for localhost
|
107
|
-
http2=False, # Force HTTP/1.1
|
108
|
-
)
|
1012
|
+
self._logic = _MorphikClientLogic(uri, timeout, is_local)
|
1013
|
+
self._client = httpx.AsyncClient(
|
1014
|
+
timeout=self._logic._timeout,
|
1015
|
+
verify=not self._logic._is_local,
|
1016
|
+
http2=False if self._logic._is_local else True,
|
109
1017
|
)
|
110
|
-
self._is_local = is_local
|
111
|
-
|
112
|
-
if uri:
|
113
|
-
self._setup_auth(uri)
|
114
|
-
else:
|
115
|
-
self._base_url = "http://localhost:8000"
|
116
|
-
self._auth_token = None
|
117
|
-
|
118
|
-
def _setup_auth(self, uri: str) -> None:
|
119
|
-
"""Setup authentication from URI"""
|
120
|
-
parsed = urlparse(uri)
|
121
|
-
if not parsed.netloc:
|
122
|
-
raise ValueError("Invalid URI format")
|
123
|
-
|
124
|
-
# Split host and auth parts
|
125
|
-
auth, host = parsed.netloc.split("@")
|
126
|
-
_, self._auth_token = auth.split(":")
|
127
|
-
|
128
|
-
# Set base URL
|
129
|
-
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
130
|
-
|
131
|
-
# Basic token validation
|
132
|
-
jwt.decode(self._auth_token, options={"verify_signature": False})
|
133
1018
|
|
134
1019
|
async def _request(
|
135
1020
|
self,
|
@@ -140,9 +1025,10 @@ class AsyncMorphik:
|
|
140
1025
|
params: Optional[Dict[str, Any]] = None,
|
141
1026
|
) -> Dict[str, Any]:
|
142
1027
|
"""Make HTTP request"""
|
143
|
-
|
144
|
-
|
145
|
-
|
1028
|
+
url = self._logic._get_url(endpoint)
|
1029
|
+
headers = self._logic._get_headers()
|
1030
|
+
if self._logic._auth_token: # Only add auth header if we have a token
|
1031
|
+
headers["Authorization"] = f"Bearer {self._logic._auth_token}"
|
146
1032
|
|
147
1033
|
# Configure request data based on type
|
148
1034
|
if files:
|
@@ -156,7 +1042,7 @@ class AsyncMorphik:
|
|
156
1042
|
|
157
1043
|
response = await self._client.request(
|
158
1044
|
method,
|
159
|
-
|
1045
|
+
url,
|
160
1046
|
headers=headers,
|
161
1047
|
params=params,
|
162
1048
|
**request_data,
|
@@ -166,9 +1052,43 @@ class AsyncMorphik:
|
|
166
1052
|
|
167
1053
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
168
1054
|
"""Convert a rule to a dictionary format"""
|
169
|
-
|
170
|
-
|
171
|
-
|
1055
|
+
return self._logic._convert_rule(rule)
|
1056
|
+
|
1057
|
+
def create_folder(self, name: str) -> AsyncFolder:
|
1058
|
+
"""
|
1059
|
+
Create a folder to scope operations.
|
1060
|
+
|
1061
|
+
Args:
|
1062
|
+
name: The name of the folder
|
1063
|
+
|
1064
|
+
Returns:
|
1065
|
+
AsyncFolder: A folder object for scoped operations
|
1066
|
+
"""
|
1067
|
+
return AsyncFolder(self, name)
|
1068
|
+
|
1069
|
+
def get_folder(self, name: str) -> AsyncFolder:
|
1070
|
+
"""
|
1071
|
+
Get a folder by name to scope operations.
|
1072
|
+
|
1073
|
+
Args:
|
1074
|
+
name: The name of the folder
|
1075
|
+
|
1076
|
+
Returns:
|
1077
|
+
AsyncFolder: A folder object for scoped operations
|
1078
|
+
"""
|
1079
|
+
return AsyncFolder(self, name)
|
1080
|
+
|
1081
|
+
def signin(self, end_user_id: str) -> AsyncUserScope:
|
1082
|
+
"""
|
1083
|
+
Sign in as an end user to scope operations.
|
1084
|
+
|
1085
|
+
Args:
|
1086
|
+
end_user_id: The ID of the end user
|
1087
|
+
|
1088
|
+
Returns:
|
1089
|
+
AsyncUserScope: A user scope object for scoped operations
|
1090
|
+
"""
|
1091
|
+
return AsyncUserScope(self, end_user_id)
|
172
1092
|
|
173
1093
|
async def ingest_text(
|
174
1094
|
self,
|
@@ -213,53 +1133,41 @@ class AsyncMorphik:
|
|
213
1133
|
)
|
214
1134
|
```
|
215
1135
|
"""
|
216
|
-
|
217
|
-
|
218
|
-
filename
|
219
|
-
metadata=metadata or {},
|
220
|
-
rules=[self._convert_rule(r) for r in (rules or [])],
|
221
|
-
use_colpali=use_colpali,
|
1136
|
+
rules_list = [self._convert_rule(r) for r in (rules or [])]
|
1137
|
+
payload = self._logic._prepare_ingest_text_request(
|
1138
|
+
content, filename, metadata, rules_list, use_colpali, None, None
|
222
1139
|
)
|
223
|
-
response = await self._request("POST", "ingest/text", data=
|
224
|
-
doc =
|
1140
|
+
response = await self._request("POST", "ingest/text", data=payload)
|
1141
|
+
doc = self._logic._parse_document_response(response)
|
225
1142
|
doc._client = self
|
226
1143
|
return doc
|
227
1144
|
|
228
1145
|
async def ingest_file(
|
229
1146
|
self,
|
230
1147
|
file: Union[str, bytes, BinaryIO, Path],
|
231
|
-
filename: str,
|
1148
|
+
filename: Optional[str] = None,
|
232
1149
|
metadata: Optional[Dict[str, Any]] = None,
|
233
1150
|
rules: Optional[List[RuleOrDict]] = None,
|
234
1151
|
use_colpali: bool = True,
|
235
1152
|
) -> Document:
|
236
1153
|
"""Ingest a file document into Morphik."""
|
237
|
-
#
|
238
|
-
|
239
|
-
file_path = Path(file)
|
240
|
-
if not file_path.exists():
|
241
|
-
raise ValueError(f"File not found: {file}")
|
242
|
-
with open(file_path, "rb") as f:
|
243
|
-
content = f.read()
|
244
|
-
file_obj = BytesIO(content)
|
245
|
-
elif isinstance(file, bytes):
|
246
|
-
file_obj = BytesIO(file)
|
247
|
-
else:
|
248
|
-
file_obj = file
|
1154
|
+
# Process file input
|
1155
|
+
file_obj, filename = self._logic._prepare_file_for_upload(file, filename)
|
249
1156
|
|
250
1157
|
try:
|
251
1158
|
# Prepare multipart form data
|
252
1159
|
files = {"file": (filename, file_obj)}
|
253
1160
|
|
254
|
-
#
|
255
|
-
|
256
|
-
"metadata": json.dumps(metadata or {}),
|
257
|
-
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
258
|
-
"use_colpali": json.dumps(use_colpali),
|
259
|
-
}
|
1161
|
+
# Create form data
|
1162
|
+
form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
|
260
1163
|
|
261
|
-
response = await self._request(
|
262
|
-
|
1164
|
+
response = await self._request(
|
1165
|
+
"POST",
|
1166
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
1167
|
+
data=form_data,
|
1168
|
+
files=files,
|
1169
|
+
)
|
1170
|
+
doc = self._logic._parse_document_response(response)
|
263
1171
|
doc._client = self
|
264
1172
|
return doc
|
265
1173
|
finally:
|
@@ -292,44 +1200,23 @@ class AsyncMorphik:
|
|
292
1200
|
ValueError: If metadata list length doesn't match files length
|
293
1201
|
"""
|
294
1202
|
# Convert files to format expected by API
|
295
|
-
file_objects =
|
296
|
-
for file in files:
|
297
|
-
if isinstance(file, (str, Path)):
|
298
|
-
path = Path(file)
|
299
|
-
file_objects.append(("files", (path.name, open(path, "rb"))))
|
300
|
-
elif isinstance(file, bytes):
|
301
|
-
file_objects.append(("files", ("file.bin", file)))
|
302
|
-
else:
|
303
|
-
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
1203
|
+
file_objects = self._logic._prepare_files_for_upload(files)
|
304
1204
|
|
305
1205
|
try:
|
306
|
-
# Prepare
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
# List of lists - per-file rules
|
311
|
-
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
312
|
-
else:
|
313
|
-
# Flat list - shared rules for all files
|
314
|
-
converted_rules = [self._convert_rule(r) for r in rules]
|
315
|
-
else:
|
316
|
-
converted_rules = []
|
317
|
-
|
318
|
-
data = {
|
319
|
-
"metadata": json.dumps(metadata or {}),
|
320
|
-
"rules": json.dumps(converted_rules),
|
321
|
-
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
322
|
-
"parallel": str(parallel).lower(),
|
323
|
-
}
|
1206
|
+
# Prepare form data
|
1207
|
+
data = self._logic._prepare_ingest_files_form_data(
|
1208
|
+
metadata, rules, use_colpali, parallel, None, None
|
1209
|
+
)
|
324
1210
|
|
325
1211
|
response = await self._request("POST", "ingest/files", data=data, files=file_objects)
|
326
|
-
|
1212
|
+
|
327
1213
|
if response.get("errors"):
|
328
1214
|
# Log errors but don't raise exception
|
329
1215
|
for error in response["errors"]:
|
330
1216
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
331
|
-
|
332
|
-
|
1217
|
+
|
1218
|
+
# Parse the documents from the response
|
1219
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
|
333
1220
|
for doc in docs:
|
334
1221
|
doc._client = self
|
335
1222
|
return docs
|
@@ -379,17 +1266,13 @@ class AsyncMorphik:
|
|
379
1266
|
|
380
1267
|
# Filter out directories
|
381
1268
|
files = [f for f in files if f.is_file()]
|
382
|
-
|
1269
|
+
|
383
1270
|
if not files:
|
384
1271
|
return []
|
385
1272
|
|
386
1273
|
# Use ingest_files with collected paths
|
387
1274
|
return await self.ingest_files(
|
388
|
-
files=files,
|
389
|
-
metadata=metadata,
|
390
|
-
rules=rules,
|
391
|
-
use_colpali=use_colpali,
|
392
|
-
parallel=parallel
|
1275
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
393
1276
|
)
|
394
1277
|
|
395
1278
|
async def retrieve_chunks(
|
@@ -420,54 +1303,11 @@ class AsyncMorphik:
|
|
420
1303
|
)
|
421
1304
|
```
|
422
1305
|
"""
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
"use_colpali": use_colpali,
|
429
|
-
}
|
430
|
-
|
431
|
-
response = await self._request("POST", "retrieve/chunks", data=request)
|
432
|
-
chunks = [ChunkResult(**r) for r in response]
|
433
|
-
|
434
|
-
final_chunks = []
|
435
|
-
for chunk in chunks:
|
436
|
-
if chunk.metadata.get("is_image"):
|
437
|
-
try:
|
438
|
-
# Handle data URI format "data:image/png;base64,..."
|
439
|
-
content = chunk.content
|
440
|
-
if content.startswith("data:"):
|
441
|
-
# Extract the base64 part after the comma
|
442
|
-
content = content.split(",", 1)[1]
|
443
|
-
|
444
|
-
# Now decode the base64 string
|
445
|
-
import base64
|
446
|
-
import io
|
447
|
-
from PIL import Image
|
448
|
-
image_bytes = base64.b64decode(content)
|
449
|
-
content = Image.open(io.BytesIO(image_bytes))
|
450
|
-
except Exception as e:
|
451
|
-
print(f"Error processing image: {str(e)}")
|
452
|
-
# Fall back to using the content as text
|
453
|
-
content = chunk.content
|
454
|
-
else:
|
455
|
-
content = chunk.content
|
456
|
-
|
457
|
-
final_chunks.append(
|
458
|
-
FinalChunkResult(
|
459
|
-
content=content,
|
460
|
-
score=chunk.score,
|
461
|
-
document_id=chunk.document_id,
|
462
|
-
chunk_number=chunk.chunk_number,
|
463
|
-
metadata=chunk.metadata,
|
464
|
-
content_type=chunk.content_type,
|
465
|
-
filename=chunk.filename,
|
466
|
-
download_url=chunk.download_url,
|
467
|
-
)
|
468
|
-
)
|
469
|
-
|
470
|
-
return final_chunks
|
1306
|
+
payload = self._logic._prepare_retrieve_chunks_request(
|
1307
|
+
query, filters, k, min_score, use_colpali, None, None
|
1308
|
+
)
|
1309
|
+
response = await self._request("POST", "retrieve/chunks", data=payload)
|
1310
|
+
return self._logic._parse_chunk_result_list_response(response)
|
471
1311
|
|
472
1312
|
async def retrieve_docs(
|
473
1313
|
self,
|
@@ -497,16 +1337,11 @@ class AsyncMorphik:
|
|
497
1337
|
)
|
498
1338
|
```
|
499
1339
|
"""
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
"use_colpali": use_colpali,
|
506
|
-
}
|
507
|
-
|
508
|
-
response = await self._request("POST", "retrieve/docs", data=request)
|
509
|
-
return [DocumentResult(**r) for r in response]
|
1340
|
+
payload = self._logic._prepare_retrieve_docs_request(
|
1341
|
+
query, filters, k, min_score, use_colpali, None, None
|
1342
|
+
)
|
1343
|
+
response = await self._request("POST", "retrieve/docs", data=payload)
|
1344
|
+
return self._logic._parse_document_result_list_response(response)
|
510
1345
|
|
511
1346
|
async def query(
|
512
1347
|
self,
|
@@ -549,7 +1384,7 @@ class AsyncMorphik:
|
|
549
1384
|
filters={"department": "research"},
|
550
1385
|
temperature=0.7
|
551
1386
|
)
|
552
|
-
|
1387
|
+
|
553
1388
|
# Knowledge graph enhanced query
|
554
1389
|
response = await db.query(
|
555
1390
|
"How does product X relate to customer segment Y?",
|
@@ -557,7 +1392,7 @@ class AsyncMorphik:
|
|
557
1392
|
hop_depth=2,
|
558
1393
|
include_paths=True
|
559
1394
|
)
|
560
|
-
|
1395
|
+
|
561
1396
|
# With prompt customization
|
562
1397
|
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
563
1398
|
response = await db.query(
|
@@ -568,7 +1403,7 @@ class AsyncMorphik:
|
|
568
1403
|
)
|
569
1404
|
)
|
570
1405
|
)
|
571
|
-
|
1406
|
+
|
572
1407
|
# Or using a dictionary
|
573
1408
|
response = await db.query(
|
574
1409
|
"What are the key findings?",
|
@@ -578,35 +1413,32 @@ class AsyncMorphik:
|
|
578
1413
|
}
|
579
1414
|
}
|
580
1415
|
)
|
581
|
-
|
1416
|
+
|
582
1417
|
print(response.completion)
|
583
|
-
|
1418
|
+
|
584
1419
|
# If include_paths=True, you can inspect the graph paths
|
585
1420
|
if response.metadata and "graph" in response.metadata:
|
586
1421
|
for path in response.metadata["graph"]["paths"]:
|
587
1422
|
print(" -> ".join(path))
|
588
1423
|
```
|
589
1424
|
"""
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
response = await self._request("POST", "query", data=request)
|
609
|
-
return CompletionResponse(**response)
|
1425
|
+
payload = self._logic._prepare_query_request(
|
1426
|
+
query,
|
1427
|
+
filters,
|
1428
|
+
k,
|
1429
|
+
min_score,
|
1430
|
+
max_tokens,
|
1431
|
+
temperature,
|
1432
|
+
use_colpali,
|
1433
|
+
graph_name,
|
1434
|
+
hop_depth,
|
1435
|
+
include_paths,
|
1436
|
+
prompt_overrides,
|
1437
|
+
None,
|
1438
|
+
None,
|
1439
|
+
)
|
1440
|
+
response = await self._request("POST", "query", data=payload)
|
1441
|
+
return self._logic._parse_completion_response(response)
|
610
1442
|
|
611
1443
|
async def list_documents(
|
612
1444
|
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
@@ -631,11 +1463,9 @@ class AsyncMorphik:
|
|
631
1463
|
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
632
1464
|
```
|
633
1465
|
"""
|
634
|
-
|
635
|
-
response = await self._request(
|
636
|
-
|
637
|
-
)
|
638
|
-
docs = [Document(**doc) for doc in response]
|
1466
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
|
1467
|
+
response = await self._request("POST", "documents", data=data, params=params)
|
1468
|
+
docs = self._logic._parse_document_list_response(response)
|
639
1469
|
for doc in docs:
|
640
1470
|
doc._client = self
|
641
1471
|
return docs
|
@@ -657,10 +1487,10 @@ class AsyncMorphik:
|
|
657
1487
|
```
|
658
1488
|
"""
|
659
1489
|
response = await self._request("GET", f"documents/{document_id}")
|
660
|
-
doc =
|
1490
|
+
doc = self._logic._parse_document_response(response)
|
661
1491
|
doc._client = self
|
662
1492
|
return doc
|
663
|
-
|
1493
|
+
|
664
1494
|
async def get_document_by_filename(self, filename: str) -> Document:
|
665
1495
|
"""
|
666
1496
|
Get document metadata by filename.
|
@@ -679,10 +1509,10 @@ class AsyncMorphik:
|
|
679
1509
|
```
|
680
1510
|
"""
|
681
1511
|
response = await self._request("GET", f"documents/filename/{filename}")
|
682
|
-
doc =
|
1512
|
+
doc = self._logic._parse_document_response(response)
|
683
1513
|
doc._client = self
|
684
1514
|
return doc
|
685
|
-
|
1515
|
+
|
686
1516
|
async def update_document_with_text(
|
687
1517
|
self,
|
688
1518
|
document_id: str,
|
@@ -695,7 +1525,7 @@ class AsyncMorphik:
|
|
695
1525
|
) -> Document:
|
696
1526
|
"""
|
697
1527
|
Update a document with new text content using the specified strategy.
|
698
|
-
|
1528
|
+
|
699
1529
|
Args:
|
700
1530
|
document_id: ID of the document to update
|
701
1531
|
content: The new content to add
|
@@ -704,10 +1534,10 @@ class AsyncMorphik:
|
|
704
1534
|
rules: Optional list of rules to apply to the content
|
705
1535
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
706
1536
|
use_colpali: Whether to use multi-vector embedding
|
707
|
-
|
1537
|
+
|
708
1538
|
Returns:
|
709
1539
|
Document: Updated document metadata
|
710
|
-
|
1540
|
+
|
711
1541
|
Example:
|
712
1542
|
```python
|
713
1543
|
# Add new content to an existing document
|
@@ -729,22 +1559,19 @@ class AsyncMorphik:
|
|
729
1559
|
rules=[self._convert_rule(r) for r in (rules or [])],
|
730
1560
|
use_colpali=use_colpali if use_colpali is not None else True,
|
731
1561
|
)
|
732
|
-
|
1562
|
+
|
733
1563
|
params = {}
|
734
1564
|
if update_strategy != "add":
|
735
1565
|
params["update_strategy"] = update_strategy
|
736
|
-
|
1566
|
+
|
737
1567
|
response = await self._request(
|
738
|
-
"POST",
|
739
|
-
f"documents/{document_id}/update_text",
|
740
|
-
data=request.model_dump(),
|
741
|
-
params=params
|
1568
|
+
"POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
|
742
1569
|
)
|
743
|
-
|
744
|
-
doc =
|
1570
|
+
|
1571
|
+
doc = self._logic._parse_document_response(response)
|
745
1572
|
doc._client = self
|
746
1573
|
return doc
|
747
|
-
|
1574
|
+
|
748
1575
|
async def update_document_with_file(
|
749
1576
|
self,
|
750
1577
|
document_id: str,
|
@@ -757,7 +1584,7 @@ class AsyncMorphik:
|
|
757
1584
|
) -> Document:
|
758
1585
|
"""
|
759
1586
|
Update a document with content from a file using the specified strategy.
|
760
|
-
|
1587
|
+
|
761
1588
|
Args:
|
762
1589
|
document_id: ID of the document to update
|
763
1590
|
file: File to add (path string, bytes, file object, or Path)
|
@@ -766,10 +1593,10 @@ class AsyncMorphik:
|
|
766
1593
|
rules: Optional list of rules to apply to the content
|
767
1594
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
768
1595
|
use_colpali: Whether to use multi-vector embedding
|
769
|
-
|
1596
|
+
|
770
1597
|
Returns:
|
771
1598
|
Document: Updated document metadata
|
772
|
-
|
1599
|
+
|
773
1600
|
Example:
|
774
1601
|
```python
|
775
1602
|
# Add content from a file to an existing document
|
@@ -799,34 +1626,34 @@ class AsyncMorphik:
|
|
799
1626
|
if filename is None:
|
800
1627
|
raise ValueError("filename is required when updating with file object")
|
801
1628
|
file_obj = file
|
802
|
-
|
1629
|
+
|
803
1630
|
try:
|
804
1631
|
# Prepare multipart form data
|
805
1632
|
files = {"file": (filename, file_obj)}
|
806
|
-
|
1633
|
+
|
807
1634
|
# Convert metadata and rules to JSON strings
|
808
1635
|
form_data = {
|
809
1636
|
"metadata": json.dumps(metadata or {}),
|
810
1637
|
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
811
1638
|
"update_strategy": update_strategy,
|
812
1639
|
}
|
813
|
-
|
1640
|
+
|
814
1641
|
if use_colpali is not None:
|
815
1642
|
form_data["use_colpali"] = str(use_colpali).lower()
|
816
|
-
|
1643
|
+
|
817
1644
|
# Use the dedicated file update endpoint
|
818
1645
|
response = await self._request(
|
819
1646
|
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
820
1647
|
)
|
821
|
-
|
822
|
-
doc =
|
1648
|
+
|
1649
|
+
doc = self._logic._parse_document_response(response)
|
823
1650
|
doc._client = self
|
824
1651
|
return doc
|
825
1652
|
finally:
|
826
1653
|
# Close file if we opened it
|
827
1654
|
if isinstance(file, (str, Path)):
|
828
1655
|
file_obj.close()
|
829
|
-
|
1656
|
+
|
830
1657
|
async def update_document_metadata(
|
831
1658
|
self,
|
832
1659
|
document_id: str,
|
@@ -834,14 +1661,14 @@ class AsyncMorphik:
|
|
834
1661
|
) -> Document:
|
835
1662
|
"""
|
836
1663
|
Update a document's metadata only.
|
837
|
-
|
1664
|
+
|
838
1665
|
Args:
|
839
1666
|
document_id: ID of the document to update
|
840
1667
|
metadata: Metadata to update
|
841
|
-
|
1668
|
+
|
842
1669
|
Returns:
|
843
1670
|
Document: Updated document metadata
|
844
|
-
|
1671
|
+
|
845
1672
|
Example:
|
846
1673
|
```python
|
847
1674
|
# Update just the metadata of a document
|
@@ -853,11 +1680,13 @@ class AsyncMorphik:
|
|
853
1680
|
```
|
854
1681
|
"""
|
855
1682
|
# Use the dedicated metadata update endpoint
|
856
|
-
response = await self._request(
|
857
|
-
|
1683
|
+
response = await self._request(
|
1684
|
+
"POST", f"documents/{document_id}/update_metadata", data=metadata
|
1685
|
+
)
|
1686
|
+
doc = self._logic._parse_document_response(response)
|
858
1687
|
doc._client = self
|
859
1688
|
return doc
|
860
|
-
|
1689
|
+
|
861
1690
|
async def update_document_by_filename_with_text(
|
862
1691
|
self,
|
863
1692
|
filename: str,
|
@@ -898,7 +1727,7 @@ class AsyncMorphik:
|
|
898
1727
|
"""
|
899
1728
|
# First get the document by filename to obtain its ID
|
900
1729
|
doc = await self.get_document_by_filename(filename)
|
901
|
-
|
1730
|
+
|
902
1731
|
# Then use the regular update_document_with_text endpoint with the document ID
|
903
1732
|
return await self.update_document_with_text(
|
904
1733
|
document_id=doc.external_id,
|
@@ -907,9 +1736,9 @@ class AsyncMorphik:
|
|
907
1736
|
metadata=metadata,
|
908
1737
|
rules=rules,
|
909
1738
|
update_strategy=update_strategy,
|
910
|
-
use_colpali=use_colpali
|
1739
|
+
use_colpali=use_colpali,
|
911
1740
|
)
|
912
|
-
|
1741
|
+
|
913
1742
|
async def update_document_by_filename_with_file(
|
914
1743
|
self,
|
915
1744
|
filename: str,
|
@@ -949,7 +1778,7 @@ class AsyncMorphik:
|
|
949
1778
|
"""
|
950
1779
|
# First get the document by filename to obtain its ID
|
951
1780
|
doc = await self.get_document_by_filename(filename)
|
952
|
-
|
1781
|
+
|
953
1782
|
# Then use the regular update_document_with_file endpoint with the document ID
|
954
1783
|
return await self.update_document_with_file(
|
955
1784
|
document_id=doc.external_id,
|
@@ -958,9 +1787,9 @@ class AsyncMorphik:
|
|
958
1787
|
metadata=metadata,
|
959
1788
|
rules=rules,
|
960
1789
|
update_strategy=update_strategy,
|
961
|
-
use_colpali=use_colpali
|
1790
|
+
use_colpali=use_colpali,
|
962
1791
|
)
|
963
|
-
|
1792
|
+
|
964
1793
|
async def update_document_by_filename_metadata(
|
965
1794
|
self,
|
966
1795
|
filename: str,
|
@@ -969,15 +1798,15 @@ class AsyncMorphik:
|
|
969
1798
|
) -> Document:
|
970
1799
|
"""
|
971
1800
|
Update a document's metadata using filename to identify the document.
|
972
|
-
|
1801
|
+
|
973
1802
|
Args:
|
974
1803
|
filename: Filename of the document to update
|
975
1804
|
metadata: Metadata to update
|
976
1805
|
new_filename: Optional new filename to assign to the document
|
977
|
-
|
1806
|
+
|
978
1807
|
Returns:
|
979
1808
|
Document: Updated document metadata
|
980
|
-
|
1809
|
+
|
981
1810
|
Example:
|
982
1811
|
```python
|
983
1812
|
# Update just the metadata of a document identified by filename
|
@@ -991,44 +1820,44 @@ class AsyncMorphik:
|
|
991
1820
|
"""
|
992
1821
|
# First get the document by filename to obtain its ID
|
993
1822
|
doc = await self.get_document_by_filename(filename)
|
994
|
-
|
1823
|
+
|
995
1824
|
# Update the metadata
|
996
1825
|
result = await self.update_document_metadata(
|
997
1826
|
document_id=doc.external_id,
|
998
1827
|
metadata=metadata,
|
999
1828
|
)
|
1000
|
-
|
1829
|
+
|
1001
1830
|
# If new_filename is provided, update the filename as well
|
1002
1831
|
if new_filename:
|
1003
1832
|
# Create a request that retains the just-updated metadata but also changes filename
|
1004
1833
|
combined_metadata = result.metadata.copy()
|
1005
|
-
|
1834
|
+
|
1006
1835
|
# Update the document again with filename change and the same metadata
|
1007
1836
|
response = await self._request(
|
1008
|
-
"POST",
|
1009
|
-
f"documents/{doc.external_id}/update_text",
|
1837
|
+
"POST",
|
1838
|
+
f"documents/{doc.external_id}/update_text",
|
1010
1839
|
data={
|
1011
|
-
"content": "",
|
1840
|
+
"content": "",
|
1012
1841
|
"filename": new_filename,
|
1013
1842
|
"metadata": combined_metadata,
|
1014
|
-
"rules": []
|
1015
|
-
}
|
1843
|
+
"rules": [],
|
1844
|
+
},
|
1016
1845
|
)
|
1017
|
-
result =
|
1846
|
+
result = self._logic._parse_document_response(response)
|
1018
1847
|
result._client = self
|
1019
|
-
|
1848
|
+
|
1020
1849
|
return result
|
1021
|
-
|
1850
|
+
|
1022
1851
|
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1023
1852
|
"""
|
1024
1853
|
Retrieve multiple documents by their IDs in a single batch operation.
|
1025
|
-
|
1854
|
+
|
1026
1855
|
Args:
|
1027
1856
|
document_ids: List of document IDs to retrieve
|
1028
|
-
|
1857
|
+
|
1029
1858
|
Returns:
|
1030
1859
|
List[Document]: List of document metadata for found documents
|
1031
|
-
|
1860
|
+
|
1032
1861
|
Example:
|
1033
1862
|
```python
|
1034
1863
|
docs = await db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
@@ -1036,22 +1865,25 @@ class AsyncMorphik:
|
|
1036
1865
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
1037
1866
|
```
|
1038
1867
|
"""
|
1039
|
-
|
1040
|
-
|
1868
|
+
request = self._logic._prepare_batch_get_documents_request(document_ids, None, None)
|
1869
|
+
response = await self._request("POST", "batch/documents", data=request)
|
1870
|
+
docs = self._logic._parse_document_list_response(response)
|
1041
1871
|
for doc in docs:
|
1042
1872
|
doc._client = self
|
1043
1873
|
return docs
|
1044
|
-
|
1045
|
-
async def batch_get_chunks(
|
1874
|
+
|
1875
|
+
async def batch_get_chunks(
|
1876
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
1877
|
+
) -> List[FinalChunkResult]:
|
1046
1878
|
"""
|
1047
1879
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1048
|
-
|
1880
|
+
|
1049
1881
|
Args:
|
1050
1882
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1051
|
-
|
1883
|
+
|
1052
1884
|
Returns:
|
1053
1885
|
List[FinalChunkResult]: List of chunk results
|
1054
|
-
|
1886
|
+
|
1055
1887
|
Example:
|
1056
1888
|
```python
|
1057
1889
|
# Using dictionaries
|
@@ -1059,67 +1891,22 @@ class AsyncMorphik:
|
|
1059
1891
|
{"document_id": "doc_123", "chunk_number": 0},
|
1060
1892
|
{"document_id": "doc_456", "chunk_number": 2}
|
1061
1893
|
]
|
1062
|
-
|
1894
|
+
|
1063
1895
|
# Or using ChunkSource objects
|
1064
1896
|
from morphik.models import ChunkSource
|
1065
1897
|
sources = [
|
1066
1898
|
ChunkSource(document_id="doc_123", chunk_number=0),
|
1067
1899
|
ChunkSource(document_id="doc_456", chunk_number=2)
|
1068
1900
|
]
|
1069
|
-
|
1901
|
+
|
1070
1902
|
chunks = await db.batch_get_chunks(sources)
|
1071
1903
|
for chunk in chunks:
|
1072
1904
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
1073
1905
|
```
|
1074
1906
|
"""
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
if isinstance(source, dict):
|
1079
|
-
source_dicts.append(source)
|
1080
|
-
else:
|
1081
|
-
source_dicts.append(source.model_dump())
|
1082
|
-
|
1083
|
-
response = await self._request("POST", "batch/chunks", data=source_dicts)
|
1084
|
-
chunks = [ChunkResult(**r) for r in response]
|
1085
|
-
|
1086
|
-
final_chunks = []
|
1087
|
-
for chunk in chunks:
|
1088
|
-
if chunk.metadata.get("is_image"):
|
1089
|
-
try:
|
1090
|
-
# Handle data URI format "data:image/png;base64,..."
|
1091
|
-
content = chunk.content
|
1092
|
-
if content.startswith("data:"):
|
1093
|
-
# Extract the base64 part after the comma
|
1094
|
-
content = content.split(",", 1)[1]
|
1095
|
-
|
1096
|
-
# Now decode the base64 string
|
1097
|
-
import base64
|
1098
|
-
import io
|
1099
|
-
from PIL import Image
|
1100
|
-
image_bytes = base64.b64decode(content)
|
1101
|
-
content = Image.open(io.BytesIO(image_bytes))
|
1102
|
-
except Exception as e:
|
1103
|
-
print(f"Error processing image: {str(e)}")
|
1104
|
-
# Fall back to using the content as text
|
1105
|
-
content = chunk.content
|
1106
|
-
else:
|
1107
|
-
content = chunk.content
|
1108
|
-
|
1109
|
-
final_chunks.append(
|
1110
|
-
FinalChunkResult(
|
1111
|
-
content=content,
|
1112
|
-
score=chunk.score,
|
1113
|
-
document_id=chunk.document_id,
|
1114
|
-
chunk_number=chunk.chunk_number,
|
1115
|
-
metadata=chunk.metadata,
|
1116
|
-
content_type=chunk.content_type,
|
1117
|
-
filename=chunk.filename,
|
1118
|
-
download_url=chunk.download_url,
|
1119
|
-
)
|
1120
|
-
)
|
1121
|
-
|
1122
|
-
return final_chunks
|
1907
|
+
request = self._logic._prepare_batch_get_chunks_request(sources, None, None)
|
1908
|
+
response = await self._request("POST", "batch/chunks", data=request)
|
1909
|
+
return self._logic._parse_chunk_result_list_response(response)
|
1123
1910
|
|
1124
1911
|
async def create_cache(
|
1125
1912
|
self,
|
@@ -1221,11 +2008,11 @@ class AsyncMorphik:
|
|
1221
2008
|
name="custom_graph",
|
1222
2009
|
documents=["doc1", "doc2", "doc3"]
|
1223
2010
|
)
|
1224
|
-
|
2011
|
+
|
1225
2012
|
# With custom entity extraction examples
|
1226
2013
|
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1227
2014
|
graph = await db.create_graph(
|
1228
|
-
name="medical_graph",
|
2015
|
+
name="medical_graph",
|
1229
2016
|
filters={"category": "medical"},
|
1230
2017
|
prompt_overrides=GraphPromptOverrides(
|
1231
2018
|
entity_extraction=EntityExtractionPromptOverride(
|
@@ -1238,19 +2025,11 @@ class AsyncMorphik:
|
|
1238
2025
|
)
|
1239
2026
|
```
|
1240
2027
|
"""
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
"name": name,
|
1247
|
-
"filters": filters,
|
1248
|
-
"documents": documents,
|
1249
|
-
"prompt_overrides": prompt_overrides,
|
1250
|
-
}
|
1251
|
-
|
1252
|
-
response = await self._request("POST", "graph/create", request)
|
1253
|
-
return Graph(**response)
|
2028
|
+
request = self._logic._prepare_create_graph_request(
|
2029
|
+
name, filters, documents, prompt_overrides, None, None
|
2030
|
+
)
|
2031
|
+
response = await self._request("POST", "graph/create", data=request)
|
2032
|
+
return self._logic._parse_graph_response(response)
|
1254
2033
|
|
1255
2034
|
async def get_graph(self, name: str) -> Graph:
|
1256
2035
|
"""
|
@@ -1270,7 +2049,7 @@ class AsyncMorphik:
|
|
1270
2049
|
```
|
1271
2050
|
"""
|
1272
2051
|
response = await self._request("GET", f"graph/{name}")
|
1273
|
-
return
|
2052
|
+
return self._logic._parse_graph_response(response)
|
1274
2053
|
|
1275
2054
|
async def list_graphs(self) -> List[Graph]:
|
1276
2055
|
"""
|
@@ -1288,7 +2067,7 @@ class AsyncMorphik:
|
|
1288
2067
|
```
|
1289
2068
|
"""
|
1290
2069
|
response = await self._request("GET", "graphs")
|
1291
|
-
return
|
2070
|
+
return self._logic._parse_graph_list_response(response)
|
1292
2071
|
|
1293
2072
|
async def update_graph(
|
1294
2073
|
self,
|
@@ -1332,7 +2111,7 @@ class AsyncMorphik:
|
|
1332
2111
|
entity_resolution=EntityResolutionPromptOverride(
|
1333
2112
|
examples=[
|
1334
2113
|
EntityResolutionExample(
|
1335
|
-
canonical="Machine Learning",
|
2114
|
+
canonical="Machine Learning",
|
1336
2115
|
variants=["ML", "machine learning", "AI/ML"]
|
1337
2116
|
)
|
1338
2117
|
]
|
@@ -1341,34 +2120,27 @@ class AsyncMorphik:
|
|
1341
2120
|
)
|
1342
2121
|
```
|
1343
2122
|
"""
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
"additional_documents": additional_documents,
|
1351
|
-
"prompt_overrides": prompt_overrides,
|
1352
|
-
}
|
1353
|
-
|
1354
|
-
response = await self._request("POST", f"graph/{name}/update", request)
|
1355
|
-
return Graph(**response)
|
1356
|
-
|
2123
|
+
request = self._logic._prepare_update_graph_request(
|
2124
|
+
name, additional_filters, additional_documents, prompt_overrides, None, None
|
2125
|
+
)
|
2126
|
+
response = await self._request("POST", f"graph/{name}/update", data=request)
|
2127
|
+
return self._logic._parse_graph_response(response)
|
2128
|
+
|
1357
2129
|
async def delete_document(self, document_id: str) -> Dict[str, str]:
|
1358
2130
|
"""
|
1359
2131
|
Delete a document and all its associated data.
|
1360
|
-
|
2132
|
+
|
1361
2133
|
This method deletes a document and all its associated data, including:
|
1362
2134
|
- Document metadata
|
1363
2135
|
- Document content in storage
|
1364
2136
|
- Document chunks and embeddings in vector store
|
1365
|
-
|
2137
|
+
|
1366
2138
|
Args:
|
1367
2139
|
document_id: ID of the document to delete
|
1368
|
-
|
2140
|
+
|
1369
2141
|
Returns:
|
1370
2142
|
Dict[str, str]: Deletion status
|
1371
|
-
|
2143
|
+
|
1372
2144
|
Example:
|
1373
2145
|
```python
|
1374
2146
|
# Delete a document
|
@@ -1378,20 +2150,20 @@ class AsyncMorphik:
|
|
1378
2150
|
"""
|
1379
2151
|
response = await self._request("DELETE", f"documents/{document_id}")
|
1380
2152
|
return response
|
1381
|
-
|
2153
|
+
|
1382
2154
|
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1383
2155
|
"""
|
1384
2156
|
Delete a document by its filename.
|
1385
|
-
|
2157
|
+
|
1386
2158
|
This is a convenience method that first retrieves the document ID by filename
|
1387
2159
|
and then deletes the document by ID.
|
1388
|
-
|
2160
|
+
|
1389
2161
|
Args:
|
1390
2162
|
filename: Filename of the document to delete
|
1391
|
-
|
2163
|
+
|
1392
2164
|
Returns:
|
1393
2165
|
Dict[str, str]: Deletion status
|
1394
|
-
|
2166
|
+
|
1395
2167
|
Example:
|
1396
2168
|
```python
|
1397
2169
|
# Delete a document by filename
|
@@ -1401,7 +2173,7 @@ class AsyncMorphik:
|
|
1401
2173
|
"""
|
1402
2174
|
# First get the document by filename to obtain its ID
|
1403
2175
|
doc = await self.get_document_by_filename(filename)
|
1404
|
-
|
2176
|
+
|
1405
2177
|
# Then delete the document by ID
|
1406
2178
|
return await self.delete_document(doc.external_id)
|
1407
2179
|
|