morphik 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +507 -0
- morphik/async_.py +1159 -381
- morphik/models.py +38 -25
- morphik/sync.py +1247 -358
- {morphik-0.1.0.dist-info → morphik-0.1.1.dist-info}/METADATA +1 -1
- morphik-0.1.1.dist-info/RECORD +10 -0
- morphik-0.1.0.dist-info/RECORD +0 -9
- {morphik-0.1.0.dist-info → morphik-0.1.1.dist-info}/WHEEL +0 -0
morphik/async_.py
CHANGED
@@ -3,12 +3,9 @@ import json
|
|
3
3
|
import logging
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
6
|
-
from urllib.parse import urlparse
|
7
6
|
|
8
7
|
import httpx
|
9
|
-
import jwt
|
10
8
|
from PIL.Image import Image as PILImage
|
11
|
-
from pydantic import BaseModel, Field
|
12
9
|
|
13
10
|
from .models import (
|
14
11
|
Document,
|
@@ -25,53 +22,974 @@ from .models import (
|
|
25
22
|
EntityResolutionPromptOverride,
|
26
23
|
QueryPromptOverride,
|
27
24
|
GraphPromptOverrides,
|
28
|
-
QueryPromptOverrides
|
25
|
+
QueryPromptOverrides,
|
29
26
|
)
|
30
27
|
from .rules import Rule
|
28
|
+
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
31
29
|
|
32
30
|
logger = logging.getLogger(__name__)
|
33
31
|
|
34
|
-
# Type alias for rules
|
35
|
-
RuleOrDict = Union[Rule, Dict[str, Any]]
|
36
32
|
|
33
|
+
class AsyncCache:
|
34
|
+
def __init__(self, db: "AsyncMorphik", name: str):
|
35
|
+
self._db = db
|
36
|
+
self._name = name
|
37
|
+
|
38
|
+
async def update(self) -> bool:
|
39
|
+
response = await self._db._request("POST", f"cache/{self._name}/update")
|
40
|
+
return response.get("success", False)
|
41
|
+
|
42
|
+
async def add_docs(self, docs: List[str]) -> bool:
|
43
|
+
response = await self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
44
|
+
return response.get("success", False)
|
45
|
+
|
46
|
+
async def query(
|
47
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
48
|
+
) -> CompletionResponse:
|
49
|
+
response = await self._db._request(
|
50
|
+
"POST",
|
51
|
+
f"cache/{self._name}/query",
|
52
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
53
|
+
data="",
|
54
|
+
)
|
55
|
+
return CompletionResponse(**response)
|
56
|
+
|
57
|
+
|
58
|
+
class AsyncFolder:
|
59
|
+
"""
|
60
|
+
A folder that allows operations to be scoped to a specific folder.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
client: The AsyncMorphik client instance
|
64
|
+
name: The name of the folder
|
65
|
+
"""
|
66
|
+
|
67
|
+
def __init__(self, client: "AsyncMorphik", name: str):
|
68
|
+
self._client = client
|
69
|
+
self._name = name
|
70
|
+
|
71
|
+
@property
|
72
|
+
def name(self) -> str:
|
73
|
+
"""Returns the folder name."""
|
74
|
+
return self._name
|
75
|
+
|
76
|
+
def signin(self, end_user_id: str) -> "AsyncUserScope":
|
77
|
+
"""
|
78
|
+
Returns an AsyncUserScope object scoped to this folder and the end user.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
end_user_id: The ID of the end user
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
AsyncUserScope: A user scope scoped to this folder and the end user
|
85
|
+
"""
|
86
|
+
return AsyncUserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)
|
87
|
+
|
88
|
+
async def ingest_text(
|
89
|
+
self,
|
90
|
+
content: str,
|
91
|
+
filename: Optional[str] = None,
|
92
|
+
metadata: Optional[Dict[str, Any]] = None,
|
93
|
+
rules: Optional[List[RuleOrDict]] = None,
|
94
|
+
use_colpali: bool = True,
|
95
|
+
) -> Document:
|
96
|
+
"""
|
97
|
+
Ingest a text document into Morphik within this folder.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
content: Text content to ingest
|
101
|
+
filename: Optional file name
|
102
|
+
metadata: Optional metadata dictionary
|
103
|
+
rules: Optional list of rules to apply during ingestion
|
104
|
+
use_colpali: Whether to use ColPali-style embedding model
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
Document: Metadata of the ingested document
|
108
|
+
"""
|
109
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
110
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
111
|
+
content, filename, metadata, rules_list, use_colpali, self._name, None
|
112
|
+
)
|
113
|
+
response = await self._client._request("POST", "ingest/text", data=payload)
|
114
|
+
doc = self._client._logic._parse_document_response(response)
|
115
|
+
doc._client = self._client
|
116
|
+
return doc
|
117
|
+
|
118
|
+
async def ingest_file(
|
119
|
+
self,
|
120
|
+
file: Union[str, bytes, BinaryIO, Path],
|
121
|
+
filename: Optional[str] = None,
|
122
|
+
metadata: Optional[Dict[str, Any]] = None,
|
123
|
+
rules: Optional[List[RuleOrDict]] = None,
|
124
|
+
use_colpali: bool = True,
|
125
|
+
) -> Document:
|
126
|
+
"""
|
127
|
+
Ingest a file document into Morphik within this folder.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
131
|
+
filename: Name of the file
|
132
|
+
metadata: Optional metadata dictionary
|
133
|
+
rules: Optional list of rules to apply during ingestion
|
134
|
+
use_colpali: Whether to use ColPali-style embedding model
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
Document: Metadata of the ingested document
|
138
|
+
"""
|
139
|
+
# Process file input
|
140
|
+
file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)
|
141
|
+
|
142
|
+
try:
|
143
|
+
# Prepare multipart form data
|
144
|
+
files = {"file": (filename, file_obj)}
|
145
|
+
|
146
|
+
# Create form data
|
147
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(
|
148
|
+
metadata, rules, self._name, None
|
149
|
+
)
|
150
|
+
|
151
|
+
response = await self._client._request(
|
152
|
+
"POST",
|
153
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
154
|
+
data=form_data,
|
155
|
+
files=files,
|
156
|
+
)
|
157
|
+
doc = self._client._logic._parse_document_response(response)
|
158
|
+
doc._client = self._client
|
159
|
+
return doc
|
160
|
+
finally:
|
161
|
+
# Close file if we opened it
|
162
|
+
if isinstance(file, (str, Path)):
|
163
|
+
file_obj.close()
|
164
|
+
|
165
|
+
async def ingest_files(
|
166
|
+
self,
|
167
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
168
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
169
|
+
rules: Optional[List[RuleOrDict]] = None,
|
170
|
+
use_colpali: bool = True,
|
171
|
+
parallel: bool = True,
|
172
|
+
) -> List[Document]:
|
173
|
+
"""
|
174
|
+
Ingest multiple files into Morphik within this folder.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
files: List of files to ingest
|
178
|
+
metadata: Optional metadata
|
179
|
+
rules: Optional list of rules to apply
|
180
|
+
use_colpali: Whether to use ColPali-style embedding
|
181
|
+
parallel: Whether to process files in parallel
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
List[Document]: List of ingested documents
|
185
|
+
"""
|
186
|
+
# Convert files to format expected by API
|
187
|
+
file_objects = self._client._logic._prepare_files_for_upload(files)
|
188
|
+
|
189
|
+
try:
|
190
|
+
# Prepare form data
|
191
|
+
data = self._client._logic._prepare_ingest_files_form_data(
|
192
|
+
metadata, rules, use_colpali, parallel, self._name, None
|
193
|
+
)
|
194
|
+
|
195
|
+
response = await self._client._request(
|
196
|
+
"POST", "ingest/files", data=data, files=file_objects
|
197
|
+
)
|
198
|
+
|
199
|
+
if response.get("errors"):
|
200
|
+
# Log errors but don't raise exception
|
201
|
+
for error in response["errors"]:
|
202
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
203
|
+
|
204
|
+
docs = [
|
205
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
206
|
+
]
|
207
|
+
for doc in docs:
|
208
|
+
doc._client = self._client
|
209
|
+
return docs
|
210
|
+
finally:
|
211
|
+
# Clean up file objects
|
212
|
+
for _, (_, file_obj) in file_objects:
|
213
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
214
|
+
file_obj.close()
|
215
|
+
|
216
|
+
async def ingest_directory(
|
217
|
+
self,
|
218
|
+
directory: Union[str, Path],
|
219
|
+
recursive: bool = False,
|
220
|
+
pattern: str = "*",
|
221
|
+
metadata: Optional[Dict[str, Any]] = None,
|
222
|
+
rules: Optional[List[RuleOrDict]] = None,
|
223
|
+
use_colpali: bool = True,
|
224
|
+
parallel: bool = True,
|
225
|
+
) -> List[Document]:
|
226
|
+
"""
|
227
|
+
Ingest all files in a directory into Morphik within this folder.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
directory: Path to directory containing files to ingest
|
231
|
+
recursive: Whether to recursively process subdirectories
|
232
|
+
pattern: Optional glob pattern to filter files
|
233
|
+
metadata: Optional metadata dictionary to apply to all files
|
234
|
+
rules: Optional list of rules to apply
|
235
|
+
use_colpali: Whether to use ColPali-style embedding
|
236
|
+
parallel: Whether to process files in parallel
|
237
|
+
|
238
|
+
Returns:
|
239
|
+
List[Document]: List of ingested documents
|
240
|
+
"""
|
241
|
+
directory = Path(directory)
|
242
|
+
if not directory.is_dir():
|
243
|
+
raise ValueError(f"Directory not found: {directory}")
|
244
|
+
|
245
|
+
# Collect all files matching pattern
|
246
|
+
if recursive:
|
247
|
+
files = list(directory.rglob(pattern))
|
248
|
+
else:
|
249
|
+
files = list(directory.glob(pattern))
|
250
|
+
|
251
|
+
# Filter out directories
|
252
|
+
files = [f for f in files if f.is_file()]
|
253
|
+
|
254
|
+
if not files:
|
255
|
+
return []
|
256
|
+
|
257
|
+
# Use ingest_files with collected paths
|
258
|
+
return await self.ingest_files(
|
259
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
260
|
+
)
|
261
|
+
|
262
|
+
async def retrieve_chunks(
|
263
|
+
self,
|
264
|
+
query: str,
|
265
|
+
filters: Optional[Dict[str, Any]] = None,
|
266
|
+
k: int = 4,
|
267
|
+
min_score: float = 0.0,
|
268
|
+
use_colpali: bool = True,
|
269
|
+
) -> List[FinalChunkResult]:
|
270
|
+
"""
|
271
|
+
Retrieve relevant chunks within this folder.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
query: Search query text
|
275
|
+
filters: Optional metadata filters
|
276
|
+
k: Number of results (default: 4)
|
277
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
278
|
+
use_colpali: Whether to use ColPali-style embedding model
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
List[FinalChunkResult]: List of relevant chunks
|
282
|
+
"""
|
283
|
+
payload = self._client._logic._prepare_retrieve_chunks_request(
|
284
|
+
query, filters, k, min_score, use_colpali, self._name, None
|
285
|
+
)
|
286
|
+
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
287
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
288
|
+
|
289
|
+
async def retrieve_docs(
|
290
|
+
self,
|
291
|
+
query: str,
|
292
|
+
filters: Optional[Dict[str, Any]] = None,
|
293
|
+
k: int = 4,
|
294
|
+
min_score: float = 0.0,
|
295
|
+
use_colpali: bool = True,
|
296
|
+
) -> List[DocumentResult]:
|
297
|
+
"""
|
298
|
+
Retrieve relevant documents within this folder.
|
299
|
+
|
300
|
+
Args:
|
301
|
+
query: Search query text
|
302
|
+
filters: Optional metadata filters
|
303
|
+
k: Number of results (default: 4)
|
304
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
305
|
+
use_colpali: Whether to use ColPali-style embedding model
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
List[DocumentResult]: List of relevant documents
|
309
|
+
"""
|
310
|
+
payload = self._client._logic._prepare_retrieve_docs_request(
|
311
|
+
query, filters, k, min_score, use_colpali, self._name, None
|
312
|
+
)
|
313
|
+
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
314
|
+
return self._client._logic._parse_document_result_list_response(response)
|
315
|
+
|
316
|
+
async def query(
|
317
|
+
self,
|
318
|
+
query: str,
|
319
|
+
filters: Optional[Dict[str, Any]] = None,
|
320
|
+
k: int = 4,
|
321
|
+
min_score: float = 0.0,
|
322
|
+
max_tokens: Optional[int] = None,
|
323
|
+
temperature: Optional[float] = None,
|
324
|
+
use_colpali: bool = True,
|
325
|
+
graph_name: Optional[str] = None,
|
326
|
+
hop_depth: int = 1,
|
327
|
+
include_paths: bool = False,
|
328
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
329
|
+
) -> CompletionResponse:
|
330
|
+
"""
|
331
|
+
Generate completion using relevant chunks as context within this folder.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
query: Query text
|
335
|
+
filters: Optional metadata filters
|
336
|
+
k: Number of chunks to use as context (default: 4)
|
337
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
338
|
+
max_tokens: Maximum tokens in completion
|
339
|
+
temperature: Model temperature
|
340
|
+
use_colpali: Whether to use ColPali-style embedding model
|
341
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
342
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
343
|
+
include_paths: Whether to include relationship paths in the response
|
344
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
CompletionResponse: Generated completion
|
348
|
+
"""
|
349
|
+
payload = self._client._logic._prepare_query_request(
|
350
|
+
query,
|
351
|
+
filters,
|
352
|
+
k,
|
353
|
+
min_score,
|
354
|
+
max_tokens,
|
355
|
+
temperature,
|
356
|
+
use_colpali,
|
357
|
+
graph_name,
|
358
|
+
hop_depth,
|
359
|
+
include_paths,
|
360
|
+
prompt_overrides,
|
361
|
+
self._name,
|
362
|
+
None,
|
363
|
+
)
|
364
|
+
response = await self._client._request("POST", "query", data=payload)
|
365
|
+
return self._client._logic._parse_completion_response(response)
|
366
|
+
|
367
|
+
async def list_documents(
|
368
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
369
|
+
) -> List[Document]:
|
370
|
+
"""
|
371
|
+
List accessible documents within this folder.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
skip: Number of documents to skip
|
375
|
+
limit: Maximum number of documents to return
|
376
|
+
filters: Optional filters
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
List[Document]: List of documents
|
380
|
+
"""
|
381
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
382
|
+
skip, limit, filters, self._name, None
|
383
|
+
)
|
384
|
+
response = await self._client._request("POST", "documents", data=data, params=params)
|
385
|
+
docs = self._client._logic._parse_document_list_response(response)
|
386
|
+
for doc in docs:
|
387
|
+
doc._client = self._client
|
388
|
+
return docs
|
389
|
+
|
390
|
+
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
391
|
+
"""
|
392
|
+
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
393
|
+
|
394
|
+
Args:
|
395
|
+
document_ids: List of document IDs to retrieve
|
396
|
+
|
397
|
+
Returns:
|
398
|
+
List[Document]: List of document metadata for found documents
|
399
|
+
"""
|
400
|
+
request = self._client._logic._prepare_batch_get_documents_request(
|
401
|
+
document_ids, self._name, None
|
402
|
+
)
|
403
|
+
response = await self._client._request("POST", "batch/documents", data=request)
|
404
|
+
docs = self._client._logic._parse_document_list_response(response)
|
405
|
+
for doc in docs:
|
406
|
+
doc._client = self._client
|
407
|
+
return docs
|
408
|
+
|
409
|
+
async def batch_get_chunks(
|
410
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
411
|
+
) -> List[FinalChunkResult]:
|
412
|
+
"""
|
413
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
414
|
+
|
415
|
+
Args:
|
416
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
417
|
+
|
418
|
+
Returns:
|
419
|
+
List[FinalChunkResult]: List of chunk results
|
420
|
+
"""
|
421
|
+
request = self._client._logic._prepare_batch_get_chunks_request(sources, self._name, None)
|
422
|
+
response = await self._client._request("POST", "batch/chunks", data=request)
|
423
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
424
|
+
|
425
|
+
async def create_graph(
|
426
|
+
self,
|
427
|
+
name: str,
|
428
|
+
filters: Optional[Dict[str, Any]] = None,
|
429
|
+
documents: Optional[List[str]] = None,
|
430
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
431
|
+
) -> Graph:
|
432
|
+
"""
|
433
|
+
Create a graph from documents within this folder.
|
434
|
+
|
435
|
+
Args:
|
436
|
+
name: Name of the graph to create
|
437
|
+
filters: Optional metadata filters to determine which documents to include
|
438
|
+
documents: Optional list of specific document IDs to include
|
439
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
440
|
+
|
441
|
+
Returns:
|
442
|
+
Graph: The created graph object
|
443
|
+
"""
|
444
|
+
request = self._client._logic._prepare_create_graph_request(
|
445
|
+
name, filters, documents, prompt_overrides, self._name, None
|
446
|
+
)
|
447
|
+
response = await self._client._request("POST", "graph/create", data=request)
|
448
|
+
return self._client._logic._parse_graph_response(response)
|
449
|
+
|
450
|
+
async def update_graph(
|
451
|
+
self,
|
452
|
+
name: str,
|
453
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
454
|
+
additional_documents: Optional[List[str]] = None,
|
455
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
456
|
+
) -> Graph:
|
457
|
+
"""
|
458
|
+
Update an existing graph with new documents from this folder.
|
459
|
+
|
460
|
+
Args:
|
461
|
+
name: Name of the graph to update
|
462
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
463
|
+
additional_documents: Optional list of additional document IDs to include
|
464
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
Graph: The updated graph
|
468
|
+
"""
|
469
|
+
request = self._client._logic._prepare_update_graph_request(
|
470
|
+
name, additional_filters, additional_documents, prompt_overrides, self._name, None
|
471
|
+
)
|
472
|
+
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
473
|
+
return self._client._logic._parse_graph_response(response)
|
474
|
+
|
475
|
+
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
476
|
+
"""
|
477
|
+
Delete a document by its filename within this folder.
|
478
|
+
|
479
|
+
Args:
|
480
|
+
filename: Filename of the document to delete
|
481
|
+
|
482
|
+
Returns:
|
483
|
+
Dict[str, str]: Deletion status
|
484
|
+
"""
|
485
|
+
# Get the document by filename with folder scope
|
486
|
+
request = {"filename": filename, "folder_name": self._name}
|
487
|
+
|
488
|
+
# First get the document ID
|
489
|
+
response = await self._client._request(
|
490
|
+
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
491
|
+
)
|
492
|
+
doc = self._client._logic._parse_document_response(response)
|
493
|
+
|
494
|
+
# Then delete by ID
|
495
|
+
return await self._client.delete_document(doc.external_id)
|
496
|
+
|
497
|
+
|
498
|
+
class AsyncUserScope:
|
499
|
+
"""
|
500
|
+
A user scope that allows operations to be scoped to a specific end user and optionally a folder.
|
501
|
+
|
502
|
+
Args:
|
503
|
+
client: The AsyncMorphik client instance
|
504
|
+
end_user_id: The ID of the end user
|
505
|
+
folder_name: Optional folder name to further scope operations
|
506
|
+
"""
|
507
|
+
|
508
|
+
def __init__(self, client: "AsyncMorphik", end_user_id: str, folder_name: Optional[str] = None):
|
509
|
+
self._client = client
|
510
|
+
self._end_user_id = end_user_id
|
511
|
+
self._folder_name = folder_name
|
512
|
+
|
513
|
+
@property
|
514
|
+
def end_user_id(self) -> str:
|
515
|
+
"""Returns the end user ID."""
|
516
|
+
return self._end_user_id
|
517
|
+
|
518
|
+
@property
|
519
|
+
def folder_name(self) -> Optional[str]:
|
520
|
+
"""Returns the folder name if any."""
|
521
|
+
return self._folder_name
|
522
|
+
|
523
|
+
async def ingest_text(
|
524
|
+
self,
|
525
|
+
content: str,
|
526
|
+
filename: Optional[str] = None,
|
527
|
+
metadata: Optional[Dict[str, Any]] = None,
|
528
|
+
rules: Optional[List[RuleOrDict]] = None,
|
529
|
+
use_colpali: bool = True,
|
530
|
+
) -> Document:
|
531
|
+
"""
|
532
|
+
Ingest a text document into Morphik as this end user.
|
533
|
+
|
534
|
+
Args:
|
535
|
+
content: Text content to ingest
|
536
|
+
filename: Optional file name
|
537
|
+
metadata: Optional metadata dictionary
|
538
|
+
rules: Optional list of rules to apply during ingestion
|
539
|
+
use_colpali: Whether to use ColPali-style embedding model
|
540
|
+
|
541
|
+
Returns:
|
542
|
+
Document: Metadata of the ingested document
|
543
|
+
"""
|
544
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
545
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
546
|
+
content,
|
547
|
+
filename,
|
548
|
+
metadata,
|
549
|
+
rules_list,
|
550
|
+
use_colpali,
|
551
|
+
self._folder_name,
|
552
|
+
self._end_user_id,
|
553
|
+
)
|
554
|
+
response = await self._client._request("POST", "ingest/text", data=payload)
|
555
|
+
doc = self._client._logic._parse_document_response(response)
|
556
|
+
doc._client = self._client
|
557
|
+
return doc
|
558
|
+
|
559
|
+
async def ingest_file(
|
560
|
+
self,
|
561
|
+
file: Union[str, bytes, BinaryIO, Path],
|
562
|
+
filename: Optional[str] = None,
|
563
|
+
metadata: Optional[Dict[str, Any]] = None,
|
564
|
+
rules: Optional[List[RuleOrDict]] = None,
|
565
|
+
use_colpali: bool = True,
|
566
|
+
) -> Document:
|
567
|
+
"""
|
568
|
+
Ingest a file document into Morphik as this end user.
|
569
|
+
|
570
|
+
Args:
|
571
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
572
|
+
filename: Name of the file
|
573
|
+
metadata: Optional metadata dictionary
|
574
|
+
rules: Optional list of rules to apply during ingestion
|
575
|
+
use_colpali: Whether to use ColPali-style embedding model
|
576
|
+
|
577
|
+
Returns:
|
578
|
+
Document: Metadata of the ingested document
|
579
|
+
"""
|
580
|
+
# Handle different file input types
|
581
|
+
if isinstance(file, (str, Path)):
|
582
|
+
file_path = Path(file)
|
583
|
+
if not file_path.exists():
|
584
|
+
raise ValueError(f"File not found: {file}")
|
585
|
+
filename = file_path.name if filename is None else filename
|
586
|
+
with open(file_path, "rb") as f:
|
587
|
+
content = f.read()
|
588
|
+
file_obj = BytesIO(content)
|
589
|
+
elif isinstance(file, bytes):
|
590
|
+
if filename is None:
|
591
|
+
raise ValueError("filename is required when ingesting bytes")
|
592
|
+
file_obj = BytesIO(file)
|
593
|
+
else:
|
594
|
+
if filename is None:
|
595
|
+
raise ValueError("filename is required when ingesting file object")
|
596
|
+
file_obj = file
|
597
|
+
|
598
|
+
try:
|
599
|
+
# Prepare multipart form data
|
600
|
+
files = {"file": (filename, file_obj)}
|
601
|
+
|
602
|
+
# Add metadata and rules
|
603
|
+
data = {
|
604
|
+
"metadata": json.dumps(metadata or {}),
|
605
|
+
"rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
|
606
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
607
|
+
}
|
608
|
+
|
609
|
+
# Add folder name if scoped to a folder
|
610
|
+
if self._folder_name:
|
611
|
+
data["folder_name"] = self._folder_name
|
612
|
+
|
613
|
+
response = await self._client._request("POST", "ingest/file", data=data, files=files)
|
614
|
+
doc = self._client._logic._parse_document_response(response)
|
615
|
+
doc._client = self._client
|
616
|
+
return doc
|
617
|
+
finally:
|
618
|
+
# Close file if we opened it
|
619
|
+
if isinstance(file, (str, Path)):
|
620
|
+
file_obj.close()
|
621
|
+
|
622
|
+
async def ingest_files(
|
623
|
+
self,
|
624
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
625
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
626
|
+
rules: Optional[List[RuleOrDict]] = None,
|
627
|
+
use_colpali: bool = True,
|
628
|
+
parallel: bool = True,
|
629
|
+
) -> List[Document]:
|
630
|
+
"""
|
631
|
+
Ingest multiple files into Morphik as this end user.
|
632
|
+
|
633
|
+
Args:
|
634
|
+
files: List of files to ingest
|
635
|
+
metadata: Optional metadata
|
636
|
+
rules: Optional list of rules to apply
|
637
|
+
use_colpali: Whether to use ColPali-style embedding
|
638
|
+
parallel: Whether to process files in parallel
|
639
|
+
|
640
|
+
Returns:
|
641
|
+
List[Document]: List of ingested documents
|
642
|
+
"""
|
643
|
+
# Convert files to format expected by API
|
644
|
+
file_objects = []
|
645
|
+
for file in files:
|
646
|
+
if isinstance(file, (str, Path)):
|
647
|
+
path = Path(file)
|
648
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
649
|
+
elif isinstance(file, bytes):
|
650
|
+
file_objects.append(("files", ("file.bin", file)))
|
651
|
+
else:
|
652
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
653
|
+
|
654
|
+
try:
|
655
|
+
# Prepare request data
|
656
|
+
# Convert rules appropriately
|
657
|
+
if rules:
|
658
|
+
if all(isinstance(r, list) for r in rules):
|
659
|
+
# List of lists - per-file rules
|
660
|
+
converted_rules = [
|
661
|
+
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
662
|
+
]
|
663
|
+
else:
|
664
|
+
# Flat list - shared rules for all files
|
665
|
+
converted_rules = [self._client._convert_rule(r) for r in rules]
|
666
|
+
else:
|
667
|
+
converted_rules = []
|
668
|
+
|
669
|
+
data = {
|
670
|
+
"metadata": json.dumps(metadata or {}),
|
671
|
+
"rules": json.dumps(converted_rules),
|
672
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
673
|
+
"parallel": str(parallel).lower(),
|
674
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
675
|
+
}
|
676
|
+
|
677
|
+
# Add folder name if scoped to a folder
|
678
|
+
if self._folder_name:
|
679
|
+
data["folder_name"] = self._folder_name
|
680
|
+
|
681
|
+
response = await self._client._request(
|
682
|
+
"POST", "ingest/files", data=data, files=file_objects
|
683
|
+
)
|
684
|
+
|
685
|
+
if response.get("errors"):
|
686
|
+
# Log errors but don't raise exception
|
687
|
+
for error in response["errors"]:
|
688
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
689
|
+
|
690
|
+
docs = [
|
691
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
692
|
+
]
|
693
|
+
for doc in docs:
|
694
|
+
doc._client = self._client
|
695
|
+
return docs
|
696
|
+
finally:
|
697
|
+
# Clean up file objects
|
698
|
+
for _, (_, file_obj) in file_objects:
|
699
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
700
|
+
file_obj.close()
|
701
|
+
|
702
|
+
async def ingest_directory(
|
703
|
+
self,
|
704
|
+
directory: Union[str, Path],
|
705
|
+
recursive: bool = False,
|
706
|
+
pattern: str = "*",
|
707
|
+
metadata: Optional[Dict[str, Any]] = None,
|
708
|
+
rules: Optional[List[RuleOrDict]] = None,
|
709
|
+
use_colpali: bool = True,
|
710
|
+
parallel: bool = True,
|
711
|
+
) -> List[Document]:
|
712
|
+
"""
|
713
|
+
Ingest all files in a directory into Morphik as this end user.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
directory: Path to directory containing files to ingest
|
717
|
+
recursive: Whether to recursively process subdirectories
|
718
|
+
pattern: Optional glob pattern to filter files
|
719
|
+
metadata: Optional metadata dictionary to apply to all files
|
720
|
+
rules: Optional list of rules to apply
|
721
|
+
use_colpali: Whether to use ColPali-style embedding
|
722
|
+
parallel: Whether to process files in parallel
|
723
|
+
|
724
|
+
Returns:
|
725
|
+
List[Document]: List of ingested documents
|
726
|
+
"""
|
727
|
+
directory = Path(directory)
|
728
|
+
if not directory.is_dir():
|
729
|
+
raise ValueError(f"Directory not found: {directory}")
|
730
|
+
|
731
|
+
# Collect all files matching pattern
|
732
|
+
if recursive:
|
733
|
+
files = list(directory.rglob(pattern))
|
734
|
+
else:
|
735
|
+
files = list(directory.glob(pattern))
|
736
|
+
|
737
|
+
# Filter out directories
|
738
|
+
files = [f for f in files if f.is_file()]
|
739
|
+
|
740
|
+
if not files:
|
741
|
+
return []
|
742
|
+
|
743
|
+
# Use ingest_files with collected paths
|
744
|
+
return await self.ingest_files(
|
745
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
746
|
+
)
|
747
|
+
|
748
|
+
async def retrieve_chunks(
|
749
|
+
self,
|
750
|
+
query: str,
|
751
|
+
filters: Optional[Dict[str, Any]] = None,
|
752
|
+
k: int = 4,
|
753
|
+
min_score: float = 0.0,
|
754
|
+
use_colpali: bool = True,
|
755
|
+
) -> List[FinalChunkResult]:
|
756
|
+
"""
|
757
|
+
Retrieve relevant chunks as this end user.
|
758
|
+
|
759
|
+
Args:
|
760
|
+
query: Search query text
|
761
|
+
filters: Optional metadata filters
|
762
|
+
k: Number of results (default: 4)
|
763
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
764
|
+
use_colpali: Whether to use ColPali-style embedding model
|
765
|
+
|
766
|
+
Returns:
|
767
|
+
List[FinalChunkResult]: List of relevant chunks
|
768
|
+
"""
|
769
|
+
payload = self._client._logic._prepare_retrieve_chunks_request(
|
770
|
+
query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
|
771
|
+
)
|
772
|
+
response = await self._client._request("POST", "retrieve/chunks", data=payload)
|
773
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
774
|
+
|
775
|
+
async def retrieve_docs(
|
776
|
+
self,
|
777
|
+
query: str,
|
778
|
+
filters: Optional[Dict[str, Any]] = None,
|
779
|
+
k: int = 4,
|
780
|
+
min_score: float = 0.0,
|
781
|
+
use_colpali: bool = True,
|
782
|
+
) -> List[DocumentResult]:
|
783
|
+
"""
|
784
|
+
Retrieve relevant documents as this end user.
|
785
|
+
|
786
|
+
Args:
|
787
|
+
query: Search query text
|
788
|
+
filters: Optional metadata filters
|
789
|
+
k: Number of results (default: 4)
|
790
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
791
|
+
use_colpali: Whether to use ColPali-style embedding model
|
792
|
+
|
793
|
+
Returns:
|
794
|
+
List[DocumentResult]: List of relevant documents
|
795
|
+
"""
|
796
|
+
payload = self._client._logic._prepare_retrieve_docs_request(
|
797
|
+
query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id
|
798
|
+
)
|
799
|
+
response = await self._client._request("POST", "retrieve/docs", data=payload)
|
800
|
+
return self._client._logic._parse_document_result_list_response(response)
|
801
|
+
|
802
|
+
async def query(
|
803
|
+
self,
|
804
|
+
query: str,
|
805
|
+
filters: Optional[Dict[str, Any]] = None,
|
806
|
+
k: int = 4,
|
807
|
+
min_score: float = 0.0,
|
808
|
+
max_tokens: Optional[int] = None,
|
809
|
+
temperature: Optional[float] = None,
|
810
|
+
use_colpali: bool = True,
|
811
|
+
graph_name: Optional[str] = None,
|
812
|
+
hop_depth: int = 1,
|
813
|
+
include_paths: bool = False,
|
814
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
815
|
+
) -> CompletionResponse:
|
816
|
+
"""
|
817
|
+
Generate completion using relevant chunks as context as this end user.
|
818
|
+
|
819
|
+
Args:
|
820
|
+
query: Query text
|
821
|
+
filters: Optional metadata filters
|
822
|
+
k: Number of chunks to use as context (default: 4)
|
823
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
824
|
+
max_tokens: Maximum tokens in completion
|
825
|
+
temperature: Model temperature
|
826
|
+
use_colpali: Whether to use ColPali-style embedding model
|
827
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
828
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
829
|
+
include_paths: Whether to include relationship paths in the response
|
830
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
831
|
+
|
832
|
+
Returns:
|
833
|
+
CompletionResponse: Generated completion
|
834
|
+
"""
|
835
|
+
payload = self._client._logic._prepare_query_request(
|
836
|
+
query,
|
837
|
+
filters,
|
838
|
+
k,
|
839
|
+
min_score,
|
840
|
+
max_tokens,
|
841
|
+
temperature,
|
842
|
+
use_colpali,
|
843
|
+
graph_name,
|
844
|
+
hop_depth,
|
845
|
+
include_paths,
|
846
|
+
prompt_overrides,
|
847
|
+
self._folder_name,
|
848
|
+
self._end_user_id,
|
849
|
+
)
|
850
|
+
response = await self._client._request("POST", "query", data=payload)
|
851
|
+
return self._client._logic._parse_completion_response(response)
|
852
|
+
|
853
|
+
async def list_documents(
|
854
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
855
|
+
) -> List[Document]:
|
856
|
+
"""
|
857
|
+
List accessible documents for this end user.
|
858
|
+
|
859
|
+
Args:
|
860
|
+
skip: Number of documents to skip
|
861
|
+
limit: Maximum number of documents to return
|
862
|
+
filters: Optional filters
|
863
|
+
|
864
|
+
Returns:
|
865
|
+
List[Document]: List of documents
|
866
|
+
"""
|
867
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
868
|
+
skip, limit, filters, self._folder_name, self._end_user_id
|
869
|
+
)
|
870
|
+
response = await self._client._request("POST", "documents", data=data, params=params)
|
871
|
+
docs = self._client._logic._parse_document_list_response(response)
|
872
|
+
for doc in docs:
|
873
|
+
doc._client = self._client
|
874
|
+
return docs
|
875
|
+
|
876
|
+
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
877
|
+
"""
|
878
|
+
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
37
879
|
|
38
|
-
|
39
|
-
|
40
|
-
self._db = db
|
41
|
-
self._name = name
|
880
|
+
Args:
|
881
|
+
document_ids: List of document IDs to retrieve
|
42
882
|
|
43
|
-
|
44
|
-
|
45
|
-
|
883
|
+
Returns:
|
884
|
+
List[Document]: List of document metadata for found documents
|
885
|
+
"""
|
886
|
+
request = self._client._logic._prepare_batch_get_documents_request(
|
887
|
+
document_ids, self._folder_name, self._end_user_id
|
888
|
+
)
|
889
|
+
response = await self._client._request("POST", "batch/documents", data=request)
|
890
|
+
docs = self._client._logic._parse_document_list_response(response)
|
891
|
+
for doc in docs:
|
892
|
+
doc._client = self._client
|
893
|
+
return docs
|
46
894
|
|
47
|
-
async def
|
48
|
-
|
49
|
-
|
895
|
+
async def batch_get_chunks(
|
896
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
897
|
+
) -> List[FinalChunkResult]:
|
898
|
+
"""
|
899
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
50
900
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
901
|
+
Args:
|
902
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
903
|
+
|
904
|
+
Returns:
|
905
|
+
List[FinalChunkResult]: List of chunk results
|
906
|
+
"""
|
907
|
+
request = self._client._logic._prepare_batch_get_chunks_request(
|
908
|
+
sources, self._folder_name, self._end_user_id
|
59
909
|
)
|
60
|
-
|
910
|
+
response = await self._client._request("POST", "batch/chunks", data=request)
|
911
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
912
|
+
|
913
|
+
async def create_graph(
|
914
|
+
self,
|
915
|
+
name: str,
|
916
|
+
filters: Optional[Dict[str, Any]] = None,
|
917
|
+
documents: Optional[List[str]] = None,
|
918
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
919
|
+
) -> Graph:
|
920
|
+
"""
|
921
|
+
Create a graph from documents for this end user.
|
922
|
+
|
923
|
+
Args:
|
924
|
+
name: Name of the graph to create
|
925
|
+
filters: Optional metadata filters to determine which documents to include
|
926
|
+
documents: Optional list of specific document IDs to include
|
927
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
928
|
+
|
929
|
+
Returns:
|
930
|
+
Graph: The created graph object
|
931
|
+
"""
|
932
|
+
request = self._client._logic._prepare_create_graph_request(
|
933
|
+
name, filters, documents, prompt_overrides, self._folder_name, self._end_user_id
|
934
|
+
)
|
935
|
+
response = await self._client._request("POST", "graph/create", data=request)
|
936
|
+
return self._client._logic._parse_graph_response(response)
|
937
|
+
|
938
|
+
async def update_graph(
|
939
|
+
self,
|
940
|
+
name: str,
|
941
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
942
|
+
additional_documents: Optional[List[str]] = None,
|
943
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
944
|
+
) -> Graph:
|
945
|
+
"""
|
946
|
+
Update an existing graph with new documents for this end user.
|
947
|
+
|
948
|
+
Args:
|
949
|
+
name: Name of the graph to update
|
950
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
951
|
+
additional_documents: Optional list of additional document IDs to include
|
952
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
953
|
+
|
954
|
+
Returns:
|
955
|
+
Graph: The updated graph
|
956
|
+
"""
|
957
|
+
request = self._client._logic._prepare_update_graph_request(
|
958
|
+
name,
|
959
|
+
additional_filters,
|
960
|
+
additional_documents,
|
961
|
+
prompt_overrides,
|
962
|
+
self._folder_name,
|
963
|
+
self._end_user_id,
|
964
|
+
)
|
965
|
+
response = await self._client._request("POST", f"graph/{name}/update", data=request)
|
966
|
+
return self._client._logic._parse_graph_response(response)
|
967
|
+
|
968
|
+
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
969
|
+
"""
|
970
|
+
Delete a document by its filename for this end user.
|
971
|
+
|
972
|
+
Args:
|
973
|
+
filename: Filename of the document to delete
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
Dict[str, str]: Deletion status
|
977
|
+
"""
|
978
|
+
# Build parameters for the filename lookup
|
979
|
+
params = {"end_user_id": self._end_user_id}
|
61
980
|
|
981
|
+
# Add folder name if scoped to a folder
|
982
|
+
if self._folder_name:
|
983
|
+
params["folder_name"] = self._folder_name
|
62
984
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
69
|
-
content_type: str = Field(..., description="Content type")
|
70
|
-
filename: Optional[str] = Field(None, description="Original filename")
|
71
|
-
download_url: Optional[str] = Field(None, description="URL to download full document")
|
985
|
+
# First get the document ID
|
986
|
+
response = await self._client._request(
|
987
|
+
"GET", f"documents/filename/{filename}", params=params
|
988
|
+
)
|
989
|
+
doc = self._client._logic._parse_document_response(response)
|
72
990
|
|
73
|
-
|
74
|
-
|
991
|
+
# Then delete by ID
|
992
|
+
return await self._client.delete_document(doc.external_id)
|
75
993
|
|
76
994
|
|
77
995
|
class AsyncMorphik:
|
@@ -97,39 +1015,12 @@ class AsyncMorphik:
|
|
97
1015
|
"""
|
98
1016
|
|
99
1017
|
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
100
|
-
self.
|
101
|
-
self._client = (
|
102
|
-
|
103
|
-
|
104
|
-
else
|
105
|
-
timeout=timeout,
|
106
|
-
verify=False, # Disable SSL for localhost
|
107
|
-
http2=False, # Force HTTP/1.1
|
108
|
-
)
|
1018
|
+
self._logic = _MorphikClientLogic(uri, timeout, is_local)
|
1019
|
+
self._client = httpx.AsyncClient(
|
1020
|
+
timeout=self._logic._timeout,
|
1021
|
+
verify=not self._logic._is_local,
|
1022
|
+
http2=False if self._logic._is_local else True,
|
109
1023
|
)
|
110
|
-
self._is_local = is_local
|
111
|
-
|
112
|
-
if uri:
|
113
|
-
self._setup_auth(uri)
|
114
|
-
else:
|
115
|
-
self._base_url = "http://localhost:8000"
|
116
|
-
self._auth_token = None
|
117
|
-
|
118
|
-
def _setup_auth(self, uri: str) -> None:
|
119
|
-
"""Setup authentication from URI"""
|
120
|
-
parsed = urlparse(uri)
|
121
|
-
if not parsed.netloc:
|
122
|
-
raise ValueError("Invalid URI format")
|
123
|
-
|
124
|
-
# Split host and auth parts
|
125
|
-
auth, host = parsed.netloc.split("@")
|
126
|
-
_, self._auth_token = auth.split(":")
|
127
|
-
|
128
|
-
# Set base URL
|
129
|
-
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
130
|
-
|
131
|
-
# Basic token validation
|
132
|
-
jwt.decode(self._auth_token, options={"verify_signature": False})
|
133
1024
|
|
134
1025
|
async def _request(
|
135
1026
|
self,
|
@@ -140,9 +1031,10 @@ class AsyncMorphik:
|
|
140
1031
|
params: Optional[Dict[str, Any]] = None,
|
141
1032
|
) -> Dict[str, Any]:
|
142
1033
|
"""Make HTTP request"""
|
143
|
-
|
144
|
-
|
145
|
-
|
1034
|
+
url = self._logic._get_url(endpoint)
|
1035
|
+
headers = self._logic._get_headers()
|
1036
|
+
if self._logic._auth_token: # Only add auth header if we have a token
|
1037
|
+
headers["Authorization"] = f"Bearer {self._logic._auth_token}"
|
146
1038
|
|
147
1039
|
# Configure request data based on type
|
148
1040
|
if files:
|
@@ -156,7 +1048,7 @@ class AsyncMorphik:
|
|
156
1048
|
|
157
1049
|
response = await self._client.request(
|
158
1050
|
method,
|
159
|
-
|
1051
|
+
url,
|
160
1052
|
headers=headers,
|
161
1053
|
params=params,
|
162
1054
|
**request_data,
|
@@ -166,9 +1058,43 @@ class AsyncMorphik:
|
|
166
1058
|
|
167
1059
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
168
1060
|
"""Convert a rule to a dictionary format"""
|
169
|
-
|
170
|
-
|
171
|
-
|
1061
|
+
return self._logic._convert_rule(rule)
|
1062
|
+
|
1063
|
+
def create_folder(self, name: str) -> AsyncFolder:
|
1064
|
+
"""
|
1065
|
+
Create a folder to scope operations.
|
1066
|
+
|
1067
|
+
Args:
|
1068
|
+
name: The name of the folder
|
1069
|
+
|
1070
|
+
Returns:
|
1071
|
+
AsyncFolder: A folder object for scoped operations
|
1072
|
+
"""
|
1073
|
+
return AsyncFolder(self, name)
|
1074
|
+
|
1075
|
+
def get_folder(self, name: str) -> AsyncFolder:
|
1076
|
+
"""
|
1077
|
+
Get a folder by name to scope operations.
|
1078
|
+
|
1079
|
+
Args:
|
1080
|
+
name: The name of the folder
|
1081
|
+
|
1082
|
+
Returns:
|
1083
|
+
AsyncFolder: A folder object for scoped operations
|
1084
|
+
"""
|
1085
|
+
return AsyncFolder(self, name)
|
1086
|
+
|
1087
|
+
def signin(self, end_user_id: str) -> AsyncUserScope:
|
1088
|
+
"""
|
1089
|
+
Sign in as an end user to scope operations.
|
1090
|
+
|
1091
|
+
Args:
|
1092
|
+
end_user_id: The ID of the end user
|
1093
|
+
|
1094
|
+
Returns:
|
1095
|
+
AsyncUserScope: A user scope object for scoped operations
|
1096
|
+
"""
|
1097
|
+
return AsyncUserScope(self, end_user_id)
|
172
1098
|
|
173
1099
|
async def ingest_text(
|
174
1100
|
self,
|
@@ -213,53 +1139,41 @@ class AsyncMorphik:
|
|
213
1139
|
)
|
214
1140
|
```
|
215
1141
|
"""
|
216
|
-
|
217
|
-
|
218
|
-
filename
|
219
|
-
metadata=metadata or {},
|
220
|
-
rules=[self._convert_rule(r) for r in (rules or [])],
|
221
|
-
use_colpali=use_colpali,
|
1142
|
+
rules_list = [self._convert_rule(r) for r in (rules or [])]
|
1143
|
+
payload = self._logic._prepare_ingest_text_request(
|
1144
|
+
content, filename, metadata, rules_list, use_colpali, None, None
|
222
1145
|
)
|
223
|
-
response = await self._request("POST", "ingest/text", data=
|
224
|
-
doc =
|
1146
|
+
response = await self._request("POST", "ingest/text", data=payload)
|
1147
|
+
doc = self._logic._parse_document_response(response)
|
225
1148
|
doc._client = self
|
226
1149
|
return doc
|
227
1150
|
|
228
1151
|
async def ingest_file(
|
229
1152
|
self,
|
230
1153
|
file: Union[str, bytes, BinaryIO, Path],
|
231
|
-
filename: str,
|
1154
|
+
filename: Optional[str] = None,
|
232
1155
|
metadata: Optional[Dict[str, Any]] = None,
|
233
1156
|
rules: Optional[List[RuleOrDict]] = None,
|
234
1157
|
use_colpali: bool = True,
|
235
1158
|
) -> Document:
|
236
1159
|
"""Ingest a file document into Morphik."""
|
237
|
-
#
|
238
|
-
|
239
|
-
file_path = Path(file)
|
240
|
-
if not file_path.exists():
|
241
|
-
raise ValueError(f"File not found: {file}")
|
242
|
-
with open(file_path, "rb") as f:
|
243
|
-
content = f.read()
|
244
|
-
file_obj = BytesIO(content)
|
245
|
-
elif isinstance(file, bytes):
|
246
|
-
file_obj = BytesIO(file)
|
247
|
-
else:
|
248
|
-
file_obj = file
|
1160
|
+
# Process file input
|
1161
|
+
file_obj, filename = self._logic._prepare_file_for_upload(file, filename)
|
249
1162
|
|
250
1163
|
try:
|
251
1164
|
# Prepare multipart form data
|
252
1165
|
files = {"file": (filename, file_obj)}
|
253
1166
|
|
254
|
-
#
|
255
|
-
|
256
|
-
"metadata": json.dumps(metadata or {}),
|
257
|
-
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
258
|
-
"use_colpali": json.dumps(use_colpali),
|
259
|
-
}
|
1167
|
+
# Create form data
|
1168
|
+
form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
|
260
1169
|
|
261
|
-
response = await self._request(
|
262
|
-
|
1170
|
+
response = await self._request(
|
1171
|
+
"POST",
|
1172
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
1173
|
+
data=form_data,
|
1174
|
+
files=files,
|
1175
|
+
)
|
1176
|
+
doc = self._logic._parse_document_response(response)
|
263
1177
|
doc._client = self
|
264
1178
|
return doc
|
265
1179
|
finally:
|
@@ -292,44 +1206,23 @@ class AsyncMorphik:
|
|
292
1206
|
ValueError: If metadata list length doesn't match files length
|
293
1207
|
"""
|
294
1208
|
# Convert files to format expected by API
|
295
|
-
file_objects =
|
296
|
-
for file in files:
|
297
|
-
if isinstance(file, (str, Path)):
|
298
|
-
path = Path(file)
|
299
|
-
file_objects.append(("files", (path.name, open(path, "rb"))))
|
300
|
-
elif isinstance(file, bytes):
|
301
|
-
file_objects.append(("files", ("file.bin", file)))
|
302
|
-
else:
|
303
|
-
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
1209
|
+
file_objects = self._logic._prepare_files_for_upload(files)
|
304
1210
|
|
305
1211
|
try:
|
306
|
-
# Prepare
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
# List of lists - per-file rules
|
311
|
-
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
312
|
-
else:
|
313
|
-
# Flat list - shared rules for all files
|
314
|
-
converted_rules = [self._convert_rule(r) for r in rules]
|
315
|
-
else:
|
316
|
-
converted_rules = []
|
317
|
-
|
318
|
-
data = {
|
319
|
-
"metadata": json.dumps(metadata or {}),
|
320
|
-
"rules": json.dumps(converted_rules),
|
321
|
-
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
322
|
-
"parallel": str(parallel).lower(),
|
323
|
-
}
|
1212
|
+
# Prepare form data
|
1213
|
+
data = self._logic._prepare_ingest_files_form_data(
|
1214
|
+
metadata, rules, use_colpali, parallel, None, None
|
1215
|
+
)
|
324
1216
|
|
325
1217
|
response = await self._request("POST", "ingest/files", data=data, files=file_objects)
|
326
|
-
|
1218
|
+
|
327
1219
|
if response.get("errors"):
|
328
1220
|
# Log errors but don't raise exception
|
329
1221
|
for error in response["errors"]:
|
330
1222
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
331
|
-
|
332
|
-
|
1223
|
+
|
1224
|
+
# Parse the documents from the response
|
1225
|
+
docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
|
333
1226
|
for doc in docs:
|
334
1227
|
doc._client = self
|
335
1228
|
return docs
|
@@ -379,17 +1272,13 @@ class AsyncMorphik:
|
|
379
1272
|
|
380
1273
|
# Filter out directories
|
381
1274
|
files = [f for f in files if f.is_file()]
|
382
|
-
|
1275
|
+
|
383
1276
|
if not files:
|
384
1277
|
return []
|
385
1278
|
|
386
1279
|
# Use ingest_files with collected paths
|
387
1280
|
return await self.ingest_files(
|
388
|
-
files=files,
|
389
|
-
metadata=metadata,
|
390
|
-
rules=rules,
|
391
|
-
use_colpali=use_colpali,
|
392
|
-
parallel=parallel
|
1281
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
393
1282
|
)
|
394
1283
|
|
395
1284
|
async def retrieve_chunks(
|
@@ -420,54 +1309,11 @@ class AsyncMorphik:
|
|
420
1309
|
)
|
421
1310
|
```
|
422
1311
|
"""
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
"use_colpali": use_colpali,
|
429
|
-
}
|
430
|
-
|
431
|
-
response = await self._request("POST", "retrieve/chunks", data=request)
|
432
|
-
chunks = [ChunkResult(**r) for r in response]
|
433
|
-
|
434
|
-
final_chunks = []
|
435
|
-
for chunk in chunks:
|
436
|
-
if chunk.metadata.get("is_image"):
|
437
|
-
try:
|
438
|
-
# Handle data URI format "data:image/png;base64,..."
|
439
|
-
content = chunk.content
|
440
|
-
if content.startswith("data:"):
|
441
|
-
# Extract the base64 part after the comma
|
442
|
-
content = content.split(",", 1)[1]
|
443
|
-
|
444
|
-
# Now decode the base64 string
|
445
|
-
import base64
|
446
|
-
import io
|
447
|
-
from PIL import Image
|
448
|
-
image_bytes = base64.b64decode(content)
|
449
|
-
content = Image.open(io.BytesIO(image_bytes))
|
450
|
-
except Exception as e:
|
451
|
-
print(f"Error processing image: {str(e)}")
|
452
|
-
# Fall back to using the content as text
|
453
|
-
content = chunk.content
|
454
|
-
else:
|
455
|
-
content = chunk.content
|
456
|
-
|
457
|
-
final_chunks.append(
|
458
|
-
FinalChunkResult(
|
459
|
-
content=content,
|
460
|
-
score=chunk.score,
|
461
|
-
document_id=chunk.document_id,
|
462
|
-
chunk_number=chunk.chunk_number,
|
463
|
-
metadata=chunk.metadata,
|
464
|
-
content_type=chunk.content_type,
|
465
|
-
filename=chunk.filename,
|
466
|
-
download_url=chunk.download_url,
|
467
|
-
)
|
468
|
-
)
|
469
|
-
|
470
|
-
return final_chunks
|
1312
|
+
payload = self._logic._prepare_retrieve_chunks_request(
|
1313
|
+
query, filters, k, min_score, use_colpali, None, None
|
1314
|
+
)
|
1315
|
+
response = await self._request("POST", "retrieve/chunks", data=payload)
|
1316
|
+
return self._logic._parse_chunk_result_list_response(response)
|
471
1317
|
|
472
1318
|
async def retrieve_docs(
|
473
1319
|
self,
|
@@ -497,16 +1343,11 @@ class AsyncMorphik:
|
|
497
1343
|
)
|
498
1344
|
```
|
499
1345
|
"""
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
"use_colpali": use_colpali,
|
506
|
-
}
|
507
|
-
|
508
|
-
response = await self._request("POST", "retrieve/docs", data=request)
|
509
|
-
return [DocumentResult(**r) for r in response]
|
1346
|
+
payload = self._logic._prepare_retrieve_docs_request(
|
1347
|
+
query, filters, k, min_score, use_colpali, None, None
|
1348
|
+
)
|
1349
|
+
response = await self._request("POST", "retrieve/docs", data=payload)
|
1350
|
+
return self._logic._parse_document_result_list_response(response)
|
510
1351
|
|
511
1352
|
async def query(
|
512
1353
|
self,
|
@@ -549,7 +1390,7 @@ class AsyncMorphik:
|
|
549
1390
|
filters={"department": "research"},
|
550
1391
|
temperature=0.7
|
551
1392
|
)
|
552
|
-
|
1393
|
+
|
553
1394
|
# Knowledge graph enhanced query
|
554
1395
|
response = await db.query(
|
555
1396
|
"How does product X relate to customer segment Y?",
|
@@ -557,7 +1398,7 @@ class AsyncMorphik:
|
|
557
1398
|
hop_depth=2,
|
558
1399
|
include_paths=True
|
559
1400
|
)
|
560
|
-
|
1401
|
+
|
561
1402
|
# With prompt customization
|
562
1403
|
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
563
1404
|
response = await db.query(
|
@@ -568,7 +1409,7 @@ class AsyncMorphik:
|
|
568
1409
|
)
|
569
1410
|
)
|
570
1411
|
)
|
571
|
-
|
1412
|
+
|
572
1413
|
# Or using a dictionary
|
573
1414
|
response = await db.query(
|
574
1415
|
"What are the key findings?",
|
@@ -578,35 +1419,32 @@ class AsyncMorphik:
|
|
578
1419
|
}
|
579
1420
|
}
|
580
1421
|
)
|
581
|
-
|
1422
|
+
|
582
1423
|
print(response.completion)
|
583
|
-
|
1424
|
+
|
584
1425
|
# If include_paths=True, you can inspect the graph paths
|
585
1426
|
if response.metadata and "graph" in response.metadata:
|
586
1427
|
for path in response.metadata["graph"]["paths"]:
|
587
1428
|
print(" -> ".join(path))
|
588
1429
|
```
|
589
1430
|
"""
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
response = await self._request("POST", "query", data=request)
|
609
|
-
return CompletionResponse(**response)
|
1431
|
+
payload = self._logic._prepare_query_request(
|
1432
|
+
query,
|
1433
|
+
filters,
|
1434
|
+
k,
|
1435
|
+
min_score,
|
1436
|
+
max_tokens,
|
1437
|
+
temperature,
|
1438
|
+
use_colpali,
|
1439
|
+
graph_name,
|
1440
|
+
hop_depth,
|
1441
|
+
include_paths,
|
1442
|
+
prompt_overrides,
|
1443
|
+
None,
|
1444
|
+
None,
|
1445
|
+
)
|
1446
|
+
response = await self._request("POST", "query", data=payload)
|
1447
|
+
return self._logic._parse_completion_response(response)
|
610
1448
|
|
611
1449
|
async def list_documents(
|
612
1450
|
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
@@ -631,11 +1469,9 @@ class AsyncMorphik:
|
|
631
1469
|
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
632
1470
|
```
|
633
1471
|
"""
|
634
|
-
|
635
|
-
response = await self._request(
|
636
|
-
|
637
|
-
)
|
638
|
-
docs = [Document(**doc) for doc in response]
|
1472
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
|
1473
|
+
response = await self._request("POST", "documents", data=data, params=params)
|
1474
|
+
docs = self._logic._parse_document_list_response(response)
|
639
1475
|
for doc in docs:
|
640
1476
|
doc._client = self
|
641
1477
|
return docs
|
@@ -657,10 +1493,10 @@ class AsyncMorphik:
|
|
657
1493
|
```
|
658
1494
|
"""
|
659
1495
|
response = await self._request("GET", f"documents/{document_id}")
|
660
|
-
doc =
|
1496
|
+
doc = self._logic._parse_document_response(response)
|
661
1497
|
doc._client = self
|
662
1498
|
return doc
|
663
|
-
|
1499
|
+
|
664
1500
|
async def get_document_by_filename(self, filename: str) -> Document:
|
665
1501
|
"""
|
666
1502
|
Get document metadata by filename.
|
@@ -679,10 +1515,10 @@ class AsyncMorphik:
|
|
679
1515
|
```
|
680
1516
|
"""
|
681
1517
|
response = await self._request("GET", f"documents/filename/{filename}")
|
682
|
-
doc =
|
1518
|
+
doc = self._logic._parse_document_response(response)
|
683
1519
|
doc._client = self
|
684
1520
|
return doc
|
685
|
-
|
1521
|
+
|
686
1522
|
async def update_document_with_text(
|
687
1523
|
self,
|
688
1524
|
document_id: str,
|
@@ -695,7 +1531,7 @@ class AsyncMorphik:
|
|
695
1531
|
) -> Document:
|
696
1532
|
"""
|
697
1533
|
Update a document with new text content using the specified strategy.
|
698
|
-
|
1534
|
+
|
699
1535
|
Args:
|
700
1536
|
document_id: ID of the document to update
|
701
1537
|
content: The new content to add
|
@@ -704,10 +1540,10 @@ class AsyncMorphik:
|
|
704
1540
|
rules: Optional list of rules to apply to the content
|
705
1541
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
706
1542
|
use_colpali: Whether to use multi-vector embedding
|
707
|
-
|
1543
|
+
|
708
1544
|
Returns:
|
709
1545
|
Document: Updated document metadata
|
710
|
-
|
1546
|
+
|
711
1547
|
Example:
|
712
1548
|
```python
|
713
1549
|
# Add new content to an existing document
|
@@ -729,22 +1565,19 @@ class AsyncMorphik:
|
|
729
1565
|
rules=[self._convert_rule(r) for r in (rules or [])],
|
730
1566
|
use_colpali=use_colpali if use_colpali is not None else True,
|
731
1567
|
)
|
732
|
-
|
1568
|
+
|
733
1569
|
params = {}
|
734
1570
|
if update_strategy != "add":
|
735
1571
|
params["update_strategy"] = update_strategy
|
736
|
-
|
1572
|
+
|
737
1573
|
response = await self._request(
|
738
|
-
"POST",
|
739
|
-
f"documents/{document_id}/update_text",
|
740
|
-
data=request.model_dump(),
|
741
|
-
params=params
|
1574
|
+
"POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
|
742
1575
|
)
|
743
|
-
|
744
|
-
doc =
|
1576
|
+
|
1577
|
+
doc = self._logic._parse_document_response(response)
|
745
1578
|
doc._client = self
|
746
1579
|
return doc
|
747
|
-
|
1580
|
+
|
748
1581
|
async def update_document_with_file(
|
749
1582
|
self,
|
750
1583
|
document_id: str,
|
@@ -757,7 +1590,7 @@ class AsyncMorphik:
|
|
757
1590
|
) -> Document:
|
758
1591
|
"""
|
759
1592
|
Update a document with content from a file using the specified strategy.
|
760
|
-
|
1593
|
+
|
761
1594
|
Args:
|
762
1595
|
document_id: ID of the document to update
|
763
1596
|
file: File to add (path string, bytes, file object, or Path)
|
@@ -766,10 +1599,10 @@ class AsyncMorphik:
|
|
766
1599
|
rules: Optional list of rules to apply to the content
|
767
1600
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
768
1601
|
use_colpali: Whether to use multi-vector embedding
|
769
|
-
|
1602
|
+
|
770
1603
|
Returns:
|
771
1604
|
Document: Updated document metadata
|
772
|
-
|
1605
|
+
|
773
1606
|
Example:
|
774
1607
|
```python
|
775
1608
|
# Add content from a file to an existing document
|
@@ -799,34 +1632,34 @@ class AsyncMorphik:
|
|
799
1632
|
if filename is None:
|
800
1633
|
raise ValueError("filename is required when updating with file object")
|
801
1634
|
file_obj = file
|
802
|
-
|
1635
|
+
|
803
1636
|
try:
|
804
1637
|
# Prepare multipart form data
|
805
1638
|
files = {"file": (filename, file_obj)}
|
806
|
-
|
1639
|
+
|
807
1640
|
# Convert metadata and rules to JSON strings
|
808
1641
|
form_data = {
|
809
1642
|
"metadata": json.dumps(metadata or {}),
|
810
1643
|
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
811
1644
|
"update_strategy": update_strategy,
|
812
1645
|
}
|
813
|
-
|
1646
|
+
|
814
1647
|
if use_colpali is not None:
|
815
1648
|
form_data["use_colpali"] = str(use_colpali).lower()
|
816
|
-
|
1649
|
+
|
817
1650
|
# Use the dedicated file update endpoint
|
818
1651
|
response = await self._request(
|
819
1652
|
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
820
1653
|
)
|
821
|
-
|
822
|
-
doc =
|
1654
|
+
|
1655
|
+
doc = self._logic._parse_document_response(response)
|
823
1656
|
doc._client = self
|
824
1657
|
return doc
|
825
1658
|
finally:
|
826
1659
|
# Close file if we opened it
|
827
1660
|
if isinstance(file, (str, Path)):
|
828
1661
|
file_obj.close()
|
829
|
-
|
1662
|
+
|
830
1663
|
async def update_document_metadata(
|
831
1664
|
self,
|
832
1665
|
document_id: str,
|
@@ -834,14 +1667,14 @@ class AsyncMorphik:
|
|
834
1667
|
) -> Document:
|
835
1668
|
"""
|
836
1669
|
Update a document's metadata only.
|
837
|
-
|
1670
|
+
|
838
1671
|
Args:
|
839
1672
|
document_id: ID of the document to update
|
840
1673
|
metadata: Metadata to update
|
841
|
-
|
1674
|
+
|
842
1675
|
Returns:
|
843
1676
|
Document: Updated document metadata
|
844
|
-
|
1677
|
+
|
845
1678
|
Example:
|
846
1679
|
```python
|
847
1680
|
# Update just the metadata of a document
|
@@ -853,11 +1686,13 @@ class AsyncMorphik:
|
|
853
1686
|
```
|
854
1687
|
"""
|
855
1688
|
# Use the dedicated metadata update endpoint
|
856
|
-
response = await self._request(
|
857
|
-
|
1689
|
+
response = await self._request(
|
1690
|
+
"POST", f"documents/{document_id}/update_metadata", data=metadata
|
1691
|
+
)
|
1692
|
+
doc = self._logic._parse_document_response(response)
|
858
1693
|
doc._client = self
|
859
1694
|
return doc
|
860
|
-
|
1695
|
+
|
861
1696
|
async def update_document_by_filename_with_text(
|
862
1697
|
self,
|
863
1698
|
filename: str,
|
@@ -898,7 +1733,7 @@ class AsyncMorphik:
|
|
898
1733
|
"""
|
899
1734
|
# First get the document by filename to obtain its ID
|
900
1735
|
doc = await self.get_document_by_filename(filename)
|
901
|
-
|
1736
|
+
|
902
1737
|
# Then use the regular update_document_with_text endpoint with the document ID
|
903
1738
|
return await self.update_document_with_text(
|
904
1739
|
document_id=doc.external_id,
|
@@ -907,9 +1742,9 @@ class AsyncMorphik:
|
|
907
1742
|
metadata=metadata,
|
908
1743
|
rules=rules,
|
909
1744
|
update_strategy=update_strategy,
|
910
|
-
use_colpali=use_colpali
|
1745
|
+
use_colpali=use_colpali,
|
911
1746
|
)
|
912
|
-
|
1747
|
+
|
913
1748
|
async def update_document_by_filename_with_file(
|
914
1749
|
self,
|
915
1750
|
filename: str,
|
@@ -949,7 +1784,7 @@ class AsyncMorphik:
|
|
949
1784
|
"""
|
950
1785
|
# First get the document by filename to obtain its ID
|
951
1786
|
doc = await self.get_document_by_filename(filename)
|
952
|
-
|
1787
|
+
|
953
1788
|
# Then use the regular update_document_with_file endpoint with the document ID
|
954
1789
|
return await self.update_document_with_file(
|
955
1790
|
document_id=doc.external_id,
|
@@ -958,9 +1793,9 @@ class AsyncMorphik:
|
|
958
1793
|
metadata=metadata,
|
959
1794
|
rules=rules,
|
960
1795
|
update_strategy=update_strategy,
|
961
|
-
use_colpali=use_colpali
|
1796
|
+
use_colpali=use_colpali,
|
962
1797
|
)
|
963
|
-
|
1798
|
+
|
964
1799
|
async def update_document_by_filename_metadata(
|
965
1800
|
self,
|
966
1801
|
filename: str,
|
@@ -969,15 +1804,15 @@ class AsyncMorphik:
|
|
969
1804
|
) -> Document:
|
970
1805
|
"""
|
971
1806
|
Update a document's metadata using filename to identify the document.
|
972
|
-
|
1807
|
+
|
973
1808
|
Args:
|
974
1809
|
filename: Filename of the document to update
|
975
1810
|
metadata: Metadata to update
|
976
1811
|
new_filename: Optional new filename to assign to the document
|
977
|
-
|
1812
|
+
|
978
1813
|
Returns:
|
979
1814
|
Document: Updated document metadata
|
980
|
-
|
1815
|
+
|
981
1816
|
Example:
|
982
1817
|
```python
|
983
1818
|
# Update just the metadata of a document identified by filename
|
@@ -991,44 +1826,44 @@ class AsyncMorphik:
|
|
991
1826
|
"""
|
992
1827
|
# First get the document by filename to obtain its ID
|
993
1828
|
doc = await self.get_document_by_filename(filename)
|
994
|
-
|
1829
|
+
|
995
1830
|
# Update the metadata
|
996
1831
|
result = await self.update_document_metadata(
|
997
1832
|
document_id=doc.external_id,
|
998
1833
|
metadata=metadata,
|
999
1834
|
)
|
1000
|
-
|
1835
|
+
|
1001
1836
|
# If new_filename is provided, update the filename as well
|
1002
1837
|
if new_filename:
|
1003
1838
|
# Create a request that retains the just-updated metadata but also changes filename
|
1004
1839
|
combined_metadata = result.metadata.copy()
|
1005
|
-
|
1840
|
+
|
1006
1841
|
# Update the document again with filename change and the same metadata
|
1007
1842
|
response = await self._request(
|
1008
|
-
"POST",
|
1009
|
-
f"documents/{doc.external_id}/update_text",
|
1843
|
+
"POST",
|
1844
|
+
f"documents/{doc.external_id}/update_text",
|
1010
1845
|
data={
|
1011
|
-
"content": "",
|
1846
|
+
"content": "",
|
1012
1847
|
"filename": new_filename,
|
1013
1848
|
"metadata": combined_metadata,
|
1014
|
-
"rules": []
|
1015
|
-
}
|
1849
|
+
"rules": [],
|
1850
|
+
},
|
1016
1851
|
)
|
1017
|
-
result =
|
1852
|
+
result = self._logic._parse_document_response(response)
|
1018
1853
|
result._client = self
|
1019
|
-
|
1854
|
+
|
1020
1855
|
return result
|
1021
|
-
|
1856
|
+
|
1022
1857
|
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1023
1858
|
"""
|
1024
1859
|
Retrieve multiple documents by their IDs in a single batch operation.
|
1025
|
-
|
1860
|
+
|
1026
1861
|
Args:
|
1027
1862
|
document_ids: List of document IDs to retrieve
|
1028
|
-
|
1863
|
+
|
1029
1864
|
Returns:
|
1030
1865
|
List[Document]: List of document metadata for found documents
|
1031
|
-
|
1866
|
+
|
1032
1867
|
Example:
|
1033
1868
|
```python
|
1034
1869
|
docs = await db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
@@ -1036,22 +1871,25 @@ class AsyncMorphik:
|
|
1036
1871
|
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
1037
1872
|
```
|
1038
1873
|
"""
|
1039
|
-
|
1040
|
-
|
1874
|
+
request = self._logic._prepare_batch_get_documents_request(document_ids, None, None)
|
1875
|
+
response = await self._request("POST", "batch/documents", data=request)
|
1876
|
+
docs = self._logic._parse_document_list_response(response)
|
1041
1877
|
for doc in docs:
|
1042
1878
|
doc._client = self
|
1043
1879
|
return docs
|
1044
|
-
|
1045
|
-
async def batch_get_chunks(
|
1880
|
+
|
1881
|
+
async def batch_get_chunks(
|
1882
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
1883
|
+
) -> List[FinalChunkResult]:
|
1046
1884
|
"""
|
1047
1885
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1048
|
-
|
1886
|
+
|
1049
1887
|
Args:
|
1050
1888
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1051
|
-
|
1889
|
+
|
1052
1890
|
Returns:
|
1053
1891
|
List[FinalChunkResult]: List of chunk results
|
1054
|
-
|
1892
|
+
|
1055
1893
|
Example:
|
1056
1894
|
```python
|
1057
1895
|
# Using dictionaries
|
@@ -1059,67 +1897,22 @@ class AsyncMorphik:
|
|
1059
1897
|
{"document_id": "doc_123", "chunk_number": 0},
|
1060
1898
|
{"document_id": "doc_456", "chunk_number": 2}
|
1061
1899
|
]
|
1062
|
-
|
1900
|
+
|
1063
1901
|
# Or using ChunkSource objects
|
1064
1902
|
from morphik.models import ChunkSource
|
1065
1903
|
sources = [
|
1066
1904
|
ChunkSource(document_id="doc_123", chunk_number=0),
|
1067
1905
|
ChunkSource(document_id="doc_456", chunk_number=2)
|
1068
1906
|
]
|
1069
|
-
|
1907
|
+
|
1070
1908
|
chunks = await db.batch_get_chunks(sources)
|
1071
1909
|
for chunk in chunks:
|
1072
1910
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
1073
1911
|
```
|
1074
1912
|
"""
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
if isinstance(source, dict):
|
1079
|
-
source_dicts.append(source)
|
1080
|
-
else:
|
1081
|
-
source_dicts.append(source.model_dump())
|
1082
|
-
|
1083
|
-
response = await self._request("POST", "batch/chunks", data=source_dicts)
|
1084
|
-
chunks = [ChunkResult(**r) for r in response]
|
1085
|
-
|
1086
|
-
final_chunks = []
|
1087
|
-
for chunk in chunks:
|
1088
|
-
if chunk.metadata.get("is_image"):
|
1089
|
-
try:
|
1090
|
-
# Handle data URI format "data:image/png;base64,..."
|
1091
|
-
content = chunk.content
|
1092
|
-
if content.startswith("data:"):
|
1093
|
-
# Extract the base64 part after the comma
|
1094
|
-
content = content.split(",", 1)[1]
|
1095
|
-
|
1096
|
-
# Now decode the base64 string
|
1097
|
-
import base64
|
1098
|
-
import io
|
1099
|
-
from PIL import Image
|
1100
|
-
image_bytes = base64.b64decode(content)
|
1101
|
-
content = Image.open(io.BytesIO(image_bytes))
|
1102
|
-
except Exception as e:
|
1103
|
-
print(f"Error processing image: {str(e)}")
|
1104
|
-
# Fall back to using the content as text
|
1105
|
-
content = chunk.content
|
1106
|
-
else:
|
1107
|
-
content = chunk.content
|
1108
|
-
|
1109
|
-
final_chunks.append(
|
1110
|
-
FinalChunkResult(
|
1111
|
-
content=content,
|
1112
|
-
score=chunk.score,
|
1113
|
-
document_id=chunk.document_id,
|
1114
|
-
chunk_number=chunk.chunk_number,
|
1115
|
-
metadata=chunk.metadata,
|
1116
|
-
content_type=chunk.content_type,
|
1117
|
-
filename=chunk.filename,
|
1118
|
-
download_url=chunk.download_url,
|
1119
|
-
)
|
1120
|
-
)
|
1121
|
-
|
1122
|
-
return final_chunks
|
1913
|
+
request = self._logic._prepare_batch_get_chunks_request(sources, None, None)
|
1914
|
+
response = await self._request("POST", "batch/chunks", data=request)
|
1915
|
+
return self._logic._parse_chunk_result_list_response(response)
|
1123
1916
|
|
1124
1917
|
async def create_cache(
|
1125
1918
|
self,
|
@@ -1221,11 +2014,11 @@ class AsyncMorphik:
|
|
1221
2014
|
name="custom_graph",
|
1222
2015
|
documents=["doc1", "doc2", "doc3"]
|
1223
2016
|
)
|
1224
|
-
|
2017
|
+
|
1225
2018
|
# With custom entity extraction examples
|
1226
2019
|
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1227
2020
|
graph = await db.create_graph(
|
1228
|
-
name="medical_graph",
|
2021
|
+
name="medical_graph",
|
1229
2022
|
filters={"category": "medical"},
|
1230
2023
|
prompt_overrides=GraphPromptOverrides(
|
1231
2024
|
entity_extraction=EntityExtractionPromptOverride(
|
@@ -1238,19 +2031,11 @@ class AsyncMorphik:
|
|
1238
2031
|
)
|
1239
2032
|
```
|
1240
2033
|
"""
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
"name": name,
|
1247
|
-
"filters": filters,
|
1248
|
-
"documents": documents,
|
1249
|
-
"prompt_overrides": prompt_overrides,
|
1250
|
-
}
|
1251
|
-
|
1252
|
-
response = await self._request("POST", "graph/create", request)
|
1253
|
-
return Graph(**response)
|
2034
|
+
request = self._logic._prepare_create_graph_request(
|
2035
|
+
name, filters, documents, prompt_overrides, None, None
|
2036
|
+
)
|
2037
|
+
response = await self._request("POST", "graph/create", data=request)
|
2038
|
+
return self._logic._parse_graph_response(response)
|
1254
2039
|
|
1255
2040
|
async def get_graph(self, name: str) -> Graph:
|
1256
2041
|
"""
|
@@ -1270,7 +2055,7 @@ class AsyncMorphik:
|
|
1270
2055
|
```
|
1271
2056
|
"""
|
1272
2057
|
response = await self._request("GET", f"graph/{name}")
|
1273
|
-
return
|
2058
|
+
return self._logic._parse_graph_response(response)
|
1274
2059
|
|
1275
2060
|
async def list_graphs(self) -> List[Graph]:
|
1276
2061
|
"""
|
@@ -1288,7 +2073,7 @@ class AsyncMorphik:
|
|
1288
2073
|
```
|
1289
2074
|
"""
|
1290
2075
|
response = await self._request("GET", "graphs")
|
1291
|
-
return
|
2076
|
+
return self._logic._parse_graph_list_response(response)
|
1292
2077
|
|
1293
2078
|
async def update_graph(
|
1294
2079
|
self,
|
@@ -1332,7 +2117,7 @@ class AsyncMorphik:
|
|
1332
2117
|
entity_resolution=EntityResolutionPromptOverride(
|
1333
2118
|
examples=[
|
1334
2119
|
EntityResolutionExample(
|
1335
|
-
canonical="Machine Learning",
|
2120
|
+
canonical="Machine Learning",
|
1336
2121
|
variants=["ML", "machine learning", "AI/ML"]
|
1337
2122
|
)
|
1338
2123
|
]
|
@@ -1341,34 +2126,27 @@ class AsyncMorphik:
|
|
1341
2126
|
)
|
1342
2127
|
```
|
1343
2128
|
"""
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
"additional_documents": additional_documents,
|
1351
|
-
"prompt_overrides": prompt_overrides,
|
1352
|
-
}
|
1353
|
-
|
1354
|
-
response = await self._request("POST", f"graph/{name}/update", request)
|
1355
|
-
return Graph(**response)
|
1356
|
-
|
2129
|
+
request = self._logic._prepare_update_graph_request(
|
2130
|
+
name, additional_filters, additional_documents, prompt_overrides, None, None
|
2131
|
+
)
|
2132
|
+
response = await self._request("POST", f"graph/{name}/update", data=request)
|
2133
|
+
return self._logic._parse_graph_response(response)
|
2134
|
+
|
1357
2135
|
async def delete_document(self, document_id: str) -> Dict[str, str]:
|
1358
2136
|
"""
|
1359
2137
|
Delete a document and all its associated data.
|
1360
|
-
|
2138
|
+
|
1361
2139
|
This method deletes a document and all its associated data, including:
|
1362
2140
|
- Document metadata
|
1363
2141
|
- Document content in storage
|
1364
2142
|
- Document chunks and embeddings in vector store
|
1365
|
-
|
2143
|
+
|
1366
2144
|
Args:
|
1367
2145
|
document_id: ID of the document to delete
|
1368
|
-
|
2146
|
+
|
1369
2147
|
Returns:
|
1370
2148
|
Dict[str, str]: Deletion status
|
1371
|
-
|
2149
|
+
|
1372
2150
|
Example:
|
1373
2151
|
```python
|
1374
2152
|
# Delete a document
|
@@ -1378,20 +2156,20 @@ class AsyncMorphik:
|
|
1378
2156
|
"""
|
1379
2157
|
response = await self._request("DELETE", f"documents/{document_id}")
|
1380
2158
|
return response
|
1381
|
-
|
2159
|
+
|
1382
2160
|
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1383
2161
|
"""
|
1384
2162
|
Delete a document by its filename.
|
1385
|
-
|
2163
|
+
|
1386
2164
|
This is a convenience method that first retrieves the document ID by filename
|
1387
2165
|
and then deletes the document by ID.
|
1388
|
-
|
2166
|
+
|
1389
2167
|
Args:
|
1390
2168
|
filename: Filename of the document to delete
|
1391
|
-
|
2169
|
+
|
1392
2170
|
Returns:
|
1393
2171
|
Dict[str, str]: Deletion status
|
1394
|
-
|
2172
|
+
|
1395
2173
|
Example:
|
1396
2174
|
```python
|
1397
2175
|
# Delete a document by filename
|
@@ -1401,7 +2179,7 @@ class AsyncMorphik:
|
|
1401
2179
|
"""
|
1402
2180
|
# First get the document by filename to obtain its ID
|
1403
2181
|
doc = await self.get_document_by_filename(filename)
|
1404
|
-
|
2182
|
+
|
1405
2183
|
# Then delete the document by ID
|
1406
2184
|
return await self.delete_document(doc.external_id)
|
1407
2185
|
|