morphik 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +15 -0
- morphik/async_.py +1416 -0
- morphik/exceptions.py +16 -0
- morphik/models.py +400 -0
- morphik/rules.py +47 -0
- morphik/sync.py +1447 -0
- morphik-0.1.0.dist-info/METADATA +47 -0
- morphik-0.1.0.dist-info/RECORD +9 -0
- morphik-0.1.0.dist-info/WHEEL +4 -0
morphik/async_.py
ADDED
@@ -0,0 +1,1416 @@
|
|
1
|
+
from io import BytesIO, IOBase
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
6
|
+
from urllib.parse import urlparse
|
7
|
+
|
8
|
+
import httpx
|
9
|
+
import jwt
|
10
|
+
from PIL.Image import Image as PILImage
|
11
|
+
from pydantic import BaseModel, Field
|
12
|
+
|
13
|
+
from .models import (
|
14
|
+
Document,
|
15
|
+
ChunkResult,
|
16
|
+
DocumentResult,
|
17
|
+
CompletionResponse,
|
18
|
+
IngestTextRequest,
|
19
|
+
ChunkSource,
|
20
|
+
Graph,
|
21
|
+
# Prompt override models
|
22
|
+
EntityExtractionExample,
|
23
|
+
EntityResolutionExample,
|
24
|
+
EntityExtractionPromptOverride,
|
25
|
+
EntityResolutionPromptOverride,
|
26
|
+
QueryPromptOverride,
|
27
|
+
GraphPromptOverrides,
|
28
|
+
QueryPromptOverrides
|
29
|
+
)
|
30
|
+
from .rules import Rule
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
# Type alias for rules
|
35
|
+
RuleOrDict = Union[Rule, Dict[str, Any]]
|
36
|
+
|
37
|
+
|
38
|
+
class AsyncCache:
|
39
|
+
def __init__(self, db: "AsyncMorphik", name: str):
|
40
|
+
self._db = db
|
41
|
+
self._name = name
|
42
|
+
|
43
|
+
async def update(self) -> bool:
|
44
|
+
response = await self._db._request("POST", f"cache/{self._name}/update")
|
45
|
+
return response.get("success", False)
|
46
|
+
|
47
|
+
async def add_docs(self, docs: List[str]) -> bool:
|
48
|
+
response = await self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
49
|
+
return response.get("success", False)
|
50
|
+
|
51
|
+
async def query(
|
52
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
53
|
+
) -> CompletionResponse:
|
54
|
+
response = await self._db._request(
|
55
|
+
"POST",
|
56
|
+
f"cache/{self._name}/query",
|
57
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
58
|
+
data="",
|
59
|
+
)
|
60
|
+
return CompletionResponse(**response)
|
61
|
+
|
62
|
+
|
63
|
+
class FinalChunkResult(BaseModel):
|
64
|
+
content: str | PILImage = Field(..., description="Chunk content")
|
65
|
+
score: float = Field(..., description="Relevance score")
|
66
|
+
document_id: str = Field(..., description="Parent document ID")
|
67
|
+
chunk_number: int = Field(..., description="Chunk sequence number")
|
68
|
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
69
|
+
content_type: str = Field(..., description="Content type")
|
70
|
+
filename: Optional[str] = Field(None, description="Original filename")
|
71
|
+
download_url: Optional[str] = Field(None, description="URL to download full document")
|
72
|
+
|
73
|
+
class Config:
|
74
|
+
arbitrary_types_allowed = True
|
75
|
+
|
76
|
+
|
77
|
+
class AsyncMorphik:
|
78
|
+
"""
|
79
|
+
Morphik client for document operations.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
uri (str, optional): Morphik URI in format "morphik://<owner_id>:<token>@<host>".
|
83
|
+
If not provided, connects to http://localhost:8000 without authentication.
|
84
|
+
timeout (int, optional): Request timeout in seconds. Defaults to 30.
|
85
|
+
is_local (bool, optional): Whether to connect to a local server. Defaults to False.
|
86
|
+
|
87
|
+
Examples:
|
88
|
+
```python
|
89
|
+
# Without authentication
|
90
|
+
async with AsyncMorphik() as db:
|
91
|
+
doc = await db.ingest_text("Sample content")
|
92
|
+
|
93
|
+
# With authentication
|
94
|
+
async with AsyncMorphik("morphik://owner_id:token@api.morphik.ai") as db:
|
95
|
+
doc = await db.ingest_text("Sample content")
|
96
|
+
```
|
97
|
+
"""
|
98
|
+
|
99
|
+
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
100
|
+
self._timeout = timeout
|
101
|
+
self._client = (
|
102
|
+
httpx.AsyncClient(timeout=timeout)
|
103
|
+
if not is_local
|
104
|
+
else httpx.AsyncClient(
|
105
|
+
timeout=timeout,
|
106
|
+
verify=False, # Disable SSL for localhost
|
107
|
+
http2=False, # Force HTTP/1.1
|
108
|
+
)
|
109
|
+
)
|
110
|
+
self._is_local = is_local
|
111
|
+
|
112
|
+
if uri:
|
113
|
+
self._setup_auth(uri)
|
114
|
+
else:
|
115
|
+
self._base_url = "http://localhost:8000"
|
116
|
+
self._auth_token = None
|
117
|
+
|
118
|
+
def _setup_auth(self, uri: str) -> None:
|
119
|
+
"""Setup authentication from URI"""
|
120
|
+
parsed = urlparse(uri)
|
121
|
+
if not parsed.netloc:
|
122
|
+
raise ValueError("Invalid URI format")
|
123
|
+
|
124
|
+
# Split host and auth parts
|
125
|
+
auth, host = parsed.netloc.split("@")
|
126
|
+
_, self._auth_token = auth.split(":")
|
127
|
+
|
128
|
+
# Set base URL
|
129
|
+
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
130
|
+
|
131
|
+
# Basic token validation
|
132
|
+
jwt.decode(self._auth_token, options={"verify_signature": False})
|
133
|
+
|
134
|
+
async def _request(
|
135
|
+
self,
|
136
|
+
method: str,
|
137
|
+
endpoint: str,
|
138
|
+
data: Optional[Dict[str, Any]] = None,
|
139
|
+
files: Optional[Dict[str, Any]] = None,
|
140
|
+
params: Optional[Dict[str, Any]] = None,
|
141
|
+
) -> Dict[str, Any]:
|
142
|
+
"""Make HTTP request"""
|
143
|
+
headers = {}
|
144
|
+
if self._auth_token: # Only add auth header if we have a token
|
145
|
+
headers["Authorization"] = f"Bearer {self._auth_token}"
|
146
|
+
|
147
|
+
# Configure request data based on type
|
148
|
+
if files:
|
149
|
+
# Multipart form data for files
|
150
|
+
request_data = {"files": files, "data": data}
|
151
|
+
# Don't set Content-Type, let httpx handle it
|
152
|
+
else:
|
153
|
+
# JSON for everything else
|
154
|
+
headers["Content-Type"] = "application/json"
|
155
|
+
request_data = {"json": data}
|
156
|
+
|
157
|
+
response = await self._client.request(
|
158
|
+
method,
|
159
|
+
f"{self._base_url}/{endpoint.lstrip('/')}",
|
160
|
+
headers=headers,
|
161
|
+
params=params,
|
162
|
+
**request_data,
|
163
|
+
)
|
164
|
+
response.raise_for_status()
|
165
|
+
return response.json()
|
166
|
+
|
167
|
+
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
168
|
+
"""Convert a rule to a dictionary format"""
|
169
|
+
if hasattr(rule, "to_dict"):
|
170
|
+
return rule.to_dict()
|
171
|
+
return rule
|
172
|
+
|
173
|
+
async def ingest_text(
|
174
|
+
self,
|
175
|
+
content: str,
|
176
|
+
filename: Optional[str] = None,
|
177
|
+
metadata: Optional[Dict[str, Any]] = None,
|
178
|
+
rules: Optional[List[RuleOrDict]] = None,
|
179
|
+
use_colpali: bool = True,
|
180
|
+
) -> Document:
|
181
|
+
"""
|
182
|
+
Ingest a text document into Morphik.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
content: Text content to ingest
|
186
|
+
metadata: Optional metadata dictionary
|
187
|
+
rules: Optional list of rules to apply during ingestion. Can be:
|
188
|
+
- MetadataExtractionRule: Extract metadata using a schema
|
189
|
+
- NaturalLanguageRule: Transform content using natural language
|
190
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
|
191
|
+
Returns:
|
192
|
+
Document: Metadata of the ingested document
|
193
|
+
|
194
|
+
Example:
|
195
|
+
```python
|
196
|
+
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
|
197
|
+
from pydantic import BaseModel
|
198
|
+
|
199
|
+
class DocumentInfo(BaseModel):
|
200
|
+
title: str
|
201
|
+
author: str
|
202
|
+
date: str
|
203
|
+
|
204
|
+
doc = await db.ingest_text(
|
205
|
+
"Machine learning is fascinating...",
|
206
|
+
metadata={"category": "tech"},
|
207
|
+
rules=[
|
208
|
+
# Extract metadata using schema
|
209
|
+
MetadataExtractionRule(schema=DocumentInfo),
|
210
|
+
# Transform content
|
211
|
+
NaturalLanguageRule(prompt="Shorten the content, use keywords")
|
212
|
+
]
|
213
|
+
)
|
214
|
+
```
|
215
|
+
"""
|
216
|
+
request = IngestTextRequest(
|
217
|
+
content=content,
|
218
|
+
filename=filename,
|
219
|
+
metadata=metadata or {},
|
220
|
+
rules=[self._convert_rule(r) for r in (rules or [])],
|
221
|
+
use_colpali=use_colpali,
|
222
|
+
)
|
223
|
+
response = await self._request("POST", "ingest/text", data=request.model_dump())
|
224
|
+
doc = Document(**response)
|
225
|
+
doc._client = self
|
226
|
+
return doc
|
227
|
+
|
228
|
+
async def ingest_file(
|
229
|
+
self,
|
230
|
+
file: Union[str, bytes, BinaryIO, Path],
|
231
|
+
filename: str,
|
232
|
+
metadata: Optional[Dict[str, Any]] = None,
|
233
|
+
rules: Optional[List[RuleOrDict]] = None,
|
234
|
+
use_colpali: bool = True,
|
235
|
+
) -> Document:
|
236
|
+
"""Ingest a file document into Morphik."""
|
237
|
+
# Handle different file input types
|
238
|
+
if isinstance(file, (str, Path)):
|
239
|
+
file_path = Path(file)
|
240
|
+
if not file_path.exists():
|
241
|
+
raise ValueError(f"File not found: {file}")
|
242
|
+
with open(file_path, "rb") as f:
|
243
|
+
content = f.read()
|
244
|
+
file_obj = BytesIO(content)
|
245
|
+
elif isinstance(file, bytes):
|
246
|
+
file_obj = BytesIO(file)
|
247
|
+
else:
|
248
|
+
file_obj = file
|
249
|
+
|
250
|
+
try:
|
251
|
+
# Prepare multipart form data
|
252
|
+
files = {"file": (filename, file_obj)}
|
253
|
+
|
254
|
+
# Add metadata and rules
|
255
|
+
data = {
|
256
|
+
"metadata": json.dumps(metadata or {}),
|
257
|
+
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
258
|
+
"use_colpali": json.dumps(use_colpali),
|
259
|
+
}
|
260
|
+
|
261
|
+
response = await self._request("POST", "ingest/file", data=data, files=files)
|
262
|
+
doc = Document(**response)
|
263
|
+
doc._client = self
|
264
|
+
return doc
|
265
|
+
finally:
|
266
|
+
# Close file if we opened it
|
267
|
+
if isinstance(file, (str, Path)):
|
268
|
+
file_obj.close()
|
269
|
+
|
270
|
+
async def ingest_files(
|
271
|
+
self,
|
272
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
273
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
274
|
+
rules: Optional[List[RuleOrDict]] = None,
|
275
|
+
use_colpali: bool = True,
|
276
|
+
parallel: bool = True,
|
277
|
+
) -> List[Document]:
|
278
|
+
"""
|
279
|
+
Ingest multiple files into Morphik.
|
280
|
+
|
281
|
+
Args:
|
282
|
+
files: List of files to ingest (path strings, bytes, file objects, or Paths)
|
283
|
+
metadata: Optional metadata (single dict for all files or list of dicts)
|
284
|
+
rules: Optional list of rules to apply
|
285
|
+
use_colpali: Whether to use ColPali-style embedding
|
286
|
+
parallel: Whether to process files in parallel
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
List[Document]: List of successfully ingested documents
|
290
|
+
|
291
|
+
Raises:
|
292
|
+
ValueError: If metadata list length doesn't match files length
|
293
|
+
"""
|
294
|
+
# Convert files to format expected by API
|
295
|
+
file_objects = []
|
296
|
+
for file in files:
|
297
|
+
if isinstance(file, (str, Path)):
|
298
|
+
path = Path(file)
|
299
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
300
|
+
elif isinstance(file, bytes):
|
301
|
+
file_objects.append(("files", ("file.bin", file)))
|
302
|
+
else:
|
303
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
304
|
+
|
305
|
+
try:
|
306
|
+
# Prepare request data
|
307
|
+
# Convert rules appropriately based on whether it's a flat list or list of lists
|
308
|
+
if rules:
|
309
|
+
if all(isinstance(r, list) for r in rules):
|
310
|
+
# List of lists - per-file rules
|
311
|
+
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
312
|
+
else:
|
313
|
+
# Flat list - shared rules for all files
|
314
|
+
converted_rules = [self._convert_rule(r) for r in rules]
|
315
|
+
else:
|
316
|
+
converted_rules = []
|
317
|
+
|
318
|
+
data = {
|
319
|
+
"metadata": json.dumps(metadata or {}),
|
320
|
+
"rules": json.dumps(converted_rules),
|
321
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
322
|
+
"parallel": str(parallel).lower(),
|
323
|
+
}
|
324
|
+
|
325
|
+
response = await self._request("POST", "ingest/files", data=data, files=file_objects)
|
326
|
+
|
327
|
+
if response.get("errors"):
|
328
|
+
# Log errors but don't raise exception
|
329
|
+
for error in response["errors"]:
|
330
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
331
|
+
|
332
|
+
docs = [Document(**doc) for doc in response["documents"]]
|
333
|
+
for doc in docs:
|
334
|
+
doc._client = self
|
335
|
+
return docs
|
336
|
+
finally:
|
337
|
+
# Clean up file objects
|
338
|
+
for _, (_, file_obj) in file_objects:
|
339
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
340
|
+
file_obj.close()
|
341
|
+
|
342
|
+
async def ingest_directory(
|
343
|
+
self,
|
344
|
+
directory: Union[str, Path],
|
345
|
+
recursive: bool = False,
|
346
|
+
pattern: str = "*",
|
347
|
+
metadata: Optional[Dict[str, Any]] = None,
|
348
|
+
rules: Optional[List[RuleOrDict]] = None,
|
349
|
+
use_colpali: bool = True,
|
350
|
+
parallel: bool = True,
|
351
|
+
) -> List[Document]:
|
352
|
+
"""
|
353
|
+
Ingest all files in a directory into Morphik.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
directory: Path to directory containing files to ingest
|
357
|
+
recursive: Whether to recursively process subdirectories
|
358
|
+
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
359
|
+
metadata: Optional metadata dictionary to apply to all files
|
360
|
+
rules: Optional list of rules to apply
|
361
|
+
use_colpali: Whether to use ColPali-style embedding
|
362
|
+
parallel: Whether to process files in parallel
|
363
|
+
|
364
|
+
Returns:
|
365
|
+
List[Document]: List of ingested documents
|
366
|
+
|
367
|
+
Raises:
|
368
|
+
ValueError: If directory not found
|
369
|
+
"""
|
370
|
+
directory = Path(directory)
|
371
|
+
if not directory.is_dir():
|
372
|
+
raise ValueError(f"Directory not found: {directory}")
|
373
|
+
|
374
|
+
# Collect all files matching pattern
|
375
|
+
if recursive:
|
376
|
+
files = list(directory.rglob(pattern))
|
377
|
+
else:
|
378
|
+
files = list(directory.glob(pattern))
|
379
|
+
|
380
|
+
# Filter out directories
|
381
|
+
files = [f for f in files if f.is_file()]
|
382
|
+
|
383
|
+
if not files:
|
384
|
+
return []
|
385
|
+
|
386
|
+
# Use ingest_files with collected paths
|
387
|
+
return await self.ingest_files(
|
388
|
+
files=files,
|
389
|
+
metadata=metadata,
|
390
|
+
rules=rules,
|
391
|
+
use_colpali=use_colpali,
|
392
|
+
parallel=parallel
|
393
|
+
)
|
394
|
+
|
395
|
+
async def retrieve_chunks(
|
396
|
+
self,
|
397
|
+
query: str,
|
398
|
+
filters: Optional[Dict[str, Any]] = None,
|
399
|
+
k: int = 4,
|
400
|
+
min_score: float = 0.0,
|
401
|
+
use_colpali: bool = True,
|
402
|
+
) -> List[FinalChunkResult]:
|
403
|
+
"""
|
404
|
+
Search for relevant chunks.
|
405
|
+
|
406
|
+
Args:
|
407
|
+
query: Search query text
|
408
|
+
filters: Optional metadata filters
|
409
|
+
k: Number of results (default: 4)
|
410
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
411
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve chunks (only works for documents ingested with `use_colpali=True`)
|
412
|
+
Returns:
|
413
|
+
List[FinalChunkResult]
|
414
|
+
|
415
|
+
Example:
|
416
|
+
```python
|
417
|
+
chunks = await db.retrieve_chunks(
|
418
|
+
"What are the key findings?",
|
419
|
+
filters={"department": "research"}
|
420
|
+
)
|
421
|
+
```
|
422
|
+
"""
|
423
|
+
request = {
|
424
|
+
"query": query,
|
425
|
+
"filters": filters,
|
426
|
+
"k": k,
|
427
|
+
"min_score": min_score,
|
428
|
+
"use_colpali": use_colpali,
|
429
|
+
}
|
430
|
+
|
431
|
+
response = await self._request("POST", "retrieve/chunks", data=request)
|
432
|
+
chunks = [ChunkResult(**r) for r in response]
|
433
|
+
|
434
|
+
final_chunks = []
|
435
|
+
for chunk in chunks:
|
436
|
+
if chunk.metadata.get("is_image"):
|
437
|
+
try:
|
438
|
+
# Handle data URI format "data:image/png;base64,..."
|
439
|
+
content = chunk.content
|
440
|
+
if content.startswith("data:"):
|
441
|
+
# Extract the base64 part after the comma
|
442
|
+
content = content.split(",", 1)[1]
|
443
|
+
|
444
|
+
# Now decode the base64 string
|
445
|
+
import base64
|
446
|
+
import io
|
447
|
+
from PIL import Image
|
448
|
+
image_bytes = base64.b64decode(content)
|
449
|
+
content = Image.open(io.BytesIO(image_bytes))
|
450
|
+
except Exception as e:
|
451
|
+
print(f"Error processing image: {str(e)}")
|
452
|
+
# Fall back to using the content as text
|
453
|
+
content = chunk.content
|
454
|
+
else:
|
455
|
+
content = chunk.content
|
456
|
+
|
457
|
+
final_chunks.append(
|
458
|
+
FinalChunkResult(
|
459
|
+
content=content,
|
460
|
+
score=chunk.score,
|
461
|
+
document_id=chunk.document_id,
|
462
|
+
chunk_number=chunk.chunk_number,
|
463
|
+
metadata=chunk.metadata,
|
464
|
+
content_type=chunk.content_type,
|
465
|
+
filename=chunk.filename,
|
466
|
+
download_url=chunk.download_url,
|
467
|
+
)
|
468
|
+
)
|
469
|
+
|
470
|
+
return final_chunks
|
471
|
+
|
472
|
+
async def retrieve_docs(
|
473
|
+
self,
|
474
|
+
query: str,
|
475
|
+
filters: Optional[Dict[str, Any]] = None,
|
476
|
+
k: int = 4,
|
477
|
+
min_score: float = 0.0,
|
478
|
+
use_colpali: bool = True,
|
479
|
+
) -> List[DocumentResult]:
|
480
|
+
"""
|
481
|
+
Retrieve relevant documents.
|
482
|
+
|
483
|
+
Args:
|
484
|
+
query: Search query text
|
485
|
+
filters: Optional metadata filters
|
486
|
+
k: Number of results (default: 4)
|
487
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
488
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve documents (only works for documents ingested with `use_colpali=True`)
|
489
|
+
Returns:
|
490
|
+
List[DocumentResult]
|
491
|
+
|
492
|
+
Example:
|
493
|
+
```python
|
494
|
+
docs = await db.retrieve_docs(
|
495
|
+
"machine learning",
|
496
|
+
k=5
|
497
|
+
)
|
498
|
+
```
|
499
|
+
"""
|
500
|
+
request = {
|
501
|
+
"query": query,
|
502
|
+
"filters": filters,
|
503
|
+
"k": k,
|
504
|
+
"min_score": min_score,
|
505
|
+
"use_colpali": use_colpali,
|
506
|
+
}
|
507
|
+
|
508
|
+
response = await self._request("POST", "retrieve/docs", data=request)
|
509
|
+
return [DocumentResult(**r) for r in response]
|
510
|
+
|
511
|
+
async def query(
|
512
|
+
self,
|
513
|
+
query: str,
|
514
|
+
filters: Optional[Dict[str, Any]] = None,
|
515
|
+
k: int = 4,
|
516
|
+
min_score: float = 0.0,
|
517
|
+
max_tokens: Optional[int] = None,
|
518
|
+
temperature: Optional[float] = None,
|
519
|
+
use_colpali: bool = True,
|
520
|
+
graph_name: Optional[str] = None,
|
521
|
+
hop_depth: int = 1,
|
522
|
+
include_paths: bool = False,
|
523
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
524
|
+
) -> CompletionResponse:
|
525
|
+
"""
|
526
|
+
Generate completion using relevant chunks as context.
|
527
|
+
|
528
|
+
Args:
|
529
|
+
query: Query text
|
530
|
+
filters: Optional metadata filters
|
531
|
+
k: Number of chunks to use as context (default: 4)
|
532
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
533
|
+
max_tokens: Maximum tokens in completion
|
534
|
+
temperature: Model temperature
|
535
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
|
536
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
537
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
538
|
+
include_paths: Whether to include relationship paths in the response
|
539
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
540
|
+
Either a QueryPromptOverrides object or a dictionary with the same structure
|
541
|
+
Returns:
|
542
|
+
CompletionResponse
|
543
|
+
|
544
|
+
Example:
|
545
|
+
```python
|
546
|
+
# Standard query
|
547
|
+
response = await db.query(
|
548
|
+
"What are the key findings about customer satisfaction?",
|
549
|
+
filters={"department": "research"},
|
550
|
+
temperature=0.7
|
551
|
+
)
|
552
|
+
|
553
|
+
# Knowledge graph enhanced query
|
554
|
+
response = await db.query(
|
555
|
+
"How does product X relate to customer segment Y?",
|
556
|
+
graph_name="market_graph",
|
557
|
+
hop_depth=2,
|
558
|
+
include_paths=True
|
559
|
+
)
|
560
|
+
|
561
|
+
# With prompt customization
|
562
|
+
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
563
|
+
response = await db.query(
|
564
|
+
"What are the key findings?",
|
565
|
+
prompt_overrides=QueryPromptOverrides(
|
566
|
+
query=QueryPromptOverride(
|
567
|
+
prompt_template="Answer the question in a formal, academic tone: {question}"
|
568
|
+
)
|
569
|
+
)
|
570
|
+
)
|
571
|
+
|
572
|
+
# Or using a dictionary
|
573
|
+
response = await db.query(
|
574
|
+
"What are the key findings?",
|
575
|
+
prompt_overrides={
|
576
|
+
"query": {
|
577
|
+
"prompt_template": "Answer the question in a formal, academic tone: {question}"
|
578
|
+
}
|
579
|
+
}
|
580
|
+
)
|
581
|
+
|
582
|
+
print(response.completion)
|
583
|
+
|
584
|
+
# If include_paths=True, you can inspect the graph paths
|
585
|
+
if response.metadata and "graph" in response.metadata:
|
586
|
+
for path in response.metadata["graph"]["paths"]:
|
587
|
+
print(" -> ".join(path))
|
588
|
+
```
|
589
|
+
"""
|
590
|
+
# Convert prompt_overrides to dict if it's a model
|
591
|
+
if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides):
|
592
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
593
|
+
|
594
|
+
request = {
|
595
|
+
"query": query,
|
596
|
+
"filters": filters,
|
597
|
+
"k": k,
|
598
|
+
"min_score": min_score,
|
599
|
+
"max_tokens": max_tokens,
|
600
|
+
"temperature": temperature,
|
601
|
+
"use_colpali": use_colpali,
|
602
|
+
"graph_name": graph_name,
|
603
|
+
"hop_depth": hop_depth,
|
604
|
+
"include_paths": include_paths,
|
605
|
+
"prompt_overrides": prompt_overrides,
|
606
|
+
}
|
607
|
+
|
608
|
+
response = await self._request("POST", "query", data=request)
|
609
|
+
return CompletionResponse(**response)
|
610
|
+
|
611
|
+
async def list_documents(
|
612
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
613
|
+
) -> List[Document]:
|
614
|
+
"""
|
615
|
+
List accessible documents.
|
616
|
+
|
617
|
+
Args:
|
618
|
+
skip: Number of documents to skip
|
619
|
+
limit: Maximum number of documents to return
|
620
|
+
filters: Optional filters
|
621
|
+
|
622
|
+
Returns:
|
623
|
+
List[Document]: List of accessible documents
|
624
|
+
|
625
|
+
Example:
|
626
|
+
```python
|
627
|
+
# Get first page
|
628
|
+
docs = await db.list_documents(limit=10)
|
629
|
+
|
630
|
+
# Get next page
|
631
|
+
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
632
|
+
```
|
633
|
+
"""
|
634
|
+
# Use query params for pagination and POST body for filters
|
635
|
+
response = await self._request(
|
636
|
+
"POST", f"documents?skip={skip}&limit={limit}", data=filters or {}
|
637
|
+
)
|
638
|
+
docs = [Document(**doc) for doc in response]
|
639
|
+
for doc in docs:
|
640
|
+
doc._client = self
|
641
|
+
return docs
|
642
|
+
|
643
|
+
async def get_document(self, document_id: str) -> Document:
|
644
|
+
"""
|
645
|
+
Get document metadata by ID.
|
646
|
+
|
647
|
+
Args:
|
648
|
+
document_id: ID of the document
|
649
|
+
|
650
|
+
Returns:
|
651
|
+
Document: Document metadata
|
652
|
+
|
653
|
+
Example:
|
654
|
+
```python
|
655
|
+
doc = await db.get_document("doc_123")
|
656
|
+
print(f"Title: {doc.metadata.get('title')}")
|
657
|
+
```
|
658
|
+
"""
|
659
|
+
response = await self._request("GET", f"documents/{document_id}")
|
660
|
+
doc = Document(**response)
|
661
|
+
doc._client = self
|
662
|
+
return doc
|
663
|
+
|
664
|
+
async def get_document_by_filename(self, filename: str) -> Document:
|
665
|
+
"""
|
666
|
+
Get document metadata by filename.
|
667
|
+
If multiple documents have the same filename, returns the most recently updated one.
|
668
|
+
|
669
|
+
Args:
|
670
|
+
filename: Filename of the document to retrieve
|
671
|
+
|
672
|
+
Returns:
|
673
|
+
Document: Document metadata
|
674
|
+
|
675
|
+
Example:
|
676
|
+
```python
|
677
|
+
doc = await db.get_document_by_filename("report.pdf")
|
678
|
+
print(f"Document ID: {doc.external_id}")
|
679
|
+
```
|
680
|
+
"""
|
681
|
+
response = await self._request("GET", f"documents/filename/{filename}")
|
682
|
+
doc = Document(**response)
|
683
|
+
doc._client = self
|
684
|
+
return doc
|
685
|
+
|
686
|
+
async def update_document_with_text(
|
687
|
+
self,
|
688
|
+
document_id: str,
|
689
|
+
content: str,
|
690
|
+
filename: Optional[str] = None,
|
691
|
+
metadata: Optional[Dict[str, Any]] = None,
|
692
|
+
rules: Optional[List] = None,
|
693
|
+
update_strategy: str = "add",
|
694
|
+
use_colpali: Optional[bool] = None,
|
695
|
+
) -> Document:
|
696
|
+
"""
|
697
|
+
Update a document with new text content using the specified strategy.
|
698
|
+
|
699
|
+
Args:
|
700
|
+
document_id: ID of the document to update
|
701
|
+
content: The new content to add
|
702
|
+
filename: Optional new filename for the document
|
703
|
+
metadata: Additional metadata to update (optional)
|
704
|
+
rules: Optional list of rules to apply to the content
|
705
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
706
|
+
use_colpali: Whether to use multi-vector embedding
|
707
|
+
|
708
|
+
Returns:
|
709
|
+
Document: Updated document metadata
|
710
|
+
|
711
|
+
Example:
|
712
|
+
```python
|
713
|
+
# Add new content to an existing document
|
714
|
+
updated_doc = await db.update_document_with_text(
|
715
|
+
document_id="doc_123",
|
716
|
+
content="This is additional content that will be appended to the document.",
|
717
|
+
filename="updated_document.txt",
|
718
|
+
metadata={"category": "updated"},
|
719
|
+
update_strategy="add"
|
720
|
+
)
|
721
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
722
|
+
```
|
723
|
+
"""
|
724
|
+
# Use the dedicated text update endpoint
|
725
|
+
request = IngestTextRequest(
|
726
|
+
content=content,
|
727
|
+
filename=filename,
|
728
|
+
metadata=metadata or {},
|
729
|
+
rules=[self._convert_rule(r) for r in (rules or [])],
|
730
|
+
use_colpali=use_colpali if use_colpali is not None else True,
|
731
|
+
)
|
732
|
+
|
733
|
+
params = {}
|
734
|
+
if update_strategy != "add":
|
735
|
+
params["update_strategy"] = update_strategy
|
736
|
+
|
737
|
+
response = await self._request(
|
738
|
+
"POST",
|
739
|
+
f"documents/{document_id}/update_text",
|
740
|
+
data=request.model_dump(),
|
741
|
+
params=params
|
742
|
+
)
|
743
|
+
|
744
|
+
doc = Document(**response)
|
745
|
+
doc._client = self
|
746
|
+
return doc
|
747
|
+
|
748
|
+
async def update_document_with_file(
|
749
|
+
self,
|
750
|
+
document_id: str,
|
751
|
+
file: Union[str, bytes, BinaryIO, Path],
|
752
|
+
filename: Optional[str] = None,
|
753
|
+
metadata: Optional[Dict[str, Any]] = None,
|
754
|
+
rules: Optional[List] = None,
|
755
|
+
update_strategy: str = "add",
|
756
|
+
use_colpali: Optional[bool] = None,
|
757
|
+
) -> Document:
|
758
|
+
"""
|
759
|
+
Update a document with content from a file using the specified strategy.
|
760
|
+
|
761
|
+
Args:
|
762
|
+
document_id: ID of the document to update
|
763
|
+
file: File to add (path string, bytes, file object, or Path)
|
764
|
+
filename: Name of the file
|
765
|
+
metadata: Additional metadata to update (optional)
|
766
|
+
rules: Optional list of rules to apply to the content
|
767
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
768
|
+
use_colpali: Whether to use multi-vector embedding
|
769
|
+
|
770
|
+
Returns:
|
771
|
+
Document: Updated document metadata
|
772
|
+
|
773
|
+
Example:
|
774
|
+
```python
|
775
|
+
# Add content from a file to an existing document
|
776
|
+
updated_doc = await db.update_document_with_file(
|
777
|
+
document_id="doc_123",
|
778
|
+
file="path/to/update.pdf",
|
779
|
+
metadata={"status": "updated"},
|
780
|
+
update_strategy="add"
|
781
|
+
)
|
782
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
783
|
+
```
|
784
|
+
"""
|
785
|
+
# Handle different file input types
|
786
|
+
if isinstance(file, (str, Path)):
|
787
|
+
file_path = Path(file)
|
788
|
+
if not file_path.exists():
|
789
|
+
raise ValueError(f"File not found: {file}")
|
790
|
+
filename = file_path.name if filename is None else filename
|
791
|
+
with open(file_path, "rb") as f:
|
792
|
+
content = f.read()
|
793
|
+
file_obj = BytesIO(content)
|
794
|
+
elif isinstance(file, bytes):
|
795
|
+
if filename is None:
|
796
|
+
raise ValueError("filename is required when updating with bytes")
|
797
|
+
file_obj = BytesIO(file)
|
798
|
+
else:
|
799
|
+
if filename is None:
|
800
|
+
raise ValueError("filename is required when updating with file object")
|
801
|
+
file_obj = file
|
802
|
+
|
803
|
+
try:
|
804
|
+
# Prepare multipart form data
|
805
|
+
files = {"file": (filename, file_obj)}
|
806
|
+
|
807
|
+
# Convert metadata and rules to JSON strings
|
808
|
+
form_data = {
|
809
|
+
"metadata": json.dumps(metadata or {}),
|
810
|
+
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
811
|
+
"update_strategy": update_strategy,
|
812
|
+
}
|
813
|
+
|
814
|
+
if use_colpali is not None:
|
815
|
+
form_data["use_colpali"] = str(use_colpali).lower()
|
816
|
+
|
817
|
+
# Use the dedicated file update endpoint
|
818
|
+
response = await self._request(
|
819
|
+
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
820
|
+
)
|
821
|
+
|
822
|
+
doc = Document(**response)
|
823
|
+
doc._client = self
|
824
|
+
return doc
|
825
|
+
finally:
|
826
|
+
# Close file if we opened it
|
827
|
+
if isinstance(file, (str, Path)):
|
828
|
+
file_obj.close()
|
829
|
+
|
830
|
+
async def update_document_metadata(
|
831
|
+
self,
|
832
|
+
document_id: str,
|
833
|
+
metadata: Dict[str, Any],
|
834
|
+
) -> Document:
|
835
|
+
"""
|
836
|
+
Update a document's metadata only.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
document_id: ID of the document to update
|
840
|
+
metadata: Metadata to update
|
841
|
+
|
842
|
+
Returns:
|
843
|
+
Document: Updated document metadata
|
844
|
+
|
845
|
+
Example:
|
846
|
+
```python
|
847
|
+
# Update just the metadata of a document
|
848
|
+
updated_doc = await db.update_document_metadata(
|
849
|
+
document_id="doc_123",
|
850
|
+
metadata={"status": "reviewed", "reviewer": "Jane Smith"}
|
851
|
+
)
|
852
|
+
print(f"Updated metadata: {updated_doc.metadata}")
|
853
|
+
```
|
854
|
+
"""
|
855
|
+
# Use the dedicated metadata update endpoint
|
856
|
+
response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
857
|
+
doc = Document(**response)
|
858
|
+
doc._client = self
|
859
|
+
return doc
|
860
|
+
|
861
|
+
async def update_document_by_filename_with_text(
|
862
|
+
self,
|
863
|
+
filename: str,
|
864
|
+
content: str,
|
865
|
+
new_filename: Optional[str] = None,
|
866
|
+
metadata: Optional[Dict[str, Any]] = None,
|
867
|
+
rules: Optional[List] = None,
|
868
|
+
update_strategy: str = "add",
|
869
|
+
use_colpali: Optional[bool] = None,
|
870
|
+
) -> Document:
|
871
|
+
"""
|
872
|
+
Update a document identified by filename with new text content using the specified strategy.
|
873
|
+
|
874
|
+
Args:
|
875
|
+
filename: Filename of the document to update
|
876
|
+
content: The new content to add
|
877
|
+
new_filename: Optional new filename for the document
|
878
|
+
metadata: Additional metadata to update (optional)
|
879
|
+
rules: Optional list of rules to apply to the content
|
880
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
881
|
+
use_colpali: Whether to use multi-vector embedding
|
882
|
+
|
883
|
+
Returns:
|
884
|
+
Document: Updated document metadata
|
885
|
+
|
886
|
+
Example:
|
887
|
+
```python
|
888
|
+
# Add new content to an existing document identified by filename
|
889
|
+
updated_doc = await db.update_document_by_filename_with_text(
|
890
|
+
filename="report.pdf",
|
891
|
+
content="This is additional content that will be appended to the document.",
|
892
|
+
new_filename="updated_report.pdf",
|
893
|
+
metadata={"category": "updated"},
|
894
|
+
update_strategy="add"
|
895
|
+
)
|
896
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
897
|
+
```
|
898
|
+
"""
|
899
|
+
# First get the document by filename to obtain its ID
|
900
|
+
doc = await self.get_document_by_filename(filename)
|
901
|
+
|
902
|
+
# Then use the regular update_document_with_text endpoint with the document ID
|
903
|
+
return await self.update_document_with_text(
|
904
|
+
document_id=doc.external_id,
|
905
|
+
content=content,
|
906
|
+
filename=new_filename,
|
907
|
+
metadata=metadata,
|
908
|
+
rules=rules,
|
909
|
+
update_strategy=update_strategy,
|
910
|
+
use_colpali=use_colpali
|
911
|
+
)
|
912
|
+
|
913
|
+
async def update_document_by_filename_with_file(
|
914
|
+
self,
|
915
|
+
filename: str,
|
916
|
+
file: Union[str, bytes, BinaryIO, Path],
|
917
|
+
new_filename: Optional[str] = None,
|
918
|
+
metadata: Optional[Dict[str, Any]] = None,
|
919
|
+
rules: Optional[List] = None,
|
920
|
+
update_strategy: str = "add",
|
921
|
+
use_colpali: Optional[bool] = None,
|
922
|
+
) -> Document:
|
923
|
+
"""
|
924
|
+
Update a document identified by filename with content from a file using the specified strategy.
|
925
|
+
|
926
|
+
Args:
|
927
|
+
filename: Filename of the document to update
|
928
|
+
file: File to add (path string, bytes, file object, or Path)
|
929
|
+
new_filename: Optional new filename for the document (defaults to the filename of the file)
|
930
|
+
metadata: Additional metadata to update (optional)
|
931
|
+
rules: Optional list of rules to apply to the content
|
932
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
933
|
+
use_colpali: Whether to use multi-vector embedding
|
934
|
+
|
935
|
+
Returns:
|
936
|
+
Document: Updated document metadata
|
937
|
+
|
938
|
+
Example:
|
939
|
+
```python
|
940
|
+
# Add content from a file to an existing document identified by filename
|
941
|
+
updated_doc = await db.update_document_by_filename_with_file(
|
942
|
+
filename="report.pdf",
|
943
|
+
file="path/to/update.pdf",
|
944
|
+
metadata={"status": "updated"},
|
945
|
+
update_strategy="add"
|
946
|
+
)
|
947
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
948
|
+
```
|
949
|
+
"""
|
950
|
+
# First get the document by filename to obtain its ID
|
951
|
+
doc = await self.get_document_by_filename(filename)
|
952
|
+
|
953
|
+
# Then use the regular update_document_with_file endpoint with the document ID
|
954
|
+
return await self.update_document_with_file(
|
955
|
+
document_id=doc.external_id,
|
956
|
+
file=file,
|
957
|
+
filename=new_filename,
|
958
|
+
metadata=metadata,
|
959
|
+
rules=rules,
|
960
|
+
update_strategy=update_strategy,
|
961
|
+
use_colpali=use_colpali
|
962
|
+
)
|
963
|
+
|
964
|
+
async def update_document_by_filename_metadata(
|
965
|
+
self,
|
966
|
+
filename: str,
|
967
|
+
metadata: Dict[str, Any],
|
968
|
+
new_filename: Optional[str] = None,
|
969
|
+
) -> Document:
|
970
|
+
"""
|
971
|
+
Update a document's metadata using filename to identify the document.
|
972
|
+
|
973
|
+
Args:
|
974
|
+
filename: Filename of the document to update
|
975
|
+
metadata: Metadata to update
|
976
|
+
new_filename: Optional new filename to assign to the document
|
977
|
+
|
978
|
+
Returns:
|
979
|
+
Document: Updated document metadata
|
980
|
+
|
981
|
+
Example:
|
982
|
+
```python
|
983
|
+
# Update just the metadata of a document identified by filename
|
984
|
+
updated_doc = await db.update_document_by_filename_metadata(
|
985
|
+
filename="report.pdf",
|
986
|
+
metadata={"status": "reviewed", "reviewer": "Jane Smith"},
|
987
|
+
new_filename="reviewed_report.pdf" # Optional: rename the file
|
988
|
+
)
|
989
|
+
print(f"Updated metadata: {updated_doc.metadata}")
|
990
|
+
```
|
991
|
+
"""
|
992
|
+
# First get the document by filename to obtain its ID
|
993
|
+
doc = await self.get_document_by_filename(filename)
|
994
|
+
|
995
|
+
# Update the metadata
|
996
|
+
result = await self.update_document_metadata(
|
997
|
+
document_id=doc.external_id,
|
998
|
+
metadata=metadata,
|
999
|
+
)
|
1000
|
+
|
1001
|
+
# If new_filename is provided, update the filename as well
|
1002
|
+
if new_filename:
|
1003
|
+
# Create a request that retains the just-updated metadata but also changes filename
|
1004
|
+
combined_metadata = result.metadata.copy()
|
1005
|
+
|
1006
|
+
# Update the document again with filename change and the same metadata
|
1007
|
+
response = await self._request(
|
1008
|
+
"POST",
|
1009
|
+
f"documents/{doc.external_id}/update_text",
|
1010
|
+
data={
|
1011
|
+
"content": "",
|
1012
|
+
"filename": new_filename,
|
1013
|
+
"metadata": combined_metadata,
|
1014
|
+
"rules": []
|
1015
|
+
}
|
1016
|
+
)
|
1017
|
+
result = Document(**response)
|
1018
|
+
result._client = self
|
1019
|
+
|
1020
|
+
return result
|
1021
|
+
|
1022
|
+
async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1023
|
+
"""
|
1024
|
+
Retrieve multiple documents by their IDs in a single batch operation.
|
1025
|
+
|
1026
|
+
Args:
|
1027
|
+
document_ids: List of document IDs to retrieve
|
1028
|
+
|
1029
|
+
Returns:
|
1030
|
+
List[Document]: List of document metadata for found documents
|
1031
|
+
|
1032
|
+
Example:
|
1033
|
+
```python
|
1034
|
+
docs = await db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
1035
|
+
for doc in docs:
|
1036
|
+
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
1037
|
+
```
|
1038
|
+
"""
|
1039
|
+
response = await self._request("POST", "batch/documents", data=document_ids)
|
1040
|
+
docs = [Document(**doc) for doc in response]
|
1041
|
+
for doc in docs:
|
1042
|
+
doc._client = self
|
1043
|
+
return docs
|
1044
|
+
|
1045
|
+
async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
1046
|
+
"""
|
1047
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1048
|
+
|
1049
|
+
Args:
|
1050
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1051
|
+
|
1052
|
+
Returns:
|
1053
|
+
List[FinalChunkResult]: List of chunk results
|
1054
|
+
|
1055
|
+
Example:
|
1056
|
+
```python
|
1057
|
+
# Using dictionaries
|
1058
|
+
sources = [
|
1059
|
+
{"document_id": "doc_123", "chunk_number": 0},
|
1060
|
+
{"document_id": "doc_456", "chunk_number": 2}
|
1061
|
+
]
|
1062
|
+
|
1063
|
+
# Or using ChunkSource objects
|
1064
|
+
from morphik.models import ChunkSource
|
1065
|
+
sources = [
|
1066
|
+
ChunkSource(document_id="doc_123", chunk_number=0),
|
1067
|
+
ChunkSource(document_id="doc_456", chunk_number=2)
|
1068
|
+
]
|
1069
|
+
|
1070
|
+
chunks = await db.batch_get_chunks(sources)
|
1071
|
+
for chunk in chunks:
|
1072
|
+
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
1073
|
+
```
|
1074
|
+
"""
|
1075
|
+
# Convert to list of dictionaries if needed
|
1076
|
+
source_dicts = []
|
1077
|
+
for source in sources:
|
1078
|
+
if isinstance(source, dict):
|
1079
|
+
source_dicts.append(source)
|
1080
|
+
else:
|
1081
|
+
source_dicts.append(source.model_dump())
|
1082
|
+
|
1083
|
+
response = await self._request("POST", "batch/chunks", data=source_dicts)
|
1084
|
+
chunks = [ChunkResult(**r) for r in response]
|
1085
|
+
|
1086
|
+
final_chunks = []
|
1087
|
+
for chunk in chunks:
|
1088
|
+
if chunk.metadata.get("is_image"):
|
1089
|
+
try:
|
1090
|
+
# Handle data URI format "data:image/png;base64,..."
|
1091
|
+
content = chunk.content
|
1092
|
+
if content.startswith("data:"):
|
1093
|
+
# Extract the base64 part after the comma
|
1094
|
+
content = content.split(",", 1)[1]
|
1095
|
+
|
1096
|
+
# Now decode the base64 string
|
1097
|
+
import base64
|
1098
|
+
import io
|
1099
|
+
from PIL import Image
|
1100
|
+
image_bytes = base64.b64decode(content)
|
1101
|
+
content = Image.open(io.BytesIO(image_bytes))
|
1102
|
+
except Exception as e:
|
1103
|
+
print(f"Error processing image: {str(e)}")
|
1104
|
+
# Fall back to using the content as text
|
1105
|
+
content = chunk.content
|
1106
|
+
else:
|
1107
|
+
content = chunk.content
|
1108
|
+
|
1109
|
+
final_chunks.append(
|
1110
|
+
FinalChunkResult(
|
1111
|
+
content=content,
|
1112
|
+
score=chunk.score,
|
1113
|
+
document_id=chunk.document_id,
|
1114
|
+
chunk_number=chunk.chunk_number,
|
1115
|
+
metadata=chunk.metadata,
|
1116
|
+
content_type=chunk.content_type,
|
1117
|
+
filename=chunk.filename,
|
1118
|
+
download_url=chunk.download_url,
|
1119
|
+
)
|
1120
|
+
)
|
1121
|
+
|
1122
|
+
return final_chunks
|
1123
|
+
|
1124
|
+
async def create_cache(
|
1125
|
+
self,
|
1126
|
+
name: str,
|
1127
|
+
model: str,
|
1128
|
+
gguf_file: str,
|
1129
|
+
filters: Optional[Dict[str, Any]] = None,
|
1130
|
+
docs: Optional[List[str]] = None,
|
1131
|
+
) -> Dict[str, Any]:
|
1132
|
+
"""
|
1133
|
+
Create a new cache with specified configuration.
|
1134
|
+
|
1135
|
+
Args:
|
1136
|
+
name: Name of the cache to create
|
1137
|
+
model: Name of the model to use (e.g. "llama2")
|
1138
|
+
gguf_file: Name of the GGUF file to use for the model
|
1139
|
+
filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
|
1140
|
+
docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
|
1141
|
+
|
1142
|
+
Returns:
|
1143
|
+
Dict[str, Any]: Created cache configuration
|
1144
|
+
|
1145
|
+
Example:
|
1146
|
+
```python
|
1147
|
+
# This will include both:
|
1148
|
+
# 1. Any documents with category="programming"
|
1149
|
+
# 2. The specific documents "doc1" and "doc2" (regardless of their category)
|
1150
|
+
cache = await db.create_cache(
|
1151
|
+
name="programming_cache",
|
1152
|
+
model="llama2",
|
1153
|
+
gguf_file="llama-2-7b-chat.Q4_K_M.gguf",
|
1154
|
+
filters={"category": "programming"},
|
1155
|
+
docs=["doc1", "doc2"]
|
1156
|
+
)
|
1157
|
+
```
|
1158
|
+
"""
|
1159
|
+
# Build query parameters for name, model and gguf_file
|
1160
|
+
params = {"name": name, "model": model, "gguf_file": gguf_file}
|
1161
|
+
|
1162
|
+
# Build request body for filters and docs
|
1163
|
+
request = {"filters": filters, "docs": docs}
|
1164
|
+
|
1165
|
+
response = await self._request("POST", "cache/create", request, params=params)
|
1166
|
+
return response
|
1167
|
+
|
1168
|
+
async def get_cache(self, name: str) -> AsyncCache:
|
1169
|
+
"""
|
1170
|
+
Get a cache by name.
|
1171
|
+
|
1172
|
+
Args:
|
1173
|
+
name: Name of the cache to retrieve
|
1174
|
+
|
1175
|
+
Returns:
|
1176
|
+
cache: A cache object that is used to interact with the cache.
|
1177
|
+
|
1178
|
+
Example:
|
1179
|
+
```python
|
1180
|
+
cache = await db.get_cache("programming_cache")
|
1181
|
+
```
|
1182
|
+
"""
|
1183
|
+
response = await self._request("GET", f"cache/{name}")
|
1184
|
+
if response.get("exists", False):
|
1185
|
+
return AsyncCache(self, name)
|
1186
|
+
raise ValueError(f"Cache '{name}' not found")
|
1187
|
+
|
1188
|
+
async def create_graph(
|
1189
|
+
self,
|
1190
|
+
name: str,
|
1191
|
+
filters: Optional[Dict[str, Any]] = None,
|
1192
|
+
documents: Optional[List[str]] = None,
|
1193
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1194
|
+
) -> Graph:
|
1195
|
+
"""
|
1196
|
+
Create a graph from documents.
|
1197
|
+
|
1198
|
+
This method extracts entities and relationships from documents
|
1199
|
+
matching the specified filters or document IDs and creates a graph.
|
1200
|
+
|
1201
|
+
Args:
|
1202
|
+
name: Name of the graph to create
|
1203
|
+
filters: Optional metadata filters to determine which documents to include
|
1204
|
+
documents: Optional list of specific document IDs to include
|
1205
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1206
|
+
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1207
|
+
|
1208
|
+
Returns:
|
1209
|
+
Graph: The created graph object
|
1210
|
+
|
1211
|
+
Example:
|
1212
|
+
```python
|
1213
|
+
# Create a graph from documents with category="research"
|
1214
|
+
graph = await db.create_graph(
|
1215
|
+
name="research_graph",
|
1216
|
+
filters={"category": "research"}
|
1217
|
+
)
|
1218
|
+
|
1219
|
+
# Create a graph from specific documents
|
1220
|
+
graph = await db.create_graph(
|
1221
|
+
name="custom_graph",
|
1222
|
+
documents=["doc1", "doc2", "doc3"]
|
1223
|
+
)
|
1224
|
+
|
1225
|
+
# With custom entity extraction examples
|
1226
|
+
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1227
|
+
graph = await db.create_graph(
|
1228
|
+
name="medical_graph",
|
1229
|
+
filters={"category": "medical"},
|
1230
|
+
prompt_overrides=GraphPromptOverrides(
|
1231
|
+
entity_extraction=EntityExtractionPromptOverride(
|
1232
|
+
examples=[
|
1233
|
+
EntityExtractionExample(label="Insulin", type="MEDICATION"),
|
1234
|
+
EntityExtractionExample(label="Diabetes", type="CONDITION")
|
1235
|
+
]
|
1236
|
+
)
|
1237
|
+
)
|
1238
|
+
)
|
1239
|
+
```
|
1240
|
+
"""
|
1241
|
+
# Convert prompt_overrides to dict if it's a model
|
1242
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1243
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1244
|
+
|
1245
|
+
request = {
|
1246
|
+
"name": name,
|
1247
|
+
"filters": filters,
|
1248
|
+
"documents": documents,
|
1249
|
+
"prompt_overrides": prompt_overrides,
|
1250
|
+
}
|
1251
|
+
|
1252
|
+
response = await self._request("POST", "graph/create", request)
|
1253
|
+
return Graph(**response)
|
1254
|
+
|
1255
|
+
async def get_graph(self, name: str) -> Graph:
|
1256
|
+
"""
|
1257
|
+
Get a graph by name.
|
1258
|
+
|
1259
|
+
Args:
|
1260
|
+
name: Name of the graph to retrieve
|
1261
|
+
|
1262
|
+
Returns:
|
1263
|
+
Graph: The requested graph object
|
1264
|
+
|
1265
|
+
Example:
|
1266
|
+
```python
|
1267
|
+
# Get a graph by name
|
1268
|
+
graph = await db.get_graph("finance_graph")
|
1269
|
+
print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships")
|
1270
|
+
```
|
1271
|
+
"""
|
1272
|
+
response = await self._request("GET", f"graph/{name}")
|
1273
|
+
return Graph(**response)
|
1274
|
+
|
1275
|
+
async def list_graphs(self) -> List[Graph]:
|
1276
|
+
"""
|
1277
|
+
List all graphs the user has access to.
|
1278
|
+
|
1279
|
+
Returns:
|
1280
|
+
List[Graph]: List of graph objects
|
1281
|
+
|
1282
|
+
Example:
|
1283
|
+
```python
|
1284
|
+
# List all accessible graphs
|
1285
|
+
graphs = await db.list_graphs()
|
1286
|
+
for graph in graphs:
|
1287
|
+
print(f"Graph: {graph.name}, Entities: {len(graph.entities)}")
|
1288
|
+
```
|
1289
|
+
"""
|
1290
|
+
response = await self._request("GET", "graphs")
|
1291
|
+
return [Graph(**graph) for graph in response]
|
1292
|
+
|
1293
|
+
async def update_graph(
|
1294
|
+
self,
|
1295
|
+
name: str,
|
1296
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
1297
|
+
additional_documents: Optional[List[str]] = None,
|
1298
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1299
|
+
) -> Graph:
|
1300
|
+
"""
|
1301
|
+
Update an existing graph with new documents.
|
1302
|
+
|
1303
|
+
This method processes additional documents matching the original or new filters,
|
1304
|
+
extracts entities and relationships, and updates the graph with new information.
|
1305
|
+
|
1306
|
+
Args:
|
1307
|
+
name: Name of the graph to update
|
1308
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1309
|
+
additional_documents: Optional list of additional document IDs to include
|
1310
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1311
|
+
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1312
|
+
|
1313
|
+
Returns:
|
1314
|
+
Graph: The updated graph
|
1315
|
+
|
1316
|
+
Example:
|
1317
|
+
```python
|
1318
|
+
# Update a graph with new documents
|
1319
|
+
updated_graph = await db.update_graph(
|
1320
|
+
name="research_graph",
|
1321
|
+
additional_filters={"category": "new_research"},
|
1322
|
+
additional_documents=["doc4", "doc5"]
|
1323
|
+
)
|
1324
|
+
print(f"Graph now has {len(updated_graph.entities)} entities")
|
1325
|
+
|
1326
|
+
# With entity resolution examples
|
1327
|
+
from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
|
1328
|
+
updated_graph = await db.update_graph(
|
1329
|
+
name="research_graph",
|
1330
|
+
additional_documents=["doc4"],
|
1331
|
+
prompt_overrides=GraphPromptOverrides(
|
1332
|
+
entity_resolution=EntityResolutionPromptOverride(
|
1333
|
+
examples=[
|
1334
|
+
EntityResolutionExample(
|
1335
|
+
canonical="Machine Learning",
|
1336
|
+
variants=["ML", "machine learning", "AI/ML"]
|
1337
|
+
)
|
1338
|
+
]
|
1339
|
+
)
|
1340
|
+
)
|
1341
|
+
)
|
1342
|
+
```
|
1343
|
+
"""
|
1344
|
+
# Convert prompt_overrides to dict if it's a model
|
1345
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1346
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1347
|
+
|
1348
|
+
request = {
|
1349
|
+
"additional_filters": additional_filters,
|
1350
|
+
"additional_documents": additional_documents,
|
1351
|
+
"prompt_overrides": prompt_overrides,
|
1352
|
+
}
|
1353
|
+
|
1354
|
+
response = await self._request("POST", f"graph/{name}/update", request)
|
1355
|
+
return Graph(**response)
|
1356
|
+
|
1357
|
+
async def delete_document(self, document_id: str) -> Dict[str, str]:
|
1358
|
+
"""
|
1359
|
+
Delete a document and all its associated data.
|
1360
|
+
|
1361
|
+
This method deletes a document and all its associated data, including:
|
1362
|
+
- Document metadata
|
1363
|
+
- Document content in storage
|
1364
|
+
- Document chunks and embeddings in vector store
|
1365
|
+
|
1366
|
+
Args:
|
1367
|
+
document_id: ID of the document to delete
|
1368
|
+
|
1369
|
+
Returns:
|
1370
|
+
Dict[str, str]: Deletion status
|
1371
|
+
|
1372
|
+
Example:
|
1373
|
+
```python
|
1374
|
+
# Delete a document
|
1375
|
+
result = await db.delete_document("doc_123")
|
1376
|
+
print(result["message"]) # Document doc_123 deleted successfully
|
1377
|
+
```
|
1378
|
+
"""
|
1379
|
+
response = await self._request("DELETE", f"documents/{document_id}")
|
1380
|
+
return response
|
1381
|
+
|
1382
|
+
async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1383
|
+
"""
|
1384
|
+
Delete a document by its filename.
|
1385
|
+
|
1386
|
+
This is a convenience method that first retrieves the document ID by filename
|
1387
|
+
and then deletes the document by ID.
|
1388
|
+
|
1389
|
+
Args:
|
1390
|
+
filename: Filename of the document to delete
|
1391
|
+
|
1392
|
+
Returns:
|
1393
|
+
Dict[str, str]: Deletion status
|
1394
|
+
|
1395
|
+
Example:
|
1396
|
+
```python
|
1397
|
+
# Delete a document by filename
|
1398
|
+
result = await db.delete_document_by_filename("report.pdf")
|
1399
|
+
print(result["message"])
|
1400
|
+
```
|
1401
|
+
"""
|
1402
|
+
# First get the document by filename to obtain its ID
|
1403
|
+
doc = await self.get_document_by_filename(filename)
|
1404
|
+
|
1405
|
+
# Then delete the document by ID
|
1406
|
+
return await self.delete_document(doc.external_id)
|
1407
|
+
|
1408
|
+
async def close(self):
|
1409
|
+
"""Close the HTTP client"""
|
1410
|
+
await self._client.aclose()
|
1411
|
+
|
1412
|
+
async def __aenter__(self):
|
1413
|
+
return self
|
1414
|
+
|
1415
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
1416
|
+
await self.close()
|