morphik 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +15 -0
- morphik/async_.py +1416 -0
- morphik/exceptions.py +16 -0
- morphik/models.py +400 -0
- morphik/rules.py +47 -0
- morphik/sync.py +1447 -0
- morphik-0.1.0.dist-info/METADATA +47 -0
- morphik-0.1.0.dist-info/RECORD +9 -0
- morphik-0.1.0.dist-info/WHEEL +4 -0
morphik/sync.py
ADDED
@@ -0,0 +1,1447 @@
|
|
1
|
+
import base64
|
2
|
+
from io import BytesIO, IOBase
|
3
|
+
import io
|
4
|
+
from PIL.Image import Image as PILImage
|
5
|
+
from PIL import Image
|
6
|
+
import json
|
7
|
+
import logging
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
10
|
+
from urllib.parse import urlparse
|
11
|
+
|
12
|
+
import jwt
|
13
|
+
from pydantic import BaseModel, Field
|
14
|
+
import requests
|
15
|
+
|
16
|
+
from .models import (
|
17
|
+
Document,
|
18
|
+
ChunkResult,
|
19
|
+
DocumentResult,
|
20
|
+
CompletionResponse,
|
21
|
+
IngestTextRequest,
|
22
|
+
ChunkSource,
|
23
|
+
Graph,
|
24
|
+
# Prompt override models
|
25
|
+
EntityExtractionExample,
|
26
|
+
EntityResolutionExample,
|
27
|
+
EntityExtractionPromptOverride,
|
28
|
+
EntityResolutionPromptOverride,
|
29
|
+
QueryPromptOverride,
|
30
|
+
GraphPromptOverrides,
|
31
|
+
QueryPromptOverrides
|
32
|
+
)
|
33
|
+
from .rules import Rule
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
# Type alias for rules
|
38
|
+
RuleOrDict = Union[Rule, Dict[str, Any]]
|
39
|
+
|
40
|
+
|
41
|
+
class Cache:
|
42
|
+
def __init__(self, db: "Morphik", name: str):
|
43
|
+
self._db = db
|
44
|
+
self._name = name
|
45
|
+
|
46
|
+
def update(self) -> bool:
|
47
|
+
response = self._db._request("POST", f"cache/{self._name}/update")
|
48
|
+
return response.get("success", False)
|
49
|
+
|
50
|
+
def add_docs(self, docs: List[str]) -> bool:
|
51
|
+
response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
52
|
+
return response.get("success", False)
|
53
|
+
|
54
|
+
def query(
|
55
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
56
|
+
) -> CompletionResponse:
|
57
|
+
response = self._db._request(
|
58
|
+
"POST",
|
59
|
+
f"cache/{self._name}/query",
|
60
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
61
|
+
data="",
|
62
|
+
)
|
63
|
+
return CompletionResponse(**response)
|
64
|
+
|
65
|
+
|
66
|
+
class FinalChunkResult(BaseModel):
|
67
|
+
content: str | PILImage = Field(..., description="Chunk content")
|
68
|
+
score: float = Field(..., description="Relevance score")
|
69
|
+
document_id: str = Field(..., description="Parent document ID")
|
70
|
+
chunk_number: int = Field(..., description="Chunk sequence number")
|
71
|
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
72
|
+
content_type: str = Field(..., description="Content type")
|
73
|
+
filename: Optional[str] = Field(None, description="Original filename")
|
74
|
+
download_url: Optional[str] = Field(None, description="URL to download full document")
|
75
|
+
|
76
|
+
class Config:
|
77
|
+
arbitrary_types_allowed = True
|
78
|
+
|
79
|
+
|
80
|
+
class Morphik:
|
81
|
+
"""
|
82
|
+
Morphik client for document operations.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
uri (str, optional): Morphik URI in format "morphik://<owner_id>:<token>@<host>".
|
86
|
+
If not provided, connects to http://localhost:8000 without authentication.
|
87
|
+
timeout (int, optional): Request timeout in seconds. Defaults to 30.
|
88
|
+
is_local (bool, optional): Whether connecting to local development server. Defaults to False.
|
89
|
+
|
90
|
+
Examples:
|
91
|
+
```python
|
92
|
+
# Without authentication
|
93
|
+
db = Morphik()
|
94
|
+
|
95
|
+
# With authentication
|
96
|
+
db = Morphik("morphik://owner_id:token@api.morphik.ai")
|
97
|
+
```
|
98
|
+
"""
|
99
|
+
|
100
|
+
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
101
|
+
self._timeout = timeout
|
102
|
+
self._session = requests.Session()
|
103
|
+
if is_local:
|
104
|
+
self._session.verify = False # Disable SSL for localhost
|
105
|
+
self._is_local = is_local
|
106
|
+
|
107
|
+
if uri:
|
108
|
+
self._setup_auth(uri)
|
109
|
+
else:
|
110
|
+
self._base_url = "http://localhost:8000"
|
111
|
+
self._auth_token = None
|
112
|
+
|
113
|
+
def _setup_auth(self, uri: str) -> None:
|
114
|
+
"""Setup authentication from URI"""
|
115
|
+
parsed = urlparse(uri)
|
116
|
+
if not parsed.netloc:
|
117
|
+
raise ValueError("Invalid URI format")
|
118
|
+
|
119
|
+
# Split host and auth parts
|
120
|
+
auth, host = parsed.netloc.split("@")
|
121
|
+
_, self._auth_token = auth.split(":")
|
122
|
+
|
123
|
+
# Set base URL
|
124
|
+
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
125
|
+
|
126
|
+
# Basic token validation
|
127
|
+
jwt.decode(self._auth_token, options={"verify_signature": False})
|
128
|
+
|
129
|
+
def _request(
|
130
|
+
self,
|
131
|
+
method: str,
|
132
|
+
endpoint: str,
|
133
|
+
data: Optional[Dict[str, Any]] = None,
|
134
|
+
files: Optional[Dict[str, Any]] = None,
|
135
|
+
params: Optional[Dict[str, Any]] = None,
|
136
|
+
) -> Dict[str, Any]:
|
137
|
+
"""Make HTTP request"""
|
138
|
+
headers = {}
|
139
|
+
if self._auth_token: # Only add auth header if we have a token
|
140
|
+
headers["Authorization"] = f"Bearer {self._auth_token}"
|
141
|
+
|
142
|
+
# Configure request data based on type
|
143
|
+
if files:
|
144
|
+
# Multipart form data for files
|
145
|
+
request_data = {"files": files, "data": data}
|
146
|
+
# Don't set Content-Type, let requests handle it
|
147
|
+
else:
|
148
|
+
# JSON for everything else
|
149
|
+
headers["Content-Type"] = "application/json"
|
150
|
+
request_data = {"json": data}
|
151
|
+
|
152
|
+
response = self._session.request(
|
153
|
+
method,
|
154
|
+
f"{self._base_url}/{endpoint.lstrip('/')}",
|
155
|
+
headers=headers,
|
156
|
+
timeout=self._timeout,
|
157
|
+
params=params,
|
158
|
+
**request_data,
|
159
|
+
)
|
160
|
+
response.raise_for_status()
|
161
|
+
return response.json()
|
162
|
+
|
163
|
+
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
164
|
+
"""Convert a rule to a dictionary format"""
|
165
|
+
if hasattr(rule, "to_dict"):
|
166
|
+
return rule.to_dict()
|
167
|
+
return rule
|
168
|
+
|
169
|
+
def ingest_text(
|
170
|
+
self,
|
171
|
+
content: str,
|
172
|
+
filename: Optional[str] = None,
|
173
|
+
metadata: Optional[Dict[str, Any]] = None,
|
174
|
+
rules: Optional[List[RuleOrDict]] = None,
|
175
|
+
use_colpali: bool = True,
|
176
|
+
) -> Document:
|
177
|
+
"""
|
178
|
+
Ingest a text document into Morphik.
|
179
|
+
|
180
|
+
Args:
|
181
|
+
content: Text content to ingest
|
182
|
+
metadata: Optional metadata dictionary
|
183
|
+
rules: Optional list of rules to apply during ingestion. Can be:
|
184
|
+
- MetadataExtractionRule: Extract metadata using a schema
|
185
|
+
- NaturalLanguageRule: Transform content using natural language
|
186
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
|
187
|
+
Returns:
|
188
|
+
Document: Metadata of the ingested document
|
189
|
+
|
190
|
+
Example:
|
191
|
+
```python
|
192
|
+
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
|
193
|
+
from pydantic import BaseModel
|
194
|
+
|
195
|
+
class DocumentInfo(BaseModel):
|
196
|
+
title: str
|
197
|
+
author: str
|
198
|
+
date: str
|
199
|
+
|
200
|
+
doc = db.ingest_text(
|
201
|
+
"Machine learning is fascinating...",
|
202
|
+
metadata={"category": "tech"},
|
203
|
+
rules=[
|
204
|
+
# Extract metadata using schema
|
205
|
+
MetadataExtractionRule(schema=DocumentInfo),
|
206
|
+
# Transform content
|
207
|
+
NaturalLanguageRule(prompt="Shorten the content, use keywords")
|
208
|
+
]
|
209
|
+
)
|
210
|
+
```
|
211
|
+
"""
|
212
|
+
request = IngestTextRequest(
|
213
|
+
content=content,
|
214
|
+
filename=filename,
|
215
|
+
metadata=metadata or {},
|
216
|
+
rules=[self._convert_rule(r) for r in (rules or [])],
|
217
|
+
use_colpali=use_colpali,
|
218
|
+
)
|
219
|
+
response = self._request("POST", "ingest/text", data=request.model_dump())
|
220
|
+
doc = Document(**response)
|
221
|
+
doc._client = self
|
222
|
+
return doc
|
223
|
+
|
224
|
+
def ingest_file(
|
225
|
+
self,
|
226
|
+
file: Union[str, bytes, BinaryIO, Path],
|
227
|
+
filename: Optional[str] = None,
|
228
|
+
metadata: Optional[Dict[str, Any]] = None,
|
229
|
+
rules: Optional[List[RuleOrDict]] = None,
|
230
|
+
use_colpali: bool = True,
|
231
|
+
) -> Document:
|
232
|
+
"""
|
233
|
+
Ingest a file document into Morphik.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
237
|
+
filename: Name of the file
|
238
|
+
metadata: Optional metadata dictionary
|
239
|
+
rules: Optional list of rules to apply during ingestion. Can be:
|
240
|
+
- MetadataExtractionRule: Extract metadata using a schema
|
241
|
+
- NaturalLanguageRule: Transform content using natural language
|
242
|
+
use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
Document: Metadata of the ingested document
|
246
|
+
|
247
|
+
Example:
|
248
|
+
```python
|
249
|
+
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
|
250
|
+
from pydantic import BaseModel
|
251
|
+
|
252
|
+
class DocumentInfo(BaseModel):
|
253
|
+
title: str
|
254
|
+
author: str
|
255
|
+
department: str
|
256
|
+
|
257
|
+
doc = db.ingest_file(
|
258
|
+
"document.pdf",
|
259
|
+
filename="document.pdf",
|
260
|
+
metadata={"category": "research"},
|
261
|
+
rules=[
|
262
|
+
MetadataExtractionRule(schema=DocumentInfo),
|
263
|
+
NaturalLanguageRule(prompt="Extract key points only")
|
264
|
+
], # Optional
|
265
|
+
use_colpali=True, # Optional
|
266
|
+
)
|
267
|
+
```
|
268
|
+
"""
|
269
|
+
# Handle different file input types
|
270
|
+
if isinstance(file, (str, Path)):
|
271
|
+
file_path = Path(file)
|
272
|
+
if not file_path.exists():
|
273
|
+
raise ValueError(f"File not found: {file}")
|
274
|
+
filename = file_path.name if filename is None else filename
|
275
|
+
with open(file_path, "rb") as f:
|
276
|
+
content = f.read()
|
277
|
+
file_obj = BytesIO(content)
|
278
|
+
elif isinstance(file, bytes):
|
279
|
+
if filename is None:
|
280
|
+
raise ValueError("filename is required when ingesting bytes")
|
281
|
+
file_obj = BytesIO(file)
|
282
|
+
else:
|
283
|
+
if filename is None:
|
284
|
+
raise ValueError("filename is required when ingesting file object")
|
285
|
+
file_obj = file
|
286
|
+
|
287
|
+
try:
|
288
|
+
# Prepare multipart form data
|
289
|
+
files = {"file": (filename, file_obj)}
|
290
|
+
|
291
|
+
# Add metadata and rules
|
292
|
+
form_data = {
|
293
|
+
"metadata": json.dumps(metadata or {}),
|
294
|
+
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
295
|
+
}
|
296
|
+
|
297
|
+
response = self._request(
|
298
|
+
"POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files
|
299
|
+
)
|
300
|
+
doc = Document(**response)
|
301
|
+
doc._client = self
|
302
|
+
return doc
|
303
|
+
finally:
|
304
|
+
# Close file if we opened it
|
305
|
+
if isinstance(file, (str, Path)):
|
306
|
+
file_obj.close()
|
307
|
+
|
308
|
+
def ingest_files(
|
309
|
+
self,
|
310
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
311
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
312
|
+
rules: Optional[List[RuleOrDict]] = None,
|
313
|
+
use_colpali: bool = True,
|
314
|
+
parallel: bool = True,
|
315
|
+
) -> List[Document]:
|
316
|
+
"""
|
317
|
+
Ingest multiple files into Morphik.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
files: List of files to ingest (path strings, bytes, file objects, or Paths)
|
321
|
+
metadata: Optional metadata (single dict for all files or list of dicts)
|
322
|
+
rules: Optional list of rules to apply
|
323
|
+
use_colpali: Whether to use ColPali-style embedding
|
324
|
+
parallel: Whether to process files in parallel
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
List[Document]: List of successfully ingested documents
|
328
|
+
|
329
|
+
Raises:
|
330
|
+
ValueError: If metadata list length doesn't match files length
|
331
|
+
"""
|
332
|
+
# Convert files to format expected by API
|
333
|
+
file_objects = []
|
334
|
+
for file in files:
|
335
|
+
if isinstance(file, (str, Path)):
|
336
|
+
path = Path(file)
|
337
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
338
|
+
elif isinstance(file, bytes):
|
339
|
+
file_objects.append(("files", ("file.bin", file)))
|
340
|
+
else:
|
341
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
342
|
+
|
343
|
+
try:
|
344
|
+
# Prepare request data
|
345
|
+
# Convert rules appropriately based on whether it's a flat list or list of lists
|
346
|
+
if rules:
|
347
|
+
if all(isinstance(r, list) for r in rules):
|
348
|
+
# List of lists - per-file rules
|
349
|
+
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
350
|
+
else:
|
351
|
+
# Flat list - shared rules for all files
|
352
|
+
converted_rules = [self._convert_rule(r) for r in rules]
|
353
|
+
else:
|
354
|
+
converted_rules = []
|
355
|
+
|
356
|
+
data = {
|
357
|
+
"metadata": json.dumps(metadata or {}),
|
358
|
+
"rules": json.dumps(converted_rules),
|
359
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
360
|
+
"parallel": str(parallel).lower(),
|
361
|
+
}
|
362
|
+
|
363
|
+
response = self._request("POST", "ingest/files", data=data, files=file_objects)
|
364
|
+
|
365
|
+
if response.get("errors"):
|
366
|
+
# Log errors but don't raise exception
|
367
|
+
for error in response["errors"]:
|
368
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
369
|
+
|
370
|
+
docs = [Document(**doc) for doc in response["documents"]]
|
371
|
+
for doc in docs:
|
372
|
+
doc._client = self
|
373
|
+
return docs
|
374
|
+
finally:
|
375
|
+
# Clean up file objects
|
376
|
+
for _, (_, file_obj) in file_objects:
|
377
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
378
|
+
file_obj.close()
|
379
|
+
|
380
|
+
def ingest_directory(
|
381
|
+
self,
|
382
|
+
directory: Union[str, Path],
|
383
|
+
recursive: bool = False,
|
384
|
+
pattern: str = "*",
|
385
|
+
metadata: Optional[Dict[str, Any]] = None,
|
386
|
+
rules: Optional[List[RuleOrDict]] = None,
|
387
|
+
use_colpali: bool = True,
|
388
|
+
parallel: bool = True,
|
389
|
+
) -> List[Document]:
|
390
|
+
"""
|
391
|
+
Ingest all files in a directory into Morphik.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
directory: Path to directory containing files to ingest
|
395
|
+
recursive: Whether to recursively process subdirectories
|
396
|
+
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
397
|
+
metadata: Optional metadata dictionary to apply to all files
|
398
|
+
rules: Optional list of rules to apply
|
399
|
+
use_colpali: Whether to use ColPali-style embedding
|
400
|
+
parallel: Whether to process files in parallel
|
401
|
+
|
402
|
+
Returns:
|
403
|
+
List[Document]: List of ingested documents
|
404
|
+
|
405
|
+
Raises:
|
406
|
+
ValueError: If directory not found
|
407
|
+
"""
|
408
|
+
directory = Path(directory)
|
409
|
+
if not directory.is_dir():
|
410
|
+
raise ValueError(f"Directory not found: {directory}")
|
411
|
+
|
412
|
+
# Collect all files matching pattern
|
413
|
+
if recursive:
|
414
|
+
files = list(directory.rglob(pattern))
|
415
|
+
else:
|
416
|
+
files = list(directory.glob(pattern))
|
417
|
+
|
418
|
+
# Filter out directories
|
419
|
+
files = [f for f in files if f.is_file()]
|
420
|
+
|
421
|
+
if not files:
|
422
|
+
return []
|
423
|
+
|
424
|
+
# Use ingest_files with collected paths
|
425
|
+
return self.ingest_files(
|
426
|
+
files=files,
|
427
|
+
metadata=metadata,
|
428
|
+
rules=rules,
|
429
|
+
use_colpali=use_colpali,
|
430
|
+
parallel=parallel
|
431
|
+
)
|
432
|
+
|
433
|
+
def retrieve_chunks(
|
434
|
+
self,
|
435
|
+
query: str,
|
436
|
+
filters: Optional[Dict[str, Any]] = None,
|
437
|
+
k: int = 4,
|
438
|
+
min_score: float = 0.0,
|
439
|
+
use_colpali: bool = True,
|
440
|
+
) -> List[FinalChunkResult]:
|
441
|
+
"""
|
442
|
+
Retrieve relevant chunks.
|
443
|
+
|
444
|
+
Args:
|
445
|
+
query: Search query text
|
446
|
+
filters: Optional metadata filters
|
447
|
+
k: Number of results (default: 4)
|
448
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
449
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
|
450
|
+
Returns:
|
451
|
+
List[ChunkResult]
|
452
|
+
|
453
|
+
Example:
|
454
|
+
```python
|
455
|
+
chunks = db.retrieve_chunks(
|
456
|
+
"What are the key findings?",
|
457
|
+
filters={"department": "research"}
|
458
|
+
)
|
459
|
+
```
|
460
|
+
"""
|
461
|
+
request = {
|
462
|
+
"query": query,
|
463
|
+
"filters": filters,
|
464
|
+
"k": k,
|
465
|
+
"min_score": min_score,
|
466
|
+
"use_colpali": use_colpali,
|
467
|
+
}
|
468
|
+
|
469
|
+
response = self._request("POST", "retrieve/chunks", request)
|
470
|
+
chunks = [ChunkResult(**r) for r in response]
|
471
|
+
|
472
|
+
final_chunks = []
|
473
|
+
|
474
|
+
for chunk in chunks:
|
475
|
+
if chunk.metadata.get("is_image"):
|
476
|
+
try:
|
477
|
+
# Handle data URI format "data:image/png;base64,..."
|
478
|
+
content = chunk.content
|
479
|
+
if content.startswith("data:"):
|
480
|
+
# Extract the base64 part after the comma
|
481
|
+
content = content.split(",", 1)[1]
|
482
|
+
|
483
|
+
# Now decode the base64 string
|
484
|
+
image_bytes = base64.b64decode(content)
|
485
|
+
content = Image.open(io.BytesIO(image_bytes))
|
486
|
+
except Exception as e:
|
487
|
+
print(f"Error processing image: {str(e)}")
|
488
|
+
# Fall back to using the content as text
|
489
|
+
print(chunk.content)
|
490
|
+
else:
|
491
|
+
content = chunk.content
|
492
|
+
|
493
|
+
final_chunks.append(
|
494
|
+
FinalChunkResult(
|
495
|
+
content=content,
|
496
|
+
score=chunk.score,
|
497
|
+
document_id=chunk.document_id,
|
498
|
+
chunk_number=chunk.chunk_number,
|
499
|
+
metadata=chunk.metadata,
|
500
|
+
content_type=chunk.content_type,
|
501
|
+
filename=chunk.filename,
|
502
|
+
download_url=chunk.download_url,
|
503
|
+
)
|
504
|
+
)
|
505
|
+
|
506
|
+
return final_chunks
|
507
|
+
|
508
|
+
def retrieve_docs(
|
509
|
+
self,
|
510
|
+
query: str,
|
511
|
+
filters: Optional[Dict[str, Any]] = None,
|
512
|
+
k: int = 4,
|
513
|
+
min_score: float = 0.0,
|
514
|
+
use_colpali: bool = True,
|
515
|
+
) -> List[DocumentResult]:
|
516
|
+
"""
|
517
|
+
Retrieve relevant documents.
|
518
|
+
|
519
|
+
Args:
|
520
|
+
query: Search query text
|
521
|
+
filters: Optional metadata filters
|
522
|
+
k: Number of results (default: 4)
|
523
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
524
|
+
use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
|
525
|
+
Returns:
|
526
|
+
List[DocumentResult]
|
527
|
+
|
528
|
+
Example:
|
529
|
+
```python
|
530
|
+
docs = db.retrieve_docs(
|
531
|
+
"machine learning",
|
532
|
+
k=5
|
533
|
+
)
|
534
|
+
```
|
535
|
+
"""
|
536
|
+
request = {
|
537
|
+
"query": query,
|
538
|
+
"filters": filters,
|
539
|
+
"k": k,
|
540
|
+
"min_score": min_score,
|
541
|
+
"use_colpali": use_colpali,
|
542
|
+
}
|
543
|
+
|
544
|
+
response = self._request("POST", "retrieve/docs", request)
|
545
|
+
return [DocumentResult(**r) for r in response]
|
546
|
+
|
547
|
+
def query(
|
548
|
+
self,
|
549
|
+
query: str,
|
550
|
+
filters: Optional[Dict[str, Any]] = None,
|
551
|
+
k: int = 4,
|
552
|
+
min_score: float = 0.0,
|
553
|
+
max_tokens: Optional[int] = None,
|
554
|
+
temperature: Optional[float] = None,
|
555
|
+
use_colpali: bool = True,
|
556
|
+
graph_name: Optional[str] = None,
|
557
|
+
hop_depth: int = 1,
|
558
|
+
include_paths: bool = False,
|
559
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
560
|
+
) -> CompletionResponse:
|
561
|
+
"""
|
562
|
+
Generate completion using relevant chunks as context.
|
563
|
+
|
564
|
+
Args:
|
565
|
+
query: Query text
|
566
|
+
filters: Optional metadata filters
|
567
|
+
k: Number of chunks to use as context (default: 4)
|
568
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
569
|
+
max_tokens: Maximum tokens in completion
|
570
|
+
temperature: Model temperature
|
571
|
+
use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
|
572
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
573
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
574
|
+
include_paths: Whether to include relationship paths in the response
|
575
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
576
|
+
Either a QueryPromptOverrides object or a dictionary with the same structure
|
577
|
+
Returns:
|
578
|
+
CompletionResponse
|
579
|
+
|
580
|
+
Example:
|
581
|
+
```python
|
582
|
+
# Standard query
|
583
|
+
response = db.query(
|
584
|
+
"What are the key findings about customer satisfaction?",
|
585
|
+
filters={"department": "research"},
|
586
|
+
temperature=0.7
|
587
|
+
)
|
588
|
+
|
589
|
+
# Knowledge graph enhanced query
|
590
|
+
response = db.query(
|
591
|
+
"How does product X relate to customer segment Y?",
|
592
|
+
graph_name="market_graph",
|
593
|
+
hop_depth=2,
|
594
|
+
include_paths=True
|
595
|
+
)
|
596
|
+
|
597
|
+
# With prompt customization
|
598
|
+
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
599
|
+
response = db.query(
|
600
|
+
"What are the key findings?",
|
601
|
+
prompt_overrides=QueryPromptOverrides(
|
602
|
+
query=QueryPromptOverride(
|
603
|
+
prompt_template="Answer the question in a formal, academic tone: {question}"
|
604
|
+
)
|
605
|
+
)
|
606
|
+
)
|
607
|
+
|
608
|
+
# Or using a dictionary
|
609
|
+
response = db.query(
|
610
|
+
"What are the key findings?",
|
611
|
+
prompt_overrides={
|
612
|
+
"query": {
|
613
|
+
"prompt_template": "Answer the question in a formal, academic tone: {question}"
|
614
|
+
}
|
615
|
+
}
|
616
|
+
)
|
617
|
+
|
618
|
+
print(response.completion)
|
619
|
+
|
620
|
+
# If include_paths=True, you can inspect the graph paths
|
621
|
+
if response.metadata and "graph" in response.metadata:
|
622
|
+
for path in response.metadata["graph"]["paths"]:
|
623
|
+
print(" -> ".join(path))
|
624
|
+
```
|
625
|
+
"""
|
626
|
+
# Convert prompt_overrides to dict if it's a model
|
627
|
+
if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides):
|
628
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
629
|
+
|
630
|
+
request = {
|
631
|
+
"query": query,
|
632
|
+
"filters": filters,
|
633
|
+
"k": k,
|
634
|
+
"min_score": min_score,
|
635
|
+
"max_tokens": max_tokens,
|
636
|
+
"temperature": temperature,
|
637
|
+
"use_colpali": use_colpali,
|
638
|
+
"graph_name": graph_name,
|
639
|
+
"hop_depth": hop_depth,
|
640
|
+
"include_paths": include_paths,
|
641
|
+
"prompt_overrides": prompt_overrides,
|
642
|
+
}
|
643
|
+
|
644
|
+
response = self._request("POST", "query", request)
|
645
|
+
return CompletionResponse(**response)
|
646
|
+
|
647
|
+
def list_documents(
|
648
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
649
|
+
) -> List[Document]:
|
650
|
+
"""
|
651
|
+
List accessible documents.
|
652
|
+
|
653
|
+
Args:
|
654
|
+
skip: Number of documents to skip
|
655
|
+
limit: Maximum number of documents to return
|
656
|
+
filters: Optional filters
|
657
|
+
|
658
|
+
Returns:
|
659
|
+
List[Document]: List of accessible documents
|
660
|
+
|
661
|
+
Example:
|
662
|
+
```python
|
663
|
+
# Get first page
|
664
|
+
docs = db.list_documents(limit=10)
|
665
|
+
|
666
|
+
# Get next page
|
667
|
+
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
668
|
+
```
|
669
|
+
"""
|
670
|
+
# Use query params for pagination and POST body for filters
|
671
|
+
response = self._request("POST", f"documents?skip={skip}&limit={limit}", data=filters or {})
|
672
|
+
docs = [Document(**doc) for doc in response]
|
673
|
+
for doc in docs:
|
674
|
+
doc._client = self
|
675
|
+
return docs
|
676
|
+
|
677
|
+
def get_document(self, document_id: str) -> Document:
|
678
|
+
"""
|
679
|
+
Get document metadata by ID.
|
680
|
+
|
681
|
+
Args:
|
682
|
+
document_id: ID of the document
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
Document: Document metadata
|
686
|
+
|
687
|
+
Example:
|
688
|
+
```python
|
689
|
+
doc = db.get_document("doc_123")
|
690
|
+
print(f"Title: {doc.metadata.get('title')}")
|
691
|
+
```
|
692
|
+
"""
|
693
|
+
response = self._request("GET", f"documents/{document_id}")
|
694
|
+
doc = Document(**response)
|
695
|
+
doc._client = self
|
696
|
+
return doc
|
697
|
+
|
698
|
+
def get_document_by_filename(self, filename: str) -> Document:
|
699
|
+
"""
|
700
|
+
Get document metadata by filename.
|
701
|
+
If multiple documents have the same filename, returns the most recently updated one.
|
702
|
+
|
703
|
+
Args:
|
704
|
+
filename: Filename of the document to retrieve
|
705
|
+
|
706
|
+
Returns:
|
707
|
+
Document: Document metadata
|
708
|
+
|
709
|
+
Example:
|
710
|
+
```python
|
711
|
+
doc = db.get_document_by_filename("report.pdf")
|
712
|
+
print(f"Document ID: {doc.external_id}")
|
713
|
+
```
|
714
|
+
"""
|
715
|
+
response = self._request("GET", f"documents/filename/{filename}")
|
716
|
+
doc = Document(**response)
|
717
|
+
doc._client = self
|
718
|
+
return doc
|
719
|
+
|
720
|
+
def update_document_with_text(
|
721
|
+
self,
|
722
|
+
document_id: str,
|
723
|
+
content: str,
|
724
|
+
filename: Optional[str] = None,
|
725
|
+
metadata: Optional[Dict[str, Any]] = None,
|
726
|
+
rules: Optional[List] = None,
|
727
|
+
update_strategy: str = "add",
|
728
|
+
use_colpali: Optional[bool] = None,
|
729
|
+
) -> Document:
|
730
|
+
"""
|
731
|
+
Update a document with new text content using the specified strategy.
|
732
|
+
|
733
|
+
Args:
|
734
|
+
document_id: ID of the document to update
|
735
|
+
content: The new content to add
|
736
|
+
filename: Optional new filename for the document
|
737
|
+
metadata: Additional metadata to update (optional)
|
738
|
+
rules: Optional list of rules to apply to the content
|
739
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
740
|
+
use_colpali: Whether to use multi-vector embedding
|
741
|
+
|
742
|
+
Returns:
|
743
|
+
Document: Updated document metadata
|
744
|
+
|
745
|
+
Example:
|
746
|
+
```python
|
747
|
+
# Add new content to an existing document
|
748
|
+
updated_doc = db.update_document_with_text(
|
749
|
+
document_id="doc_123",
|
750
|
+
content="This is additional content that will be appended to the document.",
|
751
|
+
filename="updated_document.txt",
|
752
|
+
metadata={"category": "updated"},
|
753
|
+
update_strategy="add"
|
754
|
+
)
|
755
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
756
|
+
```
|
757
|
+
"""
|
758
|
+
# Use the dedicated text update endpoint
|
759
|
+
request = IngestTextRequest(
|
760
|
+
content=content,
|
761
|
+
filename=filename,
|
762
|
+
metadata=metadata or {},
|
763
|
+
rules=[self._convert_rule(r) for r in (rules or [])],
|
764
|
+
use_colpali=use_colpali if use_colpali is not None else True,
|
765
|
+
)
|
766
|
+
|
767
|
+
params = {}
|
768
|
+
if update_strategy != "add":
|
769
|
+
params["update_strategy"] = update_strategy
|
770
|
+
|
771
|
+
response = self._request(
|
772
|
+
"POST",
|
773
|
+
f"documents/{document_id}/update_text",
|
774
|
+
data=request.model_dump(),
|
775
|
+
params=params
|
776
|
+
)
|
777
|
+
|
778
|
+
doc = Document(**response)
|
779
|
+
doc._client = self
|
780
|
+
return doc
|
781
|
+
|
782
|
+
def update_document_with_file(
|
783
|
+
self,
|
784
|
+
document_id: str,
|
785
|
+
file: Union[str, bytes, BinaryIO, Path],
|
786
|
+
filename: Optional[str] = None,
|
787
|
+
metadata: Optional[Dict[str, Any]] = None,
|
788
|
+
rules: Optional[List] = None,
|
789
|
+
update_strategy: str = "add",
|
790
|
+
use_colpali: Optional[bool] = None,
|
791
|
+
) -> Document:
|
792
|
+
"""
|
793
|
+
Update a document with content from a file using the specified strategy.
|
794
|
+
|
795
|
+
Args:
|
796
|
+
document_id: ID of the document to update
|
797
|
+
file: File to add (path string, bytes, file object, or Path)
|
798
|
+
filename: Name of the file
|
799
|
+
metadata: Additional metadata to update (optional)
|
800
|
+
rules: Optional list of rules to apply to the content
|
801
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
802
|
+
use_colpali: Whether to use multi-vector embedding
|
803
|
+
|
804
|
+
Returns:
|
805
|
+
Document: Updated document metadata
|
806
|
+
|
807
|
+
Example:
|
808
|
+
```python
|
809
|
+
# Add content from a file to an existing document
|
810
|
+
updated_doc = db.update_document_with_file(
|
811
|
+
document_id="doc_123",
|
812
|
+
file="path/to/update.pdf",
|
813
|
+
metadata={"status": "updated"},
|
814
|
+
update_strategy="add"
|
815
|
+
)
|
816
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
817
|
+
```
|
818
|
+
"""
|
819
|
+
# Handle different file input types
|
820
|
+
if isinstance(file, (str, Path)):
|
821
|
+
file_path = Path(file)
|
822
|
+
if not file_path.exists():
|
823
|
+
raise ValueError(f"File not found: {file}")
|
824
|
+
filename = file_path.name if filename is None else filename
|
825
|
+
with open(file_path, "rb") as f:
|
826
|
+
content = f.read()
|
827
|
+
file_obj = BytesIO(content)
|
828
|
+
elif isinstance(file, bytes):
|
829
|
+
if filename is None:
|
830
|
+
raise ValueError("filename is required when updating with bytes")
|
831
|
+
file_obj = BytesIO(file)
|
832
|
+
else:
|
833
|
+
if filename is None:
|
834
|
+
raise ValueError("filename is required when updating with file object")
|
835
|
+
file_obj = file
|
836
|
+
|
837
|
+
try:
|
838
|
+
# Prepare multipart form data
|
839
|
+
files = {"file": (filename, file_obj)}
|
840
|
+
|
841
|
+
# Convert metadata and rules to JSON strings
|
842
|
+
form_data = {
|
843
|
+
"metadata": json.dumps(metadata or {}),
|
844
|
+
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
845
|
+
"update_strategy": update_strategy,
|
846
|
+
}
|
847
|
+
|
848
|
+
if use_colpali is not None:
|
849
|
+
form_data["use_colpali"] = str(use_colpali).lower()
|
850
|
+
|
851
|
+
# Use the dedicated file update endpoint
|
852
|
+
response = self._request(
|
853
|
+
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
854
|
+
)
|
855
|
+
|
856
|
+
doc = Document(**response)
|
857
|
+
doc._client = self
|
858
|
+
return doc
|
859
|
+
finally:
|
860
|
+
# Close file if we opened it
|
861
|
+
if isinstance(file, (str, Path)):
|
862
|
+
file_obj.close()
|
863
|
+
|
864
|
+
def update_document_metadata(
|
865
|
+
self,
|
866
|
+
document_id: str,
|
867
|
+
metadata: Dict[str, Any],
|
868
|
+
) -> Document:
|
869
|
+
"""
|
870
|
+
Update a document's metadata only.
|
871
|
+
|
872
|
+
Args:
|
873
|
+
document_id: ID of the document to update
|
874
|
+
metadata: Metadata to update
|
875
|
+
|
876
|
+
Returns:
|
877
|
+
Document: Updated document metadata
|
878
|
+
|
879
|
+
Example:
|
880
|
+
```python
|
881
|
+
# Update just the metadata of a document
|
882
|
+
updated_doc = db.update_document_metadata(
|
883
|
+
document_id="doc_123",
|
884
|
+
metadata={"status": "reviewed", "reviewer": "Jane Smith"}
|
885
|
+
)
|
886
|
+
print(f"Updated metadata: {updated_doc.metadata}")
|
887
|
+
```
|
888
|
+
"""
|
889
|
+
# Use the dedicated metadata update endpoint
|
890
|
+
response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
891
|
+
doc = Document(**response)
|
892
|
+
doc._client = self
|
893
|
+
return doc
|
894
|
+
|
895
|
+
def update_document_by_filename_with_text(
|
896
|
+
self,
|
897
|
+
filename: str,
|
898
|
+
content: str,
|
899
|
+
new_filename: Optional[str] = None,
|
900
|
+
metadata: Optional[Dict[str, Any]] = None,
|
901
|
+
rules: Optional[List] = None,
|
902
|
+
update_strategy: str = "add",
|
903
|
+
use_colpali: Optional[bool] = None,
|
904
|
+
) -> Document:
|
905
|
+
"""
|
906
|
+
Update a document identified by filename with new text content using the specified strategy.
|
907
|
+
|
908
|
+
Args:
|
909
|
+
filename: Filename of the document to update
|
910
|
+
content: The new content to add
|
911
|
+
new_filename: Optional new filename for the document
|
912
|
+
metadata: Additional metadata to update (optional)
|
913
|
+
rules: Optional list of rules to apply to the content
|
914
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
915
|
+
use_colpali: Whether to use multi-vector embedding
|
916
|
+
|
917
|
+
Returns:
|
918
|
+
Document: Updated document metadata
|
919
|
+
|
920
|
+
Example:
|
921
|
+
```python
|
922
|
+
# Add new content to an existing document identified by filename
|
923
|
+
updated_doc = db.update_document_by_filename_with_text(
|
924
|
+
filename="report.pdf",
|
925
|
+
content="This is additional content that will be appended to the document.",
|
926
|
+
new_filename="updated_report.pdf",
|
927
|
+
metadata={"category": "updated"},
|
928
|
+
update_strategy="add"
|
929
|
+
)
|
930
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
931
|
+
```
|
932
|
+
"""
|
933
|
+
# First get the document by filename to obtain its ID
|
934
|
+
doc = self.get_document_by_filename(filename)
|
935
|
+
|
936
|
+
# Then use the regular update_document_with_text endpoint with the document ID
|
937
|
+
return self.update_document_with_text(
|
938
|
+
document_id=doc.external_id,
|
939
|
+
content=content,
|
940
|
+
filename=new_filename,
|
941
|
+
metadata=metadata,
|
942
|
+
rules=rules,
|
943
|
+
update_strategy=update_strategy,
|
944
|
+
use_colpali=use_colpali
|
945
|
+
)
|
946
|
+
|
947
|
+
def update_document_by_filename_with_file(
|
948
|
+
self,
|
949
|
+
filename: str,
|
950
|
+
file: Union[str, bytes, BinaryIO, Path],
|
951
|
+
new_filename: Optional[str] = None,
|
952
|
+
metadata: Optional[Dict[str, Any]] = None,
|
953
|
+
rules: Optional[List] = None,
|
954
|
+
update_strategy: str = "add",
|
955
|
+
use_colpali: Optional[bool] = None,
|
956
|
+
) -> Document:
|
957
|
+
"""
|
958
|
+
Update a document identified by filename with content from a file using the specified strategy.
|
959
|
+
|
960
|
+
Args:
|
961
|
+
filename: Filename of the document to update
|
962
|
+
file: File to add (path string, bytes, file object, or Path)
|
963
|
+
new_filename: Optional new filename for the document (defaults to the filename of the file)
|
964
|
+
metadata: Additional metadata to update (optional)
|
965
|
+
rules: Optional list of rules to apply to the content
|
966
|
+
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
967
|
+
use_colpali: Whether to use multi-vector embedding
|
968
|
+
|
969
|
+
Returns:
|
970
|
+
Document: Updated document metadata
|
971
|
+
|
972
|
+
Example:
|
973
|
+
```python
|
974
|
+
# Add content from a file to an existing document identified by filename
|
975
|
+
updated_doc = db.update_document_by_filename_with_file(
|
976
|
+
filename="report.pdf",
|
977
|
+
file="path/to/update.pdf",
|
978
|
+
metadata={"status": "updated"},
|
979
|
+
update_strategy="add"
|
980
|
+
)
|
981
|
+
print(f"Document version: {updated_doc.system_metadata.get('version')}")
|
982
|
+
```
|
983
|
+
"""
|
984
|
+
# First get the document by filename to obtain its ID
|
985
|
+
doc = self.get_document_by_filename(filename)
|
986
|
+
|
987
|
+
# Then use the regular update_document_with_file endpoint with the document ID
|
988
|
+
return self.update_document_with_file(
|
989
|
+
document_id=doc.external_id,
|
990
|
+
file=file,
|
991
|
+
filename=new_filename,
|
992
|
+
metadata=metadata,
|
993
|
+
rules=rules,
|
994
|
+
update_strategy=update_strategy,
|
995
|
+
use_colpali=use_colpali
|
996
|
+
)
|
997
|
+
|
998
|
+
def update_document_by_filename_metadata(
|
999
|
+
self,
|
1000
|
+
filename: str,
|
1001
|
+
metadata: Dict[str, Any],
|
1002
|
+
new_filename: Optional[str] = None,
|
1003
|
+
) -> Document:
|
1004
|
+
"""
|
1005
|
+
Update a document's metadata using filename to identify the document.
|
1006
|
+
|
1007
|
+
Args:
|
1008
|
+
filename: Filename of the document to update
|
1009
|
+
metadata: Metadata to update
|
1010
|
+
new_filename: Optional new filename to assign to the document
|
1011
|
+
|
1012
|
+
Returns:
|
1013
|
+
Document: Updated document metadata
|
1014
|
+
|
1015
|
+
Example:
|
1016
|
+
```python
|
1017
|
+
# Update just the metadata of a document identified by filename
|
1018
|
+
updated_doc = db.update_document_by_filename_metadata(
|
1019
|
+
filename="report.pdf",
|
1020
|
+
metadata={"status": "reviewed", "reviewer": "Jane Smith"},
|
1021
|
+
new_filename="reviewed_report.pdf" # Optional: rename the file
|
1022
|
+
)
|
1023
|
+
print(f"Updated metadata: {updated_doc.metadata}")
|
1024
|
+
```
|
1025
|
+
"""
|
1026
|
+
# First get the document by filename to obtain its ID
|
1027
|
+
doc = self.get_document_by_filename(filename)
|
1028
|
+
|
1029
|
+
# Update the metadata
|
1030
|
+
result = self.update_document_metadata(
|
1031
|
+
document_id=doc.external_id,
|
1032
|
+
metadata=metadata,
|
1033
|
+
)
|
1034
|
+
|
1035
|
+
# If new_filename is provided, update the filename as well
|
1036
|
+
if new_filename:
|
1037
|
+
# Create a request that retains the just-updated metadata but also changes filename
|
1038
|
+
combined_metadata = result.metadata.copy()
|
1039
|
+
|
1040
|
+
# Update the document again with filename change and the same metadata
|
1041
|
+
response = self._request(
|
1042
|
+
"POST",
|
1043
|
+
f"documents/{doc.external_id}/update_text",
|
1044
|
+
data={
|
1045
|
+
"content": "",
|
1046
|
+
"filename": new_filename,
|
1047
|
+
"metadata": combined_metadata,
|
1048
|
+
"rules": []
|
1049
|
+
}
|
1050
|
+
)
|
1051
|
+
result = Document(**response)
|
1052
|
+
result._client = self
|
1053
|
+
|
1054
|
+
return result
|
1055
|
+
|
1056
|
+
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1057
|
+
"""
|
1058
|
+
Retrieve multiple documents by their IDs in a single batch operation.
|
1059
|
+
|
1060
|
+
Args:
|
1061
|
+
document_ids: List of document IDs to retrieve
|
1062
|
+
|
1063
|
+
Returns:
|
1064
|
+
List[Document]: List of document metadata for found documents
|
1065
|
+
|
1066
|
+
Example:
|
1067
|
+
```python
|
1068
|
+
docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
1069
|
+
for doc in docs:
|
1070
|
+
print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
|
1071
|
+
```
|
1072
|
+
"""
|
1073
|
+
response = self._request("POST", "batch/documents", data=document_ids)
|
1074
|
+
docs = [Document(**doc) for doc in response]
|
1075
|
+
for doc in docs:
|
1076
|
+
doc._client = self
|
1077
|
+
return docs
|
1078
|
+
|
1079
|
+
def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
|
1080
|
+
"""
|
1081
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1082
|
+
|
1083
|
+
Args:
|
1084
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1085
|
+
|
1086
|
+
Returns:
|
1087
|
+
List[FinalChunkResult]: List of chunk results
|
1088
|
+
|
1089
|
+
Example:
|
1090
|
+
```python
|
1091
|
+
# Using dictionaries
|
1092
|
+
sources = [
|
1093
|
+
{"document_id": "doc_123", "chunk_number": 0},
|
1094
|
+
{"document_id": "doc_456", "chunk_number": 2}
|
1095
|
+
]
|
1096
|
+
|
1097
|
+
# Or using ChunkSource objects
|
1098
|
+
from morphik.models import ChunkSource
|
1099
|
+
sources = [
|
1100
|
+
ChunkSource(document_id="doc_123", chunk_number=0),
|
1101
|
+
ChunkSource(document_id="doc_456", chunk_number=2)
|
1102
|
+
]
|
1103
|
+
|
1104
|
+
chunks = db.batch_get_chunks(sources)
|
1105
|
+
for chunk in chunks:
|
1106
|
+
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
1107
|
+
```
|
1108
|
+
"""
|
1109
|
+
# Convert to list of dictionaries if needed
|
1110
|
+
source_dicts = []
|
1111
|
+
for source in sources:
|
1112
|
+
if isinstance(source, dict):
|
1113
|
+
source_dicts.append(source)
|
1114
|
+
else:
|
1115
|
+
source_dicts.append(source.model_dump())
|
1116
|
+
|
1117
|
+
response = self._request("POST", "batch/chunks", data=source_dicts)
|
1118
|
+
chunks = [ChunkResult(**r) for r in response]
|
1119
|
+
|
1120
|
+
final_chunks = []
|
1121
|
+
for chunk in chunks:
|
1122
|
+
if chunk.metadata.get("is_image"):
|
1123
|
+
try:
|
1124
|
+
# Handle data URI format "data:image/png;base64,..."
|
1125
|
+
content = chunk.content
|
1126
|
+
if content.startswith("data:"):
|
1127
|
+
# Extract the base64 part after the comma
|
1128
|
+
content = content.split(",", 1)[1]
|
1129
|
+
|
1130
|
+
# Now decode the base64 string
|
1131
|
+
image_bytes = base64.b64decode(content)
|
1132
|
+
content = Image.open(io.BytesIO(image_bytes))
|
1133
|
+
except Exception as e:
|
1134
|
+
print(f"Error processing image: {str(e)}")
|
1135
|
+
# Fall back to using the content as text
|
1136
|
+
content = chunk.content
|
1137
|
+
else:
|
1138
|
+
content = chunk.content
|
1139
|
+
|
1140
|
+
final_chunks.append(
|
1141
|
+
FinalChunkResult(
|
1142
|
+
content=content,
|
1143
|
+
score=chunk.score,
|
1144
|
+
document_id=chunk.document_id,
|
1145
|
+
chunk_number=chunk.chunk_number,
|
1146
|
+
metadata=chunk.metadata,
|
1147
|
+
content_type=chunk.content_type,
|
1148
|
+
filename=chunk.filename,
|
1149
|
+
download_url=chunk.download_url,
|
1150
|
+
)
|
1151
|
+
)
|
1152
|
+
|
1153
|
+
return final_chunks
|
1154
|
+
|
1155
|
+
def create_cache(
|
1156
|
+
self,
|
1157
|
+
name: str,
|
1158
|
+
model: str,
|
1159
|
+
gguf_file: str,
|
1160
|
+
filters: Optional[Dict[str, Any]] = None,
|
1161
|
+
docs: Optional[List[str]] = None,
|
1162
|
+
) -> Dict[str, Any]:
|
1163
|
+
"""
|
1164
|
+
Create a new cache with specified configuration.
|
1165
|
+
|
1166
|
+
Args:
|
1167
|
+
name: Name of the cache to create
|
1168
|
+
model: Name of the model to use (e.g. "llama2")
|
1169
|
+
gguf_file: Name of the GGUF file to use for the model
|
1170
|
+
filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
|
1171
|
+
docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
|
1172
|
+
|
1173
|
+
Returns:
|
1174
|
+
Dict[str, Any]: Created cache configuration
|
1175
|
+
|
1176
|
+
Example:
|
1177
|
+
```python
|
1178
|
+
# This will include both:
|
1179
|
+
# 1. Any documents with category="programming"
|
1180
|
+
# 2. The specific documents "doc1" and "doc2" (regardless of their category)
|
1181
|
+
cache = db.create_cache(
|
1182
|
+
name="programming_cache",
|
1183
|
+
model="llama2",
|
1184
|
+
gguf_file="llama-2-7b-chat.Q4_K_M.gguf",
|
1185
|
+
filters={"category": "programming"},
|
1186
|
+
docs=["doc1", "doc2"]
|
1187
|
+
)
|
1188
|
+
```
|
1189
|
+
"""
|
1190
|
+
# Build query parameters for name, model and gguf_file
|
1191
|
+
params = {"name": name, "model": model, "gguf_file": gguf_file}
|
1192
|
+
|
1193
|
+
# Build request body for filters and docs
|
1194
|
+
request = {"filters": filters, "docs": docs}
|
1195
|
+
|
1196
|
+
response = self._request("POST", "cache/create", request, params=params)
|
1197
|
+
return response
|
1198
|
+
|
1199
|
+
def get_cache(self, name: str) -> Cache:
|
1200
|
+
"""
|
1201
|
+
Get a cache by name.
|
1202
|
+
|
1203
|
+
Args:
|
1204
|
+
name: Name of the cache to retrieve
|
1205
|
+
|
1206
|
+
Returns:
|
1207
|
+
cache: A cache object that is used to interact with the cache.
|
1208
|
+
|
1209
|
+
Example:
|
1210
|
+
```python
|
1211
|
+
cache = db.get_cache("programming_cache")
|
1212
|
+
```
|
1213
|
+
"""
|
1214
|
+
response = self._request("GET", f"cache/{name}")
|
1215
|
+
if response.get("exists", False):
|
1216
|
+
return Cache(self, name)
|
1217
|
+
raise ValueError(f"Cache '{name}' not found")
|
1218
|
+
|
1219
|
+
def create_graph(
|
1220
|
+
self,
|
1221
|
+
name: str,
|
1222
|
+
filters: Optional[Dict[str, Any]] = None,
|
1223
|
+
documents: Optional[List[str]] = None,
|
1224
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1225
|
+
) -> Graph:
|
1226
|
+
"""
|
1227
|
+
Create a graph from documents.
|
1228
|
+
|
1229
|
+
This method extracts entities and relationships from documents
|
1230
|
+
matching the specified filters or document IDs and creates a graph.
|
1231
|
+
|
1232
|
+
Args:
|
1233
|
+
name: Name of the graph to create
|
1234
|
+
filters: Optional metadata filters to determine which documents to include
|
1235
|
+
documents: Optional list of specific document IDs to include
|
1236
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1237
|
+
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1238
|
+
|
1239
|
+
Returns:
|
1240
|
+
Graph: The created graph object
|
1241
|
+
|
1242
|
+
Example:
|
1243
|
+
```python
|
1244
|
+
# Create a graph from documents with category="research"
|
1245
|
+
graph = db.create_graph(
|
1246
|
+
name="research_graph",
|
1247
|
+
filters={"category": "research"}
|
1248
|
+
)
|
1249
|
+
|
1250
|
+
# Create a graph from specific documents
|
1251
|
+
graph = db.create_graph(
|
1252
|
+
name="custom_graph",
|
1253
|
+
documents=["doc1", "doc2", "doc3"]
|
1254
|
+
)
|
1255
|
+
|
1256
|
+
# With custom entity extraction examples
|
1257
|
+
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1258
|
+
graph = db.create_graph(
|
1259
|
+
name="medical_graph",
|
1260
|
+
filters={"category": "medical"},
|
1261
|
+
prompt_overrides=GraphPromptOverrides(
|
1262
|
+
entity_extraction=EntityExtractionPromptOverride(
|
1263
|
+
examples=[
|
1264
|
+
EntityExtractionExample(label="Insulin", type="MEDICATION"),
|
1265
|
+
EntityExtractionExample(label="Diabetes", type="CONDITION")
|
1266
|
+
]
|
1267
|
+
)
|
1268
|
+
)
|
1269
|
+
)
|
1270
|
+
```
|
1271
|
+
"""
|
1272
|
+
# Convert prompt_overrides to dict if it's a model
|
1273
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1274
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1275
|
+
|
1276
|
+
request = {
|
1277
|
+
"name": name,
|
1278
|
+
"filters": filters,
|
1279
|
+
"documents": documents,
|
1280
|
+
"prompt_overrides": prompt_overrides,
|
1281
|
+
}
|
1282
|
+
|
1283
|
+
response = self._request("POST", "graph/create", request)
|
1284
|
+
return Graph(**response)
|
1285
|
+
|
1286
|
+
def get_graph(self, name: str) -> Graph:
|
1287
|
+
"""
|
1288
|
+
Get a graph by name.
|
1289
|
+
|
1290
|
+
Args:
|
1291
|
+
name: Name of the graph to retrieve
|
1292
|
+
|
1293
|
+
Returns:
|
1294
|
+
Graph: The requested graph object
|
1295
|
+
|
1296
|
+
Example:
|
1297
|
+
```python
|
1298
|
+
# Get a graph by name
|
1299
|
+
graph = db.get_graph("finance_graph")
|
1300
|
+
print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships")
|
1301
|
+
```
|
1302
|
+
"""
|
1303
|
+
response = self._request("GET", f"graph/{name}")
|
1304
|
+
return Graph(**response)
|
1305
|
+
|
1306
|
+
def list_graphs(self) -> List[Graph]:
|
1307
|
+
"""
|
1308
|
+
List all graphs the user has access to.
|
1309
|
+
|
1310
|
+
Returns:
|
1311
|
+
List[Graph]: List of graph objects
|
1312
|
+
|
1313
|
+
Example:
|
1314
|
+
```python
|
1315
|
+
# List all accessible graphs
|
1316
|
+
graphs = db.list_graphs()
|
1317
|
+
for graph in graphs:
|
1318
|
+
print(f"Graph: {graph.name}, Entities: {len(graph.entities)}")
|
1319
|
+
```
|
1320
|
+
"""
|
1321
|
+
response = self._request("GET", "graphs")
|
1322
|
+
return [Graph(**graph) for graph in response]
|
1323
|
+
|
1324
|
+
def update_graph(
|
1325
|
+
self,
|
1326
|
+
name: str,
|
1327
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
1328
|
+
additional_documents: Optional[List[str]] = None,
|
1329
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1330
|
+
) -> Graph:
|
1331
|
+
"""
|
1332
|
+
Update an existing graph with new documents.
|
1333
|
+
|
1334
|
+
This method processes additional documents matching the original or new filters,
|
1335
|
+
extracts entities and relationships, and updates the graph with new information.
|
1336
|
+
|
1337
|
+
Args:
|
1338
|
+
name: Name of the graph to update
|
1339
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1340
|
+
additional_documents: Optional list of additional document IDs to include
|
1341
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1342
|
+
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1343
|
+
|
1344
|
+
Returns:
|
1345
|
+
Graph: The updated graph
|
1346
|
+
|
1347
|
+
Example:
|
1348
|
+
```python
|
1349
|
+
# Update a graph with new documents
|
1350
|
+
updated_graph = db.update_graph(
|
1351
|
+
name="research_graph",
|
1352
|
+
additional_filters={"category": "new_research"},
|
1353
|
+
additional_documents=["doc4", "doc5"]
|
1354
|
+
)
|
1355
|
+
print(f"Graph now has {len(updated_graph.entities)} entities")
|
1356
|
+
|
1357
|
+
# With entity resolution examples
|
1358
|
+
from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
|
1359
|
+
updated_graph = db.update_graph(
|
1360
|
+
name="research_graph",
|
1361
|
+
additional_documents=["doc4"],
|
1362
|
+
prompt_overrides=GraphPromptOverrides(
|
1363
|
+
entity_resolution=EntityResolutionPromptOverride(
|
1364
|
+
examples=[
|
1365
|
+
EntityResolutionExample(
|
1366
|
+
canonical="Machine Learning",
|
1367
|
+
variants=["ML", "machine learning", "AI/ML"]
|
1368
|
+
)
|
1369
|
+
]
|
1370
|
+
)
|
1371
|
+
)
|
1372
|
+
)
|
1373
|
+
```
|
1374
|
+
"""
|
1375
|
+
# Convert prompt_overrides to dict if it's a model
|
1376
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1377
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1378
|
+
|
1379
|
+
request = {
|
1380
|
+
"additional_filters": additional_filters,
|
1381
|
+
"additional_documents": additional_documents,
|
1382
|
+
"prompt_overrides": prompt_overrides,
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
response = self._request("POST", f"graph/{name}/update", request)
|
1386
|
+
return Graph(**response)
|
1387
|
+
|
1388
|
+
def delete_document(self, document_id: str) -> Dict[str, str]:
|
1389
|
+
"""
|
1390
|
+
Delete a document and all its associated data.
|
1391
|
+
|
1392
|
+
This method deletes a document and all its associated data, including:
|
1393
|
+
- Document metadata
|
1394
|
+
- Document content in storage
|
1395
|
+
- Document chunks and embeddings in vector store
|
1396
|
+
|
1397
|
+
Args:
|
1398
|
+
document_id: ID of the document to delete
|
1399
|
+
|
1400
|
+
Returns:
|
1401
|
+
Dict[str, str]: Deletion status
|
1402
|
+
|
1403
|
+
Example:
|
1404
|
+
```python
|
1405
|
+
# Delete a document
|
1406
|
+
result = db.delete_document("doc_123")
|
1407
|
+
print(result["message"]) # Document doc_123 deleted successfully
|
1408
|
+
```
|
1409
|
+
"""
|
1410
|
+
response = self._request("DELETE", f"documents/{document_id}")
|
1411
|
+
return response
|
1412
|
+
|
1413
|
+
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1414
|
+
"""
|
1415
|
+
Delete a document by its filename.
|
1416
|
+
|
1417
|
+
This is a convenience method that first retrieves the document ID by filename
|
1418
|
+
and then deletes the document by ID.
|
1419
|
+
|
1420
|
+
Args:
|
1421
|
+
filename: Filename of the document to delete
|
1422
|
+
|
1423
|
+
Returns:
|
1424
|
+
Dict[str, str]: Deletion status
|
1425
|
+
|
1426
|
+
Example:
|
1427
|
+
```python
|
1428
|
+
# Delete a document by filename
|
1429
|
+
result = db.delete_document_by_filename("report.pdf")
|
1430
|
+
print(result["message"])
|
1431
|
+
```
|
1432
|
+
"""
|
1433
|
+
# First get the document by filename to obtain its ID
|
1434
|
+
doc = self.get_document_by_filename(filename)
|
1435
|
+
|
1436
|
+
# Then delete the document by ID
|
1437
|
+
return self.delete_document(doc.external_id)
|
1438
|
+
|
1439
|
+
def close(self):
|
1440
|
+
"""Close the HTTP session"""
|
1441
|
+
self._session.close()
|
1442
|
+
|
1443
|
+
def __enter__(self):
|
1444
|
+
return self
|
1445
|
+
|
1446
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
1447
|
+
self.close()
|