morphik 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/sync.py ADDED
@@ -0,0 +1,1447 @@
1
+ import base64
2
+ from io import BytesIO, IOBase
3
+ import io
4
+ from PIL.Image import Image as PILImage
5
+ from PIL import Image
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Dict, Any, List, Optional, Union, BinaryIO
10
+ from urllib.parse import urlparse
11
+
12
+ import jwt
13
+ from pydantic import BaseModel, Field
14
+ import requests
15
+
16
+ from .models import (
17
+ Document,
18
+ ChunkResult,
19
+ DocumentResult,
20
+ CompletionResponse,
21
+ IngestTextRequest,
22
+ ChunkSource,
23
+ Graph,
24
+ # Prompt override models
25
+ EntityExtractionExample,
26
+ EntityResolutionExample,
27
+ EntityExtractionPromptOverride,
28
+ EntityResolutionPromptOverride,
29
+ QueryPromptOverride,
30
+ GraphPromptOverrides,
31
+ QueryPromptOverrides
32
+ )
33
+ from .rules import Rule
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Type alias for rules
38
+ RuleOrDict = Union[Rule, Dict[str, Any]]
39
+
40
+
41
+ class Cache:
42
+ def __init__(self, db: "Morphik", name: str):
43
+ self._db = db
44
+ self._name = name
45
+
46
+ def update(self) -> bool:
47
+ response = self._db._request("POST", f"cache/{self._name}/update")
48
+ return response.get("success", False)
49
+
50
+ def add_docs(self, docs: List[str]) -> bool:
51
+ response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
52
+ return response.get("success", False)
53
+
54
+ def query(
55
+ self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
56
+ ) -> CompletionResponse:
57
+ response = self._db._request(
58
+ "POST",
59
+ f"cache/{self._name}/query",
60
+ params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
61
+ data="",
62
+ )
63
+ return CompletionResponse(**response)
64
+
65
+
66
+ class FinalChunkResult(BaseModel):
67
+ content: str | PILImage = Field(..., description="Chunk content")
68
+ score: float = Field(..., description="Relevance score")
69
+ document_id: str = Field(..., description="Parent document ID")
70
+ chunk_number: int = Field(..., description="Chunk sequence number")
71
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
72
+ content_type: str = Field(..., description="Content type")
73
+ filename: Optional[str] = Field(None, description="Original filename")
74
+ download_url: Optional[str] = Field(None, description="URL to download full document")
75
+
76
+ class Config:
77
+ arbitrary_types_allowed = True
78
+
79
+
80
+ class Morphik:
81
+ """
82
+ Morphik client for document operations.
83
+
84
+ Args:
85
+ uri (str, optional): Morphik URI in format "morphik://<owner_id>:<token>@<host>".
86
+ If not provided, connects to http://localhost:8000 without authentication.
87
+ timeout (int, optional): Request timeout in seconds. Defaults to 30.
88
+ is_local (bool, optional): Whether connecting to local development server. Defaults to False.
89
+
90
+ Examples:
91
+ ```python
92
+ # Without authentication
93
+ db = Morphik()
94
+
95
+ # With authentication
96
+ db = Morphik("morphik://owner_id:token@api.morphik.ai")
97
+ ```
98
+ """
99
+
100
+ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
101
+ self._timeout = timeout
102
+ self._session = requests.Session()
103
+ if is_local:
104
+ self._session.verify = False # Disable SSL for localhost
105
+ self._is_local = is_local
106
+
107
+ if uri:
108
+ self._setup_auth(uri)
109
+ else:
110
+ self._base_url = "http://localhost:8000"
111
+ self._auth_token = None
112
+
113
+ def _setup_auth(self, uri: str) -> None:
114
+ """Setup authentication from URI"""
115
+ parsed = urlparse(uri)
116
+ if not parsed.netloc:
117
+ raise ValueError("Invalid URI format")
118
+
119
+ # Split host and auth parts
120
+ auth, host = parsed.netloc.split("@")
121
+ _, self._auth_token = auth.split(":")
122
+
123
+ # Set base URL
124
+ self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
125
+
126
+ # Basic token validation
127
+ jwt.decode(self._auth_token, options={"verify_signature": False})
128
+
129
+ def _request(
130
+ self,
131
+ method: str,
132
+ endpoint: str,
133
+ data: Optional[Dict[str, Any]] = None,
134
+ files: Optional[Dict[str, Any]] = None,
135
+ params: Optional[Dict[str, Any]] = None,
136
+ ) -> Dict[str, Any]:
137
+ """Make HTTP request"""
138
+ headers = {}
139
+ if self._auth_token: # Only add auth header if we have a token
140
+ headers["Authorization"] = f"Bearer {self._auth_token}"
141
+
142
+ # Configure request data based on type
143
+ if files:
144
+ # Multipart form data for files
145
+ request_data = {"files": files, "data": data}
146
+ # Don't set Content-Type, let requests handle it
147
+ else:
148
+ # JSON for everything else
149
+ headers["Content-Type"] = "application/json"
150
+ request_data = {"json": data}
151
+
152
+ response = self._session.request(
153
+ method,
154
+ f"{self._base_url}/{endpoint.lstrip('/')}",
155
+ headers=headers,
156
+ timeout=self._timeout,
157
+ params=params,
158
+ **request_data,
159
+ )
160
+ response.raise_for_status()
161
+ return response.json()
162
+
163
+ def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
164
+ """Convert a rule to a dictionary format"""
165
+ if hasattr(rule, "to_dict"):
166
+ return rule.to_dict()
167
+ return rule
168
+
169
+ def ingest_text(
170
+ self,
171
+ content: str,
172
+ filename: Optional[str] = None,
173
+ metadata: Optional[Dict[str, Any]] = None,
174
+ rules: Optional[List[RuleOrDict]] = None,
175
+ use_colpali: bool = True,
176
+ ) -> Document:
177
+ """
178
+ Ingest a text document into Morphik.
179
+
180
+ Args:
181
+ content: Text content to ingest
182
+ metadata: Optional metadata dictionary
183
+ rules: Optional list of rules to apply during ingestion. Can be:
184
+ - MetadataExtractionRule: Extract metadata using a schema
185
+ - NaturalLanguageRule: Transform content using natural language
186
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
187
+ Returns:
188
+ Document: Metadata of the ingested document
189
+
190
+ Example:
191
+ ```python
192
+ from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
193
+ from pydantic import BaseModel
194
+
195
+ class DocumentInfo(BaseModel):
196
+ title: str
197
+ author: str
198
+ date: str
199
+
200
+ doc = db.ingest_text(
201
+ "Machine learning is fascinating...",
202
+ metadata={"category": "tech"},
203
+ rules=[
204
+ # Extract metadata using schema
205
+ MetadataExtractionRule(schema=DocumentInfo),
206
+ # Transform content
207
+ NaturalLanguageRule(prompt="Shorten the content, use keywords")
208
+ ]
209
+ )
210
+ ```
211
+ """
212
+ request = IngestTextRequest(
213
+ content=content,
214
+ filename=filename,
215
+ metadata=metadata or {},
216
+ rules=[self._convert_rule(r) for r in (rules or [])],
217
+ use_colpali=use_colpali,
218
+ )
219
+ response = self._request("POST", "ingest/text", data=request.model_dump())
220
+ doc = Document(**response)
221
+ doc._client = self
222
+ return doc
223
+
224
+ def ingest_file(
225
+ self,
226
+ file: Union[str, bytes, BinaryIO, Path],
227
+ filename: Optional[str] = None,
228
+ metadata: Optional[Dict[str, Any]] = None,
229
+ rules: Optional[List[RuleOrDict]] = None,
230
+ use_colpali: bool = True,
231
+ ) -> Document:
232
+ """
233
+ Ingest a file document into Morphik.
234
+
235
+ Args:
236
+ file: File to ingest (path string, bytes, file object, or Path)
237
+ filename: Name of the file
238
+ metadata: Optional metadata dictionary
239
+ rules: Optional list of rules to apply during ingestion. Can be:
240
+ - MetadataExtractionRule: Extract metadata using a schema
241
+ - NaturalLanguageRule: Transform content using natural language
242
+ use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images)
243
+
244
+ Returns:
245
+ Document: Metadata of the ingested document
246
+
247
+ Example:
248
+ ```python
249
+ from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
250
+ from pydantic import BaseModel
251
+
252
+ class DocumentInfo(BaseModel):
253
+ title: str
254
+ author: str
255
+ department: str
256
+
257
+ doc = db.ingest_file(
258
+ "document.pdf",
259
+ filename="document.pdf",
260
+ metadata={"category": "research"},
261
+ rules=[
262
+ MetadataExtractionRule(schema=DocumentInfo),
263
+ NaturalLanguageRule(prompt="Extract key points only")
264
+ ], # Optional
265
+ use_colpali=True, # Optional
266
+ )
267
+ ```
268
+ """
269
+ # Handle different file input types
270
+ if isinstance(file, (str, Path)):
271
+ file_path = Path(file)
272
+ if not file_path.exists():
273
+ raise ValueError(f"File not found: {file}")
274
+ filename = file_path.name if filename is None else filename
275
+ with open(file_path, "rb") as f:
276
+ content = f.read()
277
+ file_obj = BytesIO(content)
278
+ elif isinstance(file, bytes):
279
+ if filename is None:
280
+ raise ValueError("filename is required when ingesting bytes")
281
+ file_obj = BytesIO(file)
282
+ else:
283
+ if filename is None:
284
+ raise ValueError("filename is required when ingesting file object")
285
+ file_obj = file
286
+
287
+ try:
288
+ # Prepare multipart form data
289
+ files = {"file": (filename, file_obj)}
290
+
291
+ # Add metadata and rules
292
+ form_data = {
293
+ "metadata": json.dumps(metadata or {}),
294
+ "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
295
+ }
296
+
297
+ response = self._request(
298
+ "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files
299
+ )
300
+ doc = Document(**response)
301
+ doc._client = self
302
+ return doc
303
+ finally:
304
+ # Close file if we opened it
305
+ if isinstance(file, (str, Path)):
306
+ file_obj.close()
307
+
308
+ def ingest_files(
309
+ self,
310
+ files: List[Union[str, bytes, BinaryIO, Path]],
311
+ metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
312
+ rules: Optional[List[RuleOrDict]] = None,
313
+ use_colpali: bool = True,
314
+ parallel: bool = True,
315
+ ) -> List[Document]:
316
+ """
317
+ Ingest multiple files into Morphik.
318
+
319
+ Args:
320
+ files: List of files to ingest (path strings, bytes, file objects, or Paths)
321
+ metadata: Optional metadata (single dict for all files or list of dicts)
322
+ rules: Optional list of rules to apply
323
+ use_colpali: Whether to use ColPali-style embedding
324
+ parallel: Whether to process files in parallel
325
+
326
+ Returns:
327
+ List[Document]: List of successfully ingested documents
328
+
329
+ Raises:
330
+ ValueError: If metadata list length doesn't match files length
331
+ """
332
+ # Convert files to format expected by API
333
+ file_objects = []
334
+ for file in files:
335
+ if isinstance(file, (str, Path)):
336
+ path = Path(file)
337
+ file_objects.append(("files", (path.name, open(path, "rb"))))
338
+ elif isinstance(file, bytes):
339
+ file_objects.append(("files", ("file.bin", file)))
340
+ else:
341
+ file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
342
+
343
+ try:
344
+ # Prepare request data
345
+ # Convert rules appropriately based on whether it's a flat list or list of lists
346
+ if rules:
347
+ if all(isinstance(r, list) for r in rules):
348
+ # List of lists - per-file rules
349
+ converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
350
+ else:
351
+ # Flat list - shared rules for all files
352
+ converted_rules = [self._convert_rule(r) for r in rules]
353
+ else:
354
+ converted_rules = []
355
+
356
+ data = {
357
+ "metadata": json.dumps(metadata or {}),
358
+ "rules": json.dumps(converted_rules),
359
+ "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
360
+ "parallel": str(parallel).lower(),
361
+ }
362
+
363
+ response = self._request("POST", "ingest/files", data=data, files=file_objects)
364
+
365
+ if response.get("errors"):
366
+ # Log errors but don't raise exception
367
+ for error in response["errors"]:
368
+ logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
369
+
370
+ docs = [Document(**doc) for doc in response["documents"]]
371
+ for doc in docs:
372
+ doc._client = self
373
+ return docs
374
+ finally:
375
+ # Clean up file objects
376
+ for _, (_, file_obj) in file_objects:
377
+ if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
378
+ file_obj.close()
379
+
380
+ def ingest_directory(
381
+ self,
382
+ directory: Union[str, Path],
383
+ recursive: bool = False,
384
+ pattern: str = "*",
385
+ metadata: Optional[Dict[str, Any]] = None,
386
+ rules: Optional[List[RuleOrDict]] = None,
387
+ use_colpali: bool = True,
388
+ parallel: bool = True,
389
+ ) -> List[Document]:
390
+ """
391
+ Ingest all files in a directory into Morphik.
392
+
393
+ Args:
394
+ directory: Path to directory containing files to ingest
395
+ recursive: Whether to recursively process subdirectories
396
+ pattern: Optional glob pattern to filter files (e.g. "*.pdf")
397
+ metadata: Optional metadata dictionary to apply to all files
398
+ rules: Optional list of rules to apply
399
+ use_colpali: Whether to use ColPali-style embedding
400
+ parallel: Whether to process files in parallel
401
+
402
+ Returns:
403
+ List[Document]: List of ingested documents
404
+
405
+ Raises:
406
+ ValueError: If directory not found
407
+ """
408
+ directory = Path(directory)
409
+ if not directory.is_dir():
410
+ raise ValueError(f"Directory not found: {directory}")
411
+
412
+ # Collect all files matching pattern
413
+ if recursive:
414
+ files = list(directory.rglob(pattern))
415
+ else:
416
+ files = list(directory.glob(pattern))
417
+
418
+ # Filter out directories
419
+ files = [f for f in files if f.is_file()]
420
+
421
+ if not files:
422
+ return []
423
+
424
+ # Use ingest_files with collected paths
425
+ return self.ingest_files(
426
+ files=files,
427
+ metadata=metadata,
428
+ rules=rules,
429
+ use_colpali=use_colpali,
430
+ parallel=parallel
431
+ )
432
+
433
+ def retrieve_chunks(
434
+ self,
435
+ query: str,
436
+ filters: Optional[Dict[str, Any]] = None,
437
+ k: int = 4,
438
+ min_score: float = 0.0,
439
+ use_colpali: bool = True,
440
+ ) -> List[FinalChunkResult]:
441
+ """
442
+ Retrieve relevant chunks.
443
+
444
+ Args:
445
+ query: Search query text
446
+ filters: Optional metadata filters
447
+ k: Number of results (default: 4)
448
+ min_score: Minimum similarity threshold (default: 0.0)
449
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`)
450
+ Returns:
451
+ List[ChunkResult]
452
+
453
+ Example:
454
+ ```python
455
+ chunks = db.retrieve_chunks(
456
+ "What are the key findings?",
457
+ filters={"department": "research"}
458
+ )
459
+ ```
460
+ """
461
+ request = {
462
+ "query": query,
463
+ "filters": filters,
464
+ "k": k,
465
+ "min_score": min_score,
466
+ "use_colpali": use_colpali,
467
+ }
468
+
469
+ response = self._request("POST", "retrieve/chunks", request)
470
+ chunks = [ChunkResult(**r) for r in response]
471
+
472
+ final_chunks = []
473
+
474
+ for chunk in chunks:
475
+ if chunk.metadata.get("is_image"):
476
+ try:
477
+ # Handle data URI format "data:image/png;base64,..."
478
+ content = chunk.content
479
+ if content.startswith("data:"):
480
+ # Extract the base64 part after the comma
481
+ content = content.split(",", 1)[1]
482
+
483
+ # Now decode the base64 string
484
+ image_bytes = base64.b64decode(content)
485
+ content = Image.open(io.BytesIO(image_bytes))
486
+ except Exception as e:
487
+ print(f"Error processing image: {str(e)}")
488
+ # Fall back to using the content as text
489
+ print(chunk.content)
490
+ else:
491
+ content = chunk.content
492
+
493
+ final_chunks.append(
494
+ FinalChunkResult(
495
+ content=content,
496
+ score=chunk.score,
497
+ document_id=chunk.document_id,
498
+ chunk_number=chunk.chunk_number,
499
+ metadata=chunk.metadata,
500
+ content_type=chunk.content_type,
501
+ filename=chunk.filename,
502
+ download_url=chunk.download_url,
503
+ )
504
+ )
505
+
506
+ return final_chunks
507
+
508
+ def retrieve_docs(
509
+ self,
510
+ query: str,
511
+ filters: Optional[Dict[str, Any]] = None,
512
+ k: int = 4,
513
+ min_score: float = 0.0,
514
+ use_colpali: bool = True,
515
+ ) -> List[DocumentResult]:
516
+ """
517
+ Retrieve relevant documents.
518
+
519
+ Args:
520
+ query: Search query text
521
+ filters: Optional metadata filters
522
+ k: Number of results (default: 4)
523
+ min_score: Minimum similarity threshold (default: 0.0)
524
+ use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`)
525
+ Returns:
526
+ List[DocumentResult]
527
+
528
+ Example:
529
+ ```python
530
+ docs = db.retrieve_docs(
531
+ "machine learning",
532
+ k=5
533
+ )
534
+ ```
535
+ """
536
+ request = {
537
+ "query": query,
538
+ "filters": filters,
539
+ "k": k,
540
+ "min_score": min_score,
541
+ "use_colpali": use_colpali,
542
+ }
543
+
544
+ response = self._request("POST", "retrieve/docs", request)
545
+ return [DocumentResult(**r) for r in response]
546
+
547
+ def query(
548
+ self,
549
+ query: str,
550
+ filters: Optional[Dict[str, Any]] = None,
551
+ k: int = 4,
552
+ min_score: float = 0.0,
553
+ max_tokens: Optional[int] = None,
554
+ temperature: Optional[float] = None,
555
+ use_colpali: bool = True,
556
+ graph_name: Optional[str] = None,
557
+ hop_depth: int = 1,
558
+ include_paths: bool = False,
559
+ prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
560
+ ) -> CompletionResponse:
561
+ """
562
+ Generate completion using relevant chunks as context.
563
+
564
+ Args:
565
+ query: Query text
566
+ filters: Optional metadata filters
567
+ k: Number of chunks to use as context (default: 4)
568
+ min_score: Minimum similarity threshold (default: 0.0)
569
+ max_tokens: Maximum tokens in completion
570
+ temperature: Model temperature
571
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
572
+ graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
573
+ hop_depth: Number of relationship hops to traverse in the graph (1-3)
574
+ include_paths: Whether to include relationship paths in the response
575
+ prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
576
+ Either a QueryPromptOverrides object or a dictionary with the same structure
577
+ Returns:
578
+ CompletionResponse
579
+
580
+ Example:
581
+ ```python
582
+ # Standard query
583
+ response = db.query(
584
+ "What are the key findings about customer satisfaction?",
585
+ filters={"department": "research"},
586
+ temperature=0.7
587
+ )
588
+
589
+ # Knowledge graph enhanced query
590
+ response = db.query(
591
+ "How does product X relate to customer segment Y?",
592
+ graph_name="market_graph",
593
+ hop_depth=2,
594
+ include_paths=True
595
+ )
596
+
597
+ # With prompt customization
598
+ from morphik.models import QueryPromptOverride, QueryPromptOverrides
599
+ response = db.query(
600
+ "What are the key findings?",
601
+ prompt_overrides=QueryPromptOverrides(
602
+ query=QueryPromptOverride(
603
+ prompt_template="Answer the question in a formal, academic tone: {question}"
604
+ )
605
+ )
606
+ )
607
+
608
+ # Or using a dictionary
609
+ response = db.query(
610
+ "What are the key findings?",
611
+ prompt_overrides={
612
+ "query": {
613
+ "prompt_template": "Answer the question in a formal, academic tone: {question}"
614
+ }
615
+ }
616
+ )
617
+
618
+ print(response.completion)
619
+
620
+ # If include_paths=True, you can inspect the graph paths
621
+ if response.metadata and "graph" in response.metadata:
622
+ for path in response.metadata["graph"]["paths"]:
623
+ print(" -> ".join(path))
624
+ ```
625
+ """
626
+ # Convert prompt_overrides to dict if it's a model
627
+ if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides):
628
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
629
+
630
+ request = {
631
+ "query": query,
632
+ "filters": filters,
633
+ "k": k,
634
+ "min_score": min_score,
635
+ "max_tokens": max_tokens,
636
+ "temperature": temperature,
637
+ "use_colpali": use_colpali,
638
+ "graph_name": graph_name,
639
+ "hop_depth": hop_depth,
640
+ "include_paths": include_paths,
641
+ "prompt_overrides": prompt_overrides,
642
+ }
643
+
644
+ response = self._request("POST", "query", request)
645
+ return CompletionResponse(**response)
646
+
647
+ def list_documents(
648
+ self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
649
+ ) -> List[Document]:
650
+ """
651
+ List accessible documents.
652
+
653
+ Args:
654
+ skip: Number of documents to skip
655
+ limit: Maximum number of documents to return
656
+ filters: Optional filters
657
+
658
+ Returns:
659
+ List[Document]: List of accessible documents
660
+
661
+ Example:
662
+ ```python
663
+ # Get first page
664
+ docs = db.list_documents(limit=10)
665
+
666
+ # Get next page
667
+ next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
668
+ ```
669
+ """
670
+ # Use query params for pagination and POST body for filters
671
+ response = self._request("POST", f"documents?skip={skip}&limit={limit}", data=filters or {})
672
+ docs = [Document(**doc) for doc in response]
673
+ for doc in docs:
674
+ doc._client = self
675
+ return docs
676
+
677
+ def get_document(self, document_id: str) -> Document:
678
+ """
679
+ Get document metadata by ID.
680
+
681
+ Args:
682
+ document_id: ID of the document
683
+
684
+ Returns:
685
+ Document: Document metadata
686
+
687
+ Example:
688
+ ```python
689
+ doc = db.get_document("doc_123")
690
+ print(f"Title: {doc.metadata.get('title')}")
691
+ ```
692
+ """
693
+ response = self._request("GET", f"documents/{document_id}")
694
+ doc = Document(**response)
695
+ doc._client = self
696
+ return doc
697
+
698
+ def get_document_by_filename(self, filename: str) -> Document:
699
+ """
700
+ Get document metadata by filename.
701
+ If multiple documents have the same filename, returns the most recently updated one.
702
+
703
+ Args:
704
+ filename: Filename of the document to retrieve
705
+
706
+ Returns:
707
+ Document: Document metadata
708
+
709
+ Example:
710
+ ```python
711
+ doc = db.get_document_by_filename("report.pdf")
712
+ print(f"Document ID: {doc.external_id}")
713
+ ```
714
+ """
715
+ response = self._request("GET", f"documents/filename/{filename}")
716
+ doc = Document(**response)
717
+ doc._client = self
718
+ return doc
719
+
720
+ def update_document_with_text(
721
+ self,
722
+ document_id: str,
723
+ content: str,
724
+ filename: Optional[str] = None,
725
+ metadata: Optional[Dict[str, Any]] = None,
726
+ rules: Optional[List] = None,
727
+ update_strategy: str = "add",
728
+ use_colpali: Optional[bool] = None,
729
+ ) -> Document:
730
+ """
731
+ Update a document with new text content using the specified strategy.
732
+
733
+ Args:
734
+ document_id: ID of the document to update
735
+ content: The new content to add
736
+ filename: Optional new filename for the document
737
+ metadata: Additional metadata to update (optional)
738
+ rules: Optional list of rules to apply to the content
739
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
740
+ use_colpali: Whether to use multi-vector embedding
741
+
742
+ Returns:
743
+ Document: Updated document metadata
744
+
745
+ Example:
746
+ ```python
747
+ # Add new content to an existing document
748
+ updated_doc = db.update_document_with_text(
749
+ document_id="doc_123",
750
+ content="This is additional content that will be appended to the document.",
751
+ filename="updated_document.txt",
752
+ metadata={"category": "updated"},
753
+ update_strategy="add"
754
+ )
755
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
756
+ ```
757
+ """
758
+ # Use the dedicated text update endpoint
759
+ request = IngestTextRequest(
760
+ content=content,
761
+ filename=filename,
762
+ metadata=metadata or {},
763
+ rules=[self._convert_rule(r) for r in (rules or [])],
764
+ use_colpali=use_colpali if use_colpali is not None else True,
765
+ )
766
+
767
+ params = {}
768
+ if update_strategy != "add":
769
+ params["update_strategy"] = update_strategy
770
+
771
+ response = self._request(
772
+ "POST",
773
+ f"documents/{document_id}/update_text",
774
+ data=request.model_dump(),
775
+ params=params
776
+ )
777
+
778
+ doc = Document(**response)
779
+ doc._client = self
780
+ return doc
781
+
782
+ def update_document_with_file(
783
+ self,
784
+ document_id: str,
785
+ file: Union[str, bytes, BinaryIO, Path],
786
+ filename: Optional[str] = None,
787
+ metadata: Optional[Dict[str, Any]] = None,
788
+ rules: Optional[List] = None,
789
+ update_strategy: str = "add",
790
+ use_colpali: Optional[bool] = None,
791
+ ) -> Document:
792
+ """
793
+ Update a document with content from a file using the specified strategy.
794
+
795
+ Args:
796
+ document_id: ID of the document to update
797
+ file: File to add (path string, bytes, file object, or Path)
798
+ filename: Name of the file
799
+ metadata: Additional metadata to update (optional)
800
+ rules: Optional list of rules to apply to the content
801
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
802
+ use_colpali: Whether to use multi-vector embedding
803
+
804
+ Returns:
805
+ Document: Updated document metadata
806
+
807
+ Example:
808
+ ```python
809
+ # Add content from a file to an existing document
810
+ updated_doc = db.update_document_with_file(
811
+ document_id="doc_123",
812
+ file="path/to/update.pdf",
813
+ metadata={"status": "updated"},
814
+ update_strategy="add"
815
+ )
816
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
817
+ ```
818
+ """
819
+ # Handle different file input types
820
+ if isinstance(file, (str, Path)):
821
+ file_path = Path(file)
822
+ if not file_path.exists():
823
+ raise ValueError(f"File not found: {file}")
824
+ filename = file_path.name if filename is None else filename
825
+ with open(file_path, "rb") as f:
826
+ content = f.read()
827
+ file_obj = BytesIO(content)
828
+ elif isinstance(file, bytes):
829
+ if filename is None:
830
+ raise ValueError("filename is required when updating with bytes")
831
+ file_obj = BytesIO(file)
832
+ else:
833
+ if filename is None:
834
+ raise ValueError("filename is required when updating with file object")
835
+ file_obj = file
836
+
837
+ try:
838
+ # Prepare multipart form data
839
+ files = {"file": (filename, file_obj)}
840
+
841
+ # Convert metadata and rules to JSON strings
842
+ form_data = {
843
+ "metadata": json.dumps(metadata or {}),
844
+ "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
845
+ "update_strategy": update_strategy,
846
+ }
847
+
848
+ if use_colpali is not None:
849
+ form_data["use_colpali"] = str(use_colpali).lower()
850
+
851
+ # Use the dedicated file update endpoint
852
+ response = self._request(
853
+ "POST", f"documents/{document_id}/update_file", data=form_data, files=files
854
+ )
855
+
856
+ doc = Document(**response)
857
+ doc._client = self
858
+ return doc
859
+ finally:
860
+ # Close file if we opened it
861
+ if isinstance(file, (str, Path)):
862
+ file_obj.close()
863
+
864
+ def update_document_metadata(
865
+ self,
866
+ document_id: str,
867
+ metadata: Dict[str, Any],
868
+ ) -> Document:
869
+ """
870
+ Update a document's metadata only.
871
+
872
+ Args:
873
+ document_id: ID of the document to update
874
+ metadata: Metadata to update
875
+
876
+ Returns:
877
+ Document: Updated document metadata
878
+
879
+ Example:
880
+ ```python
881
+ # Update just the metadata of a document
882
+ updated_doc = db.update_document_metadata(
883
+ document_id="doc_123",
884
+ metadata={"status": "reviewed", "reviewer": "Jane Smith"}
885
+ )
886
+ print(f"Updated metadata: {updated_doc.metadata}")
887
+ ```
888
+ """
889
+ # Use the dedicated metadata update endpoint
890
+ response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
891
+ doc = Document(**response)
892
+ doc._client = self
893
+ return doc
894
+
895
+ def update_document_by_filename_with_text(
896
+ self,
897
+ filename: str,
898
+ content: str,
899
+ new_filename: Optional[str] = None,
900
+ metadata: Optional[Dict[str, Any]] = None,
901
+ rules: Optional[List] = None,
902
+ update_strategy: str = "add",
903
+ use_colpali: Optional[bool] = None,
904
+ ) -> Document:
905
+ """
906
+ Update a document identified by filename with new text content using the specified strategy.
907
+
908
+ Args:
909
+ filename: Filename of the document to update
910
+ content: The new content to add
911
+ new_filename: Optional new filename for the document
912
+ metadata: Additional metadata to update (optional)
913
+ rules: Optional list of rules to apply to the content
914
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
915
+ use_colpali: Whether to use multi-vector embedding
916
+
917
+ Returns:
918
+ Document: Updated document metadata
919
+
920
+ Example:
921
+ ```python
922
+ # Add new content to an existing document identified by filename
923
+ updated_doc = db.update_document_by_filename_with_text(
924
+ filename="report.pdf",
925
+ content="This is additional content that will be appended to the document.",
926
+ new_filename="updated_report.pdf",
927
+ metadata={"category": "updated"},
928
+ update_strategy="add"
929
+ )
930
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
931
+ ```
932
+ """
933
+ # First get the document by filename to obtain its ID
934
+ doc = self.get_document_by_filename(filename)
935
+
936
+ # Then use the regular update_document_with_text endpoint with the document ID
937
+ return self.update_document_with_text(
938
+ document_id=doc.external_id,
939
+ content=content,
940
+ filename=new_filename,
941
+ metadata=metadata,
942
+ rules=rules,
943
+ update_strategy=update_strategy,
944
+ use_colpali=use_colpali
945
+ )
946
+
947
+ def update_document_by_filename_with_file(
948
+ self,
949
+ filename: str,
950
+ file: Union[str, bytes, BinaryIO, Path],
951
+ new_filename: Optional[str] = None,
952
+ metadata: Optional[Dict[str, Any]] = None,
953
+ rules: Optional[List] = None,
954
+ update_strategy: str = "add",
955
+ use_colpali: Optional[bool] = None,
956
+ ) -> Document:
957
+ """
958
+ Update a document identified by filename with content from a file using the specified strategy.
959
+
960
+ Args:
961
+ filename: Filename of the document to update
962
+ file: File to add (path string, bytes, file object, or Path)
963
+ new_filename: Optional new filename for the document (defaults to the filename of the file)
964
+ metadata: Additional metadata to update (optional)
965
+ rules: Optional list of rules to apply to the content
966
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
967
+ use_colpali: Whether to use multi-vector embedding
968
+
969
+ Returns:
970
+ Document: Updated document metadata
971
+
972
+ Example:
973
+ ```python
974
+ # Add content from a file to an existing document identified by filename
975
+ updated_doc = db.update_document_by_filename_with_file(
976
+ filename="report.pdf",
977
+ file="path/to/update.pdf",
978
+ metadata={"status": "updated"},
979
+ update_strategy="add"
980
+ )
981
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
982
+ ```
983
+ """
984
+ # First get the document by filename to obtain its ID
985
+ doc = self.get_document_by_filename(filename)
986
+
987
+ # Then use the regular update_document_with_file endpoint with the document ID
988
+ return self.update_document_with_file(
989
+ document_id=doc.external_id,
990
+ file=file,
991
+ filename=new_filename,
992
+ metadata=metadata,
993
+ rules=rules,
994
+ update_strategy=update_strategy,
995
+ use_colpali=use_colpali
996
+ )
997
+
998
+ def update_document_by_filename_metadata(
999
+ self,
1000
+ filename: str,
1001
+ metadata: Dict[str, Any],
1002
+ new_filename: Optional[str] = None,
1003
+ ) -> Document:
1004
+ """
1005
+ Update a document's metadata using filename to identify the document.
1006
+
1007
+ Args:
1008
+ filename: Filename of the document to update
1009
+ metadata: Metadata to update
1010
+ new_filename: Optional new filename to assign to the document
1011
+
1012
+ Returns:
1013
+ Document: Updated document metadata
1014
+
1015
+ Example:
1016
+ ```python
1017
+ # Update just the metadata of a document identified by filename
1018
+ updated_doc = db.update_document_by_filename_metadata(
1019
+ filename="report.pdf",
1020
+ metadata={"status": "reviewed", "reviewer": "Jane Smith"},
1021
+ new_filename="reviewed_report.pdf" # Optional: rename the file
1022
+ )
1023
+ print(f"Updated metadata: {updated_doc.metadata}")
1024
+ ```
1025
+ """
1026
+ # First get the document by filename to obtain its ID
1027
+ doc = self.get_document_by_filename(filename)
1028
+
1029
+ # Update the metadata
1030
+ result = self.update_document_metadata(
1031
+ document_id=doc.external_id,
1032
+ metadata=metadata,
1033
+ )
1034
+
1035
+ # If new_filename is provided, update the filename as well
1036
+ if new_filename:
1037
+ # Create a request that retains the just-updated metadata but also changes filename
1038
+ combined_metadata = result.metadata.copy()
1039
+
1040
+ # Update the document again with filename change and the same metadata
1041
+ response = self._request(
1042
+ "POST",
1043
+ f"documents/{doc.external_id}/update_text",
1044
+ data={
1045
+ "content": "",
1046
+ "filename": new_filename,
1047
+ "metadata": combined_metadata,
1048
+ "rules": []
1049
+ }
1050
+ )
1051
+ result = Document(**response)
1052
+ result._client = self
1053
+
1054
+ return result
1055
+
1056
+ def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
1057
+ """
1058
+ Retrieve multiple documents by their IDs in a single batch operation.
1059
+
1060
+ Args:
1061
+ document_ids: List of document IDs to retrieve
1062
+
1063
+ Returns:
1064
+ List[Document]: List of document metadata for found documents
1065
+
1066
+ Example:
1067
+ ```python
1068
+ docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
1069
+ for doc in docs:
1070
+ print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
1071
+ ```
1072
+ """
1073
+ response = self._request("POST", "batch/documents", data=document_ids)
1074
+ docs = [Document(**doc) for doc in response]
1075
+ for doc in docs:
1076
+ doc._client = self
1077
+ return docs
1078
+
1079
+ def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
1080
+ """
1081
+ Retrieve specific chunks by their document ID and chunk number in a single batch operation.
1082
+
1083
+ Args:
1084
+ sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
1085
+
1086
+ Returns:
1087
+ List[FinalChunkResult]: List of chunk results
1088
+
1089
+ Example:
1090
+ ```python
1091
+ # Using dictionaries
1092
+ sources = [
1093
+ {"document_id": "doc_123", "chunk_number": 0},
1094
+ {"document_id": "doc_456", "chunk_number": 2}
1095
+ ]
1096
+
1097
+ # Or using ChunkSource objects
1098
+ from morphik.models import ChunkSource
1099
+ sources = [
1100
+ ChunkSource(document_id="doc_123", chunk_number=0),
1101
+ ChunkSource(document_id="doc_456", chunk_number=2)
1102
+ ]
1103
+
1104
+ chunks = db.batch_get_chunks(sources)
1105
+ for chunk in chunks:
1106
+ print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
1107
+ ```
1108
+ """
1109
+ # Convert to list of dictionaries if needed
1110
+ source_dicts = []
1111
+ for source in sources:
1112
+ if isinstance(source, dict):
1113
+ source_dicts.append(source)
1114
+ else:
1115
+ source_dicts.append(source.model_dump())
1116
+
1117
+ response = self._request("POST", "batch/chunks", data=source_dicts)
1118
+ chunks = [ChunkResult(**r) for r in response]
1119
+
1120
+ final_chunks = []
1121
+ for chunk in chunks:
1122
+ if chunk.metadata.get("is_image"):
1123
+ try:
1124
+ # Handle data URI format "data:image/png;base64,..."
1125
+ content = chunk.content
1126
+ if content.startswith("data:"):
1127
+ # Extract the base64 part after the comma
1128
+ content = content.split(",", 1)[1]
1129
+
1130
+ # Now decode the base64 string
1131
+ image_bytes = base64.b64decode(content)
1132
+ content = Image.open(io.BytesIO(image_bytes))
1133
+ except Exception as e:
1134
+ print(f"Error processing image: {str(e)}")
1135
+ # Fall back to using the content as text
1136
+ content = chunk.content
1137
+ else:
1138
+ content = chunk.content
1139
+
1140
+ final_chunks.append(
1141
+ FinalChunkResult(
1142
+ content=content,
1143
+ score=chunk.score,
1144
+ document_id=chunk.document_id,
1145
+ chunk_number=chunk.chunk_number,
1146
+ metadata=chunk.metadata,
1147
+ content_type=chunk.content_type,
1148
+ filename=chunk.filename,
1149
+ download_url=chunk.download_url,
1150
+ )
1151
+ )
1152
+
1153
+ return final_chunks
1154
+
1155
+ def create_cache(
1156
+ self,
1157
+ name: str,
1158
+ model: str,
1159
+ gguf_file: str,
1160
+ filters: Optional[Dict[str, Any]] = None,
1161
+ docs: Optional[List[str]] = None,
1162
+ ) -> Dict[str, Any]:
1163
+ """
1164
+ Create a new cache with specified configuration.
1165
+
1166
+ Args:
1167
+ name: Name of the cache to create
1168
+ model: Name of the model to use (e.g. "llama2")
1169
+ gguf_file: Name of the GGUF file to use for the model
1170
+ filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
1171
+ docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
1172
+
1173
+ Returns:
1174
+ Dict[str, Any]: Created cache configuration
1175
+
1176
+ Example:
1177
+ ```python
1178
+ # This will include both:
1179
+ # 1. Any documents with category="programming"
1180
+ # 2. The specific documents "doc1" and "doc2" (regardless of their category)
1181
+ cache = db.create_cache(
1182
+ name="programming_cache",
1183
+ model="llama2",
1184
+ gguf_file="llama-2-7b-chat.Q4_K_M.gguf",
1185
+ filters={"category": "programming"},
1186
+ docs=["doc1", "doc2"]
1187
+ )
1188
+ ```
1189
+ """
1190
+ # Build query parameters for name, model and gguf_file
1191
+ params = {"name": name, "model": model, "gguf_file": gguf_file}
1192
+
1193
+ # Build request body for filters and docs
1194
+ request = {"filters": filters, "docs": docs}
1195
+
1196
+ response = self._request("POST", "cache/create", request, params=params)
1197
+ return response
1198
+
1199
+ def get_cache(self, name: str) -> Cache:
1200
+ """
1201
+ Get a cache by name.
1202
+
1203
+ Args:
1204
+ name: Name of the cache to retrieve
1205
+
1206
+ Returns:
1207
+ cache: A cache object that is used to interact with the cache.
1208
+
1209
+ Example:
1210
+ ```python
1211
+ cache = db.get_cache("programming_cache")
1212
+ ```
1213
+ """
1214
+ response = self._request("GET", f"cache/{name}")
1215
+ if response.get("exists", False):
1216
+ return Cache(self, name)
1217
+ raise ValueError(f"Cache '{name}' not found")
1218
+
1219
+ def create_graph(
1220
+ self,
1221
+ name: str,
1222
+ filters: Optional[Dict[str, Any]] = None,
1223
+ documents: Optional[List[str]] = None,
1224
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
1225
+ ) -> Graph:
1226
+ """
1227
+ Create a graph from documents.
1228
+
1229
+ This method extracts entities and relationships from documents
1230
+ matching the specified filters or document IDs and creates a graph.
1231
+
1232
+ Args:
1233
+ name: Name of the graph to create
1234
+ filters: Optional metadata filters to determine which documents to include
1235
+ documents: Optional list of specific document IDs to include
1236
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1237
+ Either a GraphPromptOverrides object or a dictionary with the same structure
1238
+
1239
+ Returns:
1240
+ Graph: The created graph object
1241
+
1242
+ Example:
1243
+ ```python
1244
+ # Create a graph from documents with category="research"
1245
+ graph = db.create_graph(
1246
+ name="research_graph",
1247
+ filters={"category": "research"}
1248
+ )
1249
+
1250
+ # Create a graph from specific documents
1251
+ graph = db.create_graph(
1252
+ name="custom_graph",
1253
+ documents=["doc1", "doc2", "doc3"]
1254
+ )
1255
+
1256
+ # With custom entity extraction examples
1257
+ from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
1258
+ graph = db.create_graph(
1259
+ name="medical_graph",
1260
+ filters={"category": "medical"},
1261
+ prompt_overrides=GraphPromptOverrides(
1262
+ entity_extraction=EntityExtractionPromptOverride(
1263
+ examples=[
1264
+ EntityExtractionExample(label="Insulin", type="MEDICATION"),
1265
+ EntityExtractionExample(label="Diabetes", type="CONDITION")
1266
+ ]
1267
+ )
1268
+ )
1269
+ )
1270
+ ```
1271
+ """
1272
+ # Convert prompt_overrides to dict if it's a model
1273
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1274
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1275
+
1276
+ request = {
1277
+ "name": name,
1278
+ "filters": filters,
1279
+ "documents": documents,
1280
+ "prompt_overrides": prompt_overrides,
1281
+ }
1282
+
1283
+ response = self._request("POST", "graph/create", request)
1284
+ return Graph(**response)
1285
+
1286
+ def get_graph(self, name: str) -> Graph:
1287
+ """
1288
+ Get a graph by name.
1289
+
1290
+ Args:
1291
+ name: Name of the graph to retrieve
1292
+
1293
+ Returns:
1294
+ Graph: The requested graph object
1295
+
1296
+ Example:
1297
+ ```python
1298
+ # Get a graph by name
1299
+ graph = db.get_graph("finance_graph")
1300
+ print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships")
1301
+ ```
1302
+ """
1303
+ response = self._request("GET", f"graph/{name}")
1304
+ return Graph(**response)
1305
+
1306
+ def list_graphs(self) -> List[Graph]:
1307
+ """
1308
+ List all graphs the user has access to.
1309
+
1310
+ Returns:
1311
+ List[Graph]: List of graph objects
1312
+
1313
+ Example:
1314
+ ```python
1315
+ # List all accessible graphs
1316
+ graphs = db.list_graphs()
1317
+ for graph in graphs:
1318
+ print(f"Graph: {graph.name}, Entities: {len(graph.entities)}")
1319
+ ```
1320
+ """
1321
+ response = self._request("GET", "graphs")
1322
+ return [Graph(**graph) for graph in response]
1323
+
1324
+ def update_graph(
1325
+ self,
1326
+ name: str,
1327
+ additional_filters: Optional[Dict[str, Any]] = None,
1328
+ additional_documents: Optional[List[str]] = None,
1329
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
1330
+ ) -> Graph:
1331
+ """
1332
+ Update an existing graph with new documents.
1333
+
1334
+ This method processes additional documents matching the original or new filters,
1335
+ extracts entities and relationships, and updates the graph with new information.
1336
+
1337
+ Args:
1338
+ name: Name of the graph to update
1339
+ additional_filters: Optional additional metadata filters to determine which new documents to include
1340
+ additional_documents: Optional list of additional document IDs to include
1341
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1342
+ Either a GraphPromptOverrides object or a dictionary with the same structure
1343
+
1344
+ Returns:
1345
+ Graph: The updated graph
1346
+
1347
+ Example:
1348
+ ```python
1349
+ # Update a graph with new documents
1350
+ updated_graph = db.update_graph(
1351
+ name="research_graph",
1352
+ additional_filters={"category": "new_research"},
1353
+ additional_documents=["doc4", "doc5"]
1354
+ )
1355
+ print(f"Graph now has {len(updated_graph.entities)} entities")
1356
+
1357
+ # With entity resolution examples
1358
+ from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
1359
+ updated_graph = db.update_graph(
1360
+ name="research_graph",
1361
+ additional_documents=["doc4"],
1362
+ prompt_overrides=GraphPromptOverrides(
1363
+ entity_resolution=EntityResolutionPromptOverride(
1364
+ examples=[
1365
+ EntityResolutionExample(
1366
+ canonical="Machine Learning",
1367
+ variants=["ML", "machine learning", "AI/ML"]
1368
+ )
1369
+ ]
1370
+ )
1371
+ )
1372
+ )
1373
+ ```
1374
+ """
1375
+ # Convert prompt_overrides to dict if it's a model
1376
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1377
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1378
+
1379
+ request = {
1380
+ "additional_filters": additional_filters,
1381
+ "additional_documents": additional_documents,
1382
+ "prompt_overrides": prompt_overrides,
1383
+ }
1384
+
1385
+ response = self._request("POST", f"graph/{name}/update", request)
1386
+ return Graph(**response)
1387
+
1388
+ def delete_document(self, document_id: str) -> Dict[str, str]:
1389
+ """
1390
+ Delete a document and all its associated data.
1391
+
1392
+ This method deletes a document and all its associated data, including:
1393
+ - Document metadata
1394
+ - Document content in storage
1395
+ - Document chunks and embeddings in vector store
1396
+
1397
+ Args:
1398
+ document_id: ID of the document to delete
1399
+
1400
+ Returns:
1401
+ Dict[str, str]: Deletion status
1402
+
1403
+ Example:
1404
+ ```python
1405
+ # Delete a document
1406
+ result = db.delete_document("doc_123")
1407
+ print(result["message"]) # Document doc_123 deleted successfully
1408
+ ```
1409
+ """
1410
+ response = self._request("DELETE", f"documents/{document_id}")
1411
+ return response
1412
+
1413
+ def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
1414
+ """
1415
+ Delete a document by its filename.
1416
+
1417
+ This is a convenience method that first retrieves the document ID by filename
1418
+ and then deletes the document by ID.
1419
+
1420
+ Args:
1421
+ filename: Filename of the document to delete
1422
+
1423
+ Returns:
1424
+ Dict[str, str]: Deletion status
1425
+
1426
+ Example:
1427
+ ```python
1428
+ # Delete a document by filename
1429
+ result = db.delete_document_by_filename("report.pdf")
1430
+ print(result["message"])
1431
+ ```
1432
+ """
1433
+ # First get the document by filename to obtain its ID
1434
+ doc = self.get_document_by_filename(filename)
1435
+
1436
+ # Then delete the document by ID
1437
+ return self.delete_document(doc.external_id)
1438
+
1439
+ def close(self):
1440
+ """Close the HTTP session"""
1441
+ self._session.close()
1442
+
1443
+ def __enter__(self):
1444
+ return self
1445
+
1446
+ def __exit__(self, exc_type, exc_val, exc_tb):
1447
+ self.close()