morphik 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/async_.py ADDED
@@ -0,0 +1,1416 @@
1
+ from io import BytesIO, IOBase
2
+ import json
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Dict, Any, List, Optional, Union, BinaryIO
6
+ from urllib.parse import urlparse
7
+
8
+ import httpx
9
+ import jwt
10
+ from PIL.Image import Image as PILImage
11
+ from pydantic import BaseModel, Field
12
+
13
+ from .models import (
14
+ Document,
15
+ ChunkResult,
16
+ DocumentResult,
17
+ CompletionResponse,
18
+ IngestTextRequest,
19
+ ChunkSource,
20
+ Graph,
21
+ # Prompt override models
22
+ EntityExtractionExample,
23
+ EntityResolutionExample,
24
+ EntityExtractionPromptOverride,
25
+ EntityResolutionPromptOverride,
26
+ QueryPromptOverride,
27
+ GraphPromptOverrides,
28
+ QueryPromptOverrides
29
+ )
30
+ from .rules import Rule
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Type alias for rules
35
+ RuleOrDict = Union[Rule, Dict[str, Any]]
36
+
37
+
38
+ class AsyncCache:
39
+ def __init__(self, db: "AsyncMorphik", name: str):
40
+ self._db = db
41
+ self._name = name
42
+
43
+ async def update(self) -> bool:
44
+ response = await self._db._request("POST", f"cache/{self._name}/update")
45
+ return response.get("success", False)
46
+
47
+ async def add_docs(self, docs: List[str]) -> bool:
48
+ response = await self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
49
+ return response.get("success", False)
50
+
51
+ async def query(
52
+ self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
53
+ ) -> CompletionResponse:
54
+ response = await self._db._request(
55
+ "POST",
56
+ f"cache/{self._name}/query",
57
+ params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
58
+ data="",
59
+ )
60
+ return CompletionResponse(**response)
61
+
62
+
63
+ class FinalChunkResult(BaseModel):
64
+ content: str | PILImage = Field(..., description="Chunk content")
65
+ score: float = Field(..., description="Relevance score")
66
+ document_id: str = Field(..., description="Parent document ID")
67
+ chunk_number: int = Field(..., description="Chunk sequence number")
68
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
69
+ content_type: str = Field(..., description="Content type")
70
+ filename: Optional[str] = Field(None, description="Original filename")
71
+ download_url: Optional[str] = Field(None, description="URL to download full document")
72
+
73
+ class Config:
74
+ arbitrary_types_allowed = True
75
+
76
+
77
+ class AsyncMorphik:
78
+ """
79
+ Morphik client for document operations.
80
+
81
+ Args:
82
+ uri (str, optional): Morphik URI in format "morphik://<owner_id>:<token>@<host>".
83
+ If not provided, connects to http://localhost:8000 without authentication.
84
+ timeout (int, optional): Request timeout in seconds. Defaults to 30.
85
+ is_local (bool, optional): Whether to connect to a local server. Defaults to False.
86
+
87
+ Examples:
88
+ ```python
89
+ # Without authentication
90
+ async with AsyncMorphik() as db:
91
+ doc = await db.ingest_text("Sample content")
92
+
93
+ # With authentication
94
+ async with AsyncMorphik("morphik://owner_id:token@api.morphik.ai") as db:
95
+ doc = await db.ingest_text("Sample content")
96
+ ```
97
+ """
98
+
99
+ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
100
+ self._timeout = timeout
101
+ self._client = (
102
+ httpx.AsyncClient(timeout=timeout)
103
+ if not is_local
104
+ else httpx.AsyncClient(
105
+ timeout=timeout,
106
+ verify=False, # Disable SSL for localhost
107
+ http2=False, # Force HTTP/1.1
108
+ )
109
+ )
110
+ self._is_local = is_local
111
+
112
+ if uri:
113
+ self._setup_auth(uri)
114
+ else:
115
+ self._base_url = "http://localhost:8000"
116
+ self._auth_token = None
117
+
118
+ def _setup_auth(self, uri: str) -> None:
119
+ """Setup authentication from URI"""
120
+ parsed = urlparse(uri)
121
+ if not parsed.netloc:
122
+ raise ValueError("Invalid URI format")
123
+
124
+ # Split host and auth parts
125
+ auth, host = parsed.netloc.split("@")
126
+ _, self._auth_token = auth.split(":")
127
+
128
+ # Set base URL
129
+ self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
130
+
131
+ # Basic token validation
132
+ jwt.decode(self._auth_token, options={"verify_signature": False})
133
+
134
+ async def _request(
135
+ self,
136
+ method: str,
137
+ endpoint: str,
138
+ data: Optional[Dict[str, Any]] = None,
139
+ files: Optional[Dict[str, Any]] = None,
140
+ params: Optional[Dict[str, Any]] = None,
141
+ ) -> Dict[str, Any]:
142
+ """Make HTTP request"""
143
+ headers = {}
144
+ if self._auth_token: # Only add auth header if we have a token
145
+ headers["Authorization"] = f"Bearer {self._auth_token}"
146
+
147
+ # Configure request data based on type
148
+ if files:
149
+ # Multipart form data for files
150
+ request_data = {"files": files, "data": data}
151
+ # Don't set Content-Type, let httpx handle it
152
+ else:
153
+ # JSON for everything else
154
+ headers["Content-Type"] = "application/json"
155
+ request_data = {"json": data}
156
+
157
+ response = await self._client.request(
158
+ method,
159
+ f"{self._base_url}/{endpoint.lstrip('/')}",
160
+ headers=headers,
161
+ params=params,
162
+ **request_data,
163
+ )
164
+ response.raise_for_status()
165
+ return response.json()
166
+
167
+ def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
168
+ """Convert a rule to a dictionary format"""
169
+ if hasattr(rule, "to_dict"):
170
+ return rule.to_dict()
171
+ return rule
172
+
173
+ async def ingest_text(
174
+ self,
175
+ content: str,
176
+ filename: Optional[str] = None,
177
+ metadata: Optional[Dict[str, Any]] = None,
178
+ rules: Optional[List[RuleOrDict]] = None,
179
+ use_colpali: bool = True,
180
+ ) -> Document:
181
+ """
182
+ Ingest a text document into Morphik.
183
+
184
+ Args:
185
+ content: Text content to ingest
186
+ metadata: Optional metadata dictionary
187
+ rules: Optional list of rules to apply during ingestion. Can be:
188
+ - MetadataExtractionRule: Extract metadata using a schema
189
+ - NaturalLanguageRule: Transform content using natural language
190
+ use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images)
191
+ Returns:
192
+ Document: Metadata of the ingested document
193
+
194
+ Example:
195
+ ```python
196
+ from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
197
+ from pydantic import BaseModel
198
+
199
+ class DocumentInfo(BaseModel):
200
+ title: str
201
+ author: str
202
+ date: str
203
+
204
+ doc = await db.ingest_text(
205
+ "Machine learning is fascinating...",
206
+ metadata={"category": "tech"},
207
+ rules=[
208
+ # Extract metadata using schema
209
+ MetadataExtractionRule(schema=DocumentInfo),
210
+ # Transform content
211
+ NaturalLanguageRule(prompt="Shorten the content, use keywords")
212
+ ]
213
+ )
214
+ ```
215
+ """
216
+ request = IngestTextRequest(
217
+ content=content,
218
+ filename=filename,
219
+ metadata=metadata or {},
220
+ rules=[self._convert_rule(r) for r in (rules or [])],
221
+ use_colpali=use_colpali,
222
+ )
223
+ response = await self._request("POST", "ingest/text", data=request.model_dump())
224
+ doc = Document(**response)
225
+ doc._client = self
226
+ return doc
227
+
228
+ async def ingest_file(
229
+ self,
230
+ file: Union[str, bytes, BinaryIO, Path],
231
+ filename: str,
232
+ metadata: Optional[Dict[str, Any]] = None,
233
+ rules: Optional[List[RuleOrDict]] = None,
234
+ use_colpali: bool = True,
235
+ ) -> Document:
236
+ """Ingest a file document into Morphik."""
237
+ # Handle different file input types
238
+ if isinstance(file, (str, Path)):
239
+ file_path = Path(file)
240
+ if not file_path.exists():
241
+ raise ValueError(f"File not found: {file}")
242
+ with open(file_path, "rb") as f:
243
+ content = f.read()
244
+ file_obj = BytesIO(content)
245
+ elif isinstance(file, bytes):
246
+ file_obj = BytesIO(file)
247
+ else:
248
+ file_obj = file
249
+
250
+ try:
251
+ # Prepare multipart form data
252
+ files = {"file": (filename, file_obj)}
253
+
254
+ # Add metadata and rules
255
+ data = {
256
+ "metadata": json.dumps(metadata or {}),
257
+ "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
258
+ "use_colpali": json.dumps(use_colpali),
259
+ }
260
+
261
+ response = await self._request("POST", "ingest/file", data=data, files=files)
262
+ doc = Document(**response)
263
+ doc._client = self
264
+ return doc
265
+ finally:
266
+ # Close file if we opened it
267
+ if isinstance(file, (str, Path)):
268
+ file_obj.close()
269
+
270
+ async def ingest_files(
271
+ self,
272
+ files: List[Union[str, bytes, BinaryIO, Path]],
273
+ metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
274
+ rules: Optional[List[RuleOrDict]] = None,
275
+ use_colpali: bool = True,
276
+ parallel: bool = True,
277
+ ) -> List[Document]:
278
+ """
279
+ Ingest multiple files into Morphik.
280
+
281
+ Args:
282
+ files: List of files to ingest (path strings, bytes, file objects, or Paths)
283
+ metadata: Optional metadata (single dict for all files or list of dicts)
284
+ rules: Optional list of rules to apply
285
+ use_colpali: Whether to use ColPali-style embedding
286
+ parallel: Whether to process files in parallel
287
+
288
+ Returns:
289
+ List[Document]: List of successfully ingested documents
290
+
291
+ Raises:
292
+ ValueError: If metadata list length doesn't match files length
293
+ """
294
+ # Convert files to format expected by API
295
+ file_objects = []
296
+ for file in files:
297
+ if isinstance(file, (str, Path)):
298
+ path = Path(file)
299
+ file_objects.append(("files", (path.name, open(path, "rb"))))
300
+ elif isinstance(file, bytes):
301
+ file_objects.append(("files", ("file.bin", file)))
302
+ else:
303
+ file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
304
+
305
+ try:
306
+ # Prepare request data
307
+ # Convert rules appropriately based on whether it's a flat list or list of lists
308
+ if rules:
309
+ if all(isinstance(r, list) for r in rules):
310
+ # List of lists - per-file rules
311
+ converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
312
+ else:
313
+ # Flat list - shared rules for all files
314
+ converted_rules = [self._convert_rule(r) for r in rules]
315
+ else:
316
+ converted_rules = []
317
+
318
+ data = {
319
+ "metadata": json.dumps(metadata or {}),
320
+ "rules": json.dumps(converted_rules),
321
+ "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
322
+ "parallel": str(parallel).lower(),
323
+ }
324
+
325
+ response = await self._request("POST", "ingest/files", data=data, files=file_objects)
326
+
327
+ if response.get("errors"):
328
+ # Log errors but don't raise exception
329
+ for error in response["errors"]:
330
+ logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
331
+
332
+ docs = [Document(**doc) for doc in response["documents"]]
333
+ for doc in docs:
334
+ doc._client = self
335
+ return docs
336
+ finally:
337
+ # Clean up file objects
338
+ for _, (_, file_obj) in file_objects:
339
+ if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
340
+ file_obj.close()
341
+
342
+ async def ingest_directory(
343
+ self,
344
+ directory: Union[str, Path],
345
+ recursive: bool = False,
346
+ pattern: str = "*",
347
+ metadata: Optional[Dict[str, Any]] = None,
348
+ rules: Optional[List[RuleOrDict]] = None,
349
+ use_colpali: bool = True,
350
+ parallel: bool = True,
351
+ ) -> List[Document]:
352
+ """
353
+ Ingest all files in a directory into Morphik.
354
+
355
+ Args:
356
+ directory: Path to directory containing files to ingest
357
+ recursive: Whether to recursively process subdirectories
358
+ pattern: Optional glob pattern to filter files (e.g. "*.pdf")
359
+ metadata: Optional metadata dictionary to apply to all files
360
+ rules: Optional list of rules to apply
361
+ use_colpali: Whether to use ColPali-style embedding
362
+ parallel: Whether to process files in parallel
363
+
364
+ Returns:
365
+ List[Document]: List of ingested documents
366
+
367
+ Raises:
368
+ ValueError: If directory not found
369
+ """
370
+ directory = Path(directory)
371
+ if not directory.is_dir():
372
+ raise ValueError(f"Directory not found: {directory}")
373
+
374
+ # Collect all files matching pattern
375
+ if recursive:
376
+ files = list(directory.rglob(pattern))
377
+ else:
378
+ files = list(directory.glob(pattern))
379
+
380
+ # Filter out directories
381
+ files = [f for f in files if f.is_file()]
382
+
383
+ if not files:
384
+ return []
385
+
386
+ # Use ingest_files with collected paths
387
+ return await self.ingest_files(
388
+ files=files,
389
+ metadata=metadata,
390
+ rules=rules,
391
+ use_colpali=use_colpali,
392
+ parallel=parallel
393
+ )
394
+
395
+ async def retrieve_chunks(
396
+ self,
397
+ query: str,
398
+ filters: Optional[Dict[str, Any]] = None,
399
+ k: int = 4,
400
+ min_score: float = 0.0,
401
+ use_colpali: bool = True,
402
+ ) -> List[FinalChunkResult]:
403
+ """
404
+ Search for relevant chunks.
405
+
406
+ Args:
407
+ query: Search query text
408
+ filters: Optional metadata filters
409
+ k: Number of results (default: 4)
410
+ min_score: Minimum similarity threshold (default: 0.0)
411
+ use_colpali: Whether to use ColPali-style embedding model to retrieve chunks (only works for documents ingested with `use_colpali=True`)
412
+ Returns:
413
+ List[FinalChunkResult]
414
+
415
+ Example:
416
+ ```python
417
+ chunks = await db.retrieve_chunks(
418
+ "What are the key findings?",
419
+ filters={"department": "research"}
420
+ )
421
+ ```
422
+ """
423
+ request = {
424
+ "query": query,
425
+ "filters": filters,
426
+ "k": k,
427
+ "min_score": min_score,
428
+ "use_colpali": use_colpali,
429
+ }
430
+
431
+ response = await self._request("POST", "retrieve/chunks", data=request)
432
+ chunks = [ChunkResult(**r) for r in response]
433
+
434
+ final_chunks = []
435
+ for chunk in chunks:
436
+ if chunk.metadata.get("is_image"):
437
+ try:
438
+ # Handle data URI format "data:image/png;base64,..."
439
+ content = chunk.content
440
+ if content.startswith("data:"):
441
+ # Extract the base64 part after the comma
442
+ content = content.split(",", 1)[1]
443
+
444
+ # Now decode the base64 string
445
+ import base64
446
+ import io
447
+ from PIL import Image
448
+ image_bytes = base64.b64decode(content)
449
+ content = Image.open(io.BytesIO(image_bytes))
450
+ except Exception as e:
451
+ print(f"Error processing image: {str(e)}")
452
+ # Fall back to using the content as text
453
+ content = chunk.content
454
+ else:
455
+ content = chunk.content
456
+
457
+ final_chunks.append(
458
+ FinalChunkResult(
459
+ content=content,
460
+ score=chunk.score,
461
+ document_id=chunk.document_id,
462
+ chunk_number=chunk.chunk_number,
463
+ metadata=chunk.metadata,
464
+ content_type=chunk.content_type,
465
+ filename=chunk.filename,
466
+ download_url=chunk.download_url,
467
+ )
468
+ )
469
+
470
+ return final_chunks
471
+
472
+ async def retrieve_docs(
473
+ self,
474
+ query: str,
475
+ filters: Optional[Dict[str, Any]] = None,
476
+ k: int = 4,
477
+ min_score: float = 0.0,
478
+ use_colpali: bool = True,
479
+ ) -> List[DocumentResult]:
480
+ """
481
+ Retrieve relevant documents.
482
+
483
+ Args:
484
+ query: Search query text
485
+ filters: Optional metadata filters
486
+ k: Number of results (default: 4)
487
+ min_score: Minimum similarity threshold (default: 0.0)
488
+ use_colpali: Whether to use ColPali-style embedding model to retrieve documents (only works for documents ingested with `use_colpali=True`)
489
+ Returns:
490
+ List[DocumentResult]
491
+
492
+ Example:
493
+ ```python
494
+ docs = await db.retrieve_docs(
495
+ "machine learning",
496
+ k=5
497
+ )
498
+ ```
499
+ """
500
+ request = {
501
+ "query": query,
502
+ "filters": filters,
503
+ "k": k,
504
+ "min_score": min_score,
505
+ "use_colpali": use_colpali,
506
+ }
507
+
508
+ response = await self._request("POST", "retrieve/docs", data=request)
509
+ return [DocumentResult(**r) for r in response]
510
+
511
+ async def query(
512
+ self,
513
+ query: str,
514
+ filters: Optional[Dict[str, Any]] = None,
515
+ k: int = 4,
516
+ min_score: float = 0.0,
517
+ max_tokens: Optional[int] = None,
518
+ temperature: Optional[float] = None,
519
+ use_colpali: bool = True,
520
+ graph_name: Optional[str] = None,
521
+ hop_depth: int = 1,
522
+ include_paths: bool = False,
523
+ prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
524
+ ) -> CompletionResponse:
525
+ """
526
+ Generate completion using relevant chunks as context.
527
+
528
+ Args:
529
+ query: Query text
530
+ filters: Optional metadata filters
531
+ k: Number of chunks to use as context (default: 4)
532
+ min_score: Minimum similarity threshold (default: 0.0)
533
+ max_tokens: Maximum tokens in completion
534
+ temperature: Model temperature
535
+ use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`)
536
+ graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
537
+ hop_depth: Number of relationship hops to traverse in the graph (1-3)
538
+ include_paths: Whether to include relationship paths in the response
539
+ prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
540
+ Either a QueryPromptOverrides object or a dictionary with the same structure
541
+ Returns:
542
+ CompletionResponse
543
+
544
+ Example:
545
+ ```python
546
+ # Standard query
547
+ response = await db.query(
548
+ "What are the key findings about customer satisfaction?",
549
+ filters={"department": "research"},
550
+ temperature=0.7
551
+ )
552
+
553
+ # Knowledge graph enhanced query
554
+ response = await db.query(
555
+ "How does product X relate to customer segment Y?",
556
+ graph_name="market_graph",
557
+ hop_depth=2,
558
+ include_paths=True
559
+ )
560
+
561
+ # With prompt customization
562
+ from morphik.models import QueryPromptOverride, QueryPromptOverrides
563
+ response = await db.query(
564
+ "What are the key findings?",
565
+ prompt_overrides=QueryPromptOverrides(
566
+ query=QueryPromptOverride(
567
+ prompt_template="Answer the question in a formal, academic tone: {question}"
568
+ )
569
+ )
570
+ )
571
+
572
+ # Or using a dictionary
573
+ response = await db.query(
574
+ "What are the key findings?",
575
+ prompt_overrides={
576
+ "query": {
577
+ "prompt_template": "Answer the question in a formal, academic tone: {question}"
578
+ }
579
+ }
580
+ )
581
+
582
+ print(response.completion)
583
+
584
+ # If include_paths=True, you can inspect the graph paths
585
+ if response.metadata and "graph" in response.metadata:
586
+ for path in response.metadata["graph"]["paths"]:
587
+ print(" -> ".join(path))
588
+ ```
589
+ """
590
+ # Convert prompt_overrides to dict if it's a model
591
+ if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides):
592
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
593
+
594
+ request = {
595
+ "query": query,
596
+ "filters": filters,
597
+ "k": k,
598
+ "min_score": min_score,
599
+ "max_tokens": max_tokens,
600
+ "temperature": temperature,
601
+ "use_colpali": use_colpali,
602
+ "graph_name": graph_name,
603
+ "hop_depth": hop_depth,
604
+ "include_paths": include_paths,
605
+ "prompt_overrides": prompt_overrides,
606
+ }
607
+
608
+ response = await self._request("POST", "query", data=request)
609
+ return CompletionResponse(**response)
610
+
611
+ async def list_documents(
612
+ self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
613
+ ) -> List[Document]:
614
+ """
615
+ List accessible documents.
616
+
617
+ Args:
618
+ skip: Number of documents to skip
619
+ limit: Maximum number of documents to return
620
+ filters: Optional filters
621
+
622
+ Returns:
623
+ List[Document]: List of accessible documents
624
+
625
+ Example:
626
+ ```python
627
+ # Get first page
628
+ docs = await db.list_documents(limit=10)
629
+
630
+ # Get next page
631
+ next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
632
+ ```
633
+ """
634
+ # Use query params for pagination and POST body for filters
635
+ response = await self._request(
636
+ "POST", f"documents?skip={skip}&limit={limit}", data=filters or {}
637
+ )
638
+ docs = [Document(**doc) for doc in response]
639
+ for doc in docs:
640
+ doc._client = self
641
+ return docs
642
+
643
+ async def get_document(self, document_id: str) -> Document:
644
+ """
645
+ Get document metadata by ID.
646
+
647
+ Args:
648
+ document_id: ID of the document
649
+
650
+ Returns:
651
+ Document: Document metadata
652
+
653
+ Example:
654
+ ```python
655
+ doc = await db.get_document("doc_123")
656
+ print(f"Title: {doc.metadata.get('title')}")
657
+ ```
658
+ """
659
+ response = await self._request("GET", f"documents/{document_id}")
660
+ doc = Document(**response)
661
+ doc._client = self
662
+ return doc
663
+
664
+ async def get_document_by_filename(self, filename: str) -> Document:
665
+ """
666
+ Get document metadata by filename.
667
+ If multiple documents have the same filename, returns the most recently updated one.
668
+
669
+ Args:
670
+ filename: Filename of the document to retrieve
671
+
672
+ Returns:
673
+ Document: Document metadata
674
+
675
+ Example:
676
+ ```python
677
+ doc = await db.get_document_by_filename("report.pdf")
678
+ print(f"Document ID: {doc.external_id}")
679
+ ```
680
+ """
681
+ response = await self._request("GET", f"documents/filename/{filename}")
682
+ doc = Document(**response)
683
+ doc._client = self
684
+ return doc
685
+
686
+ async def update_document_with_text(
687
+ self,
688
+ document_id: str,
689
+ content: str,
690
+ filename: Optional[str] = None,
691
+ metadata: Optional[Dict[str, Any]] = None,
692
+ rules: Optional[List] = None,
693
+ update_strategy: str = "add",
694
+ use_colpali: Optional[bool] = None,
695
+ ) -> Document:
696
+ """
697
+ Update a document with new text content using the specified strategy.
698
+
699
+ Args:
700
+ document_id: ID of the document to update
701
+ content: The new content to add
702
+ filename: Optional new filename for the document
703
+ metadata: Additional metadata to update (optional)
704
+ rules: Optional list of rules to apply to the content
705
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
706
+ use_colpali: Whether to use multi-vector embedding
707
+
708
+ Returns:
709
+ Document: Updated document metadata
710
+
711
+ Example:
712
+ ```python
713
+ # Add new content to an existing document
714
+ updated_doc = await db.update_document_with_text(
715
+ document_id="doc_123",
716
+ content="This is additional content that will be appended to the document.",
717
+ filename="updated_document.txt",
718
+ metadata={"category": "updated"},
719
+ update_strategy="add"
720
+ )
721
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
722
+ ```
723
+ """
724
+ # Use the dedicated text update endpoint
725
+ request = IngestTextRequest(
726
+ content=content,
727
+ filename=filename,
728
+ metadata=metadata or {},
729
+ rules=[self._convert_rule(r) for r in (rules or [])],
730
+ use_colpali=use_colpali if use_colpali is not None else True,
731
+ )
732
+
733
+ params = {}
734
+ if update_strategy != "add":
735
+ params["update_strategy"] = update_strategy
736
+
737
+ response = await self._request(
738
+ "POST",
739
+ f"documents/{document_id}/update_text",
740
+ data=request.model_dump(),
741
+ params=params
742
+ )
743
+
744
+ doc = Document(**response)
745
+ doc._client = self
746
+ return doc
747
+
748
+ async def update_document_with_file(
749
+ self,
750
+ document_id: str,
751
+ file: Union[str, bytes, BinaryIO, Path],
752
+ filename: Optional[str] = None,
753
+ metadata: Optional[Dict[str, Any]] = None,
754
+ rules: Optional[List] = None,
755
+ update_strategy: str = "add",
756
+ use_colpali: Optional[bool] = None,
757
+ ) -> Document:
758
+ """
759
+ Update a document with content from a file using the specified strategy.
760
+
761
+ Args:
762
+ document_id: ID of the document to update
763
+ file: File to add (path string, bytes, file object, or Path)
764
+ filename: Name of the file
765
+ metadata: Additional metadata to update (optional)
766
+ rules: Optional list of rules to apply to the content
767
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
768
+ use_colpali: Whether to use multi-vector embedding
769
+
770
+ Returns:
771
+ Document: Updated document metadata
772
+
773
+ Example:
774
+ ```python
775
+ # Add content from a file to an existing document
776
+ updated_doc = await db.update_document_with_file(
777
+ document_id="doc_123",
778
+ file="path/to/update.pdf",
779
+ metadata={"status": "updated"},
780
+ update_strategy="add"
781
+ )
782
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
783
+ ```
784
+ """
785
+ # Handle different file input types
786
+ if isinstance(file, (str, Path)):
787
+ file_path = Path(file)
788
+ if not file_path.exists():
789
+ raise ValueError(f"File not found: {file}")
790
+ filename = file_path.name if filename is None else filename
791
+ with open(file_path, "rb") as f:
792
+ content = f.read()
793
+ file_obj = BytesIO(content)
794
+ elif isinstance(file, bytes):
795
+ if filename is None:
796
+ raise ValueError("filename is required when updating with bytes")
797
+ file_obj = BytesIO(file)
798
+ else:
799
+ if filename is None:
800
+ raise ValueError("filename is required when updating with file object")
801
+ file_obj = file
802
+
803
+ try:
804
+ # Prepare multipart form data
805
+ files = {"file": (filename, file_obj)}
806
+
807
+ # Convert metadata and rules to JSON strings
808
+ form_data = {
809
+ "metadata": json.dumps(metadata or {}),
810
+ "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
811
+ "update_strategy": update_strategy,
812
+ }
813
+
814
+ if use_colpali is not None:
815
+ form_data["use_colpali"] = str(use_colpali).lower()
816
+
817
+ # Use the dedicated file update endpoint
818
+ response = await self._request(
819
+ "POST", f"documents/{document_id}/update_file", data=form_data, files=files
820
+ )
821
+
822
+ doc = Document(**response)
823
+ doc._client = self
824
+ return doc
825
+ finally:
826
+ # Close file if we opened it
827
+ if isinstance(file, (str, Path)):
828
+ file_obj.close()
829
+
830
+ async def update_document_metadata(
831
+ self,
832
+ document_id: str,
833
+ metadata: Dict[str, Any],
834
+ ) -> Document:
835
+ """
836
+ Update a document's metadata only.
837
+
838
+ Args:
839
+ document_id: ID of the document to update
840
+ metadata: Metadata to update
841
+
842
+ Returns:
843
+ Document: Updated document metadata
844
+
845
+ Example:
846
+ ```python
847
+ # Update just the metadata of a document
848
+ updated_doc = await db.update_document_metadata(
849
+ document_id="doc_123",
850
+ metadata={"status": "reviewed", "reviewer": "Jane Smith"}
851
+ )
852
+ print(f"Updated metadata: {updated_doc.metadata}")
853
+ ```
854
+ """
855
+ # Use the dedicated metadata update endpoint
856
+ response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
857
+ doc = Document(**response)
858
+ doc._client = self
859
+ return doc
860
+
861
+ async def update_document_by_filename_with_text(
862
+ self,
863
+ filename: str,
864
+ content: str,
865
+ new_filename: Optional[str] = None,
866
+ metadata: Optional[Dict[str, Any]] = None,
867
+ rules: Optional[List] = None,
868
+ update_strategy: str = "add",
869
+ use_colpali: Optional[bool] = None,
870
+ ) -> Document:
871
+ """
872
+ Update a document identified by filename with new text content using the specified strategy.
873
+
874
+ Args:
875
+ filename: Filename of the document to update
876
+ content: The new content to add
877
+ new_filename: Optional new filename for the document
878
+ metadata: Additional metadata to update (optional)
879
+ rules: Optional list of rules to apply to the content
880
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
881
+ use_colpali: Whether to use multi-vector embedding
882
+
883
+ Returns:
884
+ Document: Updated document metadata
885
+
886
+ Example:
887
+ ```python
888
+ # Add new content to an existing document identified by filename
889
+ updated_doc = await db.update_document_by_filename_with_text(
890
+ filename="report.pdf",
891
+ content="This is additional content that will be appended to the document.",
892
+ new_filename="updated_report.pdf",
893
+ metadata={"category": "updated"},
894
+ update_strategy="add"
895
+ )
896
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
897
+ ```
898
+ """
899
+ # First get the document by filename to obtain its ID
900
+ doc = await self.get_document_by_filename(filename)
901
+
902
+ # Then use the regular update_document_with_text endpoint with the document ID
903
+ return await self.update_document_with_text(
904
+ document_id=doc.external_id,
905
+ content=content,
906
+ filename=new_filename,
907
+ metadata=metadata,
908
+ rules=rules,
909
+ update_strategy=update_strategy,
910
+ use_colpali=use_colpali
911
+ )
912
+
913
+ async def update_document_by_filename_with_file(
914
+ self,
915
+ filename: str,
916
+ file: Union[str, bytes, BinaryIO, Path],
917
+ new_filename: Optional[str] = None,
918
+ metadata: Optional[Dict[str, Any]] = None,
919
+ rules: Optional[List] = None,
920
+ update_strategy: str = "add",
921
+ use_colpali: Optional[bool] = None,
922
+ ) -> Document:
923
+ """
924
+ Update a document identified by filename with content from a file using the specified strategy.
925
+
926
+ Args:
927
+ filename: Filename of the document to update
928
+ file: File to add (path string, bytes, file object, or Path)
929
+ new_filename: Optional new filename for the document (defaults to the filename of the file)
930
+ metadata: Additional metadata to update (optional)
931
+ rules: Optional list of rules to apply to the content
932
+ update_strategy: Strategy for updating the document (currently only 'add' is supported)
933
+ use_colpali: Whether to use multi-vector embedding
934
+
935
+ Returns:
936
+ Document: Updated document metadata
937
+
938
+ Example:
939
+ ```python
940
+ # Add content from a file to an existing document identified by filename
941
+ updated_doc = await db.update_document_by_filename_with_file(
942
+ filename="report.pdf",
943
+ file="path/to/update.pdf",
944
+ metadata={"status": "updated"},
945
+ update_strategy="add"
946
+ )
947
+ print(f"Document version: {updated_doc.system_metadata.get('version')}")
948
+ ```
949
+ """
950
+ # First get the document by filename to obtain its ID
951
+ doc = await self.get_document_by_filename(filename)
952
+
953
+ # Then use the regular update_document_with_file endpoint with the document ID
954
+ return await self.update_document_with_file(
955
+ document_id=doc.external_id,
956
+ file=file,
957
+ filename=new_filename,
958
+ metadata=metadata,
959
+ rules=rules,
960
+ update_strategy=update_strategy,
961
+ use_colpali=use_colpali
962
+ )
963
+
964
+ async def update_document_by_filename_metadata(
965
+ self,
966
+ filename: str,
967
+ metadata: Dict[str, Any],
968
+ new_filename: Optional[str] = None,
969
+ ) -> Document:
970
+ """
971
+ Update a document's metadata using filename to identify the document.
972
+
973
+ Args:
974
+ filename: Filename of the document to update
975
+ metadata: Metadata to update
976
+ new_filename: Optional new filename to assign to the document
977
+
978
+ Returns:
979
+ Document: Updated document metadata
980
+
981
+ Example:
982
+ ```python
983
+ # Update just the metadata of a document identified by filename
984
+ updated_doc = await db.update_document_by_filename_metadata(
985
+ filename="report.pdf",
986
+ metadata={"status": "reviewed", "reviewer": "Jane Smith"},
987
+ new_filename="reviewed_report.pdf" # Optional: rename the file
988
+ )
989
+ print(f"Updated metadata: {updated_doc.metadata}")
990
+ ```
991
+ """
992
+ # First get the document by filename to obtain its ID
993
+ doc = await self.get_document_by_filename(filename)
994
+
995
+ # Update the metadata
996
+ result = await self.update_document_metadata(
997
+ document_id=doc.external_id,
998
+ metadata=metadata,
999
+ )
1000
+
1001
+ # If new_filename is provided, update the filename as well
1002
+ if new_filename:
1003
+ # Create a request that retains the just-updated metadata but also changes filename
1004
+ combined_metadata = result.metadata.copy()
1005
+
1006
+ # Update the document again with filename change and the same metadata
1007
+ response = await self._request(
1008
+ "POST",
1009
+ f"documents/{doc.external_id}/update_text",
1010
+ data={
1011
+ "content": "",
1012
+ "filename": new_filename,
1013
+ "metadata": combined_metadata,
1014
+ "rules": []
1015
+ }
1016
+ )
1017
+ result = Document(**response)
1018
+ result._client = self
1019
+
1020
+ return result
1021
+
1022
+ async def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
1023
+ """
1024
+ Retrieve multiple documents by their IDs in a single batch operation.
1025
+
1026
+ Args:
1027
+ document_ids: List of document IDs to retrieve
1028
+
1029
+ Returns:
1030
+ List[Document]: List of document metadata for found documents
1031
+
1032
+ Example:
1033
+ ```python
1034
+ docs = await db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
1035
+ for doc in docs:
1036
+ print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
1037
+ ```
1038
+ """
1039
+ response = await self._request("POST", "batch/documents", data=document_ids)
1040
+ docs = [Document(**doc) for doc in response]
1041
+ for doc in docs:
1042
+ doc._client = self
1043
+ return docs
1044
+
1045
+ async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
1046
+ """
1047
+ Retrieve specific chunks by their document ID and chunk number in a single batch operation.
1048
+
1049
+ Args:
1050
+ sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
1051
+
1052
+ Returns:
1053
+ List[FinalChunkResult]: List of chunk results
1054
+
1055
+ Example:
1056
+ ```python
1057
+ # Using dictionaries
1058
+ sources = [
1059
+ {"document_id": "doc_123", "chunk_number": 0},
1060
+ {"document_id": "doc_456", "chunk_number": 2}
1061
+ ]
1062
+
1063
+ # Or using ChunkSource objects
1064
+ from morphik.models import ChunkSource
1065
+ sources = [
1066
+ ChunkSource(document_id="doc_123", chunk_number=0),
1067
+ ChunkSource(document_id="doc_456", chunk_number=2)
1068
+ ]
1069
+
1070
+ chunks = await db.batch_get_chunks(sources)
1071
+ for chunk in chunks:
1072
+ print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
1073
+ ```
1074
+ """
1075
+ # Convert to list of dictionaries if needed
1076
+ source_dicts = []
1077
+ for source in sources:
1078
+ if isinstance(source, dict):
1079
+ source_dicts.append(source)
1080
+ else:
1081
+ source_dicts.append(source.model_dump())
1082
+
1083
+ response = await self._request("POST", "batch/chunks", data=source_dicts)
1084
+ chunks = [ChunkResult(**r) for r in response]
1085
+
1086
+ final_chunks = []
1087
+ for chunk in chunks:
1088
+ if chunk.metadata.get("is_image"):
1089
+ try:
1090
+ # Handle data URI format "data:image/png;base64,..."
1091
+ content = chunk.content
1092
+ if content.startswith("data:"):
1093
+ # Extract the base64 part after the comma
1094
+ content = content.split(",", 1)[1]
1095
+
1096
+ # Now decode the base64 string
1097
+ import base64
1098
+ import io
1099
+ from PIL import Image
1100
+ image_bytes = base64.b64decode(content)
1101
+ content = Image.open(io.BytesIO(image_bytes))
1102
+ except Exception as e:
1103
+ print(f"Error processing image: {str(e)}")
1104
+ # Fall back to using the content as text
1105
+ content = chunk.content
1106
+ else:
1107
+ content = chunk.content
1108
+
1109
+ final_chunks.append(
1110
+ FinalChunkResult(
1111
+ content=content,
1112
+ score=chunk.score,
1113
+ document_id=chunk.document_id,
1114
+ chunk_number=chunk.chunk_number,
1115
+ metadata=chunk.metadata,
1116
+ content_type=chunk.content_type,
1117
+ filename=chunk.filename,
1118
+ download_url=chunk.download_url,
1119
+ )
1120
+ )
1121
+
1122
+ return final_chunks
1123
+
1124
+ async def create_cache(
1125
+ self,
1126
+ name: str,
1127
+ model: str,
1128
+ gguf_file: str,
1129
+ filters: Optional[Dict[str, Any]] = None,
1130
+ docs: Optional[List[str]] = None,
1131
+ ) -> Dict[str, Any]:
1132
+ """
1133
+ Create a new cache with specified configuration.
1134
+
1135
+ Args:
1136
+ name: Name of the cache to create
1137
+ model: Name of the model to use (e.g. "llama2")
1138
+ gguf_file: Name of the GGUF file to use for the model
1139
+ filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided.
1140
+ docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters.
1141
+
1142
+ Returns:
1143
+ Dict[str, Any]: Created cache configuration
1144
+
1145
+ Example:
1146
+ ```python
1147
+ # This will include both:
1148
+ # 1. Any documents with category="programming"
1149
+ # 2. The specific documents "doc1" and "doc2" (regardless of their category)
1150
+ cache = await db.create_cache(
1151
+ name="programming_cache",
1152
+ model="llama2",
1153
+ gguf_file="llama-2-7b-chat.Q4_K_M.gguf",
1154
+ filters={"category": "programming"},
1155
+ docs=["doc1", "doc2"]
1156
+ )
1157
+ ```
1158
+ """
1159
+ # Build query parameters for name, model and gguf_file
1160
+ params = {"name": name, "model": model, "gguf_file": gguf_file}
1161
+
1162
+ # Build request body for filters and docs
1163
+ request = {"filters": filters, "docs": docs}
1164
+
1165
+ response = await self._request("POST", "cache/create", request, params=params)
1166
+ return response
1167
+
1168
+ async def get_cache(self, name: str) -> AsyncCache:
1169
+ """
1170
+ Get a cache by name.
1171
+
1172
+ Args:
1173
+ name: Name of the cache to retrieve
1174
+
1175
+ Returns:
1176
+ cache: A cache object that is used to interact with the cache.
1177
+
1178
+ Example:
1179
+ ```python
1180
+ cache = await db.get_cache("programming_cache")
1181
+ ```
1182
+ """
1183
+ response = await self._request("GET", f"cache/{name}")
1184
+ if response.get("exists", False):
1185
+ return AsyncCache(self, name)
1186
+ raise ValueError(f"Cache '{name}' not found")
1187
+
1188
+ async def create_graph(
1189
+ self,
1190
+ name: str,
1191
+ filters: Optional[Dict[str, Any]] = None,
1192
+ documents: Optional[List[str]] = None,
1193
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
1194
+ ) -> Graph:
1195
+ """
1196
+ Create a graph from documents.
1197
+
1198
+ This method extracts entities and relationships from documents
1199
+ matching the specified filters or document IDs and creates a graph.
1200
+
1201
+ Args:
1202
+ name: Name of the graph to create
1203
+ filters: Optional metadata filters to determine which documents to include
1204
+ documents: Optional list of specific document IDs to include
1205
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1206
+ Either a GraphPromptOverrides object or a dictionary with the same structure
1207
+
1208
+ Returns:
1209
+ Graph: The created graph object
1210
+
1211
+ Example:
1212
+ ```python
1213
+ # Create a graph from documents with category="research"
1214
+ graph = await db.create_graph(
1215
+ name="research_graph",
1216
+ filters={"category": "research"}
1217
+ )
1218
+
1219
+ # Create a graph from specific documents
1220
+ graph = await db.create_graph(
1221
+ name="custom_graph",
1222
+ documents=["doc1", "doc2", "doc3"]
1223
+ )
1224
+
1225
+ # With custom entity extraction examples
1226
+ from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
1227
+ graph = await db.create_graph(
1228
+ name="medical_graph",
1229
+ filters={"category": "medical"},
1230
+ prompt_overrides=GraphPromptOverrides(
1231
+ entity_extraction=EntityExtractionPromptOverride(
1232
+ examples=[
1233
+ EntityExtractionExample(label="Insulin", type="MEDICATION"),
1234
+ EntityExtractionExample(label="Diabetes", type="CONDITION")
1235
+ ]
1236
+ )
1237
+ )
1238
+ )
1239
+ ```
1240
+ """
1241
+ # Convert prompt_overrides to dict if it's a model
1242
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1243
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1244
+
1245
+ request = {
1246
+ "name": name,
1247
+ "filters": filters,
1248
+ "documents": documents,
1249
+ "prompt_overrides": prompt_overrides,
1250
+ }
1251
+
1252
+ response = await self._request("POST", "graph/create", request)
1253
+ return Graph(**response)
1254
+
1255
+ async def get_graph(self, name: str) -> Graph:
1256
+ """
1257
+ Get a graph by name.
1258
+
1259
+ Args:
1260
+ name: Name of the graph to retrieve
1261
+
1262
+ Returns:
1263
+ Graph: The requested graph object
1264
+
1265
+ Example:
1266
+ ```python
1267
+ # Get a graph by name
1268
+ graph = await db.get_graph("finance_graph")
1269
+ print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships")
1270
+ ```
1271
+ """
1272
+ response = await self._request("GET", f"graph/{name}")
1273
+ return Graph(**response)
1274
+
1275
+ async def list_graphs(self) -> List[Graph]:
1276
+ """
1277
+ List all graphs the user has access to.
1278
+
1279
+ Returns:
1280
+ List[Graph]: List of graph objects
1281
+
1282
+ Example:
1283
+ ```python
1284
+ # List all accessible graphs
1285
+ graphs = await db.list_graphs()
1286
+ for graph in graphs:
1287
+ print(f"Graph: {graph.name}, Entities: {len(graph.entities)}")
1288
+ ```
1289
+ """
1290
+ response = await self._request("GET", "graphs")
1291
+ return [Graph(**graph) for graph in response]
1292
+
1293
+ async def update_graph(
1294
+ self,
1295
+ name: str,
1296
+ additional_filters: Optional[Dict[str, Any]] = None,
1297
+ additional_documents: Optional[List[str]] = None,
1298
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
1299
+ ) -> Graph:
1300
+ """
1301
+ Update an existing graph with new documents.
1302
+
1303
+ This method processes additional documents matching the original or new filters,
1304
+ extracts entities and relationships, and updates the graph with new information.
1305
+
1306
+ Args:
1307
+ name: Name of the graph to update
1308
+ additional_filters: Optional additional metadata filters to determine which new documents to include
1309
+ additional_documents: Optional list of additional document IDs to include
1310
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1311
+ Either a GraphPromptOverrides object or a dictionary with the same structure
1312
+
1313
+ Returns:
1314
+ Graph: The updated graph
1315
+
1316
+ Example:
1317
+ ```python
1318
+ # Update a graph with new documents
1319
+ updated_graph = await db.update_graph(
1320
+ name="research_graph",
1321
+ additional_filters={"category": "new_research"},
1322
+ additional_documents=["doc4", "doc5"]
1323
+ )
1324
+ print(f"Graph now has {len(updated_graph.entities)} entities")
1325
+
1326
+ # With entity resolution examples
1327
+ from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
1328
+ updated_graph = await db.update_graph(
1329
+ name="research_graph",
1330
+ additional_documents=["doc4"],
1331
+ prompt_overrides=GraphPromptOverrides(
1332
+ entity_resolution=EntityResolutionPromptOverride(
1333
+ examples=[
1334
+ EntityResolutionExample(
1335
+ canonical="Machine Learning",
1336
+ variants=["ML", "machine learning", "AI/ML"]
1337
+ )
1338
+ ]
1339
+ )
1340
+ )
1341
+ )
1342
+ ```
1343
+ """
1344
+ # Convert prompt_overrides to dict if it's a model
1345
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1346
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1347
+
1348
+ request = {
1349
+ "additional_filters": additional_filters,
1350
+ "additional_documents": additional_documents,
1351
+ "prompt_overrides": prompt_overrides,
1352
+ }
1353
+
1354
+ response = await self._request("POST", f"graph/{name}/update", request)
1355
+ return Graph(**response)
1356
+
1357
+ async def delete_document(self, document_id: str) -> Dict[str, str]:
1358
+ """
1359
+ Delete a document and all its associated data.
1360
+
1361
+ This method deletes a document and all its associated data, including:
1362
+ - Document metadata
1363
+ - Document content in storage
1364
+ - Document chunks and embeddings in vector store
1365
+
1366
+ Args:
1367
+ document_id: ID of the document to delete
1368
+
1369
+ Returns:
1370
+ Dict[str, str]: Deletion status
1371
+
1372
+ Example:
1373
+ ```python
1374
+ # Delete a document
1375
+ result = await db.delete_document("doc_123")
1376
+ print(result["message"]) # Document doc_123 deleted successfully
1377
+ ```
1378
+ """
1379
+ response = await self._request("DELETE", f"documents/{document_id}")
1380
+ return response
1381
+
1382
+ async def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
1383
+ """
1384
+ Delete a document by its filename.
1385
+
1386
+ This is a convenience method that first retrieves the document ID by filename
1387
+ and then deletes the document by ID.
1388
+
1389
+ Args:
1390
+ filename: Filename of the document to delete
1391
+
1392
+ Returns:
1393
+ Dict[str, str]: Deletion status
1394
+
1395
+ Example:
1396
+ ```python
1397
+ # Delete a document by filename
1398
+ result = await db.delete_document_by_filename("report.pdf")
1399
+ print(result["message"])
1400
+ ```
1401
+ """
1402
+ # First get the document by filename to obtain its ID
1403
+ doc = await self.get_document_by_filename(filename)
1404
+
1405
+ # Then delete the document by ID
1406
+ return await self.delete_document(doc.external_id)
1407
+
1408
+ async def close(self):
1409
+ """Close the HTTP client"""
1410
+ await self._client.aclose()
1411
+
1412
+ async def __aenter__(self):
1413
+ return self
1414
+
1415
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1416
+ await self.close()