morphik 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/sync.py CHANGED
@@ -7,74 +7,1083 @@ import json
7
7
  import logging
8
8
  from pathlib import Path
9
9
  from typing import Dict, Any, List, Optional, Union, BinaryIO
10
- from urllib.parse import urlparse
11
10
 
12
- import jwt
13
- from pydantic import BaseModel, Field
14
- import requests
11
+ import httpx
15
12
 
16
13
  from .models import (
17
- Document,
18
- ChunkResult,
19
- DocumentResult,
20
- CompletionResponse,
21
- IngestTextRequest,
14
+ Document,
15
+ DocumentResult,
16
+ CompletionResponse,
17
+ IngestTextRequest,
22
18
  ChunkSource,
23
19
  Graph,
24
20
  # Prompt override models
25
- EntityExtractionExample,
26
- EntityResolutionExample,
27
- EntityExtractionPromptOverride,
28
- EntityResolutionPromptOverride,
29
- QueryPromptOverride,
30
21
  GraphPromptOverrides,
31
- QueryPromptOverrides
22
+ QueryPromptOverrides,
32
23
  )
33
24
  from .rules import Rule
25
+ from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
34
26
 
35
27
  logger = logging.getLogger(__name__)
36
28
 
37
- # Type alias for rules
38
- RuleOrDict = Union[Rule, Dict[str, Any]]
39
29
 
30
+ class Cache:
31
+ def __init__(self, db: "Morphik", name: str):
32
+ self._db = db
33
+ self._name = name
34
+
35
+ def update(self) -> bool:
36
+ response = self._db._request("POST", f"cache/{self._name}/update")
37
+ return response.get("success", False)
38
+
39
+ def add_docs(self, docs: List[str]) -> bool:
40
+ response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
41
+ return response.get("success", False)
42
+
43
+ def query(
44
+ self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
45
+ ) -> CompletionResponse:
46
+ response = self._db._request(
47
+ "POST",
48
+ f"cache/{self._name}/query",
49
+ params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
50
+ data="",
51
+ )
52
+ return CompletionResponse(**response)
53
+
54
+
55
+ class Folder:
56
+ """
57
+ A folder that allows operations to be scoped to a specific folder.
58
+
59
+ Args:
60
+ client: The Morphik client instance
61
+ name: The name of the folder
62
+ """
63
+
64
+ def __init__(self, client: "Morphik", name: str):
65
+ self._client = client
66
+ self._name = name
67
+
68
+ @property
69
+ def name(self) -> str:
70
+ """Returns the folder name."""
71
+ return self._name
72
+
73
+ def signin(self, end_user_id: str) -> "UserScope":
74
+ """
75
+ Returns a UserScope object scoped to this folder and the end user.
76
+
77
+ Args:
78
+ end_user_id: The ID of the end user
79
+
80
+ Returns:
81
+ UserScope: A user scope scoped to this folder and the end user
82
+ """
83
+ return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)
84
+
85
+ def ingest_text(
86
+ self,
87
+ content: str,
88
+ filename: Optional[str] = None,
89
+ metadata: Optional[Dict[str, Any]] = None,
90
+ rules: Optional[List[RuleOrDict]] = None,
91
+ use_colpali: bool = True,
92
+ ) -> Document:
93
+ """
94
+ Ingest a text document into Morphik within this folder.
95
+
96
+ Args:
97
+ content: Text content to ingest
98
+ filename: Optional file name
99
+ metadata: Optional metadata dictionary
100
+ rules: Optional list of rules to apply during ingestion
101
+ use_colpali: Whether to use ColPali-style embedding model
102
+
103
+ Returns:
104
+ Document: Metadata of the ingested document
105
+ """
106
+ rules_list = [self._client._convert_rule(r) for r in (rules or [])]
107
+ payload = self._client._logic._prepare_ingest_text_request(
108
+ content, filename, metadata, rules_list, use_colpali, self._name, None
109
+ )
110
+ response = self._client._request("POST", "ingest/text", data=payload)
111
+ doc = self._client._logic._parse_document_response(response)
112
+ doc._client = self._client
113
+ return doc
114
+
115
+ def ingest_file(
116
+ self,
117
+ file: Union[str, bytes, BinaryIO, Path],
118
+ filename: Optional[str] = None,
119
+ metadata: Optional[Dict[str, Any]] = None,
120
+ rules: Optional[List[RuleOrDict]] = None,
121
+ use_colpali: bool = True,
122
+ ) -> Document:
123
+ """
124
+ Ingest a file document into Morphik within this folder.
125
+
126
+ Args:
127
+ file: File to ingest (path string, bytes, file object, or Path)
128
+ filename: Name of the file
129
+ metadata: Optional metadata dictionary
130
+ rules: Optional list of rules to apply during ingestion
131
+ use_colpali: Whether to use ColPali-style embedding model
132
+
133
+ Returns:
134
+ Document: Metadata of the ingested document
135
+ """
136
+ # Process file input
137
+ file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)
138
+
139
+ try:
140
+ # Prepare multipart form data
141
+ files = {"file": (filename, file_obj)}
142
+
143
+ # Create form data
144
+ form_data = self._client._logic._prepare_ingest_file_form_data(
145
+ metadata, rules, self._name, None
146
+ )
147
+
148
+ response = self._client._request(
149
+ "POST",
150
+ f"ingest/file?use_colpali={str(use_colpali).lower()}",
151
+ data=form_data,
152
+ files=files,
153
+ )
154
+ doc = self._client._logic._parse_document_response(response)
155
+ doc._client = self._client
156
+ return doc
157
+ finally:
158
+ # Close file if we opened it
159
+ if isinstance(file, (str, Path)):
160
+ file_obj.close()
161
+
162
+ def ingest_files(
163
+ self,
164
+ files: List[Union[str, bytes, BinaryIO, Path]],
165
+ metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
166
+ rules: Optional[List[RuleOrDict]] = None,
167
+ use_colpali: bool = True,
168
+ parallel: bool = True,
169
+ ) -> List[Document]:
170
+ """
171
+ Ingest multiple files into Morphik within this folder.
172
+
173
+ Args:
174
+ files: List of files to ingest
175
+ metadata: Optional metadata
176
+ rules: Optional list of rules to apply
177
+ use_colpali: Whether to use ColPali-style embedding
178
+ parallel: Whether to process files in parallel
179
+
180
+ Returns:
181
+ List[Document]: List of ingested documents
182
+ """
183
+ # Convert files to format expected by API
184
+ file_objects = self._client._logic._prepare_files_for_upload(files)
185
+
186
+ try:
187
+ # Prepare form data
188
+ data = self._client._logic._prepare_ingest_files_form_data(
189
+ metadata, rules, use_colpali, parallel, self._name, None
190
+ )
191
+
192
+ response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
193
+
194
+ if response.get("errors"):
195
+ # Log errors but don't raise exception
196
+ for error in response["errors"]:
197
+ logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
198
+
199
+ docs = [
200
+ self._client._logic._parse_document_response(doc) for doc in response["documents"]
201
+ ]
202
+ for doc in docs:
203
+ doc._client = self._client
204
+ return docs
205
+ finally:
206
+ # Clean up file objects
207
+ for _, (_, file_obj) in file_objects:
208
+ if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
209
+ file_obj.close()
210
+
211
+ def ingest_directory(
212
+ self,
213
+ directory: Union[str, Path],
214
+ recursive: bool = False,
215
+ pattern: str = "*",
216
+ metadata: Optional[Dict[str, Any]] = None,
217
+ rules: Optional[List[RuleOrDict]] = None,
218
+ use_colpali: bool = True,
219
+ parallel: bool = True,
220
+ ) -> List[Document]:
221
+ """
222
+ Ingest all files in a directory into Morphik within this folder.
223
+
224
+ Args:
225
+ directory: Path to directory containing files to ingest
226
+ recursive: Whether to recursively process subdirectories
227
+ pattern: Optional glob pattern to filter files
228
+ metadata: Optional metadata dictionary to apply to all files
229
+ rules: Optional list of rules to apply
230
+ use_colpali: Whether to use ColPali-style embedding
231
+ parallel: Whether to process files in parallel
232
+
233
+ Returns:
234
+ List[Document]: List of ingested documents
235
+ """
236
+ directory = Path(directory)
237
+ if not directory.is_dir():
238
+ raise ValueError(f"Directory not found: {directory}")
239
+
240
+ # Collect all files matching pattern
241
+ if recursive:
242
+ files = list(directory.rglob(pattern))
243
+ else:
244
+ files = list(directory.glob(pattern))
245
+
246
+ # Filter out directories
247
+ files = [f for f in files if f.is_file()]
248
+
249
+ if not files:
250
+ return []
251
+
252
+ # Use ingest_files with collected paths
253
+ return self.ingest_files(
254
+ files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
255
+ )
256
+
257
+ def retrieve_chunks(
258
+ self,
259
+ query: str,
260
+ filters: Optional[Dict[str, Any]] = None,
261
+ k: int = 4,
262
+ min_score: float = 0.0,
263
+ use_colpali: bool = True,
264
+ ) -> List[FinalChunkResult]:
265
+ """
266
+ Retrieve relevant chunks within this folder.
267
+
268
+ Args:
269
+ query: Search query text
270
+ filters: Optional metadata filters
271
+ k: Number of results (default: 4)
272
+ min_score: Minimum similarity threshold (default: 0.0)
273
+ use_colpali: Whether to use ColPali-style embedding model
274
+
275
+ Returns:
276
+ List[FinalChunkResult]: List of relevant chunks
277
+ """
278
+ request = {
279
+ "query": query,
280
+ "filters": filters,
281
+ "k": k,
282
+ "min_score": min_score,
283
+ "use_colpali": use_colpali,
284
+ "folder_name": self._name, # Add folder name here
285
+ }
286
+
287
+ response = self._client._request("POST", "retrieve/chunks", request)
288
+ return self._client._logic._parse_chunk_result_list_response(response)
289
+
290
+ def retrieve_docs(
291
+ self,
292
+ query: str,
293
+ filters: Optional[Dict[str, Any]] = None,
294
+ k: int = 4,
295
+ min_score: float = 0.0,
296
+ use_colpali: bool = True,
297
+ ) -> List[DocumentResult]:
298
+ """
299
+ Retrieve relevant documents within this folder.
300
+
301
+ Args:
302
+ query: Search query text
303
+ filters: Optional metadata filters
304
+ k: Number of results (default: 4)
305
+ min_score: Minimum similarity threshold (default: 0.0)
306
+ use_colpali: Whether to use ColPali-style embedding model
307
+
308
+ Returns:
309
+ List[DocumentResult]: List of relevant documents
310
+ """
311
+ request = {
312
+ "query": query,
313
+ "filters": filters,
314
+ "k": k,
315
+ "min_score": min_score,
316
+ "use_colpali": use_colpali,
317
+ "folder_name": self._name, # Add folder name here
318
+ }
319
+
320
+ response = self._client._request("POST", "retrieve/docs", request)
321
+ return self._client._logic._parse_document_result_list_response(response)
322
+
323
+ def query(
324
+ self,
325
+ query: str,
326
+ filters: Optional[Dict[str, Any]] = None,
327
+ k: int = 4,
328
+ min_score: float = 0.0,
329
+ max_tokens: Optional[int] = None,
330
+ temperature: Optional[float] = None,
331
+ use_colpali: bool = True,
332
+ graph_name: Optional[str] = None,
333
+ hop_depth: int = 1,
334
+ include_paths: bool = False,
335
+ prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
336
+ ) -> CompletionResponse:
337
+ """
338
+ Generate completion using relevant chunks as context within this folder.
339
+
340
+ Args:
341
+ query: Query text
342
+ filters: Optional metadata filters
343
+ k: Number of chunks to use as context (default: 4)
344
+ min_score: Minimum similarity threshold (default: 0.0)
345
+ max_tokens: Maximum tokens in completion
346
+ temperature: Model temperature
347
+ use_colpali: Whether to use ColPali-style embedding model
348
+ graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
349
+ hop_depth: Number of relationship hops to traverse in the graph (1-3)
350
+ include_paths: Whether to include relationship paths in the response
351
+ prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
352
+
353
+ Returns:
354
+ CompletionResponse: Generated completion
355
+ """
356
+ payload = self._client._logic._prepare_query_request(
357
+ query,
358
+ filters,
359
+ k,
360
+ min_score,
361
+ max_tokens,
362
+ temperature,
363
+ use_colpali,
364
+ graph_name,
365
+ hop_depth,
366
+ include_paths,
367
+ prompt_overrides,
368
+ self._name,
369
+ None,
370
+ )
371
+ response = self._client._request("POST", "query", data=payload)
372
+ return self._client._logic._parse_completion_response(response)
373
+
374
+ def list_documents(
375
+ self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
376
+ ) -> List[Document]:
377
+ """
378
+ List accessible documents within this folder.
379
+
380
+ Args:
381
+ skip: Number of documents to skip
382
+ limit: Maximum number of documents to return
383
+ filters: Optional filters
384
+
385
+ Returns:
386
+ List[Document]: List of documents
387
+ """
388
+ params, data = self._client._logic._prepare_list_documents_request(
389
+ skip, limit, filters, self._name, None
390
+ )
391
+ response = self._client._request("POST", "documents", data=data, params=params)
392
+ docs = self._client._logic._parse_document_list_response(response)
393
+ for doc in docs:
394
+ doc._client = self._client
395
+ return docs
396
+
397
+ def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
398
+ """
399
+ Retrieve multiple documents by their IDs in a single batch operation within this folder.
400
+
401
+ Args:
402
+ document_ids: List of document IDs to retrieve
403
+
404
+ Returns:
405
+ List[Document]: List of document metadata for found documents
406
+ """
407
+ request = {"document_ids": document_ids, "folder_name": self._name}
408
+
409
+ response = self._client._request("POST", "batch/documents", data=request)
410
+ docs = [self._logic._parse_document_response(doc) for doc in response]
411
+ for doc in docs:
412
+ doc._client = self._client
413
+ return docs
414
+
415
+ def batch_get_chunks(
416
+ self, sources: List[Union[ChunkSource, Dict[str, Any]]]
417
+ ) -> List[FinalChunkResult]:
418
+ """
419
+ Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
420
+
421
+ Args:
422
+ sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
423
+
424
+ Returns:
425
+ List[FinalChunkResult]: List of chunk results
426
+ """
427
+ # Convert to list of dictionaries if needed
428
+ source_dicts = []
429
+ for source in sources:
430
+ if isinstance(source, dict):
431
+ source_dicts.append(source)
432
+ else:
433
+ source_dicts.append(source.model_dump())
434
+
435
+ # Add folder_name to request
436
+ request = {"sources": source_dicts, "folder_name": self._name}
437
+
438
+ response = self._client._request("POST", "batch/chunks", data=request)
439
+ return self._client._logic._parse_chunk_result_list_response(response)
440
+
441
+ def create_graph(
442
+ self,
443
+ name: str,
444
+ filters: Optional[Dict[str, Any]] = None,
445
+ documents: Optional[List[str]] = None,
446
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
447
+ ) -> Graph:
448
+ """
449
+ Create a graph from documents within this folder.
450
+
451
+ Args:
452
+ name: Name of the graph to create
453
+ filters: Optional metadata filters to determine which documents to include
454
+ documents: Optional list of specific document IDs to include
455
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
456
+
457
+ Returns:
458
+ Graph: The created graph object
459
+ """
460
+ # Convert prompt_overrides to dict if it's a model
461
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
462
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
463
+
464
+ request = {
465
+ "name": name,
466
+ "filters": filters,
467
+ "documents": documents,
468
+ "prompt_overrides": prompt_overrides,
469
+ "folder_name": self._name, # Add folder name here
470
+ }
471
+
472
+ response = self._client._request("POST", "graph/create", request)
473
+ return self._client._logic._parse_graph_response(response)
474
+
475
+ def update_graph(
476
+ self,
477
+ name: str,
478
+ additional_filters: Optional[Dict[str, Any]] = None,
479
+ additional_documents: Optional[List[str]] = None,
480
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
481
+ ) -> Graph:
482
+ """
483
+ Update an existing graph with new documents from this folder.
484
+
485
+ Args:
486
+ name: Name of the graph to update
487
+ additional_filters: Optional additional metadata filters to determine which new documents to include
488
+ additional_documents: Optional list of additional document IDs to include
489
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
490
+
491
+ Returns:
492
+ Graph: The updated graph
493
+ """
494
+ # Convert prompt_overrides to dict if it's a model
495
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
496
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
497
+
498
+ request = {
499
+ "additional_filters": additional_filters,
500
+ "additional_documents": additional_documents,
501
+ "prompt_overrides": prompt_overrides,
502
+ "folder_name": self._name, # Add folder name here
503
+ }
504
+
505
+ response = self._client._request("POST", f"graph/{name}/update", request)
506
+ return self._client._logic._parse_graph_response(response)
507
+
508
+ def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
509
+ """
510
+ Delete a document by its filename within this folder.
511
+
512
+ Args:
513
+ filename: Filename of the document to delete
514
+
515
+ Returns:
516
+ Dict[str, str]: Deletion status
517
+ """
518
+ # Get the document by filename with folder scope
519
+ request = {"filename": filename, "folder_name": self._name}
520
+
521
+ # First get the document ID
522
+ response = self._client._request(
523
+ "GET", f"documents/filename/{filename}", params={"folder_name": self._name}
524
+ )
525
+ doc = self._client._logic._parse_document_response(response)
526
+
527
+ # Then delete by ID
528
+ return self._client.delete_document(doc.external_id)
529
+
530
+
531
+ class UserScope:
532
+ """
533
+ A user scope that allows operations to be scoped to a specific end user and optionally a folder.
534
+
535
+ Args:
536
+ client: The Morphik client instance
537
+ end_user_id: The ID of the end user
538
+ folder_name: Optional folder name to further scope operations
539
+ """
540
+
541
+ def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None):
542
+ self._client = client
543
+ self._end_user_id = end_user_id
544
+ self._folder_name = folder_name
545
+
546
+ @property
547
+ def end_user_id(self) -> str:
548
+ """Returns the end user ID."""
549
+ return self._end_user_id
550
+
551
+ @property
552
+ def folder_name(self) -> Optional[str]:
553
+ """Returns the folder name if any."""
554
+ return self._folder_name
555
+
556
+ def ingest_text(
557
+ self,
558
+ content: str,
559
+ filename: Optional[str] = None,
560
+ metadata: Optional[Dict[str, Any]] = None,
561
+ rules: Optional[List[RuleOrDict]] = None,
562
+ use_colpali: bool = True,
563
+ ) -> Document:
564
+ """
565
+ Ingest a text document into Morphik as this end user.
566
+
567
+ Args:
568
+ content: Text content to ingest
569
+ filename: Optional file name
570
+ metadata: Optional metadata dictionary
571
+ rules: Optional list of rules to apply during ingestion
572
+ use_colpali: Whether to use ColPali-style embedding model
573
+
574
+ Returns:
575
+ Document: Metadata of the ingested document
576
+ """
577
+ rules_list = [self._client._convert_rule(r) for r in (rules or [])]
578
+ payload = self._client._logic._prepare_ingest_text_request(
579
+ content,
580
+ filename,
581
+ metadata,
582
+ rules_list,
583
+ use_colpali,
584
+ self._folder_name,
585
+ self._end_user_id,
586
+ )
587
+ response = self._client._request("POST", "ingest/text", data=payload)
588
+ doc = self._client._logic._parse_document_response(response)
589
+ doc._client = self._client
590
+ return doc
591
+
592
+ def ingest_file(
593
+ self,
594
+ file: Union[str, bytes, BinaryIO, Path],
595
+ filename: Optional[str] = None,
596
+ metadata: Optional[Dict[str, Any]] = None,
597
+ rules: Optional[List[RuleOrDict]] = None,
598
+ use_colpali: bool = True,
599
+ ) -> Document:
600
+ """
601
+ Ingest a file document into Morphik as this end user.
602
+
603
+ Args:
604
+ file: File to ingest (path string, bytes, file object, or Path)
605
+ filename: Name of the file
606
+ metadata: Optional metadata dictionary
607
+ rules: Optional list of rules to apply during ingestion
608
+ use_colpali: Whether to use ColPali-style embedding model
609
+
610
+ Returns:
611
+ Document: Metadata of the ingested document
612
+ """
613
+ # Handle different file input types
614
+ if isinstance(file, (str, Path)):
615
+ file_path = Path(file)
616
+ if not file_path.exists():
617
+ raise ValueError(f"File not found: {file}")
618
+ filename = file_path.name if filename is None else filename
619
+ with open(file_path, "rb") as f:
620
+ content = f.read()
621
+ file_obj = BytesIO(content)
622
+ elif isinstance(file, bytes):
623
+ if filename is None:
624
+ raise ValueError("filename is required when ingesting bytes")
625
+ file_obj = BytesIO(file)
626
+ else:
627
+ if filename is None:
628
+ raise ValueError("filename is required when ingesting file object")
629
+ file_obj = file
630
+
631
+ try:
632
+ # Prepare multipart form data
633
+ files = {"file": (filename, file_obj)}
634
+
635
+ # Add metadata and rules
636
+ form_data = {
637
+ "metadata": json.dumps(metadata or {}),
638
+ "rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
639
+ "end_user_id": self._end_user_id, # Add end user ID here
640
+ }
641
+
642
+ # Add folder name if scoped to a folder
643
+ if self._folder_name:
644
+ form_data["folder_name"] = self._folder_name
645
+
646
+ response = self._client._request(
647
+ "POST",
648
+ f"ingest/file?use_colpali={str(use_colpali).lower()}",
649
+ data=form_data,
650
+ files=files,
651
+ )
652
+ doc = self._client._logic._parse_document_response(response)
653
+ doc._client = self._client
654
+ return doc
655
+ finally:
656
+ # Close file if we opened it
657
+ if isinstance(file, (str, Path)):
658
+ file_obj.close()
659
+
660
+ def ingest_files(
661
+ self,
662
+ files: List[Union[str, bytes, BinaryIO, Path]],
663
+ metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
664
+ rules: Optional[List[RuleOrDict]] = None,
665
+ use_colpali: bool = True,
666
+ parallel: bool = True,
667
+ ) -> List[Document]:
668
+ """
669
+ Ingest multiple files into Morphik as this end user.
670
+
671
+ Args:
672
+ files: List of files to ingest
673
+ metadata: Optional metadata
674
+ rules: Optional list of rules to apply
675
+ use_colpali: Whether to use ColPali-style embedding
676
+ parallel: Whether to process files in parallel
677
+
678
+ Returns:
679
+ List[Document]: List of ingested documents
680
+ """
681
+ # Convert files to format expected by API
682
+ file_objects = []
683
+ for file in files:
684
+ if isinstance(file, (str, Path)):
685
+ path = Path(file)
686
+ file_objects.append(("files", (path.name, open(path, "rb"))))
687
+ elif isinstance(file, bytes):
688
+ file_objects.append(("files", ("file.bin", file)))
689
+ else:
690
+ file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
691
+
692
+ try:
693
+ # Prepare request data
694
+ # Convert rules appropriately
695
+ if rules:
696
+ if all(isinstance(r, list) for r in rules):
697
+ # List of lists - per-file rules
698
+ converted_rules = [
699
+ [self._client._convert_rule(r) for r in rule_list] for rule_list in rules
700
+ ]
701
+ else:
702
+ # Flat list - shared rules for all files
703
+ converted_rules = [self._client._convert_rule(r) for r in rules]
704
+ else:
705
+ converted_rules = []
706
+
707
+ data = {
708
+ "metadata": json.dumps(metadata or {}),
709
+ "rules": json.dumps(converted_rules),
710
+ "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
711
+ "parallel": str(parallel).lower(),
712
+ "end_user_id": self._end_user_id, # Add end user ID here
713
+ }
714
+
715
+ # Add folder name if scoped to a folder
716
+ if self._folder_name:
717
+ data["folder_name"] = self._folder_name
718
+
719
+ response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
720
+
721
+ if response.get("errors"):
722
+ # Log errors but don't raise exception
723
+ for error in response["errors"]:
724
+ logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
725
+
726
+ docs = [
727
+ self._client._logic._parse_document_response(doc) for doc in response["documents"]
728
+ ]
729
+ for doc in docs:
730
+ doc._client = self._client
731
+ return docs
732
+ finally:
733
+ # Clean up file objects
734
+ for _, (_, file_obj) in file_objects:
735
+ if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
736
+ file_obj.close()
737
+
738
+ def ingest_directory(
739
+ self,
740
+ directory: Union[str, Path],
741
+ recursive: bool = False,
742
+ pattern: str = "*",
743
+ metadata: Optional[Dict[str, Any]] = None,
744
+ rules: Optional[List[RuleOrDict]] = None,
745
+ use_colpali: bool = True,
746
+ parallel: bool = True,
747
+ ) -> List[Document]:
748
+ """
749
+ Ingest all files in a directory into Morphik as this end user.
750
+
751
+ Args:
752
+ directory: Path to directory containing files to ingest
753
+ recursive: Whether to recursively process subdirectories
754
+ pattern: Optional glob pattern to filter files
755
+ metadata: Optional metadata dictionary to apply to all files
756
+ rules: Optional list of rules to apply
757
+ use_colpali: Whether to use ColPali-style embedding
758
+ parallel: Whether to process files in parallel
759
+
760
+ Returns:
761
+ List[Document]: List of ingested documents
762
+ """
763
+ directory = Path(directory)
764
+ if not directory.is_dir():
765
+ raise ValueError(f"Directory not found: {directory}")
766
+
767
+ # Collect all files matching pattern
768
+ if recursive:
769
+ files = list(directory.rglob(pattern))
770
+ else:
771
+ files = list(directory.glob(pattern))
772
+
773
+ # Filter out directories
774
+ files = [f for f in files if f.is_file()]
775
+
776
+ if not files:
777
+ return []
778
+
779
+ # Use ingest_files with collected paths
780
+ return self.ingest_files(
781
+ files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
782
+ )
783
+
784
+ def retrieve_chunks(
785
+ self,
786
+ query: str,
787
+ filters: Optional[Dict[str, Any]] = None,
788
+ k: int = 4,
789
+ min_score: float = 0.0,
790
+ use_colpali: bool = True,
791
+ ) -> List[FinalChunkResult]:
792
+ """
793
+ Retrieve relevant chunks as this end user.
794
+
795
+ Args:
796
+ query: Search query text
797
+ filters: Optional metadata filters
798
+ k: Number of results (default: 4)
799
+ min_score: Minimum similarity threshold (default: 0.0)
800
+ use_colpali: Whether to use ColPali-style embedding model
801
+
802
+ Returns:
803
+ List[FinalChunkResult]: List of relevant chunks
804
+ """
805
+ request = {
806
+ "query": query,
807
+ "filters": filters,
808
+ "k": k,
809
+ "min_score": min_score,
810
+ "use_colpali": use_colpali,
811
+ "end_user_id": self._end_user_id, # Add end user ID here
812
+ }
813
+
814
+ # Add folder name if scoped to a folder
815
+ if self._folder_name:
816
+ request["folder_name"] = self._folder_name
817
+
818
+ response = self._client._request("POST", "retrieve/chunks", request)
819
+ return self._client._logic._parse_chunk_result_list_response(response)
820
+
821
+ def retrieve_docs(
822
+ self,
823
+ query: str,
824
+ filters: Optional[Dict[str, Any]] = None,
825
+ k: int = 4,
826
+ min_score: float = 0.0,
827
+ use_colpali: bool = True,
828
+ ) -> List[DocumentResult]:
829
+ """
830
+ Retrieve relevant documents as this end user.
831
+
832
+ Args:
833
+ query: Search query text
834
+ filters: Optional metadata filters
835
+ k: Number of results (default: 4)
836
+ min_score: Minimum similarity threshold (default: 0.0)
837
+ use_colpali: Whether to use ColPali-style embedding model
838
+
839
+ Returns:
840
+ List[DocumentResult]: List of relevant documents
841
+ """
842
+ request = {
843
+ "query": query,
844
+ "filters": filters,
845
+ "k": k,
846
+ "min_score": min_score,
847
+ "use_colpali": use_colpali,
848
+ "end_user_id": self._end_user_id, # Add end user ID here
849
+ }
850
+
851
+ # Add folder name if scoped to a folder
852
+ if self._folder_name:
853
+ request["folder_name"] = self._folder_name
854
+
855
+ response = self._client._request("POST", "retrieve/docs", request)
856
+ return self._client._logic._parse_document_result_list_response(response)
857
+
858
+ def query(
859
+ self,
860
+ query: str,
861
+ filters: Optional[Dict[str, Any]] = None,
862
+ k: int = 4,
863
+ min_score: float = 0.0,
864
+ max_tokens: Optional[int] = None,
865
+ temperature: Optional[float] = None,
866
+ use_colpali: bool = True,
867
+ graph_name: Optional[str] = None,
868
+ hop_depth: int = 1,
869
+ include_paths: bool = False,
870
+ prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
871
+ ) -> CompletionResponse:
872
+ """
873
+ Generate completion using relevant chunks as context as this end user.
874
+
875
+ Args:
876
+ query: Query text
877
+ filters: Optional metadata filters
878
+ k: Number of chunks to use as context (default: 4)
879
+ min_score: Minimum similarity threshold (default: 0.0)
880
+ max_tokens: Maximum tokens in completion
881
+ temperature: Model temperature
882
+ use_colpali: Whether to use ColPali-style embedding model
883
+ graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
884
+ hop_depth: Number of relationship hops to traverse in the graph (1-3)
885
+ include_paths: Whether to include relationship paths in the response
886
+ prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
887
+
888
+ Returns:
889
+ CompletionResponse: Generated completion
890
+ """
891
+ payload = self._client._logic._prepare_query_request(
892
+ query,
893
+ filters,
894
+ k,
895
+ min_score,
896
+ max_tokens,
897
+ temperature,
898
+ use_colpali,
899
+ graph_name,
900
+ hop_depth,
901
+ include_paths,
902
+ prompt_overrides,
903
+ self._folder_name,
904
+ self._end_user_id,
905
+ )
906
+ response = self._client._request("POST", "query", data=payload)
907
+ return self._client._logic._parse_completion_response(response)
908
+
909
+ def list_documents(
910
+ self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
911
+ ) -> List[Document]:
912
+ """
913
+ List accessible documents for this end user.
914
+
915
+ Args:
916
+ skip: Number of documents to skip
917
+ limit: Maximum number of documents to return
918
+ filters: Optional filters
919
+
920
+ Returns:
921
+ List[Document]: List of documents
922
+ """
923
+ # Add end_user_id and folder_name to params
924
+ params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id}
925
+
926
+ # Add folder name if scoped to a folder
927
+ if self._folder_name:
928
+ params["folder_name"] = self._folder_name
929
+
930
+ response = self._client._request("POST", f"documents", data=filters or {}, params=params)
931
+
932
+ docs = [self._logic._parse_document_response(doc) for doc in response]
933
+ for doc in docs:
934
+ doc._client = self._client
935
+ return docs
936
+
937
+ def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
938
+ """
939
+ Retrieve multiple documents by their IDs in a single batch operation for this end user.
940
+
941
+ Args:
942
+ document_ids: List of document IDs to retrieve
943
+
944
+ Returns:
945
+ List[Document]: List of document metadata for found documents
946
+ """
947
+ request = {"document_ids": document_ids, "end_user_id": self._end_user_id}
948
+
949
+ # Add folder name if scoped to a folder
950
+ if self._folder_name:
951
+ request["folder_name"] = self._folder_name
952
+
953
+ response = self._client._request("POST", "batch/documents", data=request)
954
+ docs = [self._logic._parse_document_response(doc) for doc in response]
955
+ for doc in docs:
956
+ doc._client = self._client
957
+ return docs
958
+
959
+ def batch_get_chunks(
960
+ self, sources: List[Union[ChunkSource, Dict[str, Any]]]
961
+ ) -> List[FinalChunkResult]:
962
+ """
963
+ Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
964
+
965
+ Args:
966
+ sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
967
+
968
+ Returns:
969
+ List[FinalChunkResult]: List of chunk results
970
+ """
971
+ # Convert to list of dictionaries if needed
972
+ source_dicts = []
973
+ for source in sources:
974
+ if isinstance(source, dict):
975
+ source_dicts.append(source)
976
+ else:
977
+ source_dicts.append(source.model_dump())
978
+
979
+ # Add end_user_id and folder_name to request
980
+ request = {"sources": source_dicts, "end_user_id": self._end_user_id}
981
+
982
+ # Add folder name if scoped to a folder
983
+ if self._folder_name:
984
+ request["folder_name"] = self._folder_name
985
+
986
+ response = self._client._request("POST", "batch/chunks", data=request)
987
+ return self._client._logic._parse_chunk_result_list_response(response)
988
+
989
+ def create_graph(
990
+ self,
991
+ name: str,
992
+ filters: Optional[Dict[str, Any]] = None,
993
+ documents: Optional[List[str]] = None,
994
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
995
+ ) -> Graph:
996
+ """
997
+ Create a graph from documents for this end user.
998
+
999
+ Args:
1000
+ name: Name of the graph to create
1001
+ filters: Optional metadata filters to determine which documents to include
1002
+ documents: Optional list of specific document IDs to include
1003
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1004
+
1005
+ Returns:
1006
+ Graph: The created graph object
1007
+ """
1008
+ # Convert prompt_overrides to dict if it's a model
1009
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1010
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
40
1011
 
41
- class Cache:
42
- def __init__(self, db: "Morphik", name: str):
43
- self._db = db
44
- self._name = name
1012
+ request = {
1013
+ "name": name,
1014
+ "filters": filters,
1015
+ "documents": documents,
1016
+ "prompt_overrides": prompt_overrides,
1017
+ "end_user_id": self._end_user_id, # Add end user ID here
1018
+ }
45
1019
 
46
- def update(self) -> bool:
47
- response = self._db._request("POST", f"cache/{self._name}/update")
48
- return response.get("success", False)
1020
+ # Add folder name if scoped to a folder
1021
+ if self._folder_name:
1022
+ request["folder_name"] = self._folder_name
49
1023
 
50
- def add_docs(self, docs: List[str]) -> bool:
51
- response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
52
- return response.get("success", False)
1024
+ response = self._client._request("POST", "graph/create", request)
1025
+ return self._client._logic._parse_graph_response(response)
53
1026
 
54
- def query(
55
- self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
56
- ) -> CompletionResponse:
57
- response = self._db._request(
58
- "POST",
59
- f"cache/{self._name}/query",
60
- params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
61
- data="",
62
- )
63
- return CompletionResponse(**response)
1027
+ def update_graph(
1028
+ self,
1029
+ name: str,
1030
+ additional_filters: Optional[Dict[str, Any]] = None,
1031
+ additional_documents: Optional[List[str]] = None,
1032
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
1033
+ ) -> Graph:
1034
+ """
1035
+ Update an existing graph with new documents for this end user.
1036
+
1037
+ Args:
1038
+ name: Name of the graph to update
1039
+ additional_filters: Optional additional metadata filters to determine which new documents to include
1040
+ additional_documents: Optional list of additional document IDs to include
1041
+ prompt_overrides: Optional customizations for entity extraction and resolution prompts
1042
+
1043
+ Returns:
1044
+ Graph: The updated graph
1045
+ """
1046
+ # Convert prompt_overrides to dict if it's a model
1047
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1048
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1049
+
1050
+ request = {
1051
+ "additional_filters": additional_filters,
1052
+ "additional_documents": additional_documents,
1053
+ "prompt_overrides": prompt_overrides,
1054
+ "end_user_id": self._end_user_id, # Add end user ID here
1055
+ }
1056
+
1057
+ # Add folder name if scoped to a folder
1058
+ if self._folder_name:
1059
+ request["folder_name"] = self._folder_name
1060
+
1061
+ response = self._client._request("POST", f"graph/{name}/update", request)
1062
+ return self._client._logic._parse_graph_response(response)
1063
+
1064
+ def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
1065
+ """
1066
+ Delete a document by its filename for this end user.
1067
+
1068
+ Args:
1069
+ filename: Filename of the document to delete
1070
+
1071
+ Returns:
1072
+ Dict[str, str]: Deletion status
1073
+ """
1074
+ # Build parameters for the filename lookup
1075
+ params = {"end_user_id": self._end_user_id}
64
1076
 
1077
+ # Add folder name if scoped to a folder
1078
+ if self._folder_name:
1079
+ params["folder_name"] = self._folder_name
65
1080
 
66
- class FinalChunkResult(BaseModel):
67
- content: str | PILImage = Field(..., description="Chunk content")
68
- score: float = Field(..., description="Relevance score")
69
- document_id: str = Field(..., description="Parent document ID")
70
- chunk_number: int = Field(..., description="Chunk sequence number")
71
- metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
72
- content_type: str = Field(..., description="Content type")
73
- filename: Optional[str] = Field(None, description="Original filename")
74
- download_url: Optional[str] = Field(None, description="URL to download full document")
1081
+ # First get the document ID
1082
+ response = self._client._request("GET", f"documents/filename/{filename}", params=params)
1083
+ doc = self._client._logic._parse_document_response(response)
75
1084
 
76
- class Config:
77
- arbitrary_types_allowed = True
1085
+ # Then delete by ID
1086
+ return self._client.delete_document(doc.external_id)
78
1087
 
79
1088
 
80
1089
  class Morphik:
@@ -98,33 +1107,8 @@ class Morphik:
98
1107
  """
99
1108
 
100
1109
  def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
101
- self._timeout = timeout
102
- self._session = requests.Session()
103
- if is_local:
104
- self._session.verify = False # Disable SSL for localhost
105
- self._is_local = is_local
106
-
107
- if uri:
108
- self._setup_auth(uri)
109
- else:
110
- self._base_url = "http://localhost:8000"
111
- self._auth_token = None
112
-
113
- def _setup_auth(self, uri: str) -> None:
114
- """Setup authentication from URI"""
115
- parsed = urlparse(uri)
116
- if not parsed.netloc:
117
- raise ValueError("Invalid URI format")
118
-
119
- # Split host and auth parts
120
- auth, host = parsed.netloc.split("@")
121
- _, self._auth_token = auth.split(":")
122
-
123
- # Set base URL
124
- self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
125
-
126
- # Basic token validation
127
- jwt.decode(self._auth_token, options={"verify_signature": False})
1110
+ self._logic = _MorphikClientLogic(uri, timeout, is_local)
1111
+ self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local)
128
1112
 
129
1113
  def _request(
130
1114
  self,
@@ -135,25 +1119,25 @@ class Morphik:
135
1119
  params: Optional[Dict[str, Any]] = None,
136
1120
  ) -> Dict[str, Any]:
137
1121
  """Make HTTP request"""
138
- headers = {}
139
- if self._auth_token: # Only add auth header if we have a token
140
- headers["Authorization"] = f"Bearer {self._auth_token}"
1122
+ url = self._logic._get_url(endpoint)
1123
+ headers = self._logic._get_headers()
1124
+ if self._logic._auth_token: # Only add auth header if we have a token
1125
+ headers["Authorization"] = f"Bearer {self._logic._auth_token}"
141
1126
 
142
1127
  # Configure request data based on type
143
1128
  if files:
144
1129
  # Multipart form data for files
145
1130
  request_data = {"files": files, "data": data}
146
- # Don't set Content-Type, let requests handle it
1131
+ # Don't set Content-Type, let httpx handle it
147
1132
  else:
148
1133
  # JSON for everything else
149
1134
  headers["Content-Type"] = "application/json"
150
1135
  request_data = {"json": data}
151
1136
 
152
- response = self._session.request(
1137
+ response = self._client.request(
153
1138
  method,
154
- f"{self._base_url}/{endpoint.lstrip('/')}",
1139
+ url,
155
1140
  headers=headers,
156
- timeout=self._timeout,
157
1141
  params=params,
158
1142
  **request_data,
159
1143
  )
@@ -162,9 +1146,43 @@ class Morphik:
162
1146
 
163
1147
  def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
164
1148
  """Convert a rule to a dictionary format"""
165
- if hasattr(rule, "to_dict"):
166
- return rule.to_dict()
167
- return rule
1149
+ return self._logic._convert_rule(rule)
1150
+
1151
+ def create_folder(self, name: str) -> Folder:
1152
+ """
1153
+ Create a folder to scope operations.
1154
+
1155
+ Args:
1156
+ name: The name of the folder
1157
+
1158
+ Returns:
1159
+ Folder: A folder object for scoped operations
1160
+ """
1161
+ return Folder(self, name)
1162
+
1163
+ def get_folder(self, name: str) -> Folder:
1164
+ """
1165
+ Get a folder by name to scope operations.
1166
+
1167
+ Args:
1168
+ name: The name of the folder
1169
+
1170
+ Returns:
1171
+ Folder: A folder object for scoped operations
1172
+ """
1173
+ return Folder(self, name)
1174
+
1175
+ def signin(self, end_user_id: str) -> UserScope:
1176
+ """
1177
+ Sign in as an end user to scope operations.
1178
+
1179
+ Args:
1180
+ end_user_id: The ID of the end user
1181
+
1182
+ Returns:
1183
+ UserScope: A user scope object for scoped operations
1184
+ """
1185
+ return UserScope(self, end_user_id)
168
1186
 
169
1187
  def ingest_text(
170
1188
  self,
@@ -209,15 +1227,12 @@ class Morphik:
209
1227
  )
210
1228
  ```
211
1229
  """
212
- request = IngestTextRequest(
213
- content=content,
214
- filename=filename,
215
- metadata=metadata or {},
216
- rules=[self._convert_rule(r) for r in (rules or [])],
217
- use_colpali=use_colpali,
1230
+ rules_list = [self._convert_rule(r) for r in (rules or [])]
1231
+ payload = self._logic._prepare_ingest_text_request(
1232
+ content, filename, metadata, rules_list, use_colpali, None, None
218
1233
  )
219
- response = self._request("POST", "ingest/text", data=request.model_dump())
220
- doc = Document(**response)
1234
+ response = self._request("POST", "ingest/text", data=payload)
1235
+ doc = self._logic._parse_document_response(response)
221
1236
  doc._client = self
222
1237
  return doc
223
1238
 
@@ -266,38 +1281,23 @@ class Morphik:
266
1281
  )
267
1282
  ```
268
1283
  """
269
- # Handle different file input types
270
- if isinstance(file, (str, Path)):
271
- file_path = Path(file)
272
- if not file_path.exists():
273
- raise ValueError(f"File not found: {file}")
274
- filename = file_path.name if filename is None else filename
275
- with open(file_path, "rb") as f:
276
- content = f.read()
277
- file_obj = BytesIO(content)
278
- elif isinstance(file, bytes):
279
- if filename is None:
280
- raise ValueError("filename is required when ingesting bytes")
281
- file_obj = BytesIO(file)
282
- else:
283
- if filename is None:
284
- raise ValueError("filename is required when ingesting file object")
285
- file_obj = file
1284
+ # Process file input
1285
+ file_obj, filename = self._logic._prepare_file_for_upload(file, filename)
286
1286
 
287
1287
  try:
288
1288
  # Prepare multipart form data
289
1289
  files = {"file": (filename, file_obj)}
290
1290
 
291
- # Add metadata and rules
292
- form_data = {
293
- "metadata": json.dumps(metadata or {}),
294
- "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
295
- }
1291
+ # Create form data
1292
+ form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
296
1293
 
297
1294
  response = self._request(
298
- "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files
1295
+ "POST",
1296
+ f"ingest/file?use_colpali={str(use_colpali).lower()}",
1297
+ data=form_data,
1298
+ files=files,
299
1299
  )
300
- doc = Document(**response)
1300
+ doc = self._logic._parse_document_response(response)
301
1301
  doc._client = self
302
1302
  return doc
303
1303
  finally:
@@ -330,44 +1330,22 @@ class Morphik:
330
1330
  ValueError: If metadata list length doesn't match files length
331
1331
  """
332
1332
  # Convert files to format expected by API
333
- file_objects = []
334
- for file in files:
335
- if isinstance(file, (str, Path)):
336
- path = Path(file)
337
- file_objects.append(("files", (path.name, open(path, "rb"))))
338
- elif isinstance(file, bytes):
339
- file_objects.append(("files", ("file.bin", file)))
340
- else:
341
- file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
1333
+ file_objects = self._logic._prepare_files_for_upload(files)
342
1334
 
343
1335
  try:
344
- # Prepare request data
345
- # Convert rules appropriately based on whether it's a flat list or list of lists
346
- if rules:
347
- if all(isinstance(r, list) for r in rules):
348
- # List of lists - per-file rules
349
- converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
350
- else:
351
- # Flat list - shared rules for all files
352
- converted_rules = [self._convert_rule(r) for r in rules]
353
- else:
354
- converted_rules = []
355
-
356
- data = {
357
- "metadata": json.dumps(metadata or {}),
358
- "rules": json.dumps(converted_rules),
359
- "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
360
- "parallel": str(parallel).lower(),
361
- }
1336
+ # Prepare form data
1337
+ data = self._logic._prepare_ingest_files_form_data(
1338
+ metadata, rules, use_colpali, parallel, None, None
1339
+ )
362
1340
 
363
1341
  response = self._request("POST", "ingest/files", data=data, files=file_objects)
364
-
1342
+
365
1343
  if response.get("errors"):
366
1344
  # Log errors but don't raise exception
367
1345
  for error in response["errors"]:
368
1346
  logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
369
-
370
- docs = [Document(**doc) for doc in response["documents"]]
1347
+
1348
+ docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
371
1349
  for doc in docs:
372
1350
  doc._client = self
373
1351
  return docs
@@ -417,17 +1395,13 @@ class Morphik:
417
1395
 
418
1396
  # Filter out directories
419
1397
  files = [f for f in files if f.is_file()]
420
-
1398
+
421
1399
  if not files:
422
1400
  return []
423
1401
 
424
1402
  # Use ingest_files with collected paths
425
1403
  return self.ingest_files(
426
- files=files,
427
- metadata=metadata,
428
- rules=rules,
429
- use_colpali=use_colpali,
430
- parallel=parallel
1404
+ files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
431
1405
  )
432
1406
 
433
1407
  def retrieve_chunks(
@@ -458,52 +1432,11 @@ class Morphik:
458
1432
  )
459
1433
  ```
460
1434
  """
461
- request = {
462
- "query": query,
463
- "filters": filters,
464
- "k": k,
465
- "min_score": min_score,
466
- "use_colpali": use_colpali,
467
- }
468
-
469
- response = self._request("POST", "retrieve/chunks", request)
470
- chunks = [ChunkResult(**r) for r in response]
471
-
472
- final_chunks = []
473
-
474
- for chunk in chunks:
475
- if chunk.metadata.get("is_image"):
476
- try:
477
- # Handle data URI format "data:image/png;base64,..."
478
- content = chunk.content
479
- if content.startswith("data:"):
480
- # Extract the base64 part after the comma
481
- content = content.split(",", 1)[1]
482
-
483
- # Now decode the base64 string
484
- image_bytes = base64.b64decode(content)
485
- content = Image.open(io.BytesIO(image_bytes))
486
- except Exception as e:
487
- print(f"Error processing image: {str(e)}")
488
- # Fall back to using the content as text
489
- print(chunk.content)
490
- else:
491
- content = chunk.content
492
-
493
- final_chunks.append(
494
- FinalChunkResult(
495
- content=content,
496
- score=chunk.score,
497
- document_id=chunk.document_id,
498
- chunk_number=chunk.chunk_number,
499
- metadata=chunk.metadata,
500
- content_type=chunk.content_type,
501
- filename=chunk.filename,
502
- download_url=chunk.download_url,
503
- )
504
- )
505
-
506
- return final_chunks
1435
+ payload = self._logic._prepare_retrieve_chunks_request(
1436
+ query, filters, k, min_score, use_colpali, None, None
1437
+ )
1438
+ response = self._request("POST", "retrieve/chunks", data=payload)
1439
+ return self._logic._parse_chunk_result_list_response(response)
507
1440
 
508
1441
  def retrieve_docs(
509
1442
  self,
@@ -533,16 +1466,11 @@ class Morphik:
533
1466
  )
534
1467
  ```
535
1468
  """
536
- request = {
537
- "query": query,
538
- "filters": filters,
539
- "k": k,
540
- "min_score": min_score,
541
- "use_colpali": use_colpali,
542
- }
543
-
544
- response = self._request("POST", "retrieve/docs", request)
545
- return [DocumentResult(**r) for r in response]
1469
+ payload = self._logic._prepare_retrieve_docs_request(
1470
+ query, filters, k, min_score, use_colpali, None, None
1471
+ )
1472
+ response = self._request("POST", "retrieve/docs", data=payload)
1473
+ return self._logic._parse_document_result_list_response(response)
546
1474
 
547
1475
  def query(
548
1476
  self,
@@ -585,7 +1513,7 @@ class Morphik:
585
1513
  filters={"department": "research"},
586
1514
  temperature=0.7
587
1515
  )
588
-
1516
+
589
1517
  # Knowledge graph enhanced query
590
1518
  response = db.query(
591
1519
  "How does product X relate to customer segment Y?",
@@ -593,7 +1521,7 @@ class Morphik:
593
1521
  hop_depth=2,
594
1522
  include_paths=True
595
1523
  )
596
-
1524
+
597
1525
  # With prompt customization
598
1526
  from morphik.models import QueryPromptOverride, QueryPromptOverrides
599
1527
  response = db.query(
@@ -604,7 +1532,7 @@ class Morphik:
604
1532
  )
605
1533
  )
606
1534
  )
607
-
1535
+
608
1536
  # Or using a dictionary
609
1537
  response = db.query(
610
1538
  "What are the key findings?",
@@ -614,35 +1542,32 @@ class Morphik:
614
1542
  }
615
1543
  }
616
1544
  )
617
-
1545
+
618
1546
  print(response.completion)
619
-
1547
+
620
1548
  # If include_paths=True, you can inspect the graph paths
621
1549
  if response.metadata and "graph" in response.metadata:
622
1550
  for path in response.metadata["graph"]["paths"]:
623
1551
  print(" -> ".join(path))
624
1552
  ```
625
1553
  """
626
- # Convert prompt_overrides to dict if it's a model
627
- if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides):
628
- prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
629
-
630
- request = {
631
- "query": query,
632
- "filters": filters,
633
- "k": k,
634
- "min_score": min_score,
635
- "max_tokens": max_tokens,
636
- "temperature": temperature,
637
- "use_colpali": use_colpali,
638
- "graph_name": graph_name,
639
- "hop_depth": hop_depth,
640
- "include_paths": include_paths,
641
- "prompt_overrides": prompt_overrides,
642
- }
643
-
644
- response = self._request("POST", "query", request)
645
- return CompletionResponse(**response)
1554
+ payload = self._logic._prepare_query_request(
1555
+ query,
1556
+ filters,
1557
+ k,
1558
+ min_score,
1559
+ max_tokens,
1560
+ temperature,
1561
+ use_colpali,
1562
+ graph_name,
1563
+ hop_depth,
1564
+ include_paths,
1565
+ prompt_overrides,
1566
+ None,
1567
+ None,
1568
+ )
1569
+ response = self._request("POST", "query", data=payload)
1570
+ return self._logic._parse_completion_response(response)
646
1571
 
647
1572
  def list_documents(
648
1573
  self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
@@ -667,9 +1592,9 @@ class Morphik:
667
1592
  next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
668
1593
  ```
669
1594
  """
670
- # Use query params for pagination and POST body for filters
671
- response = self._request("POST", f"documents?skip={skip}&limit={limit}", data=filters or {})
672
- docs = [Document(**doc) for doc in response]
1595
+ params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
1596
+ response = self._request("POST", "documents", data=data, params=params)
1597
+ docs = self._logic._parse_document_list_response(response)
673
1598
  for doc in docs:
674
1599
  doc._client = self
675
1600
  return docs
@@ -691,10 +1616,10 @@ class Morphik:
691
1616
  ```
692
1617
  """
693
1618
  response = self._request("GET", f"documents/{document_id}")
694
- doc = Document(**response)
1619
+ doc = self._logic._parse_document_response(response)
695
1620
  doc._client = self
696
1621
  return doc
697
-
1622
+
698
1623
  def get_document_by_filename(self, filename: str) -> Document:
699
1624
  """
700
1625
  Get document metadata by filename.
@@ -713,10 +1638,10 @@ class Morphik:
713
1638
  ```
714
1639
  """
715
1640
  response = self._request("GET", f"documents/filename/{filename}")
716
- doc = Document(**response)
1641
+ doc = self._logic._parse_document_response(response)
717
1642
  doc._client = self
718
1643
  return doc
719
-
1644
+
720
1645
  def update_document_with_text(
721
1646
  self,
722
1647
  document_id: str,
@@ -763,19 +1688,16 @@ class Morphik:
763
1688
  rules=[self._convert_rule(r) for r in (rules or [])],
764
1689
  use_colpali=use_colpali if use_colpali is not None else True,
765
1690
  )
766
-
1691
+
767
1692
  params = {}
768
1693
  if update_strategy != "add":
769
1694
  params["update_strategy"] = update_strategy
770
-
1695
+
771
1696
  response = self._request(
772
- "POST",
773
- f"documents/{document_id}/update_text",
774
- data=request.model_dump(),
775
- params=params
1697
+ "POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
776
1698
  )
777
-
778
- doc = Document(**response)
1699
+
1700
+ doc = self._logic._parse_document_response(response)
779
1701
  doc._client = self
780
1702
  return doc
781
1703
 
@@ -833,34 +1755,34 @@ class Morphik:
833
1755
  if filename is None:
834
1756
  raise ValueError("filename is required when updating with file object")
835
1757
  file_obj = file
836
-
1758
+
837
1759
  try:
838
1760
  # Prepare multipart form data
839
1761
  files = {"file": (filename, file_obj)}
840
-
1762
+
841
1763
  # Convert metadata and rules to JSON strings
842
1764
  form_data = {
843
1765
  "metadata": json.dumps(metadata or {}),
844
1766
  "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
845
1767
  "update_strategy": update_strategy,
846
1768
  }
847
-
1769
+
848
1770
  if use_colpali is not None:
849
1771
  form_data["use_colpali"] = str(use_colpali).lower()
850
-
1772
+
851
1773
  # Use the dedicated file update endpoint
852
1774
  response = self._request(
853
1775
  "POST", f"documents/{document_id}/update_file", data=form_data, files=files
854
1776
  )
855
-
856
- doc = Document(**response)
1777
+
1778
+ doc = self._logic._parse_document_response(response)
857
1779
  doc._client = self
858
1780
  return doc
859
1781
  finally:
860
1782
  # Close file if we opened it
861
1783
  if isinstance(file, (str, Path)):
862
1784
  file_obj.close()
863
-
1785
+
864
1786
  def update_document_metadata(
865
1787
  self,
866
1788
  document_id: str,
@@ -868,14 +1790,14 @@ class Morphik:
868
1790
  ) -> Document:
869
1791
  """
870
1792
  Update a document's metadata only.
871
-
1793
+
872
1794
  Args:
873
1795
  document_id: ID of the document to update
874
1796
  metadata: Metadata to update
875
-
1797
+
876
1798
  Returns:
877
1799
  Document: Updated document metadata
878
-
1800
+
879
1801
  Example:
880
1802
  ```python
881
1803
  # Update just the metadata of a document
@@ -888,10 +1810,10 @@ class Morphik:
888
1810
  """
889
1811
  # Use the dedicated metadata update endpoint
890
1812
  response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
891
- doc = Document(**response)
1813
+ doc = self._logic._parse_document_response(response)
892
1814
  doc._client = self
893
1815
  return doc
894
-
1816
+
895
1817
  def update_document_by_filename_with_text(
896
1818
  self,
897
1819
  filename: str,
@@ -932,7 +1854,7 @@ class Morphik:
932
1854
  """
933
1855
  # First get the document by filename to obtain its ID
934
1856
  doc = self.get_document_by_filename(filename)
935
-
1857
+
936
1858
  # Then use the regular update_document_with_text endpoint with the document ID
937
1859
  return self.update_document_with_text(
938
1860
  document_id=doc.external_id,
@@ -941,9 +1863,9 @@ class Morphik:
941
1863
  metadata=metadata,
942
1864
  rules=rules,
943
1865
  update_strategy=update_strategy,
944
- use_colpali=use_colpali
1866
+ use_colpali=use_colpali,
945
1867
  )
946
-
1868
+
947
1869
  def update_document_by_filename_with_file(
948
1870
  self,
949
1871
  filename: str,
@@ -983,7 +1905,7 @@ class Morphik:
983
1905
  """
984
1906
  # First get the document by filename to obtain its ID
985
1907
  doc = self.get_document_by_filename(filename)
986
-
1908
+
987
1909
  # Then use the regular update_document_with_file endpoint with the document ID
988
1910
  return self.update_document_with_file(
989
1911
  document_id=doc.external_id,
@@ -992,9 +1914,9 @@ class Morphik:
992
1914
  metadata=metadata,
993
1915
  rules=rules,
994
1916
  update_strategy=update_strategy,
995
- use_colpali=use_colpali
1917
+ use_colpali=use_colpali,
996
1918
  )
997
-
1919
+
998
1920
  def update_document_by_filename_metadata(
999
1921
  self,
1000
1922
  filename: str,
@@ -1003,15 +1925,15 @@ class Morphik:
1003
1925
  ) -> Document:
1004
1926
  """
1005
1927
  Update a document's metadata using filename to identify the document.
1006
-
1928
+
1007
1929
  Args:
1008
1930
  filename: Filename of the document to update
1009
1931
  metadata: Metadata to update
1010
1932
  new_filename: Optional new filename to assign to the document
1011
-
1933
+
1012
1934
  Returns:
1013
1935
  Document: Updated document metadata
1014
-
1936
+
1015
1937
  Example:
1016
1938
  ```python
1017
1939
  # Update just the metadata of a document identified by filename
@@ -1025,44 +1947,44 @@ class Morphik:
1025
1947
  """
1026
1948
  # First get the document by filename to obtain its ID
1027
1949
  doc = self.get_document_by_filename(filename)
1028
-
1950
+
1029
1951
  # Update the metadata
1030
1952
  result = self.update_document_metadata(
1031
1953
  document_id=doc.external_id,
1032
1954
  metadata=metadata,
1033
1955
  )
1034
-
1956
+
1035
1957
  # If new_filename is provided, update the filename as well
1036
1958
  if new_filename:
1037
1959
  # Create a request that retains the just-updated metadata but also changes filename
1038
1960
  combined_metadata = result.metadata.copy()
1039
-
1961
+
1040
1962
  # Update the document again with filename change and the same metadata
1041
1963
  response = self._request(
1042
- "POST",
1043
- f"documents/{doc.external_id}/update_text",
1964
+ "POST",
1965
+ f"documents/{doc.external_id}/update_text",
1044
1966
  data={
1045
- "content": "",
1967
+ "content": "",
1046
1968
  "filename": new_filename,
1047
1969
  "metadata": combined_metadata,
1048
- "rules": []
1049
- }
1970
+ "rules": [],
1971
+ },
1050
1972
  )
1051
- result = Document(**response)
1973
+ result = self._logic._parse_document_response(response)
1052
1974
  result._client = self
1053
-
1975
+
1054
1976
  return result
1055
-
1977
+
1056
1978
  def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
1057
1979
  """
1058
1980
  Retrieve multiple documents by their IDs in a single batch operation.
1059
-
1981
+
1060
1982
  Args:
1061
1983
  document_ids: List of document IDs to retrieve
1062
-
1984
+
1063
1985
  Returns:
1064
1986
  List[Document]: List of document metadata for found documents
1065
-
1987
+
1066
1988
  Example:
1067
1989
  ```python
1068
1990
  docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
@@ -1071,21 +1993,23 @@ class Morphik:
1071
1993
  ```
1072
1994
  """
1073
1995
  response = self._request("POST", "batch/documents", data=document_ids)
1074
- docs = [Document(**doc) for doc in response]
1996
+ docs = self._logic._parse_document_list_response(response)
1075
1997
  for doc in docs:
1076
1998
  doc._client = self
1077
1999
  return docs
1078
-
1079
- def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
2000
+
2001
+ def batch_get_chunks(
2002
+ self, sources: List[Union[ChunkSource, Dict[str, Any]]]
2003
+ ) -> List[FinalChunkResult]:
1080
2004
  """
1081
2005
  Retrieve specific chunks by their document ID and chunk number in a single batch operation.
1082
-
2006
+
1083
2007
  Args:
1084
2008
  sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
1085
-
2009
+
1086
2010
  Returns:
1087
2011
  List[FinalChunkResult]: List of chunk results
1088
-
2012
+
1089
2013
  Example:
1090
2014
  ```python
1091
2015
  # Using dictionaries
@@ -1093,14 +2017,14 @@ class Morphik:
1093
2017
  {"document_id": "doc_123", "chunk_number": 0},
1094
2018
  {"document_id": "doc_456", "chunk_number": 2}
1095
2019
  ]
1096
-
2020
+
1097
2021
  # Or using ChunkSource objects
1098
2022
  from morphik.models import ChunkSource
1099
2023
  sources = [
1100
2024
  ChunkSource(document_id="doc_123", chunk_number=0),
1101
2025
  ChunkSource(document_id="doc_456", chunk_number=2)
1102
2026
  ]
1103
-
2027
+
1104
2028
  chunks = db.batch_get_chunks(sources)
1105
2029
  for chunk in chunks:
1106
2030
  print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
@@ -1113,44 +2037,9 @@ class Morphik:
1113
2037
  source_dicts.append(source)
1114
2038
  else:
1115
2039
  source_dicts.append(source.model_dump())
1116
-
2040
+
1117
2041
  response = self._request("POST", "batch/chunks", data=source_dicts)
1118
- chunks = [ChunkResult(**r) for r in response]
1119
-
1120
- final_chunks = []
1121
- for chunk in chunks:
1122
- if chunk.metadata.get("is_image"):
1123
- try:
1124
- # Handle data URI format "data:image/png;base64,..."
1125
- content = chunk.content
1126
- if content.startswith("data:"):
1127
- # Extract the base64 part after the comma
1128
- content = content.split(",", 1)[1]
1129
-
1130
- # Now decode the base64 string
1131
- image_bytes = base64.b64decode(content)
1132
- content = Image.open(io.BytesIO(image_bytes))
1133
- except Exception as e:
1134
- print(f"Error processing image: {str(e)}")
1135
- # Fall back to using the content as text
1136
- content = chunk.content
1137
- else:
1138
- content = chunk.content
1139
-
1140
- final_chunks.append(
1141
- FinalChunkResult(
1142
- content=content,
1143
- score=chunk.score,
1144
- document_id=chunk.document_id,
1145
- chunk_number=chunk.chunk_number,
1146
- metadata=chunk.metadata,
1147
- content_type=chunk.content_type,
1148
- filename=chunk.filename,
1149
- download_url=chunk.download_url,
1150
- )
1151
- )
1152
-
1153
- return final_chunks
2042
+ return self._logic._parse_chunk_result_list_response(response)
1154
2043
 
1155
2044
  def create_cache(
1156
2045
  self,
@@ -1252,11 +2141,11 @@ class Morphik:
1252
2141
  name="custom_graph",
1253
2142
  documents=["doc1", "doc2", "doc3"]
1254
2143
  )
1255
-
2144
+
1256
2145
  # With custom entity extraction examples
1257
2146
  from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
1258
2147
  graph = db.create_graph(
1259
- name="medical_graph",
2148
+ name="medical_graph",
1260
2149
  filters={"category": "medical"},
1261
2150
  prompt_overrides=GraphPromptOverrides(
1262
2151
  entity_extraction=EntityExtractionPromptOverride(
@@ -1272,7 +2161,7 @@ class Morphik:
1272
2161
  # Convert prompt_overrides to dict if it's a model
1273
2162
  if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1274
2163
  prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1275
-
2164
+
1276
2165
  request = {
1277
2166
  "name": name,
1278
2167
  "filters": filters,
@@ -1281,8 +2170,8 @@ class Morphik:
1281
2170
  }
1282
2171
 
1283
2172
  response = self._request("POST", "graph/create", request)
1284
- return Graph(**response)
1285
-
2173
+ return self._logic._parse_graph_response(response)
2174
+
1286
2175
  def get_graph(self, name: str) -> Graph:
1287
2176
  """
1288
2177
  Get a graph by name.
@@ -1301,7 +2190,7 @@ class Morphik:
1301
2190
  ```
1302
2191
  """
1303
2192
  response = self._request("GET", f"graph/{name}")
1304
- return Graph(**response)
2193
+ return self._logic._parse_graph_response(response)
1305
2194
 
1306
2195
  def list_graphs(self) -> List[Graph]:
1307
2196
  """
@@ -1319,8 +2208,8 @@ class Morphik:
1319
2208
  ```
1320
2209
  """
1321
2210
  response = self._request("GET", "graphs")
1322
- return [Graph(**graph) for graph in response]
1323
-
2211
+ return self._logic._parse_graph_list_response(response)
2212
+
1324
2213
  def update_graph(
1325
2214
  self,
1326
2215
  name: str,
@@ -1330,20 +2219,20 @@ class Morphik:
1330
2219
  ) -> Graph:
1331
2220
  """
1332
2221
  Update an existing graph with new documents.
1333
-
2222
+
1334
2223
  This method processes additional documents matching the original or new filters,
1335
2224
  extracts entities and relationships, and updates the graph with new information.
1336
-
2225
+
1337
2226
  Args:
1338
2227
  name: Name of the graph to update
1339
2228
  additional_filters: Optional additional metadata filters to determine which new documents to include
1340
2229
  additional_documents: Optional list of additional document IDs to include
1341
2230
  prompt_overrides: Optional customizations for entity extraction and resolution prompts
1342
2231
  Either a GraphPromptOverrides object or a dictionary with the same structure
1343
-
2232
+
1344
2233
  Returns:
1345
2234
  Graph: The updated graph
1346
-
2235
+
1347
2236
  Example:
1348
2237
  ```python
1349
2238
  # Update a graph with new documents
@@ -1353,7 +2242,7 @@ class Morphik:
1353
2242
  additional_documents=["doc4", "doc5"]
1354
2243
  )
1355
2244
  print(f"Graph now has {len(updated_graph.entities)} entities")
1356
-
2245
+
1357
2246
  # With entity resolution examples
1358
2247
  from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
1359
2248
  updated_graph = db.update_graph(
@@ -1363,7 +2252,7 @@ class Morphik:
1363
2252
  entity_resolution=EntityResolutionPromptOverride(
1364
2253
  examples=[
1365
2254
  EntityResolutionExample(
1366
- canonical="Machine Learning",
2255
+ canonical="Machine Learning",
1367
2256
  variants=["ML", "machine learning", "AI/ML"]
1368
2257
  )
1369
2258
  ]
@@ -1375,7 +2264,7 @@ class Morphik:
1375
2264
  # Convert prompt_overrides to dict if it's a model
1376
2265
  if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
1377
2266
  prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
1378
-
2267
+
1379
2268
  request = {
1380
2269
  "additional_filters": additional_filters,
1381
2270
  "additional_documents": additional_documents,
@@ -1383,23 +2272,23 @@ class Morphik:
1383
2272
  }
1384
2273
 
1385
2274
  response = self._request("POST", f"graph/{name}/update", request)
1386
- return Graph(**response)
1387
-
2275
+ return self._logic._parse_graph_response(response)
2276
+
1388
2277
  def delete_document(self, document_id: str) -> Dict[str, str]:
1389
2278
  """
1390
2279
  Delete a document and all its associated data.
1391
-
2280
+
1392
2281
  This method deletes a document and all its associated data, including:
1393
2282
  - Document metadata
1394
2283
  - Document content in storage
1395
2284
  - Document chunks and embeddings in vector store
1396
-
2285
+
1397
2286
  Args:
1398
2287
  document_id: ID of the document to delete
1399
-
2288
+
1400
2289
  Returns:
1401
2290
  Dict[str, str]: Deletion status
1402
-
2291
+
1403
2292
  Example:
1404
2293
  ```python
1405
2294
  # Delete a document
@@ -1409,20 +2298,20 @@ class Morphik:
1409
2298
  """
1410
2299
  response = self._request("DELETE", f"documents/{document_id}")
1411
2300
  return response
1412
-
2301
+
1413
2302
  def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
1414
2303
  """
1415
2304
  Delete a document by its filename.
1416
-
2305
+
1417
2306
  This is a convenience method that first retrieves the document ID by filename
1418
2307
  and then deletes the document by ID.
1419
-
2308
+
1420
2309
  Args:
1421
2310
  filename: Filename of the document to delete
1422
-
2311
+
1423
2312
  Returns:
1424
2313
  Dict[str, str]: Deletion status
1425
-
2314
+
1426
2315
  Example:
1427
2316
  ```python
1428
2317
  # Delete a document by filename
@@ -1432,13 +2321,13 @@ class Morphik:
1432
2321
  """
1433
2322
  # First get the document by filename to obtain its ID
1434
2323
  doc = self.get_document_by_filename(filename)
1435
-
2324
+
1436
2325
  # Then delete the document by ID
1437
2326
  return self.delete_document(doc.external_id)
1438
2327
 
1439
2328
  def close(self):
1440
- """Close the HTTP session"""
1441
- self._session.close()
2329
+ """Close the HTTP client"""
2330
+ self._client.close()
1442
2331
 
1443
2332
  def __enter__(self):
1444
2333
  return self