morphik 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +507 -0
- morphik/async_.py +1159 -381
- morphik/models.py +38 -25
- morphik/sync.py +1247 -358
- {morphik-0.1.0.dist-info → morphik-0.1.1.dist-info}/METADATA +1 -1
- morphik-0.1.1.dist-info/RECORD +10 -0
- morphik-0.1.0.dist-info/RECORD +0 -9
- {morphik-0.1.0.dist-info → morphik-0.1.1.dist-info}/WHEEL +0 -0
morphik/sync.py
CHANGED
@@ -7,74 +7,1083 @@ import json
|
|
7
7
|
import logging
|
8
8
|
from pathlib import Path
|
9
9
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
10
|
-
from urllib.parse import urlparse
|
11
10
|
|
12
|
-
import
|
13
|
-
from pydantic import BaseModel, Field
|
14
|
-
import requests
|
11
|
+
import httpx
|
15
12
|
|
16
13
|
from .models import (
|
17
|
-
Document,
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
IngestTextRequest,
|
14
|
+
Document,
|
15
|
+
DocumentResult,
|
16
|
+
CompletionResponse,
|
17
|
+
IngestTextRequest,
|
22
18
|
ChunkSource,
|
23
19
|
Graph,
|
24
20
|
# Prompt override models
|
25
|
-
EntityExtractionExample,
|
26
|
-
EntityResolutionExample,
|
27
|
-
EntityExtractionPromptOverride,
|
28
|
-
EntityResolutionPromptOverride,
|
29
|
-
QueryPromptOverride,
|
30
21
|
GraphPromptOverrides,
|
31
|
-
QueryPromptOverrides
|
22
|
+
QueryPromptOverrides,
|
32
23
|
)
|
33
24
|
from .rules import Rule
|
25
|
+
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
34
26
|
|
35
27
|
logger = logging.getLogger(__name__)
|
36
28
|
|
37
|
-
# Type alias for rules
|
38
|
-
RuleOrDict = Union[Rule, Dict[str, Any]]
|
39
29
|
|
30
|
+
class Cache:
|
31
|
+
def __init__(self, db: "Morphik", name: str):
|
32
|
+
self._db = db
|
33
|
+
self._name = name
|
34
|
+
|
35
|
+
def update(self) -> bool:
|
36
|
+
response = self._db._request("POST", f"cache/{self._name}/update")
|
37
|
+
return response.get("success", False)
|
38
|
+
|
39
|
+
def add_docs(self, docs: List[str]) -> bool:
|
40
|
+
response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
41
|
+
return response.get("success", False)
|
42
|
+
|
43
|
+
def query(
|
44
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
45
|
+
) -> CompletionResponse:
|
46
|
+
response = self._db._request(
|
47
|
+
"POST",
|
48
|
+
f"cache/{self._name}/query",
|
49
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
50
|
+
data="",
|
51
|
+
)
|
52
|
+
return CompletionResponse(**response)
|
53
|
+
|
54
|
+
|
55
|
+
class Folder:
|
56
|
+
"""
|
57
|
+
A folder that allows operations to be scoped to a specific folder.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
client: The Morphik client instance
|
61
|
+
name: The name of the folder
|
62
|
+
"""
|
63
|
+
|
64
|
+
def __init__(self, client: "Morphik", name: str):
|
65
|
+
self._client = client
|
66
|
+
self._name = name
|
67
|
+
|
68
|
+
@property
|
69
|
+
def name(self) -> str:
|
70
|
+
"""Returns the folder name."""
|
71
|
+
return self._name
|
72
|
+
|
73
|
+
def signin(self, end_user_id: str) -> "UserScope":
|
74
|
+
"""
|
75
|
+
Returns a UserScope object scoped to this folder and the end user.
|
76
|
+
|
77
|
+
Args:
|
78
|
+
end_user_id: The ID of the end user
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
UserScope: A user scope scoped to this folder and the end user
|
82
|
+
"""
|
83
|
+
return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)
|
84
|
+
|
85
|
+
def ingest_text(
|
86
|
+
self,
|
87
|
+
content: str,
|
88
|
+
filename: Optional[str] = None,
|
89
|
+
metadata: Optional[Dict[str, Any]] = None,
|
90
|
+
rules: Optional[List[RuleOrDict]] = None,
|
91
|
+
use_colpali: bool = True,
|
92
|
+
) -> Document:
|
93
|
+
"""
|
94
|
+
Ingest a text document into Morphik within this folder.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
content: Text content to ingest
|
98
|
+
filename: Optional file name
|
99
|
+
metadata: Optional metadata dictionary
|
100
|
+
rules: Optional list of rules to apply during ingestion
|
101
|
+
use_colpali: Whether to use ColPali-style embedding model
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
Document: Metadata of the ingested document
|
105
|
+
"""
|
106
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
107
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
108
|
+
content, filename, metadata, rules_list, use_colpali, self._name, None
|
109
|
+
)
|
110
|
+
response = self._client._request("POST", "ingest/text", data=payload)
|
111
|
+
doc = self._client._logic._parse_document_response(response)
|
112
|
+
doc._client = self._client
|
113
|
+
return doc
|
114
|
+
|
115
|
+
def ingest_file(
|
116
|
+
self,
|
117
|
+
file: Union[str, bytes, BinaryIO, Path],
|
118
|
+
filename: Optional[str] = None,
|
119
|
+
metadata: Optional[Dict[str, Any]] = None,
|
120
|
+
rules: Optional[List[RuleOrDict]] = None,
|
121
|
+
use_colpali: bool = True,
|
122
|
+
) -> Document:
|
123
|
+
"""
|
124
|
+
Ingest a file document into Morphik within this folder.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
128
|
+
filename: Name of the file
|
129
|
+
metadata: Optional metadata dictionary
|
130
|
+
rules: Optional list of rules to apply during ingestion
|
131
|
+
use_colpali: Whether to use ColPali-style embedding model
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Document: Metadata of the ingested document
|
135
|
+
"""
|
136
|
+
# Process file input
|
137
|
+
file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)
|
138
|
+
|
139
|
+
try:
|
140
|
+
# Prepare multipart form data
|
141
|
+
files = {"file": (filename, file_obj)}
|
142
|
+
|
143
|
+
# Create form data
|
144
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(
|
145
|
+
metadata, rules, self._name, None
|
146
|
+
)
|
147
|
+
|
148
|
+
response = self._client._request(
|
149
|
+
"POST",
|
150
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
151
|
+
data=form_data,
|
152
|
+
files=files,
|
153
|
+
)
|
154
|
+
doc = self._client._logic._parse_document_response(response)
|
155
|
+
doc._client = self._client
|
156
|
+
return doc
|
157
|
+
finally:
|
158
|
+
# Close file if we opened it
|
159
|
+
if isinstance(file, (str, Path)):
|
160
|
+
file_obj.close()
|
161
|
+
|
162
|
+
def ingest_files(
|
163
|
+
self,
|
164
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
165
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
166
|
+
rules: Optional[List[RuleOrDict]] = None,
|
167
|
+
use_colpali: bool = True,
|
168
|
+
parallel: bool = True,
|
169
|
+
) -> List[Document]:
|
170
|
+
"""
|
171
|
+
Ingest multiple files into Morphik within this folder.
|
172
|
+
|
173
|
+
Args:
|
174
|
+
files: List of files to ingest
|
175
|
+
metadata: Optional metadata
|
176
|
+
rules: Optional list of rules to apply
|
177
|
+
use_colpali: Whether to use ColPali-style embedding
|
178
|
+
parallel: Whether to process files in parallel
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
List[Document]: List of ingested documents
|
182
|
+
"""
|
183
|
+
# Convert files to format expected by API
|
184
|
+
file_objects = self._client._logic._prepare_files_for_upload(files)
|
185
|
+
|
186
|
+
try:
|
187
|
+
# Prepare form data
|
188
|
+
data = self._client._logic._prepare_ingest_files_form_data(
|
189
|
+
metadata, rules, use_colpali, parallel, self._name, None
|
190
|
+
)
|
191
|
+
|
192
|
+
response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
|
193
|
+
|
194
|
+
if response.get("errors"):
|
195
|
+
# Log errors but don't raise exception
|
196
|
+
for error in response["errors"]:
|
197
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
198
|
+
|
199
|
+
docs = [
|
200
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
201
|
+
]
|
202
|
+
for doc in docs:
|
203
|
+
doc._client = self._client
|
204
|
+
return docs
|
205
|
+
finally:
|
206
|
+
# Clean up file objects
|
207
|
+
for _, (_, file_obj) in file_objects:
|
208
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
209
|
+
file_obj.close()
|
210
|
+
|
211
|
+
def ingest_directory(
|
212
|
+
self,
|
213
|
+
directory: Union[str, Path],
|
214
|
+
recursive: bool = False,
|
215
|
+
pattern: str = "*",
|
216
|
+
metadata: Optional[Dict[str, Any]] = None,
|
217
|
+
rules: Optional[List[RuleOrDict]] = None,
|
218
|
+
use_colpali: bool = True,
|
219
|
+
parallel: bool = True,
|
220
|
+
) -> List[Document]:
|
221
|
+
"""
|
222
|
+
Ingest all files in a directory into Morphik within this folder.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
directory: Path to directory containing files to ingest
|
226
|
+
recursive: Whether to recursively process subdirectories
|
227
|
+
pattern: Optional glob pattern to filter files
|
228
|
+
metadata: Optional metadata dictionary to apply to all files
|
229
|
+
rules: Optional list of rules to apply
|
230
|
+
use_colpali: Whether to use ColPali-style embedding
|
231
|
+
parallel: Whether to process files in parallel
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
List[Document]: List of ingested documents
|
235
|
+
"""
|
236
|
+
directory = Path(directory)
|
237
|
+
if not directory.is_dir():
|
238
|
+
raise ValueError(f"Directory not found: {directory}")
|
239
|
+
|
240
|
+
# Collect all files matching pattern
|
241
|
+
if recursive:
|
242
|
+
files = list(directory.rglob(pattern))
|
243
|
+
else:
|
244
|
+
files = list(directory.glob(pattern))
|
245
|
+
|
246
|
+
# Filter out directories
|
247
|
+
files = [f for f in files if f.is_file()]
|
248
|
+
|
249
|
+
if not files:
|
250
|
+
return []
|
251
|
+
|
252
|
+
# Use ingest_files with collected paths
|
253
|
+
return self.ingest_files(
|
254
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
255
|
+
)
|
256
|
+
|
257
|
+
def retrieve_chunks(
|
258
|
+
self,
|
259
|
+
query: str,
|
260
|
+
filters: Optional[Dict[str, Any]] = None,
|
261
|
+
k: int = 4,
|
262
|
+
min_score: float = 0.0,
|
263
|
+
use_colpali: bool = True,
|
264
|
+
) -> List[FinalChunkResult]:
|
265
|
+
"""
|
266
|
+
Retrieve relevant chunks within this folder.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
query: Search query text
|
270
|
+
filters: Optional metadata filters
|
271
|
+
k: Number of results (default: 4)
|
272
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
273
|
+
use_colpali: Whether to use ColPali-style embedding model
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
List[FinalChunkResult]: List of relevant chunks
|
277
|
+
"""
|
278
|
+
request = {
|
279
|
+
"query": query,
|
280
|
+
"filters": filters,
|
281
|
+
"k": k,
|
282
|
+
"min_score": min_score,
|
283
|
+
"use_colpali": use_colpali,
|
284
|
+
"folder_name": self._name, # Add folder name here
|
285
|
+
}
|
286
|
+
|
287
|
+
response = self._client._request("POST", "retrieve/chunks", request)
|
288
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
289
|
+
|
290
|
+
def retrieve_docs(
|
291
|
+
self,
|
292
|
+
query: str,
|
293
|
+
filters: Optional[Dict[str, Any]] = None,
|
294
|
+
k: int = 4,
|
295
|
+
min_score: float = 0.0,
|
296
|
+
use_colpali: bool = True,
|
297
|
+
) -> List[DocumentResult]:
|
298
|
+
"""
|
299
|
+
Retrieve relevant documents within this folder.
|
300
|
+
|
301
|
+
Args:
|
302
|
+
query: Search query text
|
303
|
+
filters: Optional metadata filters
|
304
|
+
k: Number of results (default: 4)
|
305
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
306
|
+
use_colpali: Whether to use ColPali-style embedding model
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
List[DocumentResult]: List of relevant documents
|
310
|
+
"""
|
311
|
+
request = {
|
312
|
+
"query": query,
|
313
|
+
"filters": filters,
|
314
|
+
"k": k,
|
315
|
+
"min_score": min_score,
|
316
|
+
"use_colpali": use_colpali,
|
317
|
+
"folder_name": self._name, # Add folder name here
|
318
|
+
}
|
319
|
+
|
320
|
+
response = self._client._request("POST", "retrieve/docs", request)
|
321
|
+
return self._client._logic._parse_document_result_list_response(response)
|
322
|
+
|
323
|
+
def query(
|
324
|
+
self,
|
325
|
+
query: str,
|
326
|
+
filters: Optional[Dict[str, Any]] = None,
|
327
|
+
k: int = 4,
|
328
|
+
min_score: float = 0.0,
|
329
|
+
max_tokens: Optional[int] = None,
|
330
|
+
temperature: Optional[float] = None,
|
331
|
+
use_colpali: bool = True,
|
332
|
+
graph_name: Optional[str] = None,
|
333
|
+
hop_depth: int = 1,
|
334
|
+
include_paths: bool = False,
|
335
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
336
|
+
) -> CompletionResponse:
|
337
|
+
"""
|
338
|
+
Generate completion using relevant chunks as context within this folder.
|
339
|
+
|
340
|
+
Args:
|
341
|
+
query: Query text
|
342
|
+
filters: Optional metadata filters
|
343
|
+
k: Number of chunks to use as context (default: 4)
|
344
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
345
|
+
max_tokens: Maximum tokens in completion
|
346
|
+
temperature: Model temperature
|
347
|
+
use_colpali: Whether to use ColPali-style embedding model
|
348
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
349
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
350
|
+
include_paths: Whether to include relationship paths in the response
|
351
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
CompletionResponse: Generated completion
|
355
|
+
"""
|
356
|
+
payload = self._client._logic._prepare_query_request(
|
357
|
+
query,
|
358
|
+
filters,
|
359
|
+
k,
|
360
|
+
min_score,
|
361
|
+
max_tokens,
|
362
|
+
temperature,
|
363
|
+
use_colpali,
|
364
|
+
graph_name,
|
365
|
+
hop_depth,
|
366
|
+
include_paths,
|
367
|
+
prompt_overrides,
|
368
|
+
self._name,
|
369
|
+
None,
|
370
|
+
)
|
371
|
+
response = self._client._request("POST", "query", data=payload)
|
372
|
+
return self._client._logic._parse_completion_response(response)
|
373
|
+
|
374
|
+
def list_documents(
|
375
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
376
|
+
) -> List[Document]:
|
377
|
+
"""
|
378
|
+
List accessible documents within this folder.
|
379
|
+
|
380
|
+
Args:
|
381
|
+
skip: Number of documents to skip
|
382
|
+
limit: Maximum number of documents to return
|
383
|
+
filters: Optional filters
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
List[Document]: List of documents
|
387
|
+
"""
|
388
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
389
|
+
skip, limit, filters, self._name, None
|
390
|
+
)
|
391
|
+
response = self._client._request("POST", "documents", data=data, params=params)
|
392
|
+
docs = self._client._logic._parse_document_list_response(response)
|
393
|
+
for doc in docs:
|
394
|
+
doc._client = self._client
|
395
|
+
return docs
|
396
|
+
|
397
|
+
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
398
|
+
"""
|
399
|
+
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
document_ids: List of document IDs to retrieve
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
List[Document]: List of document metadata for found documents
|
406
|
+
"""
|
407
|
+
request = {"document_ids": document_ids, "folder_name": self._name}
|
408
|
+
|
409
|
+
response = self._client._request("POST", "batch/documents", data=request)
|
410
|
+
docs = [self._logic._parse_document_response(doc) for doc in response]
|
411
|
+
for doc in docs:
|
412
|
+
doc._client = self._client
|
413
|
+
return docs
|
414
|
+
|
415
|
+
def batch_get_chunks(
|
416
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
417
|
+
) -> List[FinalChunkResult]:
|
418
|
+
"""
|
419
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
420
|
+
|
421
|
+
Args:
|
422
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
423
|
+
|
424
|
+
Returns:
|
425
|
+
List[FinalChunkResult]: List of chunk results
|
426
|
+
"""
|
427
|
+
# Convert to list of dictionaries if needed
|
428
|
+
source_dicts = []
|
429
|
+
for source in sources:
|
430
|
+
if isinstance(source, dict):
|
431
|
+
source_dicts.append(source)
|
432
|
+
else:
|
433
|
+
source_dicts.append(source.model_dump())
|
434
|
+
|
435
|
+
# Add folder_name to request
|
436
|
+
request = {"sources": source_dicts, "folder_name": self._name}
|
437
|
+
|
438
|
+
response = self._client._request("POST", "batch/chunks", data=request)
|
439
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
440
|
+
|
441
|
+
def create_graph(
|
442
|
+
self,
|
443
|
+
name: str,
|
444
|
+
filters: Optional[Dict[str, Any]] = None,
|
445
|
+
documents: Optional[List[str]] = None,
|
446
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
447
|
+
) -> Graph:
|
448
|
+
"""
|
449
|
+
Create a graph from documents within this folder.
|
450
|
+
|
451
|
+
Args:
|
452
|
+
name: Name of the graph to create
|
453
|
+
filters: Optional metadata filters to determine which documents to include
|
454
|
+
documents: Optional list of specific document IDs to include
|
455
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
456
|
+
|
457
|
+
Returns:
|
458
|
+
Graph: The created graph object
|
459
|
+
"""
|
460
|
+
# Convert prompt_overrides to dict if it's a model
|
461
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
462
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
463
|
+
|
464
|
+
request = {
|
465
|
+
"name": name,
|
466
|
+
"filters": filters,
|
467
|
+
"documents": documents,
|
468
|
+
"prompt_overrides": prompt_overrides,
|
469
|
+
"folder_name": self._name, # Add folder name here
|
470
|
+
}
|
471
|
+
|
472
|
+
response = self._client._request("POST", "graph/create", request)
|
473
|
+
return self._client._logic._parse_graph_response(response)
|
474
|
+
|
475
|
+
def update_graph(
|
476
|
+
self,
|
477
|
+
name: str,
|
478
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
479
|
+
additional_documents: Optional[List[str]] = None,
|
480
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
481
|
+
) -> Graph:
|
482
|
+
"""
|
483
|
+
Update an existing graph with new documents from this folder.
|
484
|
+
|
485
|
+
Args:
|
486
|
+
name: Name of the graph to update
|
487
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
488
|
+
additional_documents: Optional list of additional document IDs to include
|
489
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
490
|
+
|
491
|
+
Returns:
|
492
|
+
Graph: The updated graph
|
493
|
+
"""
|
494
|
+
# Convert prompt_overrides to dict if it's a model
|
495
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
496
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
497
|
+
|
498
|
+
request = {
|
499
|
+
"additional_filters": additional_filters,
|
500
|
+
"additional_documents": additional_documents,
|
501
|
+
"prompt_overrides": prompt_overrides,
|
502
|
+
"folder_name": self._name, # Add folder name here
|
503
|
+
}
|
504
|
+
|
505
|
+
response = self._client._request("POST", f"graph/{name}/update", request)
|
506
|
+
return self._client._logic._parse_graph_response(response)
|
507
|
+
|
508
|
+
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
509
|
+
"""
|
510
|
+
Delete a document by its filename within this folder.
|
511
|
+
|
512
|
+
Args:
|
513
|
+
filename: Filename of the document to delete
|
514
|
+
|
515
|
+
Returns:
|
516
|
+
Dict[str, str]: Deletion status
|
517
|
+
"""
|
518
|
+
# Get the document by filename with folder scope
|
519
|
+
request = {"filename": filename, "folder_name": self._name}
|
520
|
+
|
521
|
+
# First get the document ID
|
522
|
+
response = self._client._request(
|
523
|
+
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
524
|
+
)
|
525
|
+
doc = self._client._logic._parse_document_response(response)
|
526
|
+
|
527
|
+
# Then delete by ID
|
528
|
+
return self._client.delete_document(doc.external_id)
|
529
|
+
|
530
|
+
|
531
|
+
class UserScope:
|
532
|
+
"""
|
533
|
+
A user scope that allows operations to be scoped to a specific end user and optionally a folder.
|
534
|
+
|
535
|
+
Args:
|
536
|
+
client: The Morphik client instance
|
537
|
+
end_user_id: The ID of the end user
|
538
|
+
folder_name: Optional folder name to further scope operations
|
539
|
+
"""
|
540
|
+
|
541
|
+
def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None):
|
542
|
+
self._client = client
|
543
|
+
self._end_user_id = end_user_id
|
544
|
+
self._folder_name = folder_name
|
545
|
+
|
546
|
+
@property
|
547
|
+
def end_user_id(self) -> str:
|
548
|
+
"""Returns the end user ID."""
|
549
|
+
return self._end_user_id
|
550
|
+
|
551
|
+
@property
|
552
|
+
def folder_name(self) -> Optional[str]:
|
553
|
+
"""Returns the folder name if any."""
|
554
|
+
return self._folder_name
|
555
|
+
|
556
|
+
def ingest_text(
|
557
|
+
self,
|
558
|
+
content: str,
|
559
|
+
filename: Optional[str] = None,
|
560
|
+
metadata: Optional[Dict[str, Any]] = None,
|
561
|
+
rules: Optional[List[RuleOrDict]] = None,
|
562
|
+
use_colpali: bool = True,
|
563
|
+
) -> Document:
|
564
|
+
"""
|
565
|
+
Ingest a text document into Morphik as this end user.
|
566
|
+
|
567
|
+
Args:
|
568
|
+
content: Text content to ingest
|
569
|
+
filename: Optional file name
|
570
|
+
metadata: Optional metadata dictionary
|
571
|
+
rules: Optional list of rules to apply during ingestion
|
572
|
+
use_colpali: Whether to use ColPali-style embedding model
|
573
|
+
|
574
|
+
Returns:
|
575
|
+
Document: Metadata of the ingested document
|
576
|
+
"""
|
577
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
578
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
579
|
+
content,
|
580
|
+
filename,
|
581
|
+
metadata,
|
582
|
+
rules_list,
|
583
|
+
use_colpali,
|
584
|
+
self._folder_name,
|
585
|
+
self._end_user_id,
|
586
|
+
)
|
587
|
+
response = self._client._request("POST", "ingest/text", data=payload)
|
588
|
+
doc = self._client._logic._parse_document_response(response)
|
589
|
+
doc._client = self._client
|
590
|
+
return doc
|
591
|
+
|
592
|
+
def ingest_file(
|
593
|
+
self,
|
594
|
+
file: Union[str, bytes, BinaryIO, Path],
|
595
|
+
filename: Optional[str] = None,
|
596
|
+
metadata: Optional[Dict[str, Any]] = None,
|
597
|
+
rules: Optional[List[RuleOrDict]] = None,
|
598
|
+
use_colpali: bool = True,
|
599
|
+
) -> Document:
|
600
|
+
"""
|
601
|
+
Ingest a file document into Morphik as this end user.
|
602
|
+
|
603
|
+
Args:
|
604
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
605
|
+
filename: Name of the file
|
606
|
+
metadata: Optional metadata dictionary
|
607
|
+
rules: Optional list of rules to apply during ingestion
|
608
|
+
use_colpali: Whether to use ColPali-style embedding model
|
609
|
+
|
610
|
+
Returns:
|
611
|
+
Document: Metadata of the ingested document
|
612
|
+
"""
|
613
|
+
# Handle different file input types
|
614
|
+
if isinstance(file, (str, Path)):
|
615
|
+
file_path = Path(file)
|
616
|
+
if not file_path.exists():
|
617
|
+
raise ValueError(f"File not found: {file}")
|
618
|
+
filename = file_path.name if filename is None else filename
|
619
|
+
with open(file_path, "rb") as f:
|
620
|
+
content = f.read()
|
621
|
+
file_obj = BytesIO(content)
|
622
|
+
elif isinstance(file, bytes):
|
623
|
+
if filename is None:
|
624
|
+
raise ValueError("filename is required when ingesting bytes")
|
625
|
+
file_obj = BytesIO(file)
|
626
|
+
else:
|
627
|
+
if filename is None:
|
628
|
+
raise ValueError("filename is required when ingesting file object")
|
629
|
+
file_obj = file
|
630
|
+
|
631
|
+
try:
|
632
|
+
# Prepare multipart form data
|
633
|
+
files = {"file": (filename, file_obj)}
|
634
|
+
|
635
|
+
# Add metadata and rules
|
636
|
+
form_data = {
|
637
|
+
"metadata": json.dumps(metadata or {}),
|
638
|
+
"rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
|
639
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
640
|
+
}
|
641
|
+
|
642
|
+
# Add folder name if scoped to a folder
|
643
|
+
if self._folder_name:
|
644
|
+
form_data["folder_name"] = self._folder_name
|
645
|
+
|
646
|
+
response = self._client._request(
|
647
|
+
"POST",
|
648
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
649
|
+
data=form_data,
|
650
|
+
files=files,
|
651
|
+
)
|
652
|
+
doc = self._client._logic._parse_document_response(response)
|
653
|
+
doc._client = self._client
|
654
|
+
return doc
|
655
|
+
finally:
|
656
|
+
# Close file if we opened it
|
657
|
+
if isinstance(file, (str, Path)):
|
658
|
+
file_obj.close()
|
659
|
+
|
660
|
+
def ingest_files(
|
661
|
+
self,
|
662
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
663
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
664
|
+
rules: Optional[List[RuleOrDict]] = None,
|
665
|
+
use_colpali: bool = True,
|
666
|
+
parallel: bool = True,
|
667
|
+
) -> List[Document]:
|
668
|
+
"""
|
669
|
+
Ingest multiple files into Morphik as this end user.
|
670
|
+
|
671
|
+
Args:
|
672
|
+
files: List of files to ingest
|
673
|
+
metadata: Optional metadata
|
674
|
+
rules: Optional list of rules to apply
|
675
|
+
use_colpali: Whether to use ColPali-style embedding
|
676
|
+
parallel: Whether to process files in parallel
|
677
|
+
|
678
|
+
Returns:
|
679
|
+
List[Document]: List of ingested documents
|
680
|
+
"""
|
681
|
+
# Convert files to format expected by API
|
682
|
+
file_objects = []
|
683
|
+
for file in files:
|
684
|
+
if isinstance(file, (str, Path)):
|
685
|
+
path = Path(file)
|
686
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
687
|
+
elif isinstance(file, bytes):
|
688
|
+
file_objects.append(("files", ("file.bin", file)))
|
689
|
+
else:
|
690
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
691
|
+
|
692
|
+
try:
|
693
|
+
# Prepare request data
|
694
|
+
# Convert rules appropriately
|
695
|
+
if rules:
|
696
|
+
if all(isinstance(r, list) for r in rules):
|
697
|
+
# List of lists - per-file rules
|
698
|
+
converted_rules = [
|
699
|
+
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
700
|
+
]
|
701
|
+
else:
|
702
|
+
# Flat list - shared rules for all files
|
703
|
+
converted_rules = [self._client._convert_rule(r) for r in rules]
|
704
|
+
else:
|
705
|
+
converted_rules = []
|
706
|
+
|
707
|
+
data = {
|
708
|
+
"metadata": json.dumps(metadata or {}),
|
709
|
+
"rules": json.dumps(converted_rules),
|
710
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
711
|
+
"parallel": str(parallel).lower(),
|
712
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
713
|
+
}
|
714
|
+
|
715
|
+
# Add folder name if scoped to a folder
|
716
|
+
if self._folder_name:
|
717
|
+
data["folder_name"] = self._folder_name
|
718
|
+
|
719
|
+
response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
|
720
|
+
|
721
|
+
if response.get("errors"):
|
722
|
+
# Log errors but don't raise exception
|
723
|
+
for error in response["errors"]:
|
724
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
725
|
+
|
726
|
+
docs = [
|
727
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
728
|
+
]
|
729
|
+
for doc in docs:
|
730
|
+
doc._client = self._client
|
731
|
+
return docs
|
732
|
+
finally:
|
733
|
+
# Clean up file objects
|
734
|
+
for _, (_, file_obj) in file_objects:
|
735
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
736
|
+
file_obj.close()
|
737
|
+
|
738
|
+
def ingest_directory(
|
739
|
+
self,
|
740
|
+
directory: Union[str, Path],
|
741
|
+
recursive: bool = False,
|
742
|
+
pattern: str = "*",
|
743
|
+
metadata: Optional[Dict[str, Any]] = None,
|
744
|
+
rules: Optional[List[RuleOrDict]] = None,
|
745
|
+
use_colpali: bool = True,
|
746
|
+
parallel: bool = True,
|
747
|
+
) -> List[Document]:
|
748
|
+
"""
|
749
|
+
Ingest all files in a directory into Morphik as this end user.
|
750
|
+
|
751
|
+
Args:
|
752
|
+
directory: Path to directory containing files to ingest
|
753
|
+
recursive: Whether to recursively process subdirectories
|
754
|
+
pattern: Optional glob pattern to filter files
|
755
|
+
metadata: Optional metadata dictionary to apply to all files
|
756
|
+
rules: Optional list of rules to apply
|
757
|
+
use_colpali: Whether to use ColPali-style embedding
|
758
|
+
parallel: Whether to process files in parallel
|
759
|
+
|
760
|
+
Returns:
|
761
|
+
List[Document]: List of ingested documents
|
762
|
+
"""
|
763
|
+
directory = Path(directory)
|
764
|
+
if not directory.is_dir():
|
765
|
+
raise ValueError(f"Directory not found: {directory}")
|
766
|
+
|
767
|
+
# Collect all files matching pattern
|
768
|
+
if recursive:
|
769
|
+
files = list(directory.rglob(pattern))
|
770
|
+
else:
|
771
|
+
files = list(directory.glob(pattern))
|
772
|
+
|
773
|
+
# Filter out directories
|
774
|
+
files = [f for f in files if f.is_file()]
|
775
|
+
|
776
|
+
if not files:
|
777
|
+
return []
|
778
|
+
|
779
|
+
# Use ingest_files with collected paths
|
780
|
+
return self.ingest_files(
|
781
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
782
|
+
)
|
783
|
+
|
784
|
+
def retrieve_chunks(
|
785
|
+
self,
|
786
|
+
query: str,
|
787
|
+
filters: Optional[Dict[str, Any]] = None,
|
788
|
+
k: int = 4,
|
789
|
+
min_score: float = 0.0,
|
790
|
+
use_colpali: bool = True,
|
791
|
+
) -> List[FinalChunkResult]:
|
792
|
+
"""
|
793
|
+
Retrieve relevant chunks as this end user.
|
794
|
+
|
795
|
+
Args:
|
796
|
+
query: Search query text
|
797
|
+
filters: Optional metadata filters
|
798
|
+
k: Number of results (default: 4)
|
799
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
800
|
+
use_colpali: Whether to use ColPali-style embedding model
|
801
|
+
|
802
|
+
Returns:
|
803
|
+
List[FinalChunkResult]: List of relevant chunks
|
804
|
+
"""
|
805
|
+
request = {
|
806
|
+
"query": query,
|
807
|
+
"filters": filters,
|
808
|
+
"k": k,
|
809
|
+
"min_score": min_score,
|
810
|
+
"use_colpali": use_colpali,
|
811
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
812
|
+
}
|
813
|
+
|
814
|
+
# Add folder name if scoped to a folder
|
815
|
+
if self._folder_name:
|
816
|
+
request["folder_name"] = self._folder_name
|
817
|
+
|
818
|
+
response = self._client._request("POST", "retrieve/chunks", request)
|
819
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
820
|
+
|
821
|
+
def retrieve_docs(
|
822
|
+
self,
|
823
|
+
query: str,
|
824
|
+
filters: Optional[Dict[str, Any]] = None,
|
825
|
+
k: int = 4,
|
826
|
+
min_score: float = 0.0,
|
827
|
+
use_colpali: bool = True,
|
828
|
+
) -> List[DocumentResult]:
|
829
|
+
"""
|
830
|
+
Retrieve relevant documents as this end user.
|
831
|
+
|
832
|
+
Args:
|
833
|
+
query: Search query text
|
834
|
+
filters: Optional metadata filters
|
835
|
+
k: Number of results (default: 4)
|
836
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
837
|
+
use_colpali: Whether to use ColPali-style embedding model
|
838
|
+
|
839
|
+
Returns:
|
840
|
+
List[DocumentResult]: List of relevant documents
|
841
|
+
"""
|
842
|
+
request = {
|
843
|
+
"query": query,
|
844
|
+
"filters": filters,
|
845
|
+
"k": k,
|
846
|
+
"min_score": min_score,
|
847
|
+
"use_colpali": use_colpali,
|
848
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
849
|
+
}
|
850
|
+
|
851
|
+
# Add folder name if scoped to a folder
|
852
|
+
if self._folder_name:
|
853
|
+
request["folder_name"] = self._folder_name
|
854
|
+
|
855
|
+
response = self._client._request("POST", "retrieve/docs", request)
|
856
|
+
return self._client._logic._parse_document_result_list_response(response)
|
857
|
+
|
858
|
+
def query(
|
859
|
+
self,
|
860
|
+
query: str,
|
861
|
+
filters: Optional[Dict[str, Any]] = None,
|
862
|
+
k: int = 4,
|
863
|
+
min_score: float = 0.0,
|
864
|
+
max_tokens: Optional[int] = None,
|
865
|
+
temperature: Optional[float] = None,
|
866
|
+
use_colpali: bool = True,
|
867
|
+
graph_name: Optional[str] = None,
|
868
|
+
hop_depth: int = 1,
|
869
|
+
include_paths: bool = False,
|
870
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
871
|
+
) -> CompletionResponse:
|
872
|
+
"""
|
873
|
+
Generate completion using relevant chunks as context as this end user.
|
874
|
+
|
875
|
+
Args:
|
876
|
+
query: Query text
|
877
|
+
filters: Optional metadata filters
|
878
|
+
k: Number of chunks to use as context (default: 4)
|
879
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
880
|
+
max_tokens: Maximum tokens in completion
|
881
|
+
temperature: Model temperature
|
882
|
+
use_colpali: Whether to use ColPali-style embedding model
|
883
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
884
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
885
|
+
include_paths: Whether to include relationship paths in the response
|
886
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
887
|
+
|
888
|
+
Returns:
|
889
|
+
CompletionResponse: Generated completion
|
890
|
+
"""
|
891
|
+
payload = self._client._logic._prepare_query_request(
|
892
|
+
query,
|
893
|
+
filters,
|
894
|
+
k,
|
895
|
+
min_score,
|
896
|
+
max_tokens,
|
897
|
+
temperature,
|
898
|
+
use_colpali,
|
899
|
+
graph_name,
|
900
|
+
hop_depth,
|
901
|
+
include_paths,
|
902
|
+
prompt_overrides,
|
903
|
+
self._folder_name,
|
904
|
+
self._end_user_id,
|
905
|
+
)
|
906
|
+
response = self._client._request("POST", "query", data=payload)
|
907
|
+
return self._client._logic._parse_completion_response(response)
|
908
|
+
|
909
|
+
def list_documents(
|
910
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
911
|
+
) -> List[Document]:
|
912
|
+
"""
|
913
|
+
List accessible documents for this end user.
|
914
|
+
|
915
|
+
Args:
|
916
|
+
skip: Number of documents to skip
|
917
|
+
limit: Maximum number of documents to return
|
918
|
+
filters: Optional filters
|
919
|
+
|
920
|
+
Returns:
|
921
|
+
List[Document]: List of documents
|
922
|
+
"""
|
923
|
+
# Add end_user_id and folder_name to params
|
924
|
+
params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id}
|
925
|
+
|
926
|
+
# Add folder name if scoped to a folder
|
927
|
+
if self._folder_name:
|
928
|
+
params["folder_name"] = self._folder_name
|
929
|
+
|
930
|
+
response = self._client._request("POST", f"documents", data=filters or {}, params=params)
|
931
|
+
|
932
|
+
docs = [self._logic._parse_document_response(doc) for doc in response]
|
933
|
+
for doc in docs:
|
934
|
+
doc._client = self._client
|
935
|
+
return docs
|
936
|
+
|
937
|
+
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
938
|
+
"""
|
939
|
+
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
940
|
+
|
941
|
+
Args:
|
942
|
+
document_ids: List of document IDs to retrieve
|
943
|
+
|
944
|
+
Returns:
|
945
|
+
List[Document]: List of document metadata for found documents
|
946
|
+
"""
|
947
|
+
request = {"document_ids": document_ids, "end_user_id": self._end_user_id}
|
948
|
+
|
949
|
+
# Add folder name if scoped to a folder
|
950
|
+
if self._folder_name:
|
951
|
+
request["folder_name"] = self._folder_name
|
952
|
+
|
953
|
+
response = self._client._request("POST", "batch/documents", data=request)
|
954
|
+
docs = [self._logic._parse_document_response(doc) for doc in response]
|
955
|
+
for doc in docs:
|
956
|
+
doc._client = self._client
|
957
|
+
return docs
|
958
|
+
|
959
|
+
def batch_get_chunks(
|
960
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
961
|
+
) -> List[FinalChunkResult]:
|
962
|
+
"""
|
963
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
964
|
+
|
965
|
+
Args:
|
966
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
967
|
+
|
968
|
+
Returns:
|
969
|
+
List[FinalChunkResult]: List of chunk results
|
970
|
+
"""
|
971
|
+
# Convert to list of dictionaries if needed
|
972
|
+
source_dicts = []
|
973
|
+
for source in sources:
|
974
|
+
if isinstance(source, dict):
|
975
|
+
source_dicts.append(source)
|
976
|
+
else:
|
977
|
+
source_dicts.append(source.model_dump())
|
978
|
+
|
979
|
+
# Add end_user_id and folder_name to request
|
980
|
+
request = {"sources": source_dicts, "end_user_id": self._end_user_id}
|
981
|
+
|
982
|
+
# Add folder name if scoped to a folder
|
983
|
+
if self._folder_name:
|
984
|
+
request["folder_name"] = self._folder_name
|
985
|
+
|
986
|
+
response = self._client._request("POST", "batch/chunks", data=request)
|
987
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
988
|
+
|
989
|
+
def create_graph(
|
990
|
+
self,
|
991
|
+
name: str,
|
992
|
+
filters: Optional[Dict[str, Any]] = None,
|
993
|
+
documents: Optional[List[str]] = None,
|
994
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
995
|
+
) -> Graph:
|
996
|
+
"""
|
997
|
+
Create a graph from documents for this end user.
|
998
|
+
|
999
|
+
Args:
|
1000
|
+
name: Name of the graph to create
|
1001
|
+
filters: Optional metadata filters to determine which documents to include
|
1002
|
+
documents: Optional list of specific document IDs to include
|
1003
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1004
|
+
|
1005
|
+
Returns:
|
1006
|
+
Graph: The created graph object
|
1007
|
+
"""
|
1008
|
+
# Convert prompt_overrides to dict if it's a model
|
1009
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1010
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
40
1011
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
1012
|
+
request = {
|
1013
|
+
"name": name,
|
1014
|
+
"filters": filters,
|
1015
|
+
"documents": documents,
|
1016
|
+
"prompt_overrides": prompt_overrides,
|
1017
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
1018
|
+
}
|
45
1019
|
|
46
|
-
|
47
|
-
|
48
|
-
|
1020
|
+
# Add folder name if scoped to a folder
|
1021
|
+
if self._folder_name:
|
1022
|
+
request["folder_name"] = self._folder_name
|
49
1023
|
|
50
|
-
|
51
|
-
|
52
|
-
return response.get("success", False)
|
1024
|
+
response = self._client._request("POST", "graph/create", request)
|
1025
|
+
return self._client._logic._parse_graph_response(response)
|
53
1026
|
|
54
|
-
def
|
55
|
-
self,
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
1027
|
+
def update_graph(
|
1028
|
+
self,
|
1029
|
+
name: str,
|
1030
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
1031
|
+
additional_documents: Optional[List[str]] = None,
|
1032
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1033
|
+
) -> Graph:
|
1034
|
+
"""
|
1035
|
+
Update an existing graph with new documents for this end user.
|
1036
|
+
|
1037
|
+
Args:
|
1038
|
+
name: Name of the graph to update
|
1039
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1040
|
+
additional_documents: Optional list of additional document IDs to include
|
1041
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1042
|
+
|
1043
|
+
Returns:
|
1044
|
+
Graph: The updated graph
|
1045
|
+
"""
|
1046
|
+
# Convert prompt_overrides to dict if it's a model
|
1047
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1048
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1049
|
+
|
1050
|
+
request = {
|
1051
|
+
"additional_filters": additional_filters,
|
1052
|
+
"additional_documents": additional_documents,
|
1053
|
+
"prompt_overrides": prompt_overrides,
|
1054
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
# Add folder name if scoped to a folder
|
1058
|
+
if self._folder_name:
|
1059
|
+
request["folder_name"] = self._folder_name
|
1060
|
+
|
1061
|
+
response = self._client._request("POST", f"graph/{name}/update", request)
|
1062
|
+
return self._client._logic._parse_graph_response(response)
|
1063
|
+
|
1064
|
+
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1065
|
+
"""
|
1066
|
+
Delete a document by its filename for this end user.
|
1067
|
+
|
1068
|
+
Args:
|
1069
|
+
filename: Filename of the document to delete
|
1070
|
+
|
1071
|
+
Returns:
|
1072
|
+
Dict[str, str]: Deletion status
|
1073
|
+
"""
|
1074
|
+
# Build parameters for the filename lookup
|
1075
|
+
params = {"end_user_id": self._end_user_id}
|
64
1076
|
|
1077
|
+
# Add folder name if scoped to a folder
|
1078
|
+
if self._folder_name:
|
1079
|
+
params["folder_name"] = self._folder_name
|
65
1080
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
document_id: str = Field(..., description="Parent document ID")
|
70
|
-
chunk_number: int = Field(..., description="Chunk sequence number")
|
71
|
-
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
72
|
-
content_type: str = Field(..., description="Content type")
|
73
|
-
filename: Optional[str] = Field(None, description="Original filename")
|
74
|
-
download_url: Optional[str] = Field(None, description="URL to download full document")
|
1081
|
+
# First get the document ID
|
1082
|
+
response = self._client._request("GET", f"documents/filename/{filename}", params=params)
|
1083
|
+
doc = self._client._logic._parse_document_response(response)
|
75
1084
|
|
76
|
-
|
77
|
-
|
1085
|
+
# Then delete by ID
|
1086
|
+
return self._client.delete_document(doc.external_id)
|
78
1087
|
|
79
1088
|
|
80
1089
|
class Morphik:
|
@@ -98,33 +1107,8 @@ class Morphik:
|
|
98
1107
|
"""
|
99
1108
|
|
100
1109
|
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
101
|
-
self.
|
102
|
-
self.
|
103
|
-
if is_local:
|
104
|
-
self._session.verify = False # Disable SSL for localhost
|
105
|
-
self._is_local = is_local
|
106
|
-
|
107
|
-
if uri:
|
108
|
-
self._setup_auth(uri)
|
109
|
-
else:
|
110
|
-
self._base_url = "http://localhost:8000"
|
111
|
-
self._auth_token = None
|
112
|
-
|
113
|
-
def _setup_auth(self, uri: str) -> None:
|
114
|
-
"""Setup authentication from URI"""
|
115
|
-
parsed = urlparse(uri)
|
116
|
-
if not parsed.netloc:
|
117
|
-
raise ValueError("Invalid URI format")
|
118
|
-
|
119
|
-
# Split host and auth parts
|
120
|
-
auth, host = parsed.netloc.split("@")
|
121
|
-
_, self._auth_token = auth.split(":")
|
122
|
-
|
123
|
-
# Set base URL
|
124
|
-
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
125
|
-
|
126
|
-
# Basic token validation
|
127
|
-
jwt.decode(self._auth_token, options={"verify_signature": False})
|
1110
|
+
self._logic = _MorphikClientLogic(uri, timeout, is_local)
|
1111
|
+
self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local)
|
128
1112
|
|
129
1113
|
def _request(
|
130
1114
|
self,
|
@@ -135,25 +1119,25 @@ class Morphik:
|
|
135
1119
|
params: Optional[Dict[str, Any]] = None,
|
136
1120
|
) -> Dict[str, Any]:
|
137
1121
|
"""Make HTTP request"""
|
138
|
-
|
139
|
-
|
140
|
-
|
1122
|
+
url = self._logic._get_url(endpoint)
|
1123
|
+
headers = self._logic._get_headers()
|
1124
|
+
if self._logic._auth_token: # Only add auth header if we have a token
|
1125
|
+
headers["Authorization"] = f"Bearer {self._logic._auth_token}"
|
141
1126
|
|
142
1127
|
# Configure request data based on type
|
143
1128
|
if files:
|
144
1129
|
# Multipart form data for files
|
145
1130
|
request_data = {"files": files, "data": data}
|
146
|
-
# Don't set Content-Type, let
|
1131
|
+
# Don't set Content-Type, let httpx handle it
|
147
1132
|
else:
|
148
1133
|
# JSON for everything else
|
149
1134
|
headers["Content-Type"] = "application/json"
|
150
1135
|
request_data = {"json": data}
|
151
1136
|
|
152
|
-
response = self.
|
1137
|
+
response = self._client.request(
|
153
1138
|
method,
|
154
|
-
|
1139
|
+
url,
|
155
1140
|
headers=headers,
|
156
|
-
timeout=self._timeout,
|
157
1141
|
params=params,
|
158
1142
|
**request_data,
|
159
1143
|
)
|
@@ -162,9 +1146,43 @@ class Morphik:
|
|
162
1146
|
|
163
1147
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
164
1148
|
"""Convert a rule to a dictionary format"""
|
165
|
-
|
166
|
-
|
167
|
-
|
1149
|
+
return self._logic._convert_rule(rule)
|
1150
|
+
|
1151
|
+
def create_folder(self, name: str) -> Folder:
|
1152
|
+
"""
|
1153
|
+
Create a folder to scope operations.
|
1154
|
+
|
1155
|
+
Args:
|
1156
|
+
name: The name of the folder
|
1157
|
+
|
1158
|
+
Returns:
|
1159
|
+
Folder: A folder object for scoped operations
|
1160
|
+
"""
|
1161
|
+
return Folder(self, name)
|
1162
|
+
|
1163
|
+
def get_folder(self, name: str) -> Folder:
|
1164
|
+
"""
|
1165
|
+
Get a folder by name to scope operations.
|
1166
|
+
|
1167
|
+
Args:
|
1168
|
+
name: The name of the folder
|
1169
|
+
|
1170
|
+
Returns:
|
1171
|
+
Folder: A folder object for scoped operations
|
1172
|
+
"""
|
1173
|
+
return Folder(self, name)
|
1174
|
+
|
1175
|
+
def signin(self, end_user_id: str) -> UserScope:
|
1176
|
+
"""
|
1177
|
+
Sign in as an end user to scope operations.
|
1178
|
+
|
1179
|
+
Args:
|
1180
|
+
end_user_id: The ID of the end user
|
1181
|
+
|
1182
|
+
Returns:
|
1183
|
+
UserScope: A user scope object for scoped operations
|
1184
|
+
"""
|
1185
|
+
return UserScope(self, end_user_id)
|
168
1186
|
|
169
1187
|
def ingest_text(
|
170
1188
|
self,
|
@@ -209,15 +1227,12 @@ class Morphik:
|
|
209
1227
|
)
|
210
1228
|
```
|
211
1229
|
"""
|
212
|
-
|
213
|
-
|
214
|
-
filename
|
215
|
-
metadata=metadata or {},
|
216
|
-
rules=[self._convert_rule(r) for r in (rules or [])],
|
217
|
-
use_colpali=use_colpali,
|
1230
|
+
rules_list = [self._convert_rule(r) for r in (rules or [])]
|
1231
|
+
payload = self._logic._prepare_ingest_text_request(
|
1232
|
+
content, filename, metadata, rules_list, use_colpali, None, None
|
218
1233
|
)
|
219
|
-
response = self._request("POST", "ingest/text", data=
|
220
|
-
doc =
|
1234
|
+
response = self._request("POST", "ingest/text", data=payload)
|
1235
|
+
doc = self._logic._parse_document_response(response)
|
221
1236
|
doc._client = self
|
222
1237
|
return doc
|
223
1238
|
|
@@ -266,38 +1281,23 @@ class Morphik:
|
|
266
1281
|
)
|
267
1282
|
```
|
268
1283
|
"""
|
269
|
-
#
|
270
|
-
|
271
|
-
file_path = Path(file)
|
272
|
-
if not file_path.exists():
|
273
|
-
raise ValueError(f"File not found: {file}")
|
274
|
-
filename = file_path.name if filename is None else filename
|
275
|
-
with open(file_path, "rb") as f:
|
276
|
-
content = f.read()
|
277
|
-
file_obj = BytesIO(content)
|
278
|
-
elif isinstance(file, bytes):
|
279
|
-
if filename is None:
|
280
|
-
raise ValueError("filename is required when ingesting bytes")
|
281
|
-
file_obj = BytesIO(file)
|
282
|
-
else:
|
283
|
-
if filename is None:
|
284
|
-
raise ValueError("filename is required when ingesting file object")
|
285
|
-
file_obj = file
|
1284
|
+
# Process file input
|
1285
|
+
file_obj, filename = self._logic._prepare_file_for_upload(file, filename)
|
286
1286
|
|
287
1287
|
try:
|
288
1288
|
# Prepare multipart form data
|
289
1289
|
files = {"file": (filename, file_obj)}
|
290
1290
|
|
291
|
-
#
|
292
|
-
form_data =
|
293
|
-
"metadata": json.dumps(metadata or {}),
|
294
|
-
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
295
|
-
}
|
1291
|
+
# Create form data
|
1292
|
+
form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
|
296
1293
|
|
297
1294
|
response = self._request(
|
298
|
-
"POST",
|
1295
|
+
"POST",
|
1296
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
1297
|
+
data=form_data,
|
1298
|
+
files=files,
|
299
1299
|
)
|
300
|
-
doc =
|
1300
|
+
doc = self._logic._parse_document_response(response)
|
301
1301
|
doc._client = self
|
302
1302
|
return doc
|
303
1303
|
finally:
|
@@ -330,44 +1330,22 @@ class Morphik:
|
|
330
1330
|
ValueError: If metadata list length doesn't match files length
|
331
1331
|
"""
|
332
1332
|
# Convert files to format expected by API
|
333
|
-
file_objects =
|
334
|
-
for file in files:
|
335
|
-
if isinstance(file, (str, Path)):
|
336
|
-
path = Path(file)
|
337
|
-
file_objects.append(("files", (path.name, open(path, "rb"))))
|
338
|
-
elif isinstance(file, bytes):
|
339
|
-
file_objects.append(("files", ("file.bin", file)))
|
340
|
-
else:
|
341
|
-
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
1333
|
+
file_objects = self._logic._prepare_files_for_upload(files)
|
342
1334
|
|
343
1335
|
try:
|
344
|
-
# Prepare
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
# List of lists - per-file rules
|
349
|
-
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
350
|
-
else:
|
351
|
-
# Flat list - shared rules for all files
|
352
|
-
converted_rules = [self._convert_rule(r) for r in rules]
|
353
|
-
else:
|
354
|
-
converted_rules = []
|
355
|
-
|
356
|
-
data = {
|
357
|
-
"metadata": json.dumps(metadata or {}),
|
358
|
-
"rules": json.dumps(converted_rules),
|
359
|
-
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
360
|
-
"parallel": str(parallel).lower(),
|
361
|
-
}
|
1336
|
+
# Prepare form data
|
1337
|
+
data = self._logic._prepare_ingest_files_form_data(
|
1338
|
+
metadata, rules, use_colpali, parallel, None, None
|
1339
|
+
)
|
362
1340
|
|
363
1341
|
response = self._request("POST", "ingest/files", data=data, files=file_objects)
|
364
|
-
|
1342
|
+
|
365
1343
|
if response.get("errors"):
|
366
1344
|
# Log errors but don't raise exception
|
367
1345
|
for error in response["errors"]:
|
368
1346
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
369
|
-
|
370
|
-
docs = [
|
1347
|
+
|
1348
|
+
docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
|
371
1349
|
for doc in docs:
|
372
1350
|
doc._client = self
|
373
1351
|
return docs
|
@@ -417,17 +1395,13 @@ class Morphik:
|
|
417
1395
|
|
418
1396
|
# Filter out directories
|
419
1397
|
files = [f for f in files if f.is_file()]
|
420
|
-
|
1398
|
+
|
421
1399
|
if not files:
|
422
1400
|
return []
|
423
1401
|
|
424
1402
|
# Use ingest_files with collected paths
|
425
1403
|
return self.ingest_files(
|
426
|
-
files=files,
|
427
|
-
metadata=metadata,
|
428
|
-
rules=rules,
|
429
|
-
use_colpali=use_colpali,
|
430
|
-
parallel=parallel
|
1404
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
431
1405
|
)
|
432
1406
|
|
433
1407
|
def retrieve_chunks(
|
@@ -458,52 +1432,11 @@ class Morphik:
|
|
458
1432
|
)
|
459
1433
|
```
|
460
1434
|
"""
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
"use_colpali": use_colpali,
|
467
|
-
}
|
468
|
-
|
469
|
-
response = self._request("POST", "retrieve/chunks", request)
|
470
|
-
chunks = [ChunkResult(**r) for r in response]
|
471
|
-
|
472
|
-
final_chunks = []
|
473
|
-
|
474
|
-
for chunk in chunks:
|
475
|
-
if chunk.metadata.get("is_image"):
|
476
|
-
try:
|
477
|
-
# Handle data URI format "data:image/png;base64,..."
|
478
|
-
content = chunk.content
|
479
|
-
if content.startswith("data:"):
|
480
|
-
# Extract the base64 part after the comma
|
481
|
-
content = content.split(",", 1)[1]
|
482
|
-
|
483
|
-
# Now decode the base64 string
|
484
|
-
image_bytes = base64.b64decode(content)
|
485
|
-
content = Image.open(io.BytesIO(image_bytes))
|
486
|
-
except Exception as e:
|
487
|
-
print(f"Error processing image: {str(e)}")
|
488
|
-
# Fall back to using the content as text
|
489
|
-
print(chunk.content)
|
490
|
-
else:
|
491
|
-
content = chunk.content
|
492
|
-
|
493
|
-
final_chunks.append(
|
494
|
-
FinalChunkResult(
|
495
|
-
content=content,
|
496
|
-
score=chunk.score,
|
497
|
-
document_id=chunk.document_id,
|
498
|
-
chunk_number=chunk.chunk_number,
|
499
|
-
metadata=chunk.metadata,
|
500
|
-
content_type=chunk.content_type,
|
501
|
-
filename=chunk.filename,
|
502
|
-
download_url=chunk.download_url,
|
503
|
-
)
|
504
|
-
)
|
505
|
-
|
506
|
-
return final_chunks
|
1435
|
+
payload = self._logic._prepare_retrieve_chunks_request(
|
1436
|
+
query, filters, k, min_score, use_colpali, None, None
|
1437
|
+
)
|
1438
|
+
response = self._request("POST", "retrieve/chunks", data=payload)
|
1439
|
+
return self._logic._parse_chunk_result_list_response(response)
|
507
1440
|
|
508
1441
|
def retrieve_docs(
|
509
1442
|
self,
|
@@ -533,16 +1466,11 @@ class Morphik:
|
|
533
1466
|
)
|
534
1467
|
```
|
535
1468
|
"""
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
"use_colpali": use_colpali,
|
542
|
-
}
|
543
|
-
|
544
|
-
response = self._request("POST", "retrieve/docs", request)
|
545
|
-
return [DocumentResult(**r) for r in response]
|
1469
|
+
payload = self._logic._prepare_retrieve_docs_request(
|
1470
|
+
query, filters, k, min_score, use_colpali, None, None
|
1471
|
+
)
|
1472
|
+
response = self._request("POST", "retrieve/docs", data=payload)
|
1473
|
+
return self._logic._parse_document_result_list_response(response)
|
546
1474
|
|
547
1475
|
def query(
|
548
1476
|
self,
|
@@ -585,7 +1513,7 @@ class Morphik:
|
|
585
1513
|
filters={"department": "research"},
|
586
1514
|
temperature=0.7
|
587
1515
|
)
|
588
|
-
|
1516
|
+
|
589
1517
|
# Knowledge graph enhanced query
|
590
1518
|
response = db.query(
|
591
1519
|
"How does product X relate to customer segment Y?",
|
@@ -593,7 +1521,7 @@ class Morphik:
|
|
593
1521
|
hop_depth=2,
|
594
1522
|
include_paths=True
|
595
1523
|
)
|
596
|
-
|
1524
|
+
|
597
1525
|
# With prompt customization
|
598
1526
|
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
599
1527
|
response = db.query(
|
@@ -604,7 +1532,7 @@ class Morphik:
|
|
604
1532
|
)
|
605
1533
|
)
|
606
1534
|
)
|
607
|
-
|
1535
|
+
|
608
1536
|
# Or using a dictionary
|
609
1537
|
response = db.query(
|
610
1538
|
"What are the key findings?",
|
@@ -614,35 +1542,32 @@ class Morphik:
|
|
614
1542
|
}
|
615
1543
|
}
|
616
1544
|
)
|
617
|
-
|
1545
|
+
|
618
1546
|
print(response.completion)
|
619
|
-
|
1547
|
+
|
620
1548
|
# If include_paths=True, you can inspect the graph paths
|
621
1549
|
if response.metadata and "graph" in response.metadata:
|
622
1550
|
for path in response.metadata["graph"]["paths"]:
|
623
1551
|
print(" -> ".join(path))
|
624
1552
|
```
|
625
1553
|
"""
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
response = self._request("POST", "query", request)
|
645
|
-
return CompletionResponse(**response)
|
1554
|
+
payload = self._logic._prepare_query_request(
|
1555
|
+
query,
|
1556
|
+
filters,
|
1557
|
+
k,
|
1558
|
+
min_score,
|
1559
|
+
max_tokens,
|
1560
|
+
temperature,
|
1561
|
+
use_colpali,
|
1562
|
+
graph_name,
|
1563
|
+
hop_depth,
|
1564
|
+
include_paths,
|
1565
|
+
prompt_overrides,
|
1566
|
+
None,
|
1567
|
+
None,
|
1568
|
+
)
|
1569
|
+
response = self._request("POST", "query", data=payload)
|
1570
|
+
return self._logic._parse_completion_response(response)
|
646
1571
|
|
647
1572
|
def list_documents(
|
648
1573
|
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
@@ -667,9 +1592,9 @@ class Morphik:
|
|
667
1592
|
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
668
1593
|
```
|
669
1594
|
"""
|
670
|
-
|
671
|
-
response = self._request("POST",
|
672
|
-
docs =
|
1595
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
|
1596
|
+
response = self._request("POST", "documents", data=data, params=params)
|
1597
|
+
docs = self._logic._parse_document_list_response(response)
|
673
1598
|
for doc in docs:
|
674
1599
|
doc._client = self
|
675
1600
|
return docs
|
@@ -691,10 +1616,10 @@ class Morphik:
|
|
691
1616
|
```
|
692
1617
|
"""
|
693
1618
|
response = self._request("GET", f"documents/{document_id}")
|
694
|
-
doc =
|
1619
|
+
doc = self._logic._parse_document_response(response)
|
695
1620
|
doc._client = self
|
696
1621
|
return doc
|
697
|
-
|
1622
|
+
|
698
1623
|
def get_document_by_filename(self, filename: str) -> Document:
|
699
1624
|
"""
|
700
1625
|
Get document metadata by filename.
|
@@ -713,10 +1638,10 @@ class Morphik:
|
|
713
1638
|
```
|
714
1639
|
"""
|
715
1640
|
response = self._request("GET", f"documents/filename/{filename}")
|
716
|
-
doc =
|
1641
|
+
doc = self._logic._parse_document_response(response)
|
717
1642
|
doc._client = self
|
718
1643
|
return doc
|
719
|
-
|
1644
|
+
|
720
1645
|
def update_document_with_text(
|
721
1646
|
self,
|
722
1647
|
document_id: str,
|
@@ -763,19 +1688,16 @@ class Morphik:
|
|
763
1688
|
rules=[self._convert_rule(r) for r in (rules or [])],
|
764
1689
|
use_colpali=use_colpali if use_colpali is not None else True,
|
765
1690
|
)
|
766
|
-
|
1691
|
+
|
767
1692
|
params = {}
|
768
1693
|
if update_strategy != "add":
|
769
1694
|
params["update_strategy"] = update_strategy
|
770
|
-
|
1695
|
+
|
771
1696
|
response = self._request(
|
772
|
-
"POST",
|
773
|
-
f"documents/{document_id}/update_text",
|
774
|
-
data=request.model_dump(),
|
775
|
-
params=params
|
1697
|
+
"POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
|
776
1698
|
)
|
777
|
-
|
778
|
-
doc =
|
1699
|
+
|
1700
|
+
doc = self._logic._parse_document_response(response)
|
779
1701
|
doc._client = self
|
780
1702
|
return doc
|
781
1703
|
|
@@ -833,34 +1755,34 @@ class Morphik:
|
|
833
1755
|
if filename is None:
|
834
1756
|
raise ValueError("filename is required when updating with file object")
|
835
1757
|
file_obj = file
|
836
|
-
|
1758
|
+
|
837
1759
|
try:
|
838
1760
|
# Prepare multipart form data
|
839
1761
|
files = {"file": (filename, file_obj)}
|
840
|
-
|
1762
|
+
|
841
1763
|
# Convert metadata and rules to JSON strings
|
842
1764
|
form_data = {
|
843
1765
|
"metadata": json.dumps(metadata or {}),
|
844
1766
|
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
845
1767
|
"update_strategy": update_strategy,
|
846
1768
|
}
|
847
|
-
|
1769
|
+
|
848
1770
|
if use_colpali is not None:
|
849
1771
|
form_data["use_colpali"] = str(use_colpali).lower()
|
850
|
-
|
1772
|
+
|
851
1773
|
# Use the dedicated file update endpoint
|
852
1774
|
response = self._request(
|
853
1775
|
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
854
1776
|
)
|
855
|
-
|
856
|
-
doc =
|
1777
|
+
|
1778
|
+
doc = self._logic._parse_document_response(response)
|
857
1779
|
doc._client = self
|
858
1780
|
return doc
|
859
1781
|
finally:
|
860
1782
|
# Close file if we opened it
|
861
1783
|
if isinstance(file, (str, Path)):
|
862
1784
|
file_obj.close()
|
863
|
-
|
1785
|
+
|
864
1786
|
def update_document_metadata(
|
865
1787
|
self,
|
866
1788
|
document_id: str,
|
@@ -868,14 +1790,14 @@ class Morphik:
|
|
868
1790
|
) -> Document:
|
869
1791
|
"""
|
870
1792
|
Update a document's metadata only.
|
871
|
-
|
1793
|
+
|
872
1794
|
Args:
|
873
1795
|
document_id: ID of the document to update
|
874
1796
|
metadata: Metadata to update
|
875
|
-
|
1797
|
+
|
876
1798
|
Returns:
|
877
1799
|
Document: Updated document metadata
|
878
|
-
|
1800
|
+
|
879
1801
|
Example:
|
880
1802
|
```python
|
881
1803
|
# Update just the metadata of a document
|
@@ -888,10 +1810,10 @@ class Morphik:
|
|
888
1810
|
"""
|
889
1811
|
# Use the dedicated metadata update endpoint
|
890
1812
|
response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
891
|
-
doc =
|
1813
|
+
doc = self._logic._parse_document_response(response)
|
892
1814
|
doc._client = self
|
893
1815
|
return doc
|
894
|
-
|
1816
|
+
|
895
1817
|
def update_document_by_filename_with_text(
|
896
1818
|
self,
|
897
1819
|
filename: str,
|
@@ -932,7 +1854,7 @@ class Morphik:
|
|
932
1854
|
"""
|
933
1855
|
# First get the document by filename to obtain its ID
|
934
1856
|
doc = self.get_document_by_filename(filename)
|
935
|
-
|
1857
|
+
|
936
1858
|
# Then use the regular update_document_with_text endpoint with the document ID
|
937
1859
|
return self.update_document_with_text(
|
938
1860
|
document_id=doc.external_id,
|
@@ -941,9 +1863,9 @@ class Morphik:
|
|
941
1863
|
metadata=metadata,
|
942
1864
|
rules=rules,
|
943
1865
|
update_strategy=update_strategy,
|
944
|
-
use_colpali=use_colpali
|
1866
|
+
use_colpali=use_colpali,
|
945
1867
|
)
|
946
|
-
|
1868
|
+
|
947
1869
|
def update_document_by_filename_with_file(
|
948
1870
|
self,
|
949
1871
|
filename: str,
|
@@ -983,7 +1905,7 @@ class Morphik:
|
|
983
1905
|
"""
|
984
1906
|
# First get the document by filename to obtain its ID
|
985
1907
|
doc = self.get_document_by_filename(filename)
|
986
|
-
|
1908
|
+
|
987
1909
|
# Then use the regular update_document_with_file endpoint with the document ID
|
988
1910
|
return self.update_document_with_file(
|
989
1911
|
document_id=doc.external_id,
|
@@ -992,9 +1914,9 @@ class Morphik:
|
|
992
1914
|
metadata=metadata,
|
993
1915
|
rules=rules,
|
994
1916
|
update_strategy=update_strategy,
|
995
|
-
use_colpali=use_colpali
|
1917
|
+
use_colpali=use_colpali,
|
996
1918
|
)
|
997
|
-
|
1919
|
+
|
998
1920
|
def update_document_by_filename_metadata(
|
999
1921
|
self,
|
1000
1922
|
filename: str,
|
@@ -1003,15 +1925,15 @@ class Morphik:
|
|
1003
1925
|
) -> Document:
|
1004
1926
|
"""
|
1005
1927
|
Update a document's metadata using filename to identify the document.
|
1006
|
-
|
1928
|
+
|
1007
1929
|
Args:
|
1008
1930
|
filename: Filename of the document to update
|
1009
1931
|
metadata: Metadata to update
|
1010
1932
|
new_filename: Optional new filename to assign to the document
|
1011
|
-
|
1933
|
+
|
1012
1934
|
Returns:
|
1013
1935
|
Document: Updated document metadata
|
1014
|
-
|
1936
|
+
|
1015
1937
|
Example:
|
1016
1938
|
```python
|
1017
1939
|
# Update just the metadata of a document identified by filename
|
@@ -1025,44 +1947,44 @@ class Morphik:
|
|
1025
1947
|
"""
|
1026
1948
|
# First get the document by filename to obtain its ID
|
1027
1949
|
doc = self.get_document_by_filename(filename)
|
1028
|
-
|
1950
|
+
|
1029
1951
|
# Update the metadata
|
1030
1952
|
result = self.update_document_metadata(
|
1031
1953
|
document_id=doc.external_id,
|
1032
1954
|
metadata=metadata,
|
1033
1955
|
)
|
1034
|
-
|
1956
|
+
|
1035
1957
|
# If new_filename is provided, update the filename as well
|
1036
1958
|
if new_filename:
|
1037
1959
|
# Create a request that retains the just-updated metadata but also changes filename
|
1038
1960
|
combined_metadata = result.metadata.copy()
|
1039
|
-
|
1961
|
+
|
1040
1962
|
# Update the document again with filename change and the same metadata
|
1041
1963
|
response = self._request(
|
1042
|
-
"POST",
|
1043
|
-
f"documents/{doc.external_id}/update_text",
|
1964
|
+
"POST",
|
1965
|
+
f"documents/{doc.external_id}/update_text",
|
1044
1966
|
data={
|
1045
|
-
"content": "",
|
1967
|
+
"content": "",
|
1046
1968
|
"filename": new_filename,
|
1047
1969
|
"metadata": combined_metadata,
|
1048
|
-
"rules": []
|
1049
|
-
}
|
1970
|
+
"rules": [],
|
1971
|
+
},
|
1050
1972
|
)
|
1051
|
-
result =
|
1973
|
+
result = self._logic._parse_document_response(response)
|
1052
1974
|
result._client = self
|
1053
|
-
|
1975
|
+
|
1054
1976
|
return result
|
1055
|
-
|
1977
|
+
|
1056
1978
|
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1057
1979
|
"""
|
1058
1980
|
Retrieve multiple documents by their IDs in a single batch operation.
|
1059
|
-
|
1981
|
+
|
1060
1982
|
Args:
|
1061
1983
|
document_ids: List of document IDs to retrieve
|
1062
|
-
|
1984
|
+
|
1063
1985
|
Returns:
|
1064
1986
|
List[Document]: List of document metadata for found documents
|
1065
|
-
|
1987
|
+
|
1066
1988
|
Example:
|
1067
1989
|
```python
|
1068
1990
|
docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
@@ -1071,21 +1993,23 @@ class Morphik:
|
|
1071
1993
|
```
|
1072
1994
|
"""
|
1073
1995
|
response = self._request("POST", "batch/documents", data=document_ids)
|
1074
|
-
docs =
|
1996
|
+
docs = self._logic._parse_document_list_response(response)
|
1075
1997
|
for doc in docs:
|
1076
1998
|
doc._client = self
|
1077
1999
|
return docs
|
1078
|
-
|
1079
|
-
def batch_get_chunks(
|
2000
|
+
|
2001
|
+
def batch_get_chunks(
|
2002
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2003
|
+
) -> List[FinalChunkResult]:
|
1080
2004
|
"""
|
1081
2005
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1082
|
-
|
2006
|
+
|
1083
2007
|
Args:
|
1084
2008
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1085
|
-
|
2009
|
+
|
1086
2010
|
Returns:
|
1087
2011
|
List[FinalChunkResult]: List of chunk results
|
1088
|
-
|
2012
|
+
|
1089
2013
|
Example:
|
1090
2014
|
```python
|
1091
2015
|
# Using dictionaries
|
@@ -1093,14 +2017,14 @@ class Morphik:
|
|
1093
2017
|
{"document_id": "doc_123", "chunk_number": 0},
|
1094
2018
|
{"document_id": "doc_456", "chunk_number": 2}
|
1095
2019
|
]
|
1096
|
-
|
2020
|
+
|
1097
2021
|
# Or using ChunkSource objects
|
1098
2022
|
from morphik.models import ChunkSource
|
1099
2023
|
sources = [
|
1100
2024
|
ChunkSource(document_id="doc_123", chunk_number=0),
|
1101
2025
|
ChunkSource(document_id="doc_456", chunk_number=2)
|
1102
2026
|
]
|
1103
|
-
|
2027
|
+
|
1104
2028
|
chunks = db.batch_get_chunks(sources)
|
1105
2029
|
for chunk in chunks:
|
1106
2030
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
@@ -1113,44 +2037,9 @@ class Morphik:
|
|
1113
2037
|
source_dicts.append(source)
|
1114
2038
|
else:
|
1115
2039
|
source_dicts.append(source.model_dump())
|
1116
|
-
|
2040
|
+
|
1117
2041
|
response = self._request("POST", "batch/chunks", data=source_dicts)
|
1118
|
-
|
1119
|
-
|
1120
|
-
final_chunks = []
|
1121
|
-
for chunk in chunks:
|
1122
|
-
if chunk.metadata.get("is_image"):
|
1123
|
-
try:
|
1124
|
-
# Handle data URI format "data:image/png;base64,..."
|
1125
|
-
content = chunk.content
|
1126
|
-
if content.startswith("data:"):
|
1127
|
-
# Extract the base64 part after the comma
|
1128
|
-
content = content.split(",", 1)[1]
|
1129
|
-
|
1130
|
-
# Now decode the base64 string
|
1131
|
-
image_bytes = base64.b64decode(content)
|
1132
|
-
content = Image.open(io.BytesIO(image_bytes))
|
1133
|
-
except Exception as e:
|
1134
|
-
print(f"Error processing image: {str(e)}")
|
1135
|
-
# Fall back to using the content as text
|
1136
|
-
content = chunk.content
|
1137
|
-
else:
|
1138
|
-
content = chunk.content
|
1139
|
-
|
1140
|
-
final_chunks.append(
|
1141
|
-
FinalChunkResult(
|
1142
|
-
content=content,
|
1143
|
-
score=chunk.score,
|
1144
|
-
document_id=chunk.document_id,
|
1145
|
-
chunk_number=chunk.chunk_number,
|
1146
|
-
metadata=chunk.metadata,
|
1147
|
-
content_type=chunk.content_type,
|
1148
|
-
filename=chunk.filename,
|
1149
|
-
download_url=chunk.download_url,
|
1150
|
-
)
|
1151
|
-
)
|
1152
|
-
|
1153
|
-
return final_chunks
|
2042
|
+
return self._logic._parse_chunk_result_list_response(response)
|
1154
2043
|
|
1155
2044
|
def create_cache(
|
1156
2045
|
self,
|
@@ -1252,11 +2141,11 @@ class Morphik:
|
|
1252
2141
|
name="custom_graph",
|
1253
2142
|
documents=["doc1", "doc2", "doc3"]
|
1254
2143
|
)
|
1255
|
-
|
2144
|
+
|
1256
2145
|
# With custom entity extraction examples
|
1257
2146
|
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1258
2147
|
graph = db.create_graph(
|
1259
|
-
name="medical_graph",
|
2148
|
+
name="medical_graph",
|
1260
2149
|
filters={"category": "medical"},
|
1261
2150
|
prompt_overrides=GraphPromptOverrides(
|
1262
2151
|
entity_extraction=EntityExtractionPromptOverride(
|
@@ -1272,7 +2161,7 @@ class Morphik:
|
|
1272
2161
|
# Convert prompt_overrides to dict if it's a model
|
1273
2162
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1274
2163
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1275
|
-
|
2164
|
+
|
1276
2165
|
request = {
|
1277
2166
|
"name": name,
|
1278
2167
|
"filters": filters,
|
@@ -1281,8 +2170,8 @@ class Morphik:
|
|
1281
2170
|
}
|
1282
2171
|
|
1283
2172
|
response = self._request("POST", "graph/create", request)
|
1284
|
-
return
|
1285
|
-
|
2173
|
+
return self._logic._parse_graph_response(response)
|
2174
|
+
|
1286
2175
|
def get_graph(self, name: str) -> Graph:
|
1287
2176
|
"""
|
1288
2177
|
Get a graph by name.
|
@@ -1301,7 +2190,7 @@ class Morphik:
|
|
1301
2190
|
```
|
1302
2191
|
"""
|
1303
2192
|
response = self._request("GET", f"graph/{name}")
|
1304
|
-
return
|
2193
|
+
return self._logic._parse_graph_response(response)
|
1305
2194
|
|
1306
2195
|
def list_graphs(self) -> List[Graph]:
|
1307
2196
|
"""
|
@@ -1319,8 +2208,8 @@ class Morphik:
|
|
1319
2208
|
```
|
1320
2209
|
"""
|
1321
2210
|
response = self._request("GET", "graphs")
|
1322
|
-
return
|
1323
|
-
|
2211
|
+
return self._logic._parse_graph_list_response(response)
|
2212
|
+
|
1324
2213
|
def update_graph(
|
1325
2214
|
self,
|
1326
2215
|
name: str,
|
@@ -1330,20 +2219,20 @@ class Morphik:
|
|
1330
2219
|
) -> Graph:
|
1331
2220
|
"""
|
1332
2221
|
Update an existing graph with new documents.
|
1333
|
-
|
2222
|
+
|
1334
2223
|
This method processes additional documents matching the original or new filters,
|
1335
2224
|
extracts entities and relationships, and updates the graph with new information.
|
1336
|
-
|
2225
|
+
|
1337
2226
|
Args:
|
1338
2227
|
name: Name of the graph to update
|
1339
2228
|
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1340
2229
|
additional_documents: Optional list of additional document IDs to include
|
1341
2230
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1342
2231
|
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1343
|
-
|
2232
|
+
|
1344
2233
|
Returns:
|
1345
2234
|
Graph: The updated graph
|
1346
|
-
|
2235
|
+
|
1347
2236
|
Example:
|
1348
2237
|
```python
|
1349
2238
|
# Update a graph with new documents
|
@@ -1353,7 +2242,7 @@ class Morphik:
|
|
1353
2242
|
additional_documents=["doc4", "doc5"]
|
1354
2243
|
)
|
1355
2244
|
print(f"Graph now has {len(updated_graph.entities)} entities")
|
1356
|
-
|
2245
|
+
|
1357
2246
|
# With entity resolution examples
|
1358
2247
|
from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
|
1359
2248
|
updated_graph = db.update_graph(
|
@@ -1363,7 +2252,7 @@ class Morphik:
|
|
1363
2252
|
entity_resolution=EntityResolutionPromptOverride(
|
1364
2253
|
examples=[
|
1365
2254
|
EntityResolutionExample(
|
1366
|
-
canonical="Machine Learning",
|
2255
|
+
canonical="Machine Learning",
|
1367
2256
|
variants=["ML", "machine learning", "AI/ML"]
|
1368
2257
|
)
|
1369
2258
|
]
|
@@ -1375,7 +2264,7 @@ class Morphik:
|
|
1375
2264
|
# Convert prompt_overrides to dict if it's a model
|
1376
2265
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1377
2266
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1378
|
-
|
2267
|
+
|
1379
2268
|
request = {
|
1380
2269
|
"additional_filters": additional_filters,
|
1381
2270
|
"additional_documents": additional_documents,
|
@@ -1383,23 +2272,23 @@ class Morphik:
|
|
1383
2272
|
}
|
1384
2273
|
|
1385
2274
|
response = self._request("POST", f"graph/{name}/update", request)
|
1386
|
-
return
|
1387
|
-
|
2275
|
+
return self._logic._parse_graph_response(response)
|
2276
|
+
|
1388
2277
|
def delete_document(self, document_id: str) -> Dict[str, str]:
|
1389
2278
|
"""
|
1390
2279
|
Delete a document and all its associated data.
|
1391
|
-
|
2280
|
+
|
1392
2281
|
This method deletes a document and all its associated data, including:
|
1393
2282
|
- Document metadata
|
1394
2283
|
- Document content in storage
|
1395
2284
|
- Document chunks and embeddings in vector store
|
1396
|
-
|
2285
|
+
|
1397
2286
|
Args:
|
1398
2287
|
document_id: ID of the document to delete
|
1399
|
-
|
2288
|
+
|
1400
2289
|
Returns:
|
1401
2290
|
Dict[str, str]: Deletion status
|
1402
|
-
|
2291
|
+
|
1403
2292
|
Example:
|
1404
2293
|
```python
|
1405
2294
|
# Delete a document
|
@@ -1409,20 +2298,20 @@ class Morphik:
|
|
1409
2298
|
"""
|
1410
2299
|
response = self._request("DELETE", f"documents/{document_id}")
|
1411
2300
|
return response
|
1412
|
-
|
2301
|
+
|
1413
2302
|
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1414
2303
|
"""
|
1415
2304
|
Delete a document by its filename.
|
1416
|
-
|
2305
|
+
|
1417
2306
|
This is a convenience method that first retrieves the document ID by filename
|
1418
2307
|
and then deletes the document by ID.
|
1419
|
-
|
2308
|
+
|
1420
2309
|
Args:
|
1421
2310
|
filename: Filename of the document to delete
|
1422
|
-
|
2311
|
+
|
1423
2312
|
Returns:
|
1424
2313
|
Dict[str, str]: Deletion status
|
1425
|
-
|
2314
|
+
|
1426
2315
|
Example:
|
1427
2316
|
```python
|
1428
2317
|
# Delete a document by filename
|
@@ -1432,13 +2321,13 @@ class Morphik:
|
|
1432
2321
|
"""
|
1433
2322
|
# First get the document by filename to obtain its ID
|
1434
2323
|
doc = self.get_document_by_filename(filename)
|
1435
|
-
|
2324
|
+
|
1436
2325
|
# Then delete the document by ID
|
1437
2326
|
return self.delete_document(doc.external_id)
|
1438
2327
|
|
1439
2328
|
def close(self):
|
1440
|
-
"""Close the HTTP
|
1441
|
-
self.
|
2329
|
+
"""Close the HTTP client"""
|
2330
|
+
self._client.close()
|
1442
2331
|
|
1443
2332
|
def __enter__(self):
|
1444
2333
|
return self
|