morphik 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +507 -0
- morphik/async_.py +1174 -402
- morphik/models.py +38 -25
- morphik/sync.py +1259 -371
- {morphik-0.1.0.dist-info → morphik-0.1.2.dist-info}/METADATA +1 -1
- morphik-0.1.2.dist-info/RECORD +10 -0
- morphik-0.1.0.dist-info/RECORD +0 -9
- {morphik-0.1.0.dist-info → morphik-0.1.2.dist-info}/WHEEL +0 -0
morphik/sync.py
CHANGED
@@ -1,80 +1,1088 @@
|
|
1
|
-
import base64
|
2
|
-
from io import BytesIO, IOBase
|
3
|
-
import io
|
4
|
-
from PIL.Image import Image as PILImage
|
5
|
-
from PIL import Image
|
6
1
|
import json
|
7
2
|
import logging
|
3
|
+
from io import BytesIO, IOBase
|
8
4
|
from pathlib import Path
|
9
5
|
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
10
|
-
from urllib.parse import urlparse
|
11
6
|
|
12
|
-
import
|
13
|
-
from
|
14
|
-
|
7
|
+
from PIL import Image
|
8
|
+
from PIL.Image import Image as PILImage
|
9
|
+
|
10
|
+
import httpx
|
11
|
+
|
12
|
+
from .models import (
|
13
|
+
Document,
|
14
|
+
DocumentResult,
|
15
|
+
CompletionResponse,
|
16
|
+
IngestTextRequest,
|
17
|
+
ChunkSource,
|
18
|
+
Graph,
|
19
|
+
# Prompt override models
|
20
|
+
GraphPromptOverrides,
|
21
|
+
QueryPromptOverrides,
|
22
|
+
)
|
23
|
+
from .rules import Rule
|
24
|
+
from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict
|
25
|
+
|
26
|
+
logger = logging.getLogger(__name__)
|
27
|
+
|
28
|
+
|
29
|
+
class Cache:
|
30
|
+
def __init__(self, db: "Morphik", name: str):
|
31
|
+
self._db = db
|
32
|
+
self._name = name
|
33
|
+
|
34
|
+
def update(self) -> bool:
|
35
|
+
response = self._db._request("POST", f"cache/{self._name}/update")
|
36
|
+
return response.get("success", False)
|
37
|
+
|
38
|
+
def add_docs(self, docs: List[str]) -> bool:
|
39
|
+
response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
|
40
|
+
return response.get("success", False)
|
41
|
+
|
42
|
+
def query(
|
43
|
+
self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
|
44
|
+
) -> CompletionResponse:
|
45
|
+
response = self._db._request(
|
46
|
+
"POST",
|
47
|
+
f"cache/{self._name}/query",
|
48
|
+
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
49
|
+
data="",
|
50
|
+
)
|
51
|
+
return CompletionResponse(**response)
|
52
|
+
|
53
|
+
|
54
|
+
class Folder:
|
55
|
+
"""
|
56
|
+
A folder that allows operations to be scoped to a specific folder.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
client: The Morphik client instance
|
60
|
+
name: The name of the folder
|
61
|
+
"""
|
62
|
+
|
63
|
+
def __init__(self, client: "Morphik", name: str):
|
64
|
+
self._client = client
|
65
|
+
self._name = name
|
66
|
+
|
67
|
+
@property
|
68
|
+
def name(self) -> str:
|
69
|
+
"""Returns the folder name."""
|
70
|
+
return self._name
|
71
|
+
|
72
|
+
def signin(self, end_user_id: str) -> "UserScope":
|
73
|
+
"""
|
74
|
+
Returns a UserScope object scoped to this folder and the end user.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
end_user_id: The ID of the end user
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
UserScope: A user scope scoped to this folder and the end user
|
81
|
+
"""
|
82
|
+
return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)
|
83
|
+
|
84
|
+
def ingest_text(
|
85
|
+
self,
|
86
|
+
content: str,
|
87
|
+
filename: Optional[str] = None,
|
88
|
+
metadata: Optional[Dict[str, Any]] = None,
|
89
|
+
rules: Optional[List[RuleOrDict]] = None,
|
90
|
+
use_colpali: bool = True,
|
91
|
+
) -> Document:
|
92
|
+
"""
|
93
|
+
Ingest a text document into Morphik within this folder.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
content: Text content to ingest
|
97
|
+
filename: Optional file name
|
98
|
+
metadata: Optional metadata dictionary
|
99
|
+
rules: Optional list of rules to apply during ingestion
|
100
|
+
use_colpali: Whether to use ColPali-style embedding model
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Document: Metadata of the ingested document
|
104
|
+
"""
|
105
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
106
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
107
|
+
content, filename, metadata, rules_list, use_colpali, self._name, None
|
108
|
+
)
|
109
|
+
response = self._client._request("POST", "ingest/text", data=payload)
|
110
|
+
doc = self._client._logic._parse_document_response(response)
|
111
|
+
doc._client = self._client
|
112
|
+
return doc
|
113
|
+
|
114
|
+
def ingest_file(
|
115
|
+
self,
|
116
|
+
file: Union[str, bytes, BinaryIO, Path],
|
117
|
+
filename: Optional[str] = None,
|
118
|
+
metadata: Optional[Dict[str, Any]] = None,
|
119
|
+
rules: Optional[List[RuleOrDict]] = None,
|
120
|
+
use_colpali: bool = True,
|
121
|
+
) -> Document:
|
122
|
+
"""
|
123
|
+
Ingest a file document into Morphik within this folder.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
127
|
+
filename: Name of the file
|
128
|
+
metadata: Optional metadata dictionary
|
129
|
+
rules: Optional list of rules to apply during ingestion
|
130
|
+
use_colpali: Whether to use ColPali-style embedding model
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Document: Metadata of the ingested document
|
134
|
+
"""
|
135
|
+
# Process file input
|
136
|
+
file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)
|
137
|
+
|
138
|
+
try:
|
139
|
+
# Prepare multipart form data
|
140
|
+
files = {"file": (filename, file_obj)}
|
141
|
+
|
142
|
+
# Create form data
|
143
|
+
form_data = self._client._logic._prepare_ingest_file_form_data(
|
144
|
+
metadata, rules, self._name, None
|
145
|
+
)
|
146
|
+
|
147
|
+
response = self._client._request(
|
148
|
+
"POST",
|
149
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
150
|
+
data=form_data,
|
151
|
+
files=files,
|
152
|
+
)
|
153
|
+
doc = self._client._logic._parse_document_response(response)
|
154
|
+
doc._client = self._client
|
155
|
+
return doc
|
156
|
+
finally:
|
157
|
+
# Close file if we opened it
|
158
|
+
if isinstance(file, (str, Path)):
|
159
|
+
file_obj.close()
|
160
|
+
|
161
|
+
def ingest_files(
|
162
|
+
self,
|
163
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
164
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
165
|
+
rules: Optional[List[RuleOrDict]] = None,
|
166
|
+
use_colpali: bool = True,
|
167
|
+
parallel: bool = True,
|
168
|
+
) -> List[Document]:
|
169
|
+
"""
|
170
|
+
Ingest multiple files into Morphik within this folder.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
files: List of files to ingest
|
174
|
+
metadata: Optional metadata
|
175
|
+
rules: Optional list of rules to apply
|
176
|
+
use_colpali: Whether to use ColPali-style embedding
|
177
|
+
parallel: Whether to process files in parallel
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
List[Document]: List of ingested documents
|
181
|
+
"""
|
182
|
+
# Convert files to format expected by API
|
183
|
+
file_objects = self._client._logic._prepare_files_for_upload(files)
|
184
|
+
|
185
|
+
try:
|
186
|
+
# Prepare form data
|
187
|
+
data = self._client._logic._prepare_ingest_files_form_data(
|
188
|
+
metadata, rules, use_colpali, parallel, self._name, None
|
189
|
+
)
|
190
|
+
|
191
|
+
response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
|
192
|
+
|
193
|
+
if response.get("errors"):
|
194
|
+
# Log errors but don't raise exception
|
195
|
+
for error in response["errors"]:
|
196
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
197
|
+
|
198
|
+
docs = [
|
199
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
200
|
+
]
|
201
|
+
for doc in docs:
|
202
|
+
doc._client = self._client
|
203
|
+
return docs
|
204
|
+
finally:
|
205
|
+
# Clean up file objects
|
206
|
+
for _, (_, file_obj) in file_objects:
|
207
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
208
|
+
file_obj.close()
|
209
|
+
|
210
|
+
def ingest_directory(
|
211
|
+
self,
|
212
|
+
directory: Union[str, Path],
|
213
|
+
recursive: bool = False,
|
214
|
+
pattern: str = "*",
|
215
|
+
metadata: Optional[Dict[str, Any]] = None,
|
216
|
+
rules: Optional[List[RuleOrDict]] = None,
|
217
|
+
use_colpali: bool = True,
|
218
|
+
parallel: bool = True,
|
219
|
+
) -> List[Document]:
|
220
|
+
"""
|
221
|
+
Ingest all files in a directory into Morphik within this folder.
|
222
|
+
|
223
|
+
Args:
|
224
|
+
directory: Path to directory containing files to ingest
|
225
|
+
recursive: Whether to recursively process subdirectories
|
226
|
+
pattern: Optional glob pattern to filter files
|
227
|
+
metadata: Optional metadata dictionary to apply to all files
|
228
|
+
rules: Optional list of rules to apply
|
229
|
+
use_colpali: Whether to use ColPali-style embedding
|
230
|
+
parallel: Whether to process files in parallel
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
List[Document]: List of ingested documents
|
234
|
+
"""
|
235
|
+
directory = Path(directory)
|
236
|
+
if not directory.is_dir():
|
237
|
+
raise ValueError(f"Directory not found: {directory}")
|
238
|
+
|
239
|
+
# Collect all files matching pattern
|
240
|
+
if recursive:
|
241
|
+
files = list(directory.rglob(pattern))
|
242
|
+
else:
|
243
|
+
files = list(directory.glob(pattern))
|
244
|
+
|
245
|
+
# Filter out directories
|
246
|
+
files = [f for f in files if f.is_file()]
|
247
|
+
|
248
|
+
if not files:
|
249
|
+
return []
|
250
|
+
|
251
|
+
# Use ingest_files with collected paths
|
252
|
+
return self.ingest_files(
|
253
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
254
|
+
)
|
255
|
+
|
256
|
+
def retrieve_chunks(
|
257
|
+
self,
|
258
|
+
query: str,
|
259
|
+
filters: Optional[Dict[str, Any]] = None,
|
260
|
+
k: int = 4,
|
261
|
+
min_score: float = 0.0,
|
262
|
+
use_colpali: bool = True,
|
263
|
+
) -> List[FinalChunkResult]:
|
264
|
+
"""
|
265
|
+
Retrieve relevant chunks within this folder.
|
266
|
+
|
267
|
+
Args:
|
268
|
+
query: Search query text
|
269
|
+
filters: Optional metadata filters
|
270
|
+
k: Number of results (default: 4)
|
271
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
272
|
+
use_colpali: Whether to use ColPali-style embedding model
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
List[FinalChunkResult]: List of relevant chunks
|
276
|
+
"""
|
277
|
+
request = {
|
278
|
+
"query": query,
|
279
|
+
"filters": filters,
|
280
|
+
"k": k,
|
281
|
+
"min_score": min_score,
|
282
|
+
"use_colpali": use_colpali,
|
283
|
+
"folder_name": self._name, # Add folder name here
|
284
|
+
}
|
285
|
+
|
286
|
+
response = self._client._request("POST", "retrieve/chunks", request)
|
287
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
288
|
+
|
289
|
+
def retrieve_docs(
|
290
|
+
self,
|
291
|
+
query: str,
|
292
|
+
filters: Optional[Dict[str, Any]] = None,
|
293
|
+
k: int = 4,
|
294
|
+
min_score: float = 0.0,
|
295
|
+
use_colpali: bool = True,
|
296
|
+
) -> List[DocumentResult]:
|
297
|
+
"""
|
298
|
+
Retrieve relevant documents within this folder.
|
299
|
+
|
300
|
+
Args:
|
301
|
+
query: Search query text
|
302
|
+
filters: Optional metadata filters
|
303
|
+
k: Number of results (default: 4)
|
304
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
305
|
+
use_colpali: Whether to use ColPali-style embedding model
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
List[DocumentResult]: List of relevant documents
|
309
|
+
"""
|
310
|
+
request = {
|
311
|
+
"query": query,
|
312
|
+
"filters": filters,
|
313
|
+
"k": k,
|
314
|
+
"min_score": min_score,
|
315
|
+
"use_colpali": use_colpali,
|
316
|
+
"folder_name": self._name, # Add folder name here
|
317
|
+
}
|
318
|
+
|
319
|
+
response = self._client._request("POST", "retrieve/docs", request)
|
320
|
+
return self._client._logic._parse_document_result_list_response(response)
|
321
|
+
|
322
|
+
def query(
|
323
|
+
self,
|
324
|
+
query: str,
|
325
|
+
filters: Optional[Dict[str, Any]] = None,
|
326
|
+
k: int = 4,
|
327
|
+
min_score: float = 0.0,
|
328
|
+
max_tokens: Optional[int] = None,
|
329
|
+
temperature: Optional[float] = None,
|
330
|
+
use_colpali: bool = True,
|
331
|
+
graph_name: Optional[str] = None,
|
332
|
+
hop_depth: int = 1,
|
333
|
+
include_paths: bool = False,
|
334
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
335
|
+
) -> CompletionResponse:
|
336
|
+
"""
|
337
|
+
Generate completion using relevant chunks as context within this folder.
|
338
|
+
|
339
|
+
Args:
|
340
|
+
query: Query text
|
341
|
+
filters: Optional metadata filters
|
342
|
+
k: Number of chunks to use as context (default: 4)
|
343
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
344
|
+
max_tokens: Maximum tokens in completion
|
345
|
+
temperature: Model temperature
|
346
|
+
use_colpali: Whether to use ColPali-style embedding model
|
347
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
348
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
349
|
+
include_paths: Whether to include relationship paths in the response
|
350
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
351
|
+
|
352
|
+
Returns:
|
353
|
+
CompletionResponse: Generated completion
|
354
|
+
"""
|
355
|
+
payload = self._client._logic._prepare_query_request(
|
356
|
+
query,
|
357
|
+
filters,
|
358
|
+
k,
|
359
|
+
min_score,
|
360
|
+
max_tokens,
|
361
|
+
temperature,
|
362
|
+
use_colpali,
|
363
|
+
graph_name,
|
364
|
+
hop_depth,
|
365
|
+
include_paths,
|
366
|
+
prompt_overrides,
|
367
|
+
self._name,
|
368
|
+
None,
|
369
|
+
)
|
370
|
+
response = self._client._request("POST", "query", data=payload)
|
371
|
+
return self._client._logic._parse_completion_response(response)
|
372
|
+
|
373
|
+
def list_documents(
|
374
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
375
|
+
) -> List[Document]:
|
376
|
+
"""
|
377
|
+
List accessible documents within this folder.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
skip: Number of documents to skip
|
381
|
+
limit: Maximum number of documents to return
|
382
|
+
filters: Optional filters
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
List[Document]: List of documents
|
386
|
+
"""
|
387
|
+
params, data = self._client._logic._prepare_list_documents_request(
|
388
|
+
skip, limit, filters, self._name, None
|
389
|
+
)
|
390
|
+
response = self._client._request("POST", "documents", data=data, params=params)
|
391
|
+
docs = self._client._logic._parse_document_list_response(response)
|
392
|
+
for doc in docs:
|
393
|
+
doc._client = self._client
|
394
|
+
return docs
|
395
|
+
|
396
|
+
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
397
|
+
"""
|
398
|
+
Retrieve multiple documents by their IDs in a single batch operation within this folder.
|
399
|
+
|
400
|
+
Args:
|
401
|
+
document_ids: List of document IDs to retrieve
|
402
|
+
|
403
|
+
Returns:
|
404
|
+
List[Document]: List of document metadata for found documents
|
405
|
+
"""
|
406
|
+
request = {"document_ids": document_ids, "folder_name": self._name}
|
407
|
+
|
408
|
+
response = self._client._request("POST", "batch/documents", data=request)
|
409
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
410
|
+
for doc in docs:
|
411
|
+
doc._client = self._client
|
412
|
+
return docs
|
413
|
+
|
414
|
+
def batch_get_chunks(
|
415
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
416
|
+
) -> List[FinalChunkResult]:
|
417
|
+
"""
|
418
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
422
|
+
|
423
|
+
Returns:
|
424
|
+
List[FinalChunkResult]: List of chunk results
|
425
|
+
"""
|
426
|
+
# Convert to list of dictionaries if needed
|
427
|
+
source_dicts = []
|
428
|
+
for source in sources:
|
429
|
+
if isinstance(source, dict):
|
430
|
+
source_dicts.append(source)
|
431
|
+
else:
|
432
|
+
source_dicts.append(source.model_dump())
|
433
|
+
|
434
|
+
# Add folder_name to request
|
435
|
+
request = {"sources": source_dicts, "folder_name": self._name}
|
436
|
+
|
437
|
+
response = self._client._request("POST", "batch/chunks", data=request)
|
438
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
439
|
+
|
440
|
+
def create_graph(
|
441
|
+
self,
|
442
|
+
name: str,
|
443
|
+
filters: Optional[Dict[str, Any]] = None,
|
444
|
+
documents: Optional[List[str]] = None,
|
445
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
446
|
+
) -> Graph:
|
447
|
+
"""
|
448
|
+
Create a graph from documents within this folder.
|
449
|
+
|
450
|
+
Args:
|
451
|
+
name: Name of the graph to create
|
452
|
+
filters: Optional metadata filters to determine which documents to include
|
453
|
+
documents: Optional list of specific document IDs to include
|
454
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
Graph: The created graph object
|
458
|
+
"""
|
459
|
+
# Convert prompt_overrides to dict if it's a model
|
460
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
461
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
462
|
+
|
463
|
+
request = {
|
464
|
+
"name": name,
|
465
|
+
"filters": filters,
|
466
|
+
"documents": documents,
|
467
|
+
"prompt_overrides": prompt_overrides,
|
468
|
+
"folder_name": self._name, # Add folder name here
|
469
|
+
}
|
470
|
+
|
471
|
+
response = self._client._request("POST", "graph/create", request)
|
472
|
+
return self._client._logic._parse_graph_response(response)
|
473
|
+
|
474
|
+
def update_graph(
|
475
|
+
self,
|
476
|
+
name: str,
|
477
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
478
|
+
additional_documents: Optional[List[str]] = None,
|
479
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
480
|
+
) -> Graph:
|
481
|
+
"""
|
482
|
+
Update an existing graph with new documents from this folder.
|
483
|
+
|
484
|
+
Args:
|
485
|
+
name: Name of the graph to update
|
486
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
487
|
+
additional_documents: Optional list of additional document IDs to include
|
488
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
489
|
+
|
490
|
+
Returns:
|
491
|
+
Graph: The updated graph
|
492
|
+
"""
|
493
|
+
# Convert prompt_overrides to dict if it's a model
|
494
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
495
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
496
|
+
|
497
|
+
request = {
|
498
|
+
"additional_filters": additional_filters,
|
499
|
+
"additional_documents": additional_documents,
|
500
|
+
"prompt_overrides": prompt_overrides,
|
501
|
+
"folder_name": self._name, # Add folder name here
|
502
|
+
}
|
503
|
+
|
504
|
+
response = self._client._request("POST", f"graph/{name}/update", request)
|
505
|
+
return self._client._logic._parse_graph_response(response)
|
506
|
+
|
507
|
+
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
508
|
+
"""
|
509
|
+
Delete a document by its filename within this folder.
|
510
|
+
|
511
|
+
Args:
|
512
|
+
filename: Filename of the document to delete
|
513
|
+
|
514
|
+
Returns:
|
515
|
+
Dict[str, str]: Deletion status
|
516
|
+
"""
|
517
|
+
# Get the document by filename with folder scope
|
518
|
+
request = {"filename": filename, "folder_name": self._name}
|
519
|
+
|
520
|
+
# First get the document ID
|
521
|
+
response = self._client._request(
|
522
|
+
"GET", f"documents/filename/{filename}", params={"folder_name": self._name}
|
523
|
+
)
|
524
|
+
doc = self._client._logic._parse_document_response(response)
|
525
|
+
|
526
|
+
# Then delete by ID
|
527
|
+
return self._client.delete_document(doc.external_id)
|
528
|
+
|
529
|
+
|
530
|
+
class UserScope:
|
531
|
+
"""
|
532
|
+
A user scope that allows operations to be scoped to a specific end user and optionally a folder.
|
533
|
+
|
534
|
+
Args:
|
535
|
+
client: The Morphik client instance
|
536
|
+
end_user_id: The ID of the end user
|
537
|
+
folder_name: Optional folder name to further scope operations
|
538
|
+
"""
|
539
|
+
|
540
|
+
def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None):
|
541
|
+
self._client = client
|
542
|
+
self._end_user_id = end_user_id
|
543
|
+
self._folder_name = folder_name
|
544
|
+
|
545
|
+
@property
|
546
|
+
def end_user_id(self) -> str:
|
547
|
+
"""Returns the end user ID."""
|
548
|
+
return self._end_user_id
|
549
|
+
|
550
|
+
@property
|
551
|
+
def folder_name(self) -> Optional[str]:
|
552
|
+
"""Returns the folder name if any."""
|
553
|
+
return self._folder_name
|
554
|
+
|
555
|
+
def ingest_text(
|
556
|
+
self,
|
557
|
+
content: str,
|
558
|
+
filename: Optional[str] = None,
|
559
|
+
metadata: Optional[Dict[str, Any]] = None,
|
560
|
+
rules: Optional[List[RuleOrDict]] = None,
|
561
|
+
use_colpali: bool = True,
|
562
|
+
) -> Document:
|
563
|
+
"""
|
564
|
+
Ingest a text document into Morphik as this end user.
|
565
|
+
|
566
|
+
Args:
|
567
|
+
content: Text content to ingest
|
568
|
+
filename: Optional file name
|
569
|
+
metadata: Optional metadata dictionary
|
570
|
+
rules: Optional list of rules to apply during ingestion
|
571
|
+
use_colpali: Whether to use ColPali-style embedding model
|
572
|
+
|
573
|
+
Returns:
|
574
|
+
Document: Metadata of the ingested document
|
575
|
+
"""
|
576
|
+
rules_list = [self._client._convert_rule(r) for r in (rules or [])]
|
577
|
+
payload = self._client._logic._prepare_ingest_text_request(
|
578
|
+
content,
|
579
|
+
filename,
|
580
|
+
metadata,
|
581
|
+
rules_list,
|
582
|
+
use_colpali,
|
583
|
+
self._folder_name,
|
584
|
+
self._end_user_id,
|
585
|
+
)
|
586
|
+
response = self._client._request("POST", "ingest/text", data=payload)
|
587
|
+
doc = self._client._logic._parse_document_response(response)
|
588
|
+
doc._client = self._client
|
589
|
+
return doc
|
590
|
+
|
591
|
+
def ingest_file(
|
592
|
+
self,
|
593
|
+
file: Union[str, bytes, BinaryIO, Path],
|
594
|
+
filename: Optional[str] = None,
|
595
|
+
metadata: Optional[Dict[str, Any]] = None,
|
596
|
+
rules: Optional[List[RuleOrDict]] = None,
|
597
|
+
use_colpali: bool = True,
|
598
|
+
) -> Document:
|
599
|
+
"""
|
600
|
+
Ingest a file document into Morphik as this end user.
|
601
|
+
|
602
|
+
Args:
|
603
|
+
file: File to ingest (path string, bytes, file object, or Path)
|
604
|
+
filename: Name of the file
|
605
|
+
metadata: Optional metadata dictionary
|
606
|
+
rules: Optional list of rules to apply during ingestion
|
607
|
+
use_colpali: Whether to use ColPali-style embedding model
|
608
|
+
|
609
|
+
Returns:
|
610
|
+
Document: Metadata of the ingested document
|
611
|
+
"""
|
612
|
+
# Handle different file input types
|
613
|
+
if isinstance(file, (str, Path)):
|
614
|
+
file_path = Path(file)
|
615
|
+
if not file_path.exists():
|
616
|
+
raise ValueError(f"File not found: {file}")
|
617
|
+
filename = file_path.name if filename is None else filename
|
618
|
+
with open(file_path, "rb") as f:
|
619
|
+
content = f.read()
|
620
|
+
file_obj = BytesIO(content)
|
621
|
+
elif isinstance(file, bytes):
|
622
|
+
if filename is None:
|
623
|
+
raise ValueError("filename is required when ingesting bytes")
|
624
|
+
file_obj = BytesIO(file)
|
625
|
+
else:
|
626
|
+
if filename is None:
|
627
|
+
raise ValueError("filename is required when ingesting file object")
|
628
|
+
file_obj = file
|
629
|
+
|
630
|
+
try:
|
631
|
+
# Prepare multipart form data
|
632
|
+
files = {"file": (filename, file_obj)}
|
633
|
+
|
634
|
+
# Add metadata and rules
|
635
|
+
form_data = {
|
636
|
+
"metadata": json.dumps(metadata or {}),
|
637
|
+
"rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
|
638
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
639
|
+
}
|
640
|
+
|
641
|
+
# Add folder name if scoped to a folder
|
642
|
+
if self._folder_name:
|
643
|
+
form_data["folder_name"] = self._folder_name
|
644
|
+
|
645
|
+
response = self._client._request(
|
646
|
+
"POST",
|
647
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
648
|
+
data=form_data,
|
649
|
+
files=files,
|
650
|
+
)
|
651
|
+
doc = self._client._logic._parse_document_response(response)
|
652
|
+
doc._client = self._client
|
653
|
+
return doc
|
654
|
+
finally:
|
655
|
+
# Close file if we opened it
|
656
|
+
if isinstance(file, (str, Path)):
|
657
|
+
file_obj.close()
|
658
|
+
|
659
|
+
def ingest_files(
|
660
|
+
self,
|
661
|
+
files: List[Union[str, bytes, BinaryIO, Path]],
|
662
|
+
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
663
|
+
rules: Optional[List[RuleOrDict]] = None,
|
664
|
+
use_colpali: bool = True,
|
665
|
+
parallel: bool = True,
|
666
|
+
) -> List[Document]:
|
667
|
+
"""
|
668
|
+
Ingest multiple files into Morphik as this end user.
|
669
|
+
|
670
|
+
Args:
|
671
|
+
files: List of files to ingest
|
672
|
+
metadata: Optional metadata
|
673
|
+
rules: Optional list of rules to apply
|
674
|
+
use_colpali: Whether to use ColPali-style embedding
|
675
|
+
parallel: Whether to process files in parallel
|
676
|
+
|
677
|
+
Returns:
|
678
|
+
List[Document]: List of ingested documents
|
679
|
+
"""
|
680
|
+
# Convert files to format expected by API
|
681
|
+
file_objects = []
|
682
|
+
for file in files:
|
683
|
+
if isinstance(file, (str, Path)):
|
684
|
+
path = Path(file)
|
685
|
+
file_objects.append(("files", (path.name, open(path, "rb"))))
|
686
|
+
elif isinstance(file, bytes):
|
687
|
+
file_objects.append(("files", ("file.bin", file)))
|
688
|
+
else:
|
689
|
+
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
690
|
+
|
691
|
+
try:
|
692
|
+
# Prepare request data
|
693
|
+
# Convert rules appropriately
|
694
|
+
if rules:
|
695
|
+
if all(isinstance(r, list) for r in rules):
|
696
|
+
# List of lists - per-file rules
|
697
|
+
converted_rules = [
|
698
|
+
[self._client._convert_rule(r) for r in rule_list] for rule_list in rules
|
699
|
+
]
|
700
|
+
else:
|
701
|
+
# Flat list - shared rules for all files
|
702
|
+
converted_rules = [self._client._convert_rule(r) for r in rules]
|
703
|
+
else:
|
704
|
+
converted_rules = []
|
705
|
+
|
706
|
+
data = {
|
707
|
+
"metadata": json.dumps(metadata or {}),
|
708
|
+
"rules": json.dumps(converted_rules),
|
709
|
+
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
710
|
+
"parallel": str(parallel).lower(),
|
711
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
712
|
+
}
|
713
|
+
|
714
|
+
# Add folder name if scoped to a folder
|
715
|
+
if self._folder_name:
|
716
|
+
data["folder_name"] = self._folder_name
|
717
|
+
|
718
|
+
response = self._client._request("POST", "ingest/files", data=data, files=file_objects)
|
719
|
+
|
720
|
+
if response.get("errors"):
|
721
|
+
# Log errors but don't raise exception
|
722
|
+
for error in response["errors"]:
|
723
|
+
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
724
|
+
|
725
|
+
docs = [
|
726
|
+
self._client._logic._parse_document_response(doc) for doc in response["documents"]
|
727
|
+
]
|
728
|
+
for doc in docs:
|
729
|
+
doc._client = self._client
|
730
|
+
return docs
|
731
|
+
finally:
|
732
|
+
# Clean up file objects
|
733
|
+
for _, (_, file_obj) in file_objects:
|
734
|
+
if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
|
735
|
+
file_obj.close()
|
736
|
+
|
737
|
+
def ingest_directory(
|
738
|
+
self,
|
739
|
+
directory: Union[str, Path],
|
740
|
+
recursive: bool = False,
|
741
|
+
pattern: str = "*",
|
742
|
+
metadata: Optional[Dict[str, Any]] = None,
|
743
|
+
rules: Optional[List[RuleOrDict]] = None,
|
744
|
+
use_colpali: bool = True,
|
745
|
+
parallel: bool = True,
|
746
|
+
) -> List[Document]:
|
747
|
+
"""
|
748
|
+
Ingest all files in a directory into Morphik as this end user.
|
749
|
+
|
750
|
+
Args:
|
751
|
+
directory: Path to directory containing files to ingest
|
752
|
+
recursive: Whether to recursively process subdirectories
|
753
|
+
pattern: Optional glob pattern to filter files
|
754
|
+
metadata: Optional metadata dictionary to apply to all files
|
755
|
+
rules: Optional list of rules to apply
|
756
|
+
use_colpali: Whether to use ColPali-style embedding
|
757
|
+
parallel: Whether to process files in parallel
|
758
|
+
|
759
|
+
Returns:
|
760
|
+
List[Document]: List of ingested documents
|
761
|
+
"""
|
762
|
+
directory = Path(directory)
|
763
|
+
if not directory.is_dir():
|
764
|
+
raise ValueError(f"Directory not found: {directory}")
|
765
|
+
|
766
|
+
# Collect all files matching pattern
|
767
|
+
if recursive:
|
768
|
+
files = list(directory.rglob(pattern))
|
769
|
+
else:
|
770
|
+
files = list(directory.glob(pattern))
|
771
|
+
|
772
|
+
# Filter out directories
|
773
|
+
files = [f for f in files if f.is_file()]
|
774
|
+
|
775
|
+
if not files:
|
776
|
+
return []
|
777
|
+
|
778
|
+
# Use ingest_files with collected paths
|
779
|
+
return self.ingest_files(
|
780
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
781
|
+
)
|
782
|
+
|
783
|
+
def retrieve_chunks(
|
784
|
+
self,
|
785
|
+
query: str,
|
786
|
+
filters: Optional[Dict[str, Any]] = None,
|
787
|
+
k: int = 4,
|
788
|
+
min_score: float = 0.0,
|
789
|
+
use_colpali: bool = True,
|
790
|
+
) -> List[FinalChunkResult]:
|
791
|
+
"""
|
792
|
+
Retrieve relevant chunks as this end user.
|
793
|
+
|
794
|
+
Args:
|
795
|
+
query: Search query text
|
796
|
+
filters: Optional metadata filters
|
797
|
+
k: Number of results (default: 4)
|
798
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
799
|
+
use_colpali: Whether to use ColPali-style embedding model
|
800
|
+
|
801
|
+
Returns:
|
802
|
+
List[FinalChunkResult]: List of relevant chunks
|
803
|
+
"""
|
804
|
+
request = {
|
805
|
+
"query": query,
|
806
|
+
"filters": filters,
|
807
|
+
"k": k,
|
808
|
+
"min_score": min_score,
|
809
|
+
"use_colpali": use_colpali,
|
810
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
811
|
+
}
|
812
|
+
|
813
|
+
# Add folder name if scoped to a folder
|
814
|
+
if self._folder_name:
|
815
|
+
request["folder_name"] = self._folder_name
|
816
|
+
|
817
|
+
response = self._client._request("POST", "retrieve/chunks", request)
|
818
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
819
|
+
|
820
|
+
def retrieve_docs(
|
821
|
+
self,
|
822
|
+
query: str,
|
823
|
+
filters: Optional[Dict[str, Any]] = None,
|
824
|
+
k: int = 4,
|
825
|
+
min_score: float = 0.0,
|
826
|
+
use_colpali: bool = True,
|
827
|
+
) -> List[DocumentResult]:
|
828
|
+
"""
|
829
|
+
Retrieve relevant documents as this end user.
|
830
|
+
|
831
|
+
Args:
|
832
|
+
query: Search query text
|
833
|
+
filters: Optional metadata filters
|
834
|
+
k: Number of results (default: 4)
|
835
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
836
|
+
use_colpali: Whether to use ColPali-style embedding model
|
837
|
+
|
838
|
+
Returns:
|
839
|
+
List[DocumentResult]: List of relevant documents
|
840
|
+
"""
|
841
|
+
request = {
|
842
|
+
"query": query,
|
843
|
+
"filters": filters,
|
844
|
+
"k": k,
|
845
|
+
"min_score": min_score,
|
846
|
+
"use_colpali": use_colpali,
|
847
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
848
|
+
}
|
849
|
+
|
850
|
+
# Add folder name if scoped to a folder
|
851
|
+
if self._folder_name:
|
852
|
+
request["folder_name"] = self._folder_name
|
853
|
+
|
854
|
+
response = self._client._request("POST", "retrieve/docs", request)
|
855
|
+
return self._client._logic._parse_document_result_list_response(response)
|
856
|
+
|
857
|
+
def query(
|
858
|
+
self,
|
859
|
+
query: str,
|
860
|
+
filters: Optional[Dict[str, Any]] = None,
|
861
|
+
k: int = 4,
|
862
|
+
min_score: float = 0.0,
|
863
|
+
max_tokens: Optional[int] = None,
|
864
|
+
temperature: Optional[float] = None,
|
865
|
+
use_colpali: bool = True,
|
866
|
+
graph_name: Optional[str] = None,
|
867
|
+
hop_depth: int = 1,
|
868
|
+
include_paths: bool = False,
|
869
|
+
prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
|
870
|
+
) -> CompletionResponse:
|
871
|
+
"""
|
872
|
+
Generate completion using relevant chunks as context as this end user.
|
873
|
+
|
874
|
+
Args:
|
875
|
+
query: Query text
|
876
|
+
filters: Optional metadata filters
|
877
|
+
k: Number of chunks to use as context (default: 4)
|
878
|
+
min_score: Minimum similarity threshold (default: 0.0)
|
879
|
+
max_tokens: Maximum tokens in completion
|
880
|
+
temperature: Model temperature
|
881
|
+
use_colpali: Whether to use ColPali-style embedding model
|
882
|
+
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
883
|
+
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
884
|
+
include_paths: Whether to include relationship paths in the response
|
885
|
+
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
886
|
+
|
887
|
+
Returns:
|
888
|
+
CompletionResponse: Generated completion
|
889
|
+
"""
|
890
|
+
payload = self._client._logic._prepare_query_request(
|
891
|
+
query,
|
892
|
+
filters,
|
893
|
+
k,
|
894
|
+
min_score,
|
895
|
+
max_tokens,
|
896
|
+
temperature,
|
897
|
+
use_colpali,
|
898
|
+
graph_name,
|
899
|
+
hop_depth,
|
900
|
+
include_paths,
|
901
|
+
prompt_overrides,
|
902
|
+
self._folder_name,
|
903
|
+
self._end_user_id,
|
904
|
+
)
|
905
|
+
response = self._client._request("POST", "query", data=payload)
|
906
|
+
return self._client._logic._parse_completion_response(response)
|
907
|
+
|
908
|
+
def list_documents(
|
909
|
+
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
910
|
+
) -> List[Document]:
|
911
|
+
"""
|
912
|
+
List accessible documents for this end user.
|
913
|
+
|
914
|
+
Args:
|
915
|
+
skip: Number of documents to skip
|
916
|
+
limit: Maximum number of documents to return
|
917
|
+
filters: Optional filters
|
918
|
+
|
919
|
+
Returns:
|
920
|
+
List[Document]: List of documents
|
921
|
+
"""
|
922
|
+
# Add end_user_id and folder_name to params
|
923
|
+
params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id}
|
924
|
+
|
925
|
+
# Add folder name if scoped to a folder
|
926
|
+
if self._folder_name:
|
927
|
+
params["folder_name"] = self._folder_name
|
928
|
+
|
929
|
+
response = self._client._request("POST", f"documents", data=filters or {}, params=params)
|
930
|
+
|
931
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
932
|
+
for doc in docs:
|
933
|
+
doc._client = self._client
|
934
|
+
return docs
|
935
|
+
|
936
|
+
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
937
|
+
"""
|
938
|
+
Retrieve multiple documents by their IDs in a single batch operation for this end user.
|
939
|
+
|
940
|
+
Args:
|
941
|
+
document_ids: List of document IDs to retrieve
|
942
|
+
|
943
|
+
Returns:
|
944
|
+
List[Document]: List of document metadata for found documents
|
945
|
+
"""
|
946
|
+
request = {"document_ids": document_ids, "end_user_id": self._end_user_id}
|
947
|
+
|
948
|
+
# Add folder name if scoped to a folder
|
949
|
+
if self._folder_name:
|
950
|
+
request["folder_name"] = self._folder_name
|
951
|
+
|
952
|
+
response = self._client._request("POST", "batch/documents", data=request)
|
953
|
+
docs = [self._client._logic._parse_document_response(doc) for doc in response]
|
954
|
+
for doc in docs:
|
955
|
+
doc._client = self._client
|
956
|
+
return docs
|
957
|
+
|
958
|
+
def batch_get_chunks(
|
959
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
960
|
+
) -> List[FinalChunkResult]:
|
961
|
+
"""
|
962
|
+
Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.
|
963
|
+
|
964
|
+
Args:
|
965
|
+
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
966
|
+
|
967
|
+
Returns:
|
968
|
+
List[FinalChunkResult]: List of chunk results
|
969
|
+
"""
|
970
|
+
# Convert to list of dictionaries if needed
|
971
|
+
source_dicts = []
|
972
|
+
for source in sources:
|
973
|
+
if isinstance(source, dict):
|
974
|
+
source_dicts.append(source)
|
975
|
+
else:
|
976
|
+
source_dicts.append(source.model_dump())
|
977
|
+
|
978
|
+
# Add end_user_id and folder_name to request
|
979
|
+
request = {"sources": source_dicts, "end_user_id": self._end_user_id}
|
980
|
+
|
981
|
+
# Add folder name if scoped to a folder
|
982
|
+
if self._folder_name:
|
983
|
+
request["folder_name"] = self._folder_name
|
984
|
+
|
985
|
+
response = self._client._request("POST", "batch/chunks", data=request)
|
986
|
+
return self._client._logic._parse_chunk_result_list_response(response)
|
987
|
+
|
988
|
+
def create_graph(
|
989
|
+
self,
|
990
|
+
name: str,
|
991
|
+
filters: Optional[Dict[str, Any]] = None,
|
992
|
+
documents: Optional[List[str]] = None,
|
993
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
994
|
+
) -> Graph:
|
995
|
+
"""
|
996
|
+
Create a graph from documents for this end user.
|
997
|
+
|
998
|
+
Args:
|
999
|
+
name: Name of the graph to create
|
1000
|
+
filters: Optional metadata filters to determine which documents to include
|
1001
|
+
documents: Optional list of specific document IDs to include
|
1002
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1003
|
+
|
1004
|
+
Returns:
|
1005
|
+
Graph: The created graph object
|
1006
|
+
"""
|
1007
|
+
# Convert prompt_overrides to dict if it's a model
|
1008
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1009
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1010
|
+
|
1011
|
+
request = {
|
1012
|
+
"name": name,
|
1013
|
+
"filters": filters,
|
1014
|
+
"documents": documents,
|
1015
|
+
"prompt_overrides": prompt_overrides,
|
1016
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
# Add folder name if scoped to a folder
|
1020
|
+
if self._folder_name:
|
1021
|
+
request["folder_name"] = self._folder_name
|
15
1022
|
|
16
|
-
|
17
|
-
|
18
|
-
ChunkResult,
|
19
|
-
DocumentResult,
|
20
|
-
CompletionResponse,
|
21
|
-
IngestTextRequest,
|
22
|
-
ChunkSource,
|
23
|
-
Graph,
|
24
|
-
# Prompt override models
|
25
|
-
EntityExtractionExample,
|
26
|
-
EntityResolutionExample,
|
27
|
-
EntityExtractionPromptOverride,
|
28
|
-
EntityResolutionPromptOverride,
|
29
|
-
QueryPromptOverride,
|
30
|
-
GraphPromptOverrides,
|
31
|
-
QueryPromptOverrides
|
32
|
-
)
|
33
|
-
from .rules import Rule
|
1023
|
+
response = self._client._request("POST", "graph/create", request)
|
1024
|
+
return self._client._logic._parse_graph_response(response)
|
34
1025
|
|
35
|
-
|
1026
|
+
def update_graph(
|
1027
|
+
self,
|
1028
|
+
name: str,
|
1029
|
+
additional_filters: Optional[Dict[str, Any]] = None,
|
1030
|
+
additional_documents: Optional[List[str]] = None,
|
1031
|
+
prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
|
1032
|
+
) -> Graph:
|
1033
|
+
"""
|
1034
|
+
Update an existing graph with new documents for this end user.
|
1035
|
+
|
1036
|
+
Args:
|
1037
|
+
name: Name of the graph to update
|
1038
|
+
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1039
|
+
additional_documents: Optional list of additional document IDs to include
|
1040
|
+
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1041
|
+
|
1042
|
+
Returns:
|
1043
|
+
Graph: The updated graph
|
1044
|
+
"""
|
1045
|
+
# Convert prompt_overrides to dict if it's a model
|
1046
|
+
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1047
|
+
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
36
1048
|
|
37
|
-
|
38
|
-
|
1049
|
+
request = {
|
1050
|
+
"additional_filters": additional_filters,
|
1051
|
+
"additional_documents": additional_documents,
|
1052
|
+
"prompt_overrides": prompt_overrides,
|
1053
|
+
"end_user_id": self._end_user_id, # Add end user ID here
|
1054
|
+
}
|
39
1055
|
|
1056
|
+
# Add folder name if scoped to a folder
|
1057
|
+
if self._folder_name:
|
1058
|
+
request["folder_name"] = self._folder_name
|
40
1059
|
|
41
|
-
|
42
|
-
|
43
|
-
self._db = db
|
44
|
-
self._name = name
|
1060
|
+
response = self._client._request("POST", f"graph/{name}/update", request)
|
1061
|
+
return self._client._logic._parse_graph_response(response)
|
45
1062
|
|
46
|
-
def
|
47
|
-
|
48
|
-
|
1063
|
+
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1064
|
+
"""
|
1065
|
+
Delete a document by its filename for this end user.
|
49
1066
|
|
50
|
-
|
51
|
-
|
52
|
-
return response.get("success", False)
|
1067
|
+
Args:
|
1068
|
+
filename: Filename of the document to delete
|
53
1069
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
f"cache/{self._name}/query",
|
60
|
-
params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
|
61
|
-
data="",
|
62
|
-
)
|
63
|
-
return CompletionResponse(**response)
|
1070
|
+
Returns:
|
1071
|
+
Dict[str, str]: Deletion status
|
1072
|
+
"""
|
1073
|
+
# Build parameters for the filename lookup
|
1074
|
+
params = {"end_user_id": self._end_user_id}
|
64
1075
|
|
1076
|
+
# Add folder name if scoped to a folder
|
1077
|
+
if self._folder_name:
|
1078
|
+
params["folder_name"] = self._folder_name
|
65
1079
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
document_id: str = Field(..., description="Parent document ID")
|
70
|
-
chunk_number: int = Field(..., description="Chunk sequence number")
|
71
|
-
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
72
|
-
content_type: str = Field(..., description="Content type")
|
73
|
-
filename: Optional[str] = Field(None, description="Original filename")
|
74
|
-
download_url: Optional[str] = Field(None, description="URL to download full document")
|
1080
|
+
# First get the document ID
|
1081
|
+
response = self._client._request("GET", f"documents/filename/{filename}", params=params)
|
1082
|
+
doc = self._client._logic._parse_document_response(response)
|
75
1083
|
|
76
|
-
|
77
|
-
|
1084
|
+
# Then delete by ID
|
1085
|
+
return self._client.delete_document(doc.external_id)
|
78
1086
|
|
79
1087
|
|
80
1088
|
class Morphik:
|
@@ -98,33 +1106,8 @@ class Morphik:
|
|
98
1106
|
"""
|
99
1107
|
|
100
1108
|
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
|
101
|
-
self.
|
102
|
-
self.
|
103
|
-
if is_local:
|
104
|
-
self._session.verify = False # Disable SSL for localhost
|
105
|
-
self._is_local = is_local
|
106
|
-
|
107
|
-
if uri:
|
108
|
-
self._setup_auth(uri)
|
109
|
-
else:
|
110
|
-
self._base_url = "http://localhost:8000"
|
111
|
-
self._auth_token = None
|
112
|
-
|
113
|
-
def _setup_auth(self, uri: str) -> None:
|
114
|
-
"""Setup authentication from URI"""
|
115
|
-
parsed = urlparse(uri)
|
116
|
-
if not parsed.netloc:
|
117
|
-
raise ValueError("Invalid URI format")
|
118
|
-
|
119
|
-
# Split host and auth parts
|
120
|
-
auth, host = parsed.netloc.split("@")
|
121
|
-
_, self._auth_token = auth.split(":")
|
122
|
-
|
123
|
-
# Set base URL
|
124
|
-
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
125
|
-
|
126
|
-
# Basic token validation
|
127
|
-
jwt.decode(self._auth_token, options={"verify_signature": False})
|
1109
|
+
self._logic = _MorphikClientLogic(uri, timeout, is_local)
|
1110
|
+
self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local)
|
128
1111
|
|
129
1112
|
def _request(
|
130
1113
|
self,
|
@@ -135,25 +1118,25 @@ class Morphik:
|
|
135
1118
|
params: Optional[Dict[str, Any]] = None,
|
136
1119
|
) -> Dict[str, Any]:
|
137
1120
|
"""Make HTTP request"""
|
138
|
-
|
139
|
-
|
140
|
-
|
1121
|
+
url = self._logic._get_url(endpoint)
|
1122
|
+
headers = self._logic._get_headers()
|
1123
|
+
if self._logic._auth_token: # Only add auth header if we have a token
|
1124
|
+
headers["Authorization"] = f"Bearer {self._logic._auth_token}"
|
141
1125
|
|
142
1126
|
# Configure request data based on type
|
143
1127
|
if files:
|
144
1128
|
# Multipart form data for files
|
145
1129
|
request_data = {"files": files, "data": data}
|
146
|
-
# Don't set Content-Type, let
|
1130
|
+
# Don't set Content-Type, let httpx handle it
|
147
1131
|
else:
|
148
1132
|
# JSON for everything else
|
149
1133
|
headers["Content-Type"] = "application/json"
|
150
1134
|
request_data = {"json": data}
|
151
1135
|
|
152
|
-
response = self.
|
1136
|
+
response = self._client.request(
|
153
1137
|
method,
|
154
|
-
|
1138
|
+
url,
|
155
1139
|
headers=headers,
|
156
|
-
timeout=self._timeout,
|
157
1140
|
params=params,
|
158
1141
|
**request_data,
|
159
1142
|
)
|
@@ -162,9 +1145,43 @@ class Morphik:
|
|
162
1145
|
|
163
1146
|
def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
|
164
1147
|
"""Convert a rule to a dictionary format"""
|
165
|
-
|
166
|
-
|
167
|
-
|
1148
|
+
return self._logic._convert_rule(rule)
|
1149
|
+
|
1150
|
+
def create_folder(self, name: str) -> Folder:
|
1151
|
+
"""
|
1152
|
+
Create a folder to scope operations.
|
1153
|
+
|
1154
|
+
Args:
|
1155
|
+
name: The name of the folder
|
1156
|
+
|
1157
|
+
Returns:
|
1158
|
+
Folder: A folder object for scoped operations
|
1159
|
+
"""
|
1160
|
+
return Folder(self, name)
|
1161
|
+
|
1162
|
+
def get_folder(self, name: str) -> Folder:
|
1163
|
+
"""
|
1164
|
+
Get a folder by name to scope operations.
|
1165
|
+
|
1166
|
+
Args:
|
1167
|
+
name: The name of the folder
|
1168
|
+
|
1169
|
+
Returns:
|
1170
|
+
Folder: A folder object for scoped operations
|
1171
|
+
"""
|
1172
|
+
return Folder(self, name)
|
1173
|
+
|
1174
|
+
def signin(self, end_user_id: str) -> UserScope:
|
1175
|
+
"""
|
1176
|
+
Sign in as an end user to scope operations.
|
1177
|
+
|
1178
|
+
Args:
|
1179
|
+
end_user_id: The ID of the end user
|
1180
|
+
|
1181
|
+
Returns:
|
1182
|
+
UserScope: A user scope object for scoped operations
|
1183
|
+
"""
|
1184
|
+
return UserScope(self, end_user_id)
|
168
1185
|
|
169
1186
|
def ingest_text(
|
170
1187
|
self,
|
@@ -209,15 +1226,12 @@ class Morphik:
|
|
209
1226
|
)
|
210
1227
|
```
|
211
1228
|
"""
|
212
|
-
|
213
|
-
|
214
|
-
filename
|
215
|
-
metadata=metadata or {},
|
216
|
-
rules=[self._convert_rule(r) for r in (rules or [])],
|
217
|
-
use_colpali=use_colpali,
|
1229
|
+
rules_list = [self._convert_rule(r) for r in (rules or [])]
|
1230
|
+
payload = self._logic._prepare_ingest_text_request(
|
1231
|
+
content, filename, metadata, rules_list, use_colpali, None, None
|
218
1232
|
)
|
219
|
-
response = self._request("POST", "ingest/text", data=
|
220
|
-
doc =
|
1233
|
+
response = self._request("POST", "ingest/text", data=payload)
|
1234
|
+
doc = self._logic._parse_document_response(response)
|
221
1235
|
doc._client = self
|
222
1236
|
return doc
|
223
1237
|
|
@@ -266,38 +1280,23 @@ class Morphik:
|
|
266
1280
|
)
|
267
1281
|
```
|
268
1282
|
"""
|
269
|
-
#
|
270
|
-
|
271
|
-
file_path = Path(file)
|
272
|
-
if not file_path.exists():
|
273
|
-
raise ValueError(f"File not found: {file}")
|
274
|
-
filename = file_path.name if filename is None else filename
|
275
|
-
with open(file_path, "rb") as f:
|
276
|
-
content = f.read()
|
277
|
-
file_obj = BytesIO(content)
|
278
|
-
elif isinstance(file, bytes):
|
279
|
-
if filename is None:
|
280
|
-
raise ValueError("filename is required when ingesting bytes")
|
281
|
-
file_obj = BytesIO(file)
|
282
|
-
else:
|
283
|
-
if filename is None:
|
284
|
-
raise ValueError("filename is required when ingesting file object")
|
285
|
-
file_obj = file
|
1283
|
+
# Process file input
|
1284
|
+
file_obj, filename = self._logic._prepare_file_for_upload(file, filename)
|
286
1285
|
|
287
1286
|
try:
|
288
1287
|
# Prepare multipart form data
|
289
1288
|
files = {"file": (filename, file_obj)}
|
290
1289
|
|
291
|
-
#
|
292
|
-
form_data =
|
293
|
-
"metadata": json.dumps(metadata or {}),
|
294
|
-
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
295
|
-
}
|
1290
|
+
# Create form data
|
1291
|
+
form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)
|
296
1292
|
|
297
1293
|
response = self._request(
|
298
|
-
"POST",
|
1294
|
+
"POST",
|
1295
|
+
f"ingest/file?use_colpali={str(use_colpali).lower()}",
|
1296
|
+
data=form_data,
|
1297
|
+
files=files,
|
299
1298
|
)
|
300
|
-
doc =
|
1299
|
+
doc = self._logic._parse_document_response(response)
|
301
1300
|
doc._client = self
|
302
1301
|
return doc
|
303
1302
|
finally:
|
@@ -330,44 +1329,22 @@ class Morphik:
|
|
330
1329
|
ValueError: If metadata list length doesn't match files length
|
331
1330
|
"""
|
332
1331
|
# Convert files to format expected by API
|
333
|
-
file_objects =
|
334
|
-
for file in files:
|
335
|
-
if isinstance(file, (str, Path)):
|
336
|
-
path = Path(file)
|
337
|
-
file_objects.append(("files", (path.name, open(path, "rb"))))
|
338
|
-
elif isinstance(file, bytes):
|
339
|
-
file_objects.append(("files", ("file.bin", file)))
|
340
|
-
else:
|
341
|
-
file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
|
1332
|
+
file_objects = self._logic._prepare_files_for_upload(files)
|
342
1333
|
|
343
1334
|
try:
|
344
|
-
# Prepare
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
# List of lists - per-file rules
|
349
|
-
converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules]
|
350
|
-
else:
|
351
|
-
# Flat list - shared rules for all files
|
352
|
-
converted_rules = [self._convert_rule(r) for r in rules]
|
353
|
-
else:
|
354
|
-
converted_rules = []
|
355
|
-
|
356
|
-
data = {
|
357
|
-
"metadata": json.dumps(metadata or {}),
|
358
|
-
"rules": json.dumps(converted_rules),
|
359
|
-
"use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
|
360
|
-
"parallel": str(parallel).lower(),
|
361
|
-
}
|
1335
|
+
# Prepare form data
|
1336
|
+
data = self._logic._prepare_ingest_files_form_data(
|
1337
|
+
metadata, rules, use_colpali, parallel, None, None
|
1338
|
+
)
|
362
1339
|
|
363
1340
|
response = self._request("POST", "ingest/files", data=data, files=file_objects)
|
364
|
-
|
1341
|
+
|
365
1342
|
if response.get("errors"):
|
366
1343
|
# Log errors but don't raise exception
|
367
1344
|
for error in response["errors"]:
|
368
1345
|
logger.error(f"Failed to ingest {error['filename']}: {error['error']}")
|
369
|
-
|
370
|
-
docs = [
|
1346
|
+
|
1347
|
+
docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
|
371
1348
|
for doc in docs:
|
372
1349
|
doc._client = self
|
373
1350
|
return docs
|
@@ -417,17 +1394,13 @@ class Morphik:
|
|
417
1394
|
|
418
1395
|
# Filter out directories
|
419
1396
|
files = [f for f in files if f.is_file()]
|
420
|
-
|
1397
|
+
|
421
1398
|
if not files:
|
422
1399
|
return []
|
423
1400
|
|
424
1401
|
# Use ingest_files with collected paths
|
425
1402
|
return self.ingest_files(
|
426
|
-
files=files,
|
427
|
-
metadata=metadata,
|
428
|
-
rules=rules,
|
429
|
-
use_colpali=use_colpali,
|
430
|
-
parallel=parallel
|
1403
|
+
files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
|
431
1404
|
)
|
432
1405
|
|
433
1406
|
def retrieve_chunks(
|
@@ -458,52 +1431,11 @@ class Morphik:
|
|
458
1431
|
)
|
459
1432
|
```
|
460
1433
|
"""
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
"use_colpali": use_colpali,
|
467
|
-
}
|
468
|
-
|
469
|
-
response = self._request("POST", "retrieve/chunks", request)
|
470
|
-
chunks = [ChunkResult(**r) for r in response]
|
471
|
-
|
472
|
-
final_chunks = []
|
473
|
-
|
474
|
-
for chunk in chunks:
|
475
|
-
if chunk.metadata.get("is_image"):
|
476
|
-
try:
|
477
|
-
# Handle data URI format "data:image/png;base64,..."
|
478
|
-
content = chunk.content
|
479
|
-
if content.startswith("data:"):
|
480
|
-
# Extract the base64 part after the comma
|
481
|
-
content = content.split(",", 1)[1]
|
482
|
-
|
483
|
-
# Now decode the base64 string
|
484
|
-
image_bytes = base64.b64decode(content)
|
485
|
-
content = Image.open(io.BytesIO(image_bytes))
|
486
|
-
except Exception as e:
|
487
|
-
print(f"Error processing image: {str(e)}")
|
488
|
-
# Fall back to using the content as text
|
489
|
-
print(chunk.content)
|
490
|
-
else:
|
491
|
-
content = chunk.content
|
492
|
-
|
493
|
-
final_chunks.append(
|
494
|
-
FinalChunkResult(
|
495
|
-
content=content,
|
496
|
-
score=chunk.score,
|
497
|
-
document_id=chunk.document_id,
|
498
|
-
chunk_number=chunk.chunk_number,
|
499
|
-
metadata=chunk.metadata,
|
500
|
-
content_type=chunk.content_type,
|
501
|
-
filename=chunk.filename,
|
502
|
-
download_url=chunk.download_url,
|
503
|
-
)
|
504
|
-
)
|
505
|
-
|
506
|
-
return final_chunks
|
1434
|
+
payload = self._logic._prepare_retrieve_chunks_request(
|
1435
|
+
query, filters, k, min_score, use_colpali, None, None
|
1436
|
+
)
|
1437
|
+
response = self._request("POST", "retrieve/chunks", data=payload)
|
1438
|
+
return self._logic._parse_chunk_result_list_response(response)
|
507
1439
|
|
508
1440
|
def retrieve_docs(
|
509
1441
|
self,
|
@@ -533,16 +1465,11 @@ class Morphik:
|
|
533
1465
|
)
|
534
1466
|
```
|
535
1467
|
"""
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
"use_colpali": use_colpali,
|
542
|
-
}
|
543
|
-
|
544
|
-
response = self._request("POST", "retrieve/docs", request)
|
545
|
-
return [DocumentResult(**r) for r in response]
|
1468
|
+
payload = self._logic._prepare_retrieve_docs_request(
|
1469
|
+
query, filters, k, min_score, use_colpali, None, None
|
1470
|
+
)
|
1471
|
+
response = self._request("POST", "retrieve/docs", data=payload)
|
1472
|
+
return self._logic._parse_document_result_list_response(response)
|
546
1473
|
|
547
1474
|
def query(
|
548
1475
|
self,
|
@@ -585,7 +1512,7 @@ class Morphik:
|
|
585
1512
|
filters={"department": "research"},
|
586
1513
|
temperature=0.7
|
587
1514
|
)
|
588
|
-
|
1515
|
+
|
589
1516
|
# Knowledge graph enhanced query
|
590
1517
|
response = db.query(
|
591
1518
|
"How does product X relate to customer segment Y?",
|
@@ -593,7 +1520,7 @@ class Morphik:
|
|
593
1520
|
hop_depth=2,
|
594
1521
|
include_paths=True
|
595
1522
|
)
|
596
|
-
|
1523
|
+
|
597
1524
|
# With prompt customization
|
598
1525
|
from morphik.models import QueryPromptOverride, QueryPromptOverrides
|
599
1526
|
response = db.query(
|
@@ -604,7 +1531,7 @@ class Morphik:
|
|
604
1531
|
)
|
605
1532
|
)
|
606
1533
|
)
|
607
|
-
|
1534
|
+
|
608
1535
|
# Or using a dictionary
|
609
1536
|
response = db.query(
|
610
1537
|
"What are the key findings?",
|
@@ -614,35 +1541,32 @@ class Morphik:
|
|
614
1541
|
}
|
615
1542
|
}
|
616
1543
|
)
|
617
|
-
|
1544
|
+
|
618
1545
|
print(response.completion)
|
619
|
-
|
1546
|
+
|
620
1547
|
# If include_paths=True, you can inspect the graph paths
|
621
1548
|
if response.metadata and "graph" in response.metadata:
|
622
1549
|
for path in response.metadata["graph"]["paths"]:
|
623
1550
|
print(" -> ".join(path))
|
624
1551
|
```
|
625
1552
|
"""
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
response = self._request("POST", "query", request)
|
645
|
-
return CompletionResponse(**response)
|
1553
|
+
payload = self._logic._prepare_query_request(
|
1554
|
+
query,
|
1555
|
+
filters,
|
1556
|
+
k,
|
1557
|
+
min_score,
|
1558
|
+
max_tokens,
|
1559
|
+
temperature,
|
1560
|
+
use_colpali,
|
1561
|
+
graph_name,
|
1562
|
+
hop_depth,
|
1563
|
+
include_paths,
|
1564
|
+
prompt_overrides,
|
1565
|
+
None,
|
1566
|
+
None,
|
1567
|
+
)
|
1568
|
+
response = self._request("POST", "query", data=payload)
|
1569
|
+
return self._logic._parse_completion_response(response)
|
646
1570
|
|
647
1571
|
def list_documents(
|
648
1572
|
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
|
@@ -667,9 +1591,9 @@ class Morphik:
|
|
667
1591
|
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
668
1592
|
```
|
669
1593
|
"""
|
670
|
-
|
671
|
-
response = self._request("POST",
|
672
|
-
docs =
|
1594
|
+
params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
|
1595
|
+
response = self._request("POST", "documents", data=data, params=params)
|
1596
|
+
docs = self._logic._parse_document_list_response(response)
|
673
1597
|
for doc in docs:
|
674
1598
|
doc._client = self
|
675
1599
|
return docs
|
@@ -691,10 +1615,10 @@ class Morphik:
|
|
691
1615
|
```
|
692
1616
|
"""
|
693
1617
|
response = self._request("GET", f"documents/{document_id}")
|
694
|
-
doc =
|
1618
|
+
doc = self._logic._parse_document_response(response)
|
695
1619
|
doc._client = self
|
696
1620
|
return doc
|
697
|
-
|
1621
|
+
|
698
1622
|
def get_document_by_filename(self, filename: str) -> Document:
|
699
1623
|
"""
|
700
1624
|
Get document metadata by filename.
|
@@ -713,10 +1637,10 @@ class Morphik:
|
|
713
1637
|
```
|
714
1638
|
"""
|
715
1639
|
response = self._request("GET", f"documents/filename/{filename}")
|
716
|
-
doc =
|
1640
|
+
doc = self._logic._parse_document_response(response)
|
717
1641
|
doc._client = self
|
718
1642
|
return doc
|
719
|
-
|
1643
|
+
|
720
1644
|
def update_document_with_text(
|
721
1645
|
self,
|
722
1646
|
document_id: str,
|
@@ -763,19 +1687,16 @@ class Morphik:
|
|
763
1687
|
rules=[self._convert_rule(r) for r in (rules or [])],
|
764
1688
|
use_colpali=use_colpali if use_colpali is not None else True,
|
765
1689
|
)
|
766
|
-
|
1690
|
+
|
767
1691
|
params = {}
|
768
1692
|
if update_strategy != "add":
|
769
1693
|
params["update_strategy"] = update_strategy
|
770
|
-
|
1694
|
+
|
771
1695
|
response = self._request(
|
772
|
-
"POST",
|
773
|
-
f"documents/{document_id}/update_text",
|
774
|
-
data=request.model_dump(),
|
775
|
-
params=params
|
1696
|
+
"POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
|
776
1697
|
)
|
777
|
-
|
778
|
-
doc =
|
1698
|
+
|
1699
|
+
doc = self._logic._parse_document_response(response)
|
779
1700
|
doc._client = self
|
780
1701
|
return doc
|
781
1702
|
|
@@ -833,34 +1754,34 @@ class Morphik:
|
|
833
1754
|
if filename is None:
|
834
1755
|
raise ValueError("filename is required when updating with file object")
|
835
1756
|
file_obj = file
|
836
|
-
|
1757
|
+
|
837
1758
|
try:
|
838
1759
|
# Prepare multipart form data
|
839
1760
|
files = {"file": (filename, file_obj)}
|
840
|
-
|
1761
|
+
|
841
1762
|
# Convert metadata and rules to JSON strings
|
842
1763
|
form_data = {
|
843
1764
|
"metadata": json.dumps(metadata or {}),
|
844
1765
|
"rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
|
845
1766
|
"update_strategy": update_strategy,
|
846
1767
|
}
|
847
|
-
|
1768
|
+
|
848
1769
|
if use_colpali is not None:
|
849
1770
|
form_data["use_colpali"] = str(use_colpali).lower()
|
850
|
-
|
1771
|
+
|
851
1772
|
# Use the dedicated file update endpoint
|
852
1773
|
response = self._request(
|
853
1774
|
"POST", f"documents/{document_id}/update_file", data=form_data, files=files
|
854
1775
|
)
|
855
|
-
|
856
|
-
doc =
|
1776
|
+
|
1777
|
+
doc = self._logic._parse_document_response(response)
|
857
1778
|
doc._client = self
|
858
1779
|
return doc
|
859
1780
|
finally:
|
860
1781
|
# Close file if we opened it
|
861
1782
|
if isinstance(file, (str, Path)):
|
862
1783
|
file_obj.close()
|
863
|
-
|
1784
|
+
|
864
1785
|
def update_document_metadata(
|
865
1786
|
self,
|
866
1787
|
document_id: str,
|
@@ -868,14 +1789,14 @@ class Morphik:
|
|
868
1789
|
) -> Document:
|
869
1790
|
"""
|
870
1791
|
Update a document's metadata only.
|
871
|
-
|
1792
|
+
|
872
1793
|
Args:
|
873
1794
|
document_id: ID of the document to update
|
874
1795
|
metadata: Metadata to update
|
875
|
-
|
1796
|
+
|
876
1797
|
Returns:
|
877
1798
|
Document: Updated document metadata
|
878
|
-
|
1799
|
+
|
879
1800
|
Example:
|
880
1801
|
```python
|
881
1802
|
# Update just the metadata of a document
|
@@ -888,10 +1809,10 @@ class Morphik:
|
|
888
1809
|
"""
|
889
1810
|
# Use the dedicated metadata update endpoint
|
890
1811
|
response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
|
891
|
-
doc =
|
1812
|
+
doc = self._logic._parse_document_response(response)
|
892
1813
|
doc._client = self
|
893
1814
|
return doc
|
894
|
-
|
1815
|
+
|
895
1816
|
def update_document_by_filename_with_text(
|
896
1817
|
self,
|
897
1818
|
filename: str,
|
@@ -932,7 +1853,7 @@ class Morphik:
|
|
932
1853
|
"""
|
933
1854
|
# First get the document by filename to obtain its ID
|
934
1855
|
doc = self.get_document_by_filename(filename)
|
935
|
-
|
1856
|
+
|
936
1857
|
# Then use the regular update_document_with_text endpoint with the document ID
|
937
1858
|
return self.update_document_with_text(
|
938
1859
|
document_id=doc.external_id,
|
@@ -941,9 +1862,9 @@ class Morphik:
|
|
941
1862
|
metadata=metadata,
|
942
1863
|
rules=rules,
|
943
1864
|
update_strategy=update_strategy,
|
944
|
-
use_colpali=use_colpali
|
1865
|
+
use_colpali=use_colpali,
|
945
1866
|
)
|
946
|
-
|
1867
|
+
|
947
1868
|
def update_document_by_filename_with_file(
|
948
1869
|
self,
|
949
1870
|
filename: str,
|
@@ -983,7 +1904,7 @@ class Morphik:
|
|
983
1904
|
"""
|
984
1905
|
# First get the document by filename to obtain its ID
|
985
1906
|
doc = self.get_document_by_filename(filename)
|
986
|
-
|
1907
|
+
|
987
1908
|
# Then use the regular update_document_with_file endpoint with the document ID
|
988
1909
|
return self.update_document_with_file(
|
989
1910
|
document_id=doc.external_id,
|
@@ -992,9 +1913,9 @@ class Morphik:
|
|
992
1913
|
metadata=metadata,
|
993
1914
|
rules=rules,
|
994
1915
|
update_strategy=update_strategy,
|
995
|
-
use_colpali=use_colpali
|
1916
|
+
use_colpali=use_colpali,
|
996
1917
|
)
|
997
|
-
|
1918
|
+
|
998
1919
|
def update_document_by_filename_metadata(
|
999
1920
|
self,
|
1000
1921
|
filename: str,
|
@@ -1003,15 +1924,15 @@ class Morphik:
|
|
1003
1924
|
) -> Document:
|
1004
1925
|
"""
|
1005
1926
|
Update a document's metadata using filename to identify the document.
|
1006
|
-
|
1927
|
+
|
1007
1928
|
Args:
|
1008
1929
|
filename: Filename of the document to update
|
1009
1930
|
metadata: Metadata to update
|
1010
1931
|
new_filename: Optional new filename to assign to the document
|
1011
|
-
|
1932
|
+
|
1012
1933
|
Returns:
|
1013
1934
|
Document: Updated document metadata
|
1014
|
-
|
1935
|
+
|
1015
1936
|
Example:
|
1016
1937
|
```python
|
1017
1938
|
# Update just the metadata of a document identified by filename
|
@@ -1025,44 +1946,44 @@ class Morphik:
|
|
1025
1946
|
"""
|
1026
1947
|
# First get the document by filename to obtain its ID
|
1027
1948
|
doc = self.get_document_by_filename(filename)
|
1028
|
-
|
1949
|
+
|
1029
1950
|
# Update the metadata
|
1030
1951
|
result = self.update_document_metadata(
|
1031
1952
|
document_id=doc.external_id,
|
1032
1953
|
metadata=metadata,
|
1033
1954
|
)
|
1034
|
-
|
1955
|
+
|
1035
1956
|
# If new_filename is provided, update the filename as well
|
1036
1957
|
if new_filename:
|
1037
1958
|
# Create a request that retains the just-updated metadata but also changes filename
|
1038
1959
|
combined_metadata = result.metadata.copy()
|
1039
|
-
|
1960
|
+
|
1040
1961
|
# Update the document again with filename change and the same metadata
|
1041
1962
|
response = self._request(
|
1042
|
-
"POST",
|
1043
|
-
f"documents/{doc.external_id}/update_text",
|
1963
|
+
"POST",
|
1964
|
+
f"documents/{doc.external_id}/update_text",
|
1044
1965
|
data={
|
1045
|
-
"content": "",
|
1966
|
+
"content": "",
|
1046
1967
|
"filename": new_filename,
|
1047
1968
|
"metadata": combined_metadata,
|
1048
|
-
"rules": []
|
1049
|
-
}
|
1969
|
+
"rules": [],
|
1970
|
+
},
|
1050
1971
|
)
|
1051
|
-
result =
|
1972
|
+
result = self._logic._parse_document_response(response)
|
1052
1973
|
result._client = self
|
1053
|
-
|
1974
|
+
|
1054
1975
|
return result
|
1055
|
-
|
1976
|
+
|
1056
1977
|
def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
|
1057
1978
|
"""
|
1058
1979
|
Retrieve multiple documents by their IDs in a single batch operation.
|
1059
|
-
|
1980
|
+
|
1060
1981
|
Args:
|
1061
1982
|
document_ids: List of document IDs to retrieve
|
1062
|
-
|
1983
|
+
|
1063
1984
|
Returns:
|
1064
1985
|
List[Document]: List of document metadata for found documents
|
1065
|
-
|
1986
|
+
|
1066
1987
|
Example:
|
1067
1988
|
```python
|
1068
1989
|
docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
|
@@ -1071,21 +1992,23 @@ class Morphik:
|
|
1071
1992
|
```
|
1072
1993
|
"""
|
1073
1994
|
response = self._request("POST", "batch/documents", data=document_ids)
|
1074
|
-
docs =
|
1995
|
+
docs = self._logic._parse_document_list_response(response)
|
1075
1996
|
for doc in docs:
|
1076
1997
|
doc._client = self
|
1077
1998
|
return docs
|
1078
|
-
|
1079
|
-
def batch_get_chunks(
|
1999
|
+
|
2000
|
+
def batch_get_chunks(
|
2001
|
+
self, sources: List[Union[ChunkSource, Dict[str, Any]]]
|
2002
|
+
) -> List[FinalChunkResult]:
|
1080
2003
|
"""
|
1081
2004
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
1082
|
-
|
2005
|
+
|
1083
2006
|
Args:
|
1084
2007
|
sources: List of ChunkSource objects or dictionaries with document_id and chunk_number
|
1085
|
-
|
2008
|
+
|
1086
2009
|
Returns:
|
1087
2010
|
List[FinalChunkResult]: List of chunk results
|
1088
|
-
|
2011
|
+
|
1089
2012
|
Example:
|
1090
2013
|
```python
|
1091
2014
|
# Using dictionaries
|
@@ -1093,14 +2016,14 @@ class Morphik:
|
|
1093
2016
|
{"document_id": "doc_123", "chunk_number": 0},
|
1094
2017
|
{"document_id": "doc_456", "chunk_number": 2}
|
1095
2018
|
]
|
1096
|
-
|
2019
|
+
|
1097
2020
|
# Or using ChunkSource objects
|
1098
2021
|
from morphik.models import ChunkSource
|
1099
2022
|
sources = [
|
1100
2023
|
ChunkSource(document_id="doc_123", chunk_number=0),
|
1101
2024
|
ChunkSource(document_id="doc_456", chunk_number=2)
|
1102
2025
|
]
|
1103
|
-
|
2026
|
+
|
1104
2027
|
chunks = db.batch_get_chunks(sources)
|
1105
2028
|
for chunk in chunks:
|
1106
2029
|
print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
|
@@ -1113,44 +2036,9 @@ class Morphik:
|
|
1113
2036
|
source_dicts.append(source)
|
1114
2037
|
else:
|
1115
2038
|
source_dicts.append(source.model_dump())
|
1116
|
-
|
2039
|
+
|
1117
2040
|
response = self._request("POST", "batch/chunks", data=source_dicts)
|
1118
|
-
|
1119
|
-
|
1120
|
-
final_chunks = []
|
1121
|
-
for chunk in chunks:
|
1122
|
-
if chunk.metadata.get("is_image"):
|
1123
|
-
try:
|
1124
|
-
# Handle data URI format "data:image/png;base64,..."
|
1125
|
-
content = chunk.content
|
1126
|
-
if content.startswith("data:"):
|
1127
|
-
# Extract the base64 part after the comma
|
1128
|
-
content = content.split(",", 1)[1]
|
1129
|
-
|
1130
|
-
# Now decode the base64 string
|
1131
|
-
image_bytes = base64.b64decode(content)
|
1132
|
-
content = Image.open(io.BytesIO(image_bytes))
|
1133
|
-
except Exception as e:
|
1134
|
-
print(f"Error processing image: {str(e)}")
|
1135
|
-
# Fall back to using the content as text
|
1136
|
-
content = chunk.content
|
1137
|
-
else:
|
1138
|
-
content = chunk.content
|
1139
|
-
|
1140
|
-
final_chunks.append(
|
1141
|
-
FinalChunkResult(
|
1142
|
-
content=content,
|
1143
|
-
score=chunk.score,
|
1144
|
-
document_id=chunk.document_id,
|
1145
|
-
chunk_number=chunk.chunk_number,
|
1146
|
-
metadata=chunk.metadata,
|
1147
|
-
content_type=chunk.content_type,
|
1148
|
-
filename=chunk.filename,
|
1149
|
-
download_url=chunk.download_url,
|
1150
|
-
)
|
1151
|
-
)
|
1152
|
-
|
1153
|
-
return final_chunks
|
2041
|
+
return self._logic._parse_chunk_result_list_response(response)
|
1154
2042
|
|
1155
2043
|
def create_cache(
|
1156
2044
|
self,
|
@@ -1252,11 +2140,11 @@ class Morphik:
|
|
1252
2140
|
name="custom_graph",
|
1253
2141
|
documents=["doc1", "doc2", "doc3"]
|
1254
2142
|
)
|
1255
|
-
|
2143
|
+
|
1256
2144
|
# With custom entity extraction examples
|
1257
2145
|
from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
|
1258
2146
|
graph = db.create_graph(
|
1259
|
-
name="medical_graph",
|
2147
|
+
name="medical_graph",
|
1260
2148
|
filters={"category": "medical"},
|
1261
2149
|
prompt_overrides=GraphPromptOverrides(
|
1262
2150
|
entity_extraction=EntityExtractionPromptOverride(
|
@@ -1272,7 +2160,7 @@ class Morphik:
|
|
1272
2160
|
# Convert prompt_overrides to dict if it's a model
|
1273
2161
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1274
2162
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1275
|
-
|
2163
|
+
|
1276
2164
|
request = {
|
1277
2165
|
"name": name,
|
1278
2166
|
"filters": filters,
|
@@ -1281,8 +2169,8 @@ class Morphik:
|
|
1281
2169
|
}
|
1282
2170
|
|
1283
2171
|
response = self._request("POST", "graph/create", request)
|
1284
|
-
return
|
1285
|
-
|
2172
|
+
return self._logic._parse_graph_response(response)
|
2173
|
+
|
1286
2174
|
def get_graph(self, name: str) -> Graph:
|
1287
2175
|
"""
|
1288
2176
|
Get a graph by name.
|
@@ -1301,7 +2189,7 @@ class Morphik:
|
|
1301
2189
|
```
|
1302
2190
|
"""
|
1303
2191
|
response = self._request("GET", f"graph/{name}")
|
1304
|
-
return
|
2192
|
+
return self._logic._parse_graph_response(response)
|
1305
2193
|
|
1306
2194
|
def list_graphs(self) -> List[Graph]:
|
1307
2195
|
"""
|
@@ -1319,8 +2207,8 @@ class Morphik:
|
|
1319
2207
|
```
|
1320
2208
|
"""
|
1321
2209
|
response = self._request("GET", "graphs")
|
1322
|
-
return
|
1323
|
-
|
2210
|
+
return self._logic._parse_graph_list_response(response)
|
2211
|
+
|
1324
2212
|
def update_graph(
|
1325
2213
|
self,
|
1326
2214
|
name: str,
|
@@ -1330,20 +2218,20 @@ class Morphik:
|
|
1330
2218
|
) -> Graph:
|
1331
2219
|
"""
|
1332
2220
|
Update an existing graph with new documents.
|
1333
|
-
|
2221
|
+
|
1334
2222
|
This method processes additional documents matching the original or new filters,
|
1335
2223
|
extracts entities and relationships, and updates the graph with new information.
|
1336
|
-
|
2224
|
+
|
1337
2225
|
Args:
|
1338
2226
|
name: Name of the graph to update
|
1339
2227
|
additional_filters: Optional additional metadata filters to determine which new documents to include
|
1340
2228
|
additional_documents: Optional list of additional document IDs to include
|
1341
2229
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
1342
2230
|
Either a GraphPromptOverrides object or a dictionary with the same structure
|
1343
|
-
|
2231
|
+
|
1344
2232
|
Returns:
|
1345
2233
|
Graph: The updated graph
|
1346
|
-
|
2234
|
+
|
1347
2235
|
Example:
|
1348
2236
|
```python
|
1349
2237
|
# Update a graph with new documents
|
@@ -1353,7 +2241,7 @@ class Morphik:
|
|
1353
2241
|
additional_documents=["doc4", "doc5"]
|
1354
2242
|
)
|
1355
2243
|
print(f"Graph now has {len(updated_graph.entities)} entities")
|
1356
|
-
|
2244
|
+
|
1357
2245
|
# With entity resolution examples
|
1358
2246
|
from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
|
1359
2247
|
updated_graph = db.update_graph(
|
@@ -1363,7 +2251,7 @@ class Morphik:
|
|
1363
2251
|
entity_resolution=EntityResolutionPromptOverride(
|
1364
2252
|
examples=[
|
1365
2253
|
EntityResolutionExample(
|
1366
|
-
canonical="Machine Learning",
|
2254
|
+
canonical="Machine Learning",
|
1367
2255
|
variants=["ML", "machine learning", "AI/ML"]
|
1368
2256
|
)
|
1369
2257
|
]
|
@@ -1375,7 +2263,7 @@ class Morphik:
|
|
1375
2263
|
# Convert prompt_overrides to dict if it's a model
|
1376
2264
|
if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
|
1377
2265
|
prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
|
1378
|
-
|
2266
|
+
|
1379
2267
|
request = {
|
1380
2268
|
"additional_filters": additional_filters,
|
1381
2269
|
"additional_documents": additional_documents,
|
@@ -1383,23 +2271,23 @@ class Morphik:
|
|
1383
2271
|
}
|
1384
2272
|
|
1385
2273
|
response = self._request("POST", f"graph/{name}/update", request)
|
1386
|
-
return
|
1387
|
-
|
2274
|
+
return self._logic._parse_graph_response(response)
|
2275
|
+
|
1388
2276
|
def delete_document(self, document_id: str) -> Dict[str, str]:
|
1389
2277
|
"""
|
1390
2278
|
Delete a document and all its associated data.
|
1391
|
-
|
2279
|
+
|
1392
2280
|
This method deletes a document and all its associated data, including:
|
1393
2281
|
- Document metadata
|
1394
2282
|
- Document content in storage
|
1395
2283
|
- Document chunks and embeddings in vector store
|
1396
|
-
|
2284
|
+
|
1397
2285
|
Args:
|
1398
2286
|
document_id: ID of the document to delete
|
1399
|
-
|
2287
|
+
|
1400
2288
|
Returns:
|
1401
2289
|
Dict[str, str]: Deletion status
|
1402
|
-
|
2290
|
+
|
1403
2291
|
Example:
|
1404
2292
|
```python
|
1405
2293
|
# Delete a document
|
@@ -1409,20 +2297,20 @@ class Morphik:
|
|
1409
2297
|
"""
|
1410
2298
|
response = self._request("DELETE", f"documents/{document_id}")
|
1411
2299
|
return response
|
1412
|
-
|
2300
|
+
|
1413
2301
|
def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
|
1414
2302
|
"""
|
1415
2303
|
Delete a document by its filename.
|
1416
|
-
|
2304
|
+
|
1417
2305
|
This is a convenience method that first retrieves the document ID by filename
|
1418
2306
|
and then deletes the document by ID.
|
1419
|
-
|
2307
|
+
|
1420
2308
|
Args:
|
1421
2309
|
filename: Filename of the document to delete
|
1422
|
-
|
2310
|
+
|
1423
2311
|
Returns:
|
1424
2312
|
Dict[str, str]: Deletion status
|
1425
|
-
|
2313
|
+
|
1426
2314
|
Example:
|
1427
2315
|
```python
|
1428
2316
|
# Delete a document by filename
|
@@ -1432,13 +2320,13 @@ class Morphik:
|
|
1432
2320
|
"""
|
1433
2321
|
# First get the document by filename to obtain its ID
|
1434
2322
|
doc = self.get_document_by_filename(filename)
|
1435
|
-
|
2323
|
+
|
1436
2324
|
# Then delete the document by ID
|
1437
2325
|
return self.delete_document(doc.external_id)
|
1438
2326
|
|
1439
2327
|
def close(self):
|
1440
|
-
"""Close the HTTP
|
1441
|
-
self.
|
2328
|
+
"""Close the HTTP client"""
|
2329
|
+
self._client.close()
|
1442
2330
|
|
1443
2331
|
def __enter__(self):
|
1444
2332
|
return self
|