morphik 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: morphik
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Morphik Python Client
5
5
  Author-email: Morphik <founders@morphik.ai>
6
6
  Requires-Python: >=3.8
@@ -12,4 +12,4 @@ __all__ = [
12
12
  "Document",
13
13
  ]
14
14
 
15
- __version__ = "0.1.0"
15
+ __version__ = "0.1.1"
@@ -0,0 +1,507 @@
1
+ import base64
2
+ import io
3
+ import json
4
+ from io import BytesIO, IOBase
5
+ from PIL import Image
6
+ from PIL.Image import Image as PILImage
7
+ from pathlib import Path
8
+ from typing import Dict, Any, List, Optional, Union, Tuple, BinaryIO
9
+ from urllib.parse import urlparse
10
+
11
+ import jwt
12
+ from pydantic import BaseModel, Field
13
+
14
+ from .models import (
15
+ Document,
16
+ ChunkResult,
17
+ DocumentResult,
18
+ CompletionResponse,
19
+ IngestTextRequest,
20
+ ChunkSource,
21
+ Graph,
22
+ # Prompt override models
23
+ GraphPromptOverrides,
24
+ )
25
+ from .rules import Rule
26
+
27
+ # Type alias for rules
28
+ RuleOrDict = Union[Rule, Dict[str, Any]]
29
+
30
+
31
+ class FinalChunkResult(BaseModel):
32
+ content: str | PILImage = Field(..., description="Chunk content")
33
+ score: float = Field(..., description="Relevance score")
34
+ document_id: str = Field(..., description="Parent document ID")
35
+ chunk_number: int = Field(..., description="Chunk sequence number")
36
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
37
+ content_type: str = Field(..., description="Content type")
38
+ filename: Optional[str] = Field(None, description="Original filename")
39
+ download_url: Optional[str] = Field(None, description="URL to download full document")
40
+
41
+ class Config:
42
+ arbitrary_types_allowed = True
43
+
44
+
45
+ class _MorphikClientLogic:
46
+ """
47
+ Internal shared logic for Morphik clients.
48
+
49
+ This class contains the shared logic between synchronous and asynchronous clients.
50
+ It handles URL generation, request preparation, and response parsing.
51
+ """
52
+
53
+ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
54
+ """Initialize shared client logic"""
55
+ self._timeout = timeout
56
+ self._is_local = is_local
57
+
58
+ if uri:
59
+ self._setup_auth(uri)
60
+ else:
61
+ self._base_url = "http://localhost:8000"
62
+ self._auth_token = None
63
+
64
+ def _setup_auth(self, uri: str) -> None:
65
+ """Setup authentication from URI"""
66
+ parsed = urlparse(uri)
67
+ if not parsed.netloc:
68
+ raise ValueError("Invalid URI format")
69
+
70
+ # Split host and auth parts
71
+ auth, host = parsed.netloc.split("@")
72
+ _, self._auth_token = auth.split(":")
73
+
74
+ # Set base URL
75
+ self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
76
+
77
+ # Basic token validation
78
+ jwt.decode(self._auth_token, options={"verify_signature": False})
79
+
80
+ def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
81
+ """Convert a rule to a dictionary format"""
82
+ if hasattr(rule, "to_dict"):
83
+ return rule.to_dict()
84
+ return rule
85
+
86
+ def _get_url(self, endpoint: str) -> str:
87
+ """Get the full URL for an API endpoint"""
88
+ return f"{self._base_url}/{endpoint.lstrip('/')}"
89
+
90
+ def _get_headers(self) -> Dict[str, str]:
91
+ """Get base headers for API requests"""
92
+ headers = {"Content-Type": "application/json"}
93
+ return headers
94
+
95
+ # Request preparation methods
96
+
97
+ def _prepare_ingest_text_request(
98
+ self,
99
+ content: str,
100
+ filename: Optional[str],
101
+ metadata: Optional[Dict[str, Any]],
102
+ rules: Optional[List[RuleOrDict]],
103
+ use_colpali: bool,
104
+ folder_name: Optional[str],
105
+ end_user_id: Optional[str],
106
+ ) -> Dict[str, Any]:
107
+ """Prepare request for ingest_text endpoint"""
108
+ rules_dict = [self._convert_rule(r) for r in (rules or [])]
109
+ payload = {
110
+ "content": content,
111
+ "filename": filename,
112
+ "metadata": metadata or {},
113
+ "rules": rules_dict,
114
+ "use_colpali": use_colpali,
115
+ }
116
+ if folder_name:
117
+ payload["folder_name"] = folder_name
118
+ if end_user_id:
119
+ payload["end_user_id"] = end_user_id
120
+ return payload
121
+
122
+ def _prepare_file_for_upload(
123
+ self,
124
+ file: Union[str, bytes, BinaryIO, Path],
125
+ filename: Optional[str] = None,
126
+ ) -> Tuple[BinaryIO, str]:
127
+ """
128
+ Process file input and return file object and filename.
129
+ Handles different file input types (str, Path, bytes, file-like object).
130
+ """
131
+ if isinstance(file, (str, Path)):
132
+ file_path = Path(file)
133
+ if not file_path.exists():
134
+ raise ValueError(f"File not found: {file}")
135
+ filename = file_path.name if filename is None else filename
136
+ with open(file_path, "rb") as f:
137
+ content = f.read()
138
+ file_obj = BytesIO(content)
139
+ elif isinstance(file, bytes):
140
+ if filename is None:
141
+ raise ValueError("filename is required when ingesting bytes")
142
+ file_obj = BytesIO(file)
143
+ else:
144
+ if filename is None:
145
+ raise ValueError("filename is required when ingesting file object")
146
+ file_obj = file
147
+
148
+ return file_obj, filename
149
+
150
+ def _prepare_files_for_upload(
151
+ self,
152
+ files: List[Union[str, bytes, BinaryIO, Path]],
153
+ ) -> List[Tuple[str, Tuple[str, BinaryIO]]]:
154
+ """
155
+ Process multiple files and return a list of file objects in the format
156
+ expected by the API: [("files", (filename, file_obj)), ...]
157
+ """
158
+ file_objects = []
159
+ for file in files:
160
+ if isinstance(file, (str, Path)):
161
+ path = Path(file)
162
+ file_objects.append(("files", (path.name, open(path, "rb"))))
163
+ elif isinstance(file, bytes):
164
+ file_objects.append(("files", ("file.bin", BytesIO(file))))
165
+ else:
166
+ file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))
167
+
168
+ return file_objects
169
+
170
+ def _prepare_ingest_file_form_data(
171
+ self,
172
+ metadata: Optional[Dict[str, Any]],
173
+ rules: Optional[List[RuleOrDict]],
174
+ folder_name: Optional[str],
175
+ end_user_id: Optional[str],
176
+ ) -> Dict[str, Any]:
177
+ """Prepare form data for ingest_file endpoint"""
178
+ form_data = {
179
+ "metadata": json.dumps(metadata or {}),
180
+ "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
181
+ }
182
+ if folder_name:
183
+ form_data["folder_name"] = folder_name
184
+ if end_user_id:
185
+ form_data["end_user_id"] = end_user_id
186
+ return form_data
187
+
188
+ def _prepare_ingest_files_form_data(
189
+ self,
190
+ metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]],
191
+ rules: Optional[List[RuleOrDict]],
192
+ use_colpali: bool,
193
+ parallel: bool,
194
+ folder_name: Optional[str],
195
+ end_user_id: Optional[str],
196
+ ) -> Dict[str, Any]:
197
+ """Prepare form data for ingest_files endpoint"""
198
+ # Convert rules appropriately based on whether it's a flat list or list of lists
199
+ if rules:
200
+ if all(isinstance(r, list) for r in rules):
201
+ # List of lists - per-file rules
202
+ converted_rules = [
203
+ [self._convert_rule(r) for r in rule_list] for rule_list in rules
204
+ ]
205
+ else:
206
+ # Flat list - shared rules for all files
207
+ converted_rules = [self._convert_rule(r) for r in rules]
208
+ else:
209
+ converted_rules = []
210
+
211
+ data = {
212
+ "metadata": json.dumps(metadata or {}),
213
+ "rules": json.dumps(converted_rules),
214
+ "use_colpali": str(use_colpali).lower() if use_colpali is not None else None,
215
+ "parallel": str(parallel).lower(),
216
+ }
217
+
218
+ if folder_name:
219
+ data["folder_name"] = folder_name
220
+ if end_user_id:
221
+ data["end_user_id"] = end_user_id
222
+
223
+ return data
224
+
225
+ def _prepare_query_request(
226
+ self,
227
+ query: str,
228
+ filters: Optional[Dict[str, Any]],
229
+ k: int,
230
+ min_score: float,
231
+ max_tokens: Optional[int],
232
+ temperature: Optional[float],
233
+ use_colpali: bool,
234
+ graph_name: Optional[str],
235
+ hop_depth: int,
236
+ include_paths: bool,
237
+ prompt_overrides: Optional[Dict],
238
+ folder_name: Optional[str],
239
+ end_user_id: Optional[str],
240
+ ) -> Dict[str, Any]:
241
+ """Prepare request for query endpoint"""
242
+ payload = {
243
+ "query": query,
244
+ "filters": filters,
245
+ "k": k,
246
+ "min_score": min_score,
247
+ "max_tokens": max_tokens,
248
+ "temperature": temperature,
249
+ "use_colpali": use_colpali,
250
+ "graph_name": graph_name,
251
+ "hop_depth": hop_depth,
252
+ "include_paths": include_paths,
253
+ "prompt_overrides": prompt_overrides,
254
+ }
255
+ if folder_name:
256
+ payload["folder_name"] = folder_name
257
+ if end_user_id:
258
+ payload["end_user_id"] = end_user_id
259
+ # Filter out None values before sending
260
+ return {k_p: v_p for k_p, v_p in payload.items() if v_p is not None}
261
+
262
+ def _prepare_retrieve_chunks_request(
263
+ self,
264
+ query: str,
265
+ filters: Optional[Dict[str, Any]],
266
+ k: int,
267
+ min_score: float,
268
+ use_colpali: bool,
269
+ folder_name: Optional[str],
270
+ end_user_id: Optional[str],
271
+ ) -> Dict[str, Any]:
272
+ """Prepare request for retrieve_chunks endpoint"""
273
+ request = {
274
+ "query": query,
275
+ "filters": filters,
276
+ "k": k,
277
+ "min_score": min_score,
278
+ "use_colpali": use_colpali,
279
+ }
280
+ if folder_name:
281
+ request["folder_name"] = folder_name
282
+ if end_user_id:
283
+ request["end_user_id"] = end_user_id
284
+ return request
285
+
286
+ def _prepare_retrieve_docs_request(
287
+ self,
288
+ query: str,
289
+ filters: Optional[Dict[str, Any]],
290
+ k: int,
291
+ min_score: float,
292
+ use_colpali: bool,
293
+ folder_name: Optional[str],
294
+ end_user_id: Optional[str],
295
+ ) -> Dict[str, Any]:
296
+ """Prepare request for retrieve_docs endpoint"""
297
+ request = {
298
+ "query": query,
299
+ "filters": filters,
300
+ "k": k,
301
+ "min_score": min_score,
302
+ "use_colpali": use_colpali,
303
+ }
304
+ if folder_name:
305
+ request["folder_name"] = folder_name
306
+ if end_user_id:
307
+ request["end_user_id"] = end_user_id
308
+ return request
309
+
310
+ def _prepare_list_documents_request(
311
+ self,
312
+ skip: int,
313
+ limit: int,
314
+ filters: Optional[Dict[str, Any]],
315
+ folder_name: Optional[str],
316
+ end_user_id: Optional[str],
317
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
318
+ """Prepare request for list_documents endpoint"""
319
+ params = {
320
+ "skip": skip,
321
+ "limit": limit,
322
+ }
323
+ if folder_name:
324
+ params["folder_name"] = folder_name
325
+ if end_user_id:
326
+ params["end_user_id"] = end_user_id
327
+ data = filters or {}
328
+ return params, data
329
+
330
+ def _prepare_batch_get_documents_request(
331
+ self, document_ids: List[str], folder_name: Optional[str], end_user_id: Optional[str]
332
+ ) -> Dict[str, Any]:
333
+ """Prepare request for batch_get_documents endpoint"""
334
+ if folder_name or end_user_id:
335
+ request = {"document_ids": document_ids}
336
+ if folder_name:
337
+ request["folder_name"] = folder_name
338
+ if end_user_id:
339
+ request["end_user_id"] = end_user_id
340
+ return request
341
+ return document_ids # Return just IDs list if no scoping is needed
342
+
343
+ def _prepare_batch_get_chunks_request(
344
+ self,
345
+ sources: List[Union[ChunkSource, Dict[str, Any]]],
346
+ folder_name: Optional[str],
347
+ end_user_id: Optional[str],
348
+ ) -> Dict[str, Any]:
349
+ """Prepare request for batch_get_chunks endpoint"""
350
+ source_dicts = []
351
+ for source in sources:
352
+ if isinstance(source, dict):
353
+ source_dicts.append(source)
354
+ else:
355
+ source_dicts.append(source.model_dump())
356
+
357
+ if folder_name or end_user_id:
358
+ request = {"sources": source_dicts}
359
+ if folder_name:
360
+ request["folder_name"] = folder_name
361
+ if end_user_id:
362
+ request["end_user_id"] = end_user_id
363
+ return request
364
+ return source_dicts # Return just sources list if no scoping is needed
365
+
366
+ def _prepare_create_graph_request(
367
+ self,
368
+ name: str,
369
+ filters: Optional[Dict[str, Any]],
370
+ documents: Optional[List[str]],
371
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]],
372
+ folder_name: Optional[str],
373
+ end_user_id: Optional[str],
374
+ ) -> Dict[str, Any]:
375
+ """Prepare request for create_graph endpoint"""
376
+ # Convert prompt_overrides to dict if it's a model
377
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
378
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
379
+
380
+ request = {
381
+ "name": name,
382
+ "filters": filters,
383
+ "documents": documents,
384
+ "prompt_overrides": prompt_overrides,
385
+ }
386
+ if folder_name:
387
+ request["folder_name"] = folder_name
388
+ if end_user_id:
389
+ request["end_user_id"] = end_user_id
390
+ return request
391
+
392
+ def _prepare_update_graph_request(
393
+ self,
394
+ name: str,
395
+ additional_filters: Optional[Dict[str, Any]],
396
+ additional_documents: Optional[List[str]],
397
+ prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]],
398
+ folder_name: Optional[str],
399
+ end_user_id: Optional[str],
400
+ ) -> Dict[str, Any]:
401
+ """Prepare request for update_graph endpoint"""
402
+ # Convert prompt_overrides to dict if it's a model
403
+ if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
404
+ prompt_overrides = prompt_overrides.model_dump(exclude_none=True)
405
+
406
+ request = {
407
+ "additional_filters": additional_filters,
408
+ "additional_documents": additional_documents,
409
+ "prompt_overrides": prompt_overrides,
410
+ }
411
+ if folder_name:
412
+ request["folder_name"] = folder_name
413
+ if end_user_id:
414
+ request["end_user_id"] = end_user_id
415
+ return request
416
+
417
+ def _prepare_update_document_with_text_request(
418
+ self,
419
+ document_id: str,
420
+ content: str,
421
+ filename: Optional[str],
422
+ metadata: Optional[Dict[str, Any]],
423
+ rules: Optional[List],
424
+ update_strategy: str,
425
+ use_colpali: Optional[bool],
426
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
427
+ """Prepare request for update_document_with_text endpoint"""
428
+ request = IngestTextRequest(
429
+ content=content,
430
+ filename=filename,
431
+ metadata=metadata or {},
432
+ rules=[self._convert_rule(r) for r in (rules or [])],
433
+ use_colpali=use_colpali if use_colpali is not None else True,
434
+ )
435
+
436
+ params = {}
437
+ if update_strategy != "add":
438
+ params["update_strategy"] = update_strategy
439
+
440
+ return params, request.model_dump()
441
+
442
+ # Response parsing methods
443
+
444
+ def _parse_document_response(self, response_json: Dict[str, Any]) -> Document:
445
+ """Parse document response"""
446
+ return Document(**response_json)
447
+
448
+ def _parse_completion_response(self, response_json: Dict[str, Any]) -> CompletionResponse:
449
+ """Parse completion response"""
450
+ return CompletionResponse(**response_json)
451
+
452
+ def _parse_document_list_response(self, response_json: List[Dict[str, Any]]) -> List[Document]:
453
+ """Parse document list response"""
454
+ docs = [Document(**doc) for doc in response_json]
455
+ return docs
456
+
457
+ def _parse_document_result_list_response(
458
+ self, response_json: List[Dict[str, Any]]
459
+ ) -> List[DocumentResult]:
460
+ """Parse document result list response"""
461
+ return [DocumentResult(**r) for r in response_json]
462
+
463
+ def _parse_chunk_result_list_response(
464
+ self, response_json: List[Dict[str, Any]]
465
+ ) -> List[FinalChunkResult]:
466
+ """Parse chunk result list response"""
467
+ chunks = [ChunkResult(**r) for r in response_json]
468
+
469
+ final_chunks = []
470
+ for chunk in chunks:
471
+ content = chunk.content
472
+ if chunk.metadata.get("is_image"):
473
+ try:
474
+ # Handle data URI format "data:image/png;base64,..."
475
+ if content.startswith("data:"):
476
+ # Extract the base64 part after the comma
477
+ content = content.split(",", 1)[1]
478
+
479
+ # Now decode the base64 string
480
+ image_bytes = base64.b64decode(content)
481
+ content = Image.open(io.BytesIO(image_bytes))
482
+ except Exception:
483
+ # Fall back to using the content as text
484
+ content = chunk.content
485
+
486
+ final_chunks.append(
487
+ FinalChunkResult(
488
+ content=content,
489
+ score=chunk.score,
490
+ document_id=chunk.document_id,
491
+ chunk_number=chunk.chunk_number,
492
+ metadata=chunk.metadata,
493
+ content_type=chunk.content_type,
494
+ filename=chunk.filename,
495
+ download_url=chunk.download_url,
496
+ )
497
+ )
498
+
499
+ return final_chunks
500
+
501
+ def _parse_graph_response(self, response_json: Dict[str, Any]) -> Graph:
502
+ """Parse graph response"""
503
+ return Graph(**response_json)
504
+
505
+ def _parse_graph_list_response(self, response_json: List[Dict[str, Any]]) -> List[Graph]:
506
+ """Parse graph list response"""
507
+ return [Graph(**graph) for graph in response_json]