morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +2 -2
- morphik/_internal.py +36 -27
- morphik/async_.py +294 -127
- morphik/models.py +79 -58
- morphik/rules.py +28 -5
- morphik/sync.py +352 -144
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/METADATA +4 -8
- morphik-0.1.6.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.6.dist-info}/WHEEL +0 -0
morphik/models.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
|
2
|
-
from pathlib import Path
|
3
1
|
from datetime import datetime
|
4
|
-
from
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, BinaryIO, Dict, List, Literal, Optional, Union
|
4
|
+
|
5
|
+
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
|
5
6
|
|
6
7
|
|
7
8
|
class Document(BaseModel):
|
@@ -11,20 +12,14 @@ class Document(BaseModel):
|
|
11
12
|
content_type: str = Field(..., description="Content type of the document")
|
12
13
|
filename: Optional[str] = Field(None, description="Original filename if available")
|
13
14
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
|
14
|
-
storage_info: Dict[str, str] = Field(
|
15
|
-
|
16
|
-
)
|
17
|
-
system_metadata: Dict[str, Any] = Field(
|
18
|
-
default_factory=dict, description="System-managed metadata"
|
19
|
-
)
|
20
|
-
access_control: Dict[str, Any] = Field(
|
21
|
-
default_factory=dict, description="Access control information"
|
22
|
-
)
|
15
|
+
storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
|
16
|
+
system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
|
17
|
+
access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
|
23
18
|
chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
|
24
19
|
|
25
20
|
# Client reference for update methods
|
26
21
|
_client = None
|
27
|
-
|
22
|
+
|
28
23
|
@property
|
29
24
|
def status(self) -> Dict[str, Any]:
|
30
25
|
"""Get the latest processing status of the document from the API.
|
@@ -37,38 +32,38 @@ class Document(BaseModel):
|
|
37
32
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
38
33
|
)
|
39
34
|
return self._client.get_document_status(self.external_id)
|
40
|
-
|
35
|
+
|
41
36
|
@property
|
42
37
|
def is_processing(self) -> bool:
|
43
38
|
"""Check if the document is still being processed."""
|
44
39
|
return self.status.get("status") == "processing"
|
45
|
-
|
40
|
+
|
46
41
|
@property
|
47
42
|
def is_ingested(self) -> bool:
|
48
43
|
"""Check if the document has completed processing."""
|
49
44
|
return self.status.get("status") == "completed"
|
50
|
-
|
45
|
+
|
51
46
|
@property
|
52
47
|
def is_failed(self) -> bool:
|
53
48
|
"""Check if document processing has failed."""
|
54
49
|
return self.status.get("status") == "failed"
|
55
|
-
|
50
|
+
|
56
51
|
@property
|
57
52
|
def error(self) -> Optional[str]:
|
58
53
|
"""Get the error message if processing failed."""
|
59
54
|
status_info = self.status
|
60
55
|
return status_info.get("error") if status_info.get("status") == "failed" else None
|
61
|
-
|
56
|
+
|
62
57
|
def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
|
63
58
|
"""Wait for document processing to complete.
|
64
|
-
|
59
|
+
|
65
60
|
Args:
|
66
61
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
67
62
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
68
|
-
|
63
|
+
|
69
64
|
Returns:
|
70
65
|
Document: Updated document with the latest status
|
71
|
-
|
66
|
+
|
72
67
|
Raises:
|
73
68
|
TimeoutError: If processing doesn't complete within the timeout period
|
74
69
|
ValueError: If processing fails with an error
|
@@ -173,9 +168,7 @@ class Document(BaseModel):
|
|
173
168
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
174
169
|
)
|
175
170
|
|
176
|
-
return self._client.update_document_metadata(
|
177
|
-
document_id=self.external_id, metadata=metadata
|
178
|
-
)
|
171
|
+
return self._client.update_document_metadata(document_id=self.external_id, metadata=metadata)
|
179
172
|
|
180
173
|
|
181
174
|
class ChunkResult(BaseModel):
|
@@ -227,12 +220,13 @@ class ChunkSource(BaseModel):
|
|
227
220
|
class CompletionResponse(BaseModel):
|
228
221
|
"""Completion response model"""
|
229
222
|
|
230
|
-
completion: str
|
231
|
-
|
232
|
-
sources: List[ChunkSource] = Field(
|
233
|
-
default_factory=list, description="Sources of chunks used in the completion"
|
223
|
+
completion: Optional[Union[str, Dict[str, Any], None]] = Field(
|
224
|
+
None, description="Generated text completion or structured output"
|
234
225
|
)
|
226
|
+
usage: Dict[str, int]
|
227
|
+
sources: List[ChunkSource] = Field(default_factory=list, description="Sources of chunks used in the completion")
|
235
228
|
metadata: Optional[Dict[str, Any]] = None
|
229
|
+
finish_reason: Optional[str] = Field(None, description="Reason the generation finished (e.g., 'stop', 'length')")
|
236
230
|
|
237
231
|
|
238
232
|
class IngestTextRequest(BaseModel):
|
@@ -253,9 +247,7 @@ class Entity(BaseModel):
|
|
253
247
|
type: str = Field(..., description="Entity type")
|
254
248
|
properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
|
255
249
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
256
|
-
chunk_sources: Dict[str, List[int]] = Field(
|
257
|
-
default_factory=dict, description="Source chunk numbers by document ID"
|
258
|
-
)
|
250
|
+
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
|
259
251
|
|
260
252
|
def __hash__(self):
|
261
253
|
return hash(self.id)
|
@@ -274,9 +266,7 @@ class Relationship(BaseModel):
|
|
274
266
|
target_id: str = Field(..., description="Target entity ID")
|
275
267
|
type: str = Field(..., description="Relationship type")
|
276
268
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
277
|
-
chunk_sources: Dict[str, List[int]] = Field(
|
278
|
-
default_factory=dict, description="Source chunk numbers by document ID"
|
279
|
-
)
|
269
|
+
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
|
280
270
|
|
281
271
|
def __hash__(self):
|
282
272
|
return hash(self.id)
|
@@ -293,20 +283,56 @@ class Graph(BaseModel):
|
|
293
283
|
id: str = Field(..., description="Unique graph identifier")
|
294
284
|
name: str = Field(..., description="Graph name")
|
295
285
|
entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
|
296
|
-
relationships: List[Relationship] = Field(
|
297
|
-
default_factory=list, description="Relationships in the graph"
|
298
|
-
)
|
286
|
+
relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
|
299
287
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
|
288
|
+
system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
|
300
289
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
301
|
-
filters: Optional[Dict[str, Any]] = Field(
|
302
|
-
None, description="Document filters used to create the graph"
|
303
|
-
)
|
290
|
+
filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
|
304
291
|
created_at: datetime = Field(..., description="Creation timestamp")
|
305
292
|
updated_at: datetime = Field(..., description="Last update timestamp")
|
306
293
|
owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
|
307
|
-
access_control: Dict[str, List[str]] = Field(
|
308
|
-
|
309
|
-
)
|
294
|
+
access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
|
295
|
+
|
296
|
+
_client: Any | None = PrivateAttr(default=None)
|
297
|
+
|
298
|
+
# ---------------- Convenience helpers ----------------
|
299
|
+
@property
|
300
|
+
def status(self) -> str | None:
|
301
|
+
"""Return processing status if available."""
|
302
|
+
return self.system_metadata.get("status") if self.system_metadata else None
|
303
|
+
|
304
|
+
@property
|
305
|
+
def is_processing(self) -> bool:
|
306
|
+
return self.status == "processing"
|
307
|
+
|
308
|
+
@property
|
309
|
+
def is_completed(self) -> bool:
|
310
|
+
return self.status == "completed"
|
311
|
+
|
312
|
+
@property
|
313
|
+
def is_failed(self) -> bool:
|
314
|
+
return self.status == "failed"
|
315
|
+
|
316
|
+
@property
|
317
|
+
def error(self) -> str | None:
|
318
|
+
return self.system_metadata.get("error") if self.system_metadata else None
|
319
|
+
|
320
|
+
def wait_for_completion(self, timeout_seconds: int = 300, check_interval_seconds: int = 5) -> "Graph":
|
321
|
+
"""Poll the server until the graph processing is finished."""
|
322
|
+
import time
|
323
|
+
|
324
|
+
if not self._client:
|
325
|
+
raise RuntimeError("Graph object has no client reference for polling")
|
326
|
+
|
327
|
+
start = time.time()
|
328
|
+
while time.time() - start < timeout_seconds:
|
329
|
+
refreshed = self._client.get_graph(self.name)
|
330
|
+
if refreshed.is_completed:
|
331
|
+
return refreshed
|
332
|
+
if refreshed.is_failed:
|
333
|
+
raise RuntimeError(refreshed.error or "Graph creation failed")
|
334
|
+
time.sleep(check_interval_seconds)
|
335
|
+
raise TimeoutError("Timed out waiting for graph completion")
|
310
336
|
|
311
337
|
|
312
338
|
class EntityExtractionExample(BaseModel):
|
@@ -318,9 +344,7 @@ class EntityExtractionExample(BaseModel):
|
|
318
344
|
"""
|
319
345
|
|
320
346
|
label: str = Field(..., description="The entity label (e.g., 'John Doe', 'Apple Inc.')")
|
321
|
-
type: str = Field(
|
322
|
-
..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')"
|
323
|
-
)
|
347
|
+
type: str = Field(..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')")
|
324
348
|
properties: Optional[Dict[str, Any]] = Field(
|
325
349
|
default_factory=dict,
|
326
350
|
description="Optional properties of the entity (e.g., {'role': 'CEO', 'age': 42})",
|
@@ -337,9 +361,7 @@ class EntityResolutionExample(BaseModel):
|
|
337
361
|
"""
|
338
362
|
|
339
363
|
canonical: str = Field(..., description="The canonical (standard/preferred) form of the entity")
|
340
|
-
variants: List[str] = Field(
|
341
|
-
..., description="List of variant forms that should resolve to the canonical form"
|
342
|
-
)
|
364
|
+
variants: List[str] = Field(..., description="List of variant forms that should resolve to the canonical form")
|
343
365
|
|
344
366
|
|
345
367
|
class EntityExtractionPromptOverride(BaseModel):
|
@@ -425,11 +447,13 @@ class GraphPromptOverrides(BaseModel):
|
|
425
447
|
|
426
448
|
entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
|
427
449
|
None,
|
428
|
-
description="Overrides for entity extraction prompts - controls how entities are identified in text
|
450
|
+
description="Overrides for entity extraction prompts - controls how entities are identified in text "
|
451
|
+
"during graph operations",
|
429
452
|
)
|
430
453
|
entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
|
431
454
|
None,
|
432
|
-
description="Overrides for entity resolution prompts - controls how variant forms are grouped
|
455
|
+
description="Overrides for entity resolution prompts - controls how variant forms are grouped "
|
456
|
+
"during graph operations",
|
433
457
|
)
|
434
458
|
|
435
459
|
@model_validator(mode="after")
|
@@ -455,7 +479,8 @@ class QueryPromptOverrides(BaseModel):
|
|
455
479
|
|
456
480
|
entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
|
457
481
|
None,
|
458
|
-
description="Overrides for entity extraction prompts - controls how entities are identified in text
|
482
|
+
description="Overrides for entity extraction prompts - controls how entities are identified in text "
|
483
|
+
"during queries",
|
459
484
|
)
|
460
485
|
entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
|
461
486
|
None,
|
@@ -475,9 +500,5 @@ class FolderInfo(BaseModel):
|
|
475
500
|
description: Optional[str] = Field(None, description="Folder description")
|
476
501
|
owner: Dict[str, str] = Field(..., description="Owner information")
|
477
502
|
document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
|
478
|
-
system_metadata: Dict[str, Any] = Field(
|
479
|
-
|
480
|
-
)
|
481
|
-
access_control: Dict[str, List[str]] = Field(
|
482
|
-
default_factory=dict, description="Access control information"
|
483
|
-
)
|
503
|
+
system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
|
504
|
+
access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
|
morphik/rules.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
from typing import Dict, Any, Type, Union
|
2
1
|
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, Dict, Literal, Type, Union
|
3
|
+
|
3
4
|
from pydantic import BaseModel
|
4
5
|
|
5
6
|
|
@@ -15,8 +16,22 @@ class Rule(ABC):
|
|
15
16
|
class MetadataExtractionRule(Rule):
|
16
17
|
"""Server-side rule for extracting metadata using a schema"""
|
17
18
|
|
18
|
-
def __init__(
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
schema: Union[Type[BaseModel], Dict[str, Any]],
|
22
|
+
stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
|
23
|
+
use_images: bool = False,
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Args:
|
27
|
+
schema: Pydantic model or dict schema defining metadata fields to extract
|
28
|
+
stage: When to apply the rule - either "post_parsing" (full document text) or
|
29
|
+
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
30
|
+
use_images: Whether to process image chunks instead of text chunks. Defaults to False.
|
31
|
+
"""
|
19
32
|
self.schema = schema
|
33
|
+
self.stage = stage
|
34
|
+
self.use_images = use_images
|
20
35
|
|
21
36
|
def to_dict(self) -> Dict[str, Any]:
|
22
37
|
if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
|
@@ -26,22 +41,30 @@ class MetadataExtractionRule(Rule):
|
|
26
41
|
# Assume it's already a dict schema
|
27
42
|
schema_dict = self.schema
|
28
43
|
|
29
|
-
return {
|
44
|
+
return {
|
45
|
+
"type": "metadata_extraction",
|
46
|
+
"schema": schema_dict,
|
47
|
+
"stage": self.stage,
|
48
|
+
"use_images": self.use_images,
|
49
|
+
}
|
30
50
|
|
31
51
|
|
32
52
|
class NaturalLanguageRule(Rule):
|
33
53
|
"""Server-side rule for transforming content using natural language"""
|
34
54
|
|
35
|
-
def __init__(self, prompt: str):
|
55
|
+
def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
|
36
56
|
"""
|
37
57
|
Args:
|
38
58
|
prompt: Instruction for how to transform the content
|
39
59
|
e.g. "Remove any personal information" or "Convert to bullet points"
|
60
|
+
stage: When to apply the rule - either "post_parsing" (full document text) or
|
61
|
+
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
40
62
|
"""
|
41
63
|
self.prompt = prompt
|
64
|
+
self.stage = stage
|
42
65
|
|
43
66
|
def to_dict(self) -> Dict[str, Any]:
|
44
|
-
return {"type": "natural_language", "prompt": self.prompt}
|
67
|
+
return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
|
45
68
|
|
46
69
|
|
47
70
|
__all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]
|