morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphik/__init__.py +1 -1
- morphik/_internal.py +28 -19
- morphik/async_.py +121 -110
- morphik/models.py +36 -57
- morphik/rules.py +28 -5
- morphik/sync.py +156 -109
- morphik/tests/README.md +1 -1
- morphik/tests/example_usage.py +69 -69
- morphik/tests/test_async.py +166 -82
- morphik/tests/test_docs/sample1.txt +1 -1
- morphik/tests/test_docs/sample2.txt +2 -2
- morphik/tests/test_docs/sample3.txt +1 -1
- morphik/tests/test_sync.py +162 -84
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/METADATA +4 -8
- morphik-0.1.5.dist-info/RECORD +18 -0
- morphik-0.1.4.dist-info/RECORD +0 -18
- {morphik-0.1.4.dist-info → morphik-0.1.5.dist-info}/WHEEL +0 -0
morphik/models.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
|
2
|
-
from pathlib import Path
|
3
1
|
from datetime import datetime
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, BinaryIO, Dict, List, Literal, Optional, Union
|
4
|
+
|
4
5
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
5
6
|
|
6
7
|
|
@@ -11,20 +12,14 @@ class Document(BaseModel):
|
|
11
12
|
content_type: str = Field(..., description="Content type of the document")
|
12
13
|
filename: Optional[str] = Field(None, description="Original filename if available")
|
13
14
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
|
14
|
-
storage_info: Dict[str, str] = Field(
|
15
|
-
|
16
|
-
)
|
17
|
-
system_metadata: Dict[str, Any] = Field(
|
18
|
-
default_factory=dict, description="System-managed metadata"
|
19
|
-
)
|
20
|
-
access_control: Dict[str, Any] = Field(
|
21
|
-
default_factory=dict, description="Access control information"
|
22
|
-
)
|
15
|
+
storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
|
16
|
+
system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
|
17
|
+
access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
|
23
18
|
chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
|
24
19
|
|
25
20
|
# Client reference for update methods
|
26
21
|
_client = None
|
27
|
-
|
22
|
+
|
28
23
|
@property
|
29
24
|
def status(self) -> Dict[str, Any]:
|
30
25
|
"""Get the latest processing status of the document from the API.
|
@@ -37,38 +32,38 @@ class Document(BaseModel):
|
|
37
32
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
38
33
|
)
|
39
34
|
return self._client.get_document_status(self.external_id)
|
40
|
-
|
35
|
+
|
41
36
|
@property
|
42
37
|
def is_processing(self) -> bool:
|
43
38
|
"""Check if the document is still being processed."""
|
44
39
|
return self.status.get("status") == "processing"
|
45
|
-
|
40
|
+
|
46
41
|
@property
|
47
42
|
def is_ingested(self) -> bool:
|
48
43
|
"""Check if the document has completed processing."""
|
49
44
|
return self.status.get("status") == "completed"
|
50
|
-
|
45
|
+
|
51
46
|
@property
|
52
47
|
def is_failed(self) -> bool:
|
53
48
|
"""Check if document processing has failed."""
|
54
49
|
return self.status.get("status") == "failed"
|
55
|
-
|
50
|
+
|
56
51
|
@property
|
57
52
|
def error(self) -> Optional[str]:
|
58
53
|
"""Get the error message if processing failed."""
|
59
54
|
status_info = self.status
|
60
55
|
return status_info.get("error") if status_info.get("status") == "failed" else None
|
61
|
-
|
56
|
+
|
62
57
|
def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
|
63
58
|
"""Wait for document processing to complete.
|
64
|
-
|
59
|
+
|
65
60
|
Args:
|
66
61
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
67
62
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
68
|
-
|
63
|
+
|
69
64
|
Returns:
|
70
65
|
Document: Updated document with the latest status
|
71
|
-
|
66
|
+
|
72
67
|
Raises:
|
73
68
|
TimeoutError: If processing doesn't complete within the timeout period
|
74
69
|
ValueError: If processing fails with an error
|
@@ -173,9 +168,7 @@ class Document(BaseModel):
|
|
173
168
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
174
169
|
)
|
175
170
|
|
176
|
-
return self._client.update_document_metadata(
|
177
|
-
document_id=self.external_id, metadata=metadata
|
178
|
-
)
|
171
|
+
return self._client.update_document_metadata(document_id=self.external_id, metadata=metadata)
|
179
172
|
|
180
173
|
|
181
174
|
class ChunkResult(BaseModel):
|
@@ -227,12 +220,13 @@ class ChunkSource(BaseModel):
|
|
227
220
|
class CompletionResponse(BaseModel):
|
228
221
|
"""Completion response model"""
|
229
222
|
|
230
|
-
completion: str
|
231
|
-
|
232
|
-
sources: List[ChunkSource] = Field(
|
233
|
-
default_factory=list, description="Sources of chunks used in the completion"
|
223
|
+
completion: Optional[Union[str, Dict[str, Any], None]] = Field(
|
224
|
+
None, description="Generated text completion or structured output"
|
234
225
|
)
|
226
|
+
usage: Dict[str, int]
|
227
|
+
sources: List[ChunkSource] = Field(default_factory=list, description="Sources of chunks used in the completion")
|
235
228
|
metadata: Optional[Dict[str, Any]] = None
|
229
|
+
finish_reason: Optional[str] = Field(None, description="Reason the generation finished (e.g., 'stop', 'length')")
|
236
230
|
|
237
231
|
|
238
232
|
class IngestTextRequest(BaseModel):
|
@@ -253,9 +247,7 @@ class Entity(BaseModel):
|
|
253
247
|
type: str = Field(..., description="Entity type")
|
254
248
|
properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
|
255
249
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
256
|
-
chunk_sources: Dict[str, List[int]] = Field(
|
257
|
-
default_factory=dict, description="Source chunk numbers by document ID"
|
258
|
-
)
|
250
|
+
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
|
259
251
|
|
260
252
|
def __hash__(self):
|
261
253
|
return hash(self.id)
|
@@ -274,9 +266,7 @@ class Relationship(BaseModel):
|
|
274
266
|
target_id: str = Field(..., description="Target entity ID")
|
275
267
|
type: str = Field(..., description="Relationship type")
|
276
268
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
277
|
-
chunk_sources: Dict[str, List[int]] = Field(
|
278
|
-
default_factory=dict, description="Source chunk numbers by document ID"
|
279
|
-
)
|
269
|
+
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
|
280
270
|
|
281
271
|
def __hash__(self):
|
282
272
|
return hash(self.id)
|
@@ -293,20 +283,14 @@ class Graph(BaseModel):
|
|
293
283
|
id: str = Field(..., description="Unique graph identifier")
|
294
284
|
name: str = Field(..., description="Graph name")
|
295
285
|
entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
|
296
|
-
relationships: List[Relationship] = Field(
|
297
|
-
default_factory=list, description="Relationships in the graph"
|
298
|
-
)
|
286
|
+
relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
|
299
287
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
|
300
288
|
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
|
301
|
-
filters: Optional[Dict[str, Any]] = Field(
|
302
|
-
None, description="Document filters used to create the graph"
|
303
|
-
)
|
289
|
+
filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
|
304
290
|
created_at: datetime = Field(..., description="Creation timestamp")
|
305
291
|
updated_at: datetime = Field(..., description="Last update timestamp")
|
306
292
|
owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
|
307
|
-
access_control: Dict[str, List[str]] = Field(
|
308
|
-
default_factory=dict, description="Access control information"
|
309
|
-
)
|
293
|
+
access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
|
310
294
|
|
311
295
|
|
312
296
|
class EntityExtractionExample(BaseModel):
|
@@ -318,9 +302,7 @@ class EntityExtractionExample(BaseModel):
|
|
318
302
|
"""
|
319
303
|
|
320
304
|
label: str = Field(..., description="The entity label (e.g., 'John Doe', 'Apple Inc.')")
|
321
|
-
type: str = Field(
|
322
|
-
..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')"
|
323
|
-
)
|
305
|
+
type: str = Field(..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')")
|
324
306
|
properties: Optional[Dict[str, Any]] = Field(
|
325
307
|
default_factory=dict,
|
326
308
|
description="Optional properties of the entity (e.g., {'role': 'CEO', 'age': 42})",
|
@@ -337,9 +319,7 @@ class EntityResolutionExample(BaseModel):
|
|
337
319
|
"""
|
338
320
|
|
339
321
|
canonical: str = Field(..., description="The canonical (standard/preferred) form of the entity")
|
340
|
-
variants: List[str] = Field(
|
341
|
-
..., description="List of variant forms that should resolve to the canonical form"
|
342
|
-
)
|
322
|
+
variants: List[str] = Field(..., description="List of variant forms that should resolve to the canonical form")
|
343
323
|
|
344
324
|
|
345
325
|
class EntityExtractionPromptOverride(BaseModel):
|
@@ -425,11 +405,13 @@ class GraphPromptOverrides(BaseModel):
|
|
425
405
|
|
426
406
|
entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
|
427
407
|
None,
|
428
|
-
description="Overrides for entity extraction prompts - controls how entities are identified in text
|
408
|
+
description="Overrides for entity extraction prompts - controls how entities are identified in text "
|
409
|
+
"during graph operations",
|
429
410
|
)
|
430
411
|
entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
|
431
412
|
None,
|
432
|
-
description="Overrides for entity resolution prompts - controls how variant forms are grouped
|
413
|
+
description="Overrides for entity resolution prompts - controls how variant forms are grouped "
|
414
|
+
"during graph operations",
|
433
415
|
)
|
434
416
|
|
435
417
|
@model_validator(mode="after")
|
@@ -455,7 +437,8 @@ class QueryPromptOverrides(BaseModel):
|
|
455
437
|
|
456
438
|
entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
|
457
439
|
None,
|
458
|
-
description="Overrides for entity extraction prompts - controls how entities are identified in text
|
440
|
+
description="Overrides for entity extraction prompts - controls how entities are identified in text "
|
441
|
+
"during queries",
|
459
442
|
)
|
460
443
|
entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
|
461
444
|
None,
|
@@ -475,9 +458,5 @@ class FolderInfo(BaseModel):
|
|
475
458
|
description: Optional[str] = Field(None, description="Folder description")
|
476
459
|
owner: Dict[str, str] = Field(..., description="Owner information")
|
477
460
|
document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
|
478
|
-
system_metadata: Dict[str, Any] = Field(
|
479
|
-
|
480
|
-
)
|
481
|
-
access_control: Dict[str, List[str]] = Field(
|
482
|
-
default_factory=dict, description="Access control information"
|
483
|
-
)
|
461
|
+
system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
|
462
|
+
access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
|
morphik/rules.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
from typing import Dict, Any, Type, Union
|
2
1
|
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, Dict, Literal, Type, Union
|
3
|
+
|
3
4
|
from pydantic import BaseModel
|
4
5
|
|
5
6
|
|
@@ -15,8 +16,22 @@ class Rule(ABC):
|
|
15
16
|
class MetadataExtractionRule(Rule):
|
16
17
|
"""Server-side rule for extracting metadata using a schema"""
|
17
18
|
|
18
|
-
def __init__(
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
schema: Union[Type[BaseModel], Dict[str, Any]],
|
22
|
+
stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
|
23
|
+
use_images: bool = False,
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Args:
|
27
|
+
schema: Pydantic model or dict schema defining metadata fields to extract
|
28
|
+
stage: When to apply the rule - either "post_parsing" (full document text) or
|
29
|
+
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
30
|
+
use_images: Whether to process image chunks instead of text chunks. Defaults to False.
|
31
|
+
"""
|
19
32
|
self.schema = schema
|
33
|
+
self.stage = stage
|
34
|
+
self.use_images = use_images
|
20
35
|
|
21
36
|
def to_dict(self) -> Dict[str, Any]:
|
22
37
|
if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
|
@@ -26,22 +41,30 @@ class MetadataExtractionRule(Rule):
|
|
26
41
|
# Assume it's already a dict schema
|
27
42
|
schema_dict = self.schema
|
28
43
|
|
29
|
-
return {
|
44
|
+
return {
|
45
|
+
"type": "metadata_extraction",
|
46
|
+
"schema": schema_dict,
|
47
|
+
"stage": self.stage,
|
48
|
+
"use_images": self.use_images,
|
49
|
+
}
|
30
50
|
|
31
51
|
|
32
52
|
class NaturalLanguageRule(Rule):
|
33
53
|
"""Server-side rule for transforming content using natural language"""
|
34
54
|
|
35
|
-
def __init__(self, prompt: str):
|
55
|
+
def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
|
36
56
|
"""
|
37
57
|
Args:
|
38
58
|
prompt: Instruction for how to transform the content
|
39
59
|
e.g. "Remove any personal information" or "Convert to bullet points"
|
60
|
+
stage: When to apply the rule - either "post_parsing" (full document text) or
|
61
|
+
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
40
62
|
"""
|
41
63
|
self.prompt = prompt
|
64
|
+
self.stage = stage
|
42
65
|
|
43
66
|
def to_dict(self) -> Dict[str, Any]:
|
44
|
-
return {"type": "natural_language", "prompt": self.prompt}
|
67
|
+
return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
|
45
68
|
|
46
69
|
|
47
70
|
__all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]
|