morphik 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/models.py CHANGED
@@ -1,6 +1,7 @@
1
- from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
2
- from pathlib import Path
3
1
  from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Any, BinaryIO, Dict, List, Literal, Optional, Union
4
+
4
5
  from pydantic import BaseModel, Field, field_validator, model_validator
5
6
 
6
7
 
@@ -11,20 +12,14 @@ class Document(BaseModel):
11
12
  content_type: str = Field(..., description="Content type of the document")
12
13
  filename: Optional[str] = Field(None, description="Original filename if available")
13
14
  metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
14
- storage_info: Dict[str, str] = Field(
15
- default_factory=dict, description="Storage-related information"
16
- )
17
- system_metadata: Dict[str, Any] = Field(
18
- default_factory=dict, description="System-managed metadata"
19
- )
20
- access_control: Dict[str, Any] = Field(
21
- default_factory=dict, description="Access control information"
22
- )
15
+ storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
16
+ system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
17
+ access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
23
18
  chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
24
19
 
25
20
  # Client reference for update methods
26
21
  _client = None
27
-
22
+
28
23
  @property
29
24
  def status(self) -> Dict[str, Any]:
30
25
  """Get the latest processing status of the document from the API.
@@ -37,38 +32,38 @@ class Document(BaseModel):
37
32
  "Document instance not connected to a client. Use a document returned from a Morphik client method."
38
33
  )
39
34
  return self._client.get_document_status(self.external_id)
40
-
35
+
41
36
  @property
42
37
  def is_processing(self) -> bool:
43
38
  """Check if the document is still being processed."""
44
39
  return self.status.get("status") == "processing"
45
-
40
+
46
41
  @property
47
42
  def is_ingested(self) -> bool:
48
43
  """Check if the document has completed processing."""
49
44
  return self.status.get("status") == "completed"
50
-
45
+
51
46
  @property
52
47
  def is_failed(self) -> bool:
53
48
  """Check if document processing has failed."""
54
49
  return self.status.get("status") == "failed"
55
-
50
+
56
51
  @property
57
52
  def error(self) -> Optional[str]:
58
53
  """Get the error message if processing failed."""
59
54
  status_info = self.status
60
55
  return status_info.get("error") if status_info.get("status") == "failed" else None
61
-
56
+
62
57
  def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
63
58
  """Wait for document processing to complete.
64
-
59
+
65
60
  Args:
66
61
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
67
62
  check_interval_seconds: Time between status checks (default: 2 seconds)
68
-
63
+
69
64
  Returns:
70
65
  Document: Updated document with the latest status
71
-
66
+
72
67
  Raises:
73
68
  TimeoutError: If processing doesn't complete within the timeout period
74
69
  ValueError: If processing fails with an error
@@ -173,9 +168,7 @@ class Document(BaseModel):
173
168
  "Document instance not connected to a client. Use a document returned from a Morphik client method."
174
169
  )
175
170
 
176
- return self._client.update_document_metadata(
177
- document_id=self.external_id, metadata=metadata
178
- )
171
+ return self._client.update_document_metadata(document_id=self.external_id, metadata=metadata)
179
172
 
180
173
 
181
174
  class ChunkResult(BaseModel):
@@ -227,12 +220,13 @@ class ChunkSource(BaseModel):
227
220
  class CompletionResponse(BaseModel):
228
221
  """Completion response model"""
229
222
 
230
- completion: str
231
- usage: Dict[str, int]
232
- sources: List[ChunkSource] = Field(
233
- default_factory=list, description="Sources of chunks used in the completion"
223
+ completion: Optional[Union[str, Dict[str, Any], None]] = Field(
224
+ None, description="Generated text completion or structured output"
234
225
  )
226
+ usage: Dict[str, int]
227
+ sources: List[ChunkSource] = Field(default_factory=list, description="Sources of chunks used in the completion")
235
228
  metadata: Optional[Dict[str, Any]] = None
229
+ finish_reason: Optional[str] = Field(None, description="Reason the generation finished (e.g., 'stop', 'length')")
236
230
 
237
231
 
238
232
  class IngestTextRequest(BaseModel):
@@ -253,9 +247,7 @@ class Entity(BaseModel):
253
247
  type: str = Field(..., description="Entity type")
254
248
  properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
255
249
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
256
- chunk_sources: Dict[str, List[int]] = Field(
257
- default_factory=dict, description="Source chunk numbers by document ID"
258
- )
250
+ chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
259
251
 
260
252
  def __hash__(self):
261
253
  return hash(self.id)
@@ -274,9 +266,7 @@ class Relationship(BaseModel):
274
266
  target_id: str = Field(..., description="Target entity ID")
275
267
  type: str = Field(..., description="Relationship type")
276
268
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
277
- chunk_sources: Dict[str, List[int]] = Field(
278
- default_factory=dict, description="Source chunk numbers by document ID"
279
- )
269
+ chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
280
270
 
281
271
  def __hash__(self):
282
272
  return hash(self.id)
@@ -293,20 +283,14 @@ class Graph(BaseModel):
293
283
  id: str = Field(..., description="Unique graph identifier")
294
284
  name: str = Field(..., description="Graph name")
295
285
  entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
296
- relationships: List[Relationship] = Field(
297
- default_factory=list, description="Relationships in the graph"
298
- )
286
+ relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
299
287
  metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
300
288
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
301
- filters: Optional[Dict[str, Any]] = Field(
302
- None, description="Document filters used to create the graph"
303
- )
289
+ filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
304
290
  created_at: datetime = Field(..., description="Creation timestamp")
305
291
  updated_at: datetime = Field(..., description="Last update timestamp")
306
292
  owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
307
- access_control: Dict[str, List[str]] = Field(
308
- default_factory=dict, description="Access control information"
309
- )
293
+ access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
310
294
 
311
295
 
312
296
  class EntityExtractionExample(BaseModel):
@@ -318,9 +302,7 @@ class EntityExtractionExample(BaseModel):
318
302
  """
319
303
 
320
304
  label: str = Field(..., description="The entity label (e.g., 'John Doe', 'Apple Inc.')")
321
- type: str = Field(
322
- ..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')"
323
- )
305
+ type: str = Field(..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')")
324
306
  properties: Optional[Dict[str, Any]] = Field(
325
307
  default_factory=dict,
326
308
  description="Optional properties of the entity (e.g., {'role': 'CEO', 'age': 42})",
@@ -337,9 +319,7 @@ class EntityResolutionExample(BaseModel):
337
319
  """
338
320
 
339
321
  canonical: str = Field(..., description="The canonical (standard/preferred) form of the entity")
340
- variants: List[str] = Field(
341
- ..., description="List of variant forms that should resolve to the canonical form"
342
- )
322
+ variants: List[str] = Field(..., description="List of variant forms that should resolve to the canonical form")
343
323
 
344
324
 
345
325
  class EntityExtractionPromptOverride(BaseModel):
@@ -425,11 +405,13 @@ class GraphPromptOverrides(BaseModel):
425
405
 
426
406
  entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
427
407
  None,
428
- description="Overrides for entity extraction prompts - controls how entities are identified in text during graph operations",
408
+ description="Overrides for entity extraction prompts - controls how entities are identified in text "
409
+ "during graph operations",
429
410
  )
430
411
  entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
431
412
  None,
432
- description="Overrides for entity resolution prompts - controls how variant forms are grouped during graph operations",
413
+ description="Overrides for entity resolution prompts - controls how variant forms are grouped "
414
+ "during graph operations",
433
415
  )
434
416
 
435
417
  @model_validator(mode="after")
@@ -455,7 +437,8 @@ class QueryPromptOverrides(BaseModel):
455
437
 
456
438
  entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
457
439
  None,
458
- description="Overrides for entity extraction prompts - controls how entities are identified in text during queries",
440
+ description="Overrides for entity extraction prompts - controls how entities are identified in text "
441
+ "during queries",
459
442
  )
460
443
  entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
461
444
  None,
@@ -475,9 +458,5 @@ class FolderInfo(BaseModel):
475
458
  description: Optional[str] = Field(None, description="Folder description")
476
459
  owner: Dict[str, str] = Field(..., description="Owner information")
477
460
  document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
478
- system_metadata: Dict[str, Any] = Field(
479
- default_factory=dict, description="System-managed metadata"
480
- )
481
- access_control: Dict[str, List[str]] = Field(
482
- default_factory=dict, description="Access control information"
483
- )
461
+ system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
462
+ access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
morphik/rules.py CHANGED
@@ -1,5 +1,6 @@
1
- from typing import Dict, Any, Type, Union
2
1
  from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, Literal, Type, Union
3
+
3
4
  from pydantic import BaseModel
4
5
 
5
6
 
@@ -15,8 +16,22 @@ class Rule(ABC):
15
16
  class MetadataExtractionRule(Rule):
16
17
  """Server-side rule for extracting metadata using a schema"""
17
18
 
18
- def __init__(self, schema: Union[Type[BaseModel], Dict[str, Any]]):
19
+ def __init__(
20
+ self,
21
+ schema: Union[Type[BaseModel], Dict[str, Any]],
22
+ stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
23
+ use_images: bool = False,
24
+ ):
25
+ """
26
+ Args:
27
+ schema: Pydantic model or dict schema defining metadata fields to extract
28
+ stage: When to apply the rule - either "post_parsing" (full document text) or
29
+ "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
30
+ use_images: Whether to process image chunks instead of text chunks. Defaults to False.
31
+ """
19
32
  self.schema = schema
33
+ self.stage = stage
34
+ self.use_images = use_images
20
35
 
21
36
  def to_dict(self) -> Dict[str, Any]:
22
37
  if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
@@ -26,22 +41,30 @@ class MetadataExtractionRule(Rule):
26
41
  # Assume it's already a dict schema
27
42
  schema_dict = self.schema
28
43
 
29
- return {"type": "metadata_extraction", "schema": schema_dict}
44
+ return {
45
+ "type": "metadata_extraction",
46
+ "schema": schema_dict,
47
+ "stage": self.stage,
48
+ "use_images": self.use_images,
49
+ }
30
50
 
31
51
 
32
52
  class NaturalLanguageRule(Rule):
33
53
  """Server-side rule for transforming content using natural language"""
34
54
 
35
- def __init__(self, prompt: str):
55
+ def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
36
56
  """
37
57
  Args:
38
58
  prompt: Instruction for how to transform the content
39
59
  e.g. "Remove any personal information" or "Convert to bullet points"
60
+ stage: When to apply the rule - either "post_parsing" (full document text) or
61
+ "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
40
62
  """
41
63
  self.prompt = prompt
64
+ self.stage = stage
42
65
 
43
66
  def to_dict(self) -> Dict[str, Any]:
44
- return {"type": "natural_language", "prompt": self.prompt}
67
+ return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
45
68
 
46
69
 
47
70
  __all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]