morphik 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morphik/models.py CHANGED
@@ -1,7 +1,8 @@
1
- from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
2
- from pathlib import Path
3
1
  from datetime import datetime
4
- from pydantic import BaseModel, Field, field_validator, model_validator
2
+ from pathlib import Path
3
+ from typing import Any, BinaryIO, Dict, List, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
5
6
 
6
7
 
7
8
  class Document(BaseModel):
@@ -11,20 +12,14 @@ class Document(BaseModel):
11
12
  content_type: str = Field(..., description="Content type of the document")
12
13
  filename: Optional[str] = Field(None, description="Original filename if available")
13
14
  metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
14
- storage_info: Dict[str, str] = Field(
15
- default_factory=dict, description="Storage-related information"
16
- )
17
- system_metadata: Dict[str, Any] = Field(
18
- default_factory=dict, description="System-managed metadata"
19
- )
20
- access_control: Dict[str, Any] = Field(
21
- default_factory=dict, description="Access control information"
22
- )
15
+ storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
16
+ system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
17
+ access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
23
18
  chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
24
19
 
25
20
  # Client reference for update methods
26
21
  _client = None
27
-
22
+
28
23
  @property
29
24
  def status(self) -> Dict[str, Any]:
30
25
  """Get the latest processing status of the document from the API.
@@ -37,38 +32,38 @@ class Document(BaseModel):
37
32
  "Document instance not connected to a client. Use a document returned from a Morphik client method."
38
33
  )
39
34
  return self._client.get_document_status(self.external_id)
40
-
35
+
41
36
  @property
42
37
  def is_processing(self) -> bool:
43
38
  """Check if the document is still being processed."""
44
39
  return self.status.get("status") == "processing"
45
-
40
+
46
41
  @property
47
42
  def is_ingested(self) -> bool:
48
43
  """Check if the document has completed processing."""
49
44
  return self.status.get("status") == "completed"
50
-
45
+
51
46
  @property
52
47
  def is_failed(self) -> bool:
53
48
  """Check if document processing has failed."""
54
49
  return self.status.get("status") == "failed"
55
-
50
+
56
51
  @property
57
52
  def error(self) -> Optional[str]:
58
53
  """Get the error message if processing failed."""
59
54
  status_info = self.status
60
55
  return status_info.get("error") if status_info.get("status") == "failed" else None
61
-
56
+
62
57
  def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
63
58
  """Wait for document processing to complete.
64
-
59
+
65
60
  Args:
66
61
  timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
67
62
  check_interval_seconds: Time between status checks (default: 2 seconds)
68
-
63
+
69
64
  Returns:
70
65
  Document: Updated document with the latest status
71
-
66
+
72
67
  Raises:
73
68
  TimeoutError: If processing doesn't complete within the timeout period
74
69
  ValueError: If processing fails with an error
@@ -173,9 +168,7 @@ class Document(BaseModel):
173
168
  "Document instance not connected to a client. Use a document returned from a Morphik client method."
174
169
  )
175
170
 
176
- return self._client.update_document_metadata(
177
- document_id=self.external_id, metadata=metadata
178
- )
171
+ return self._client.update_document_metadata(document_id=self.external_id, metadata=metadata)
179
172
 
180
173
 
181
174
  class ChunkResult(BaseModel):
@@ -227,12 +220,13 @@ class ChunkSource(BaseModel):
227
220
  class CompletionResponse(BaseModel):
228
221
  """Completion response model"""
229
222
 
230
- completion: str
231
- usage: Dict[str, int]
232
- sources: List[ChunkSource] = Field(
233
- default_factory=list, description="Sources of chunks used in the completion"
223
+ completion: Optional[Union[str, Dict[str, Any], None]] = Field(
224
+ None, description="Generated text completion or structured output"
234
225
  )
226
+ usage: Dict[str, int]
227
+ sources: List[ChunkSource] = Field(default_factory=list, description="Sources of chunks used in the completion")
235
228
  metadata: Optional[Dict[str, Any]] = None
229
+ finish_reason: Optional[str] = Field(None, description="Reason the generation finished (e.g., 'stop', 'length')")
236
230
 
237
231
 
238
232
  class IngestTextRequest(BaseModel):
@@ -253,9 +247,7 @@ class Entity(BaseModel):
253
247
  type: str = Field(..., description="Entity type")
254
248
  properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
255
249
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
256
- chunk_sources: Dict[str, List[int]] = Field(
257
- default_factory=dict, description="Source chunk numbers by document ID"
258
- )
250
+ chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
259
251
 
260
252
  def __hash__(self):
261
253
  return hash(self.id)
@@ -274,9 +266,7 @@ class Relationship(BaseModel):
274
266
  target_id: str = Field(..., description="Target entity ID")
275
267
  type: str = Field(..., description="Relationship type")
276
268
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
277
- chunk_sources: Dict[str, List[int]] = Field(
278
- default_factory=dict, description="Source chunk numbers by document ID"
279
- )
269
+ chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
280
270
 
281
271
  def __hash__(self):
282
272
  return hash(self.id)
@@ -293,20 +283,56 @@ class Graph(BaseModel):
293
283
  id: str = Field(..., description="Unique graph identifier")
294
284
  name: str = Field(..., description="Graph name")
295
285
  entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
296
- relationships: List[Relationship] = Field(
297
- default_factory=list, description="Relationships in the graph"
298
- )
286
+ relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
299
287
  metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
288
+ system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
300
289
  document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
301
- filters: Optional[Dict[str, Any]] = Field(
302
- None, description="Document filters used to create the graph"
303
- )
290
+ filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
304
291
  created_at: datetime = Field(..., description="Creation timestamp")
305
292
  updated_at: datetime = Field(..., description="Last update timestamp")
306
293
  owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
307
- access_control: Dict[str, List[str]] = Field(
308
- default_factory=dict, description="Access control information"
309
- )
294
+ access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
295
+
296
+ _client: Any | None = PrivateAttr(default=None)
297
+
298
+ # ---------------- Convenience helpers ----------------
299
+ @property
300
+ def status(self) -> str | None:
301
+ """Return processing status if available."""
302
+ return self.system_metadata.get("status") if self.system_metadata else None
303
+
304
+ @property
305
+ def is_processing(self) -> bool:
306
+ return self.status == "processing"
307
+
308
+ @property
309
+ def is_completed(self) -> bool:
310
+ return self.status == "completed"
311
+
312
+ @property
313
+ def is_failed(self) -> bool:
314
+ return self.status == "failed"
315
+
316
+ @property
317
+ def error(self) -> str | None:
318
+ return self.system_metadata.get("error") if self.system_metadata else None
319
+
320
+ def wait_for_completion(self, timeout_seconds: int = 300, check_interval_seconds: int = 5) -> "Graph":
321
+ """Poll the server until the graph processing is finished."""
322
+ import time
323
+
324
+ if not self._client:
325
+ raise RuntimeError("Graph object has no client reference for polling")
326
+
327
+ start = time.time()
328
+ while time.time() - start < timeout_seconds:
329
+ refreshed = self._client.get_graph(self.name)
330
+ if refreshed.is_completed:
331
+ return refreshed
332
+ if refreshed.is_failed:
333
+ raise RuntimeError(refreshed.error or "Graph creation failed")
334
+ time.sleep(check_interval_seconds)
335
+ raise TimeoutError("Timed out waiting for graph completion")
310
336
 
311
337
 
312
338
  class EntityExtractionExample(BaseModel):
@@ -318,9 +344,7 @@ class EntityExtractionExample(BaseModel):
318
344
  """
319
345
 
320
346
  label: str = Field(..., description="The entity label (e.g., 'John Doe', 'Apple Inc.')")
321
- type: str = Field(
322
- ..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')"
323
- )
347
+ type: str = Field(..., description="The entity type (e.g., 'PERSON', 'ORGANIZATION', 'PRODUCT')")
324
348
  properties: Optional[Dict[str, Any]] = Field(
325
349
  default_factory=dict,
326
350
  description="Optional properties of the entity (e.g., {'role': 'CEO', 'age': 42})",
@@ -337,9 +361,7 @@ class EntityResolutionExample(BaseModel):
337
361
  """
338
362
 
339
363
  canonical: str = Field(..., description="The canonical (standard/preferred) form of the entity")
340
- variants: List[str] = Field(
341
- ..., description="List of variant forms that should resolve to the canonical form"
342
- )
364
+ variants: List[str] = Field(..., description="List of variant forms that should resolve to the canonical form")
343
365
 
344
366
 
345
367
  class EntityExtractionPromptOverride(BaseModel):
@@ -425,11 +447,13 @@ class GraphPromptOverrides(BaseModel):
425
447
 
426
448
  entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
427
449
  None,
428
- description="Overrides for entity extraction prompts - controls how entities are identified in text during graph operations",
450
+ description="Overrides for entity extraction prompts - controls how entities are identified in text "
451
+ "during graph operations",
429
452
  )
430
453
  entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
431
454
  None,
432
- description="Overrides for entity resolution prompts - controls how variant forms are grouped during graph operations",
455
+ description="Overrides for entity resolution prompts - controls how variant forms are grouped "
456
+ "during graph operations",
433
457
  )
434
458
 
435
459
  @model_validator(mode="after")
@@ -455,7 +479,8 @@ class QueryPromptOverrides(BaseModel):
455
479
 
456
480
  entity_extraction: Optional[EntityExtractionPromptOverride] = Field(
457
481
  None,
458
- description="Overrides for entity extraction prompts - controls how entities are identified in text during queries",
482
+ description="Overrides for entity extraction prompts - controls how entities are identified in text "
483
+ "during queries",
459
484
  )
460
485
  entity_resolution: Optional[EntityResolutionPromptOverride] = Field(
461
486
  None,
@@ -475,9 +500,5 @@ class FolderInfo(BaseModel):
475
500
  description: Optional[str] = Field(None, description="Folder description")
476
501
  owner: Dict[str, str] = Field(..., description="Owner information")
477
502
  document_ids: List[str] = Field(default_factory=list, description="IDs of documents in the folder")
478
- system_metadata: Dict[str, Any] = Field(
479
- default_factory=dict, description="System-managed metadata"
480
- )
481
- access_control: Dict[str, List[str]] = Field(
482
- default_factory=dict, description="Access control information"
483
- )
503
+ system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
504
+ access_control: Dict[str, List[str]] = Field(default_factory=dict, description="Access control information")
morphik/rules.py CHANGED
@@ -1,5 +1,6 @@
1
- from typing import Dict, Any, Type, Union
2
1
  from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, Literal, Type, Union
3
+
3
4
  from pydantic import BaseModel
4
5
 
5
6
 
@@ -15,8 +16,22 @@ class Rule(ABC):
15
16
  class MetadataExtractionRule(Rule):
16
17
  """Server-side rule for extracting metadata using a schema"""
17
18
 
18
- def __init__(self, schema: Union[Type[BaseModel], Dict[str, Any]]):
19
+ def __init__(
20
+ self,
21
+ schema: Union[Type[BaseModel], Dict[str, Any]],
22
+ stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
23
+ use_images: bool = False,
24
+ ):
25
+ """
26
+ Args:
27
+ schema: Pydantic model or dict schema defining metadata fields to extract
28
+ stage: When to apply the rule - either "post_parsing" (full document text) or
29
+ "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
30
+ use_images: Whether to process image chunks instead of text chunks. Defaults to False.
31
+ """
19
32
  self.schema = schema
33
+ self.stage = stage
34
+ self.use_images = use_images
20
35
 
21
36
  def to_dict(self) -> Dict[str, Any]:
22
37
  if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
@@ -26,22 +41,30 @@ class MetadataExtractionRule(Rule):
26
41
  # Assume it's already a dict schema
27
42
  schema_dict = self.schema
28
43
 
29
- return {"type": "metadata_extraction", "schema": schema_dict}
44
+ return {
45
+ "type": "metadata_extraction",
46
+ "schema": schema_dict,
47
+ "stage": self.stage,
48
+ "use_images": self.use_images,
49
+ }
30
50
 
31
51
 
32
52
  class NaturalLanguageRule(Rule):
33
53
  """Server-side rule for transforming content using natural language"""
34
54
 
35
- def __init__(self, prompt: str):
55
+ def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
36
56
  """
37
57
  Args:
38
58
  prompt: Instruction for how to transform the content
39
59
  e.g. "Remove any personal information" or "Convert to bullet points"
60
+ stage: When to apply the rule - either "post_parsing" (full document text) or
61
+ "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
40
62
  """
41
63
  self.prompt = prompt
64
+ self.stage = stage
42
65
 
43
66
  def to_dict(self) -> Dict[str, Any]:
44
- return {"type": "natural_language", "prompt": self.prompt}
67
+ return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
45
68
 
46
69
 
47
70
  __all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]