ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ai_pipeline_core/__init__.py +21 -13
  2. ai_pipeline_core/documents/document.py +202 -51
  3. ai_pipeline_core/documents/document_list.py +148 -24
  4. ai_pipeline_core/documents/flow_document.py +2 -6
  5. ai_pipeline_core/documents/task_document.py +0 -4
  6. ai_pipeline_core/documents/temporary_document.py +1 -8
  7. ai_pipeline_core/flow/config.py +174 -5
  8. ai_pipeline_core/llm/__init__.py +1 -6
  9. ai_pipeline_core/llm/ai_messages.py +137 -4
  10. ai_pipeline_core/llm/client.py +118 -65
  11. ai_pipeline_core/llm/model_options.py +6 -7
  12. ai_pipeline_core/llm/model_response.py +17 -16
  13. ai_pipeline_core/llm/model_types.py +3 -7
  14. ai_pipeline_core/logging/__init__.py +0 -2
  15. ai_pipeline_core/logging/logging_config.py +0 -6
  16. ai_pipeline_core/logging/logging_mixin.py +2 -10
  17. ai_pipeline_core/pipeline.py +54 -68
  18. ai_pipeline_core/prefect.py +12 -3
  19. ai_pipeline_core/prompt_manager.py +14 -7
  20. ai_pipeline_core/settings.py +13 -5
  21. ai_pipeline_core/simple_runner/__init__.py +1 -11
  22. ai_pipeline_core/simple_runner/cli.py +13 -12
  23. ai_pipeline_core/simple_runner/simple_runner.py +34 -189
  24. ai_pipeline_core/storage/__init__.py +8 -0
  25. ai_pipeline_core/storage/storage.py +628 -0
  26. ai_pipeline_core/tracing.py +234 -30
  27. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/METADATA +35 -20
  28. ai_pipeline_core-0.2.1.dist-info/RECORD +38 -0
  29. ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
  30. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/WHEEL +0 -0
  31. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
7
7
  system designed for production use.
8
8
 
9
9
  The framework enforces best practices through strong typing (Pydantic), automatic retries,
10
- cost tracking, and distributed tracing. All I/O operations are async for maximum throughput.
10
+ and cost tracking. All I/O operations are async for maximum throughput.
11
11
 
12
12
  **CRITICAL IMPORT RULE**:
13
13
  Always import from the top-level package:
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
18
18
  from ai_pipeline_core.llm import generate # NO!
19
19
  from ai_pipeline_core.documents import FlowDocument # NO!
20
20
 
21
- FRAMEWORK RULES (90% Use Cases):
22
- 1. Decorators: Use @trace, @pipeline_task, @pipeline_flow WITHOUT parameters
21
+ FRAMEWORK RULES (Use by default, unless instructed otherwise):
22
+ 1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
23
23
  2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
24
24
  3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
25
- 4. Options: Omit ModelOptions unless specifically needed (defaults are optimal)
26
- 5. Documents: Create with just name and content - skip description
25
+ 4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
26
+ 5. Documents: Create with just name and content - skip description unless needed
27
27
  6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
28
28
  7. Initialization: PromptManager and logger at module scope, not in functions
29
29
  8. DocumentList: Use default constructor - no validation flags needed
@@ -36,18 +36,22 @@ Core Capabilities:
36
36
  - **LLM Integration**: Unified interface to any model via LiteLLM with caching
37
37
  - **Structured Output**: Type-safe generation with Pydantic model validation
38
38
  - **Workflow Orchestration**: Prefect-based flows and tasks with retries
39
- - **Observability**: Distributed tracing via Laminar (LMNR) for debugging
39
+ - **Observability**: Built-in monitoring and debugging capabilities
40
40
  - **Local Development**: Simple runner for testing without infrastructure
41
41
 
42
42
  Quick Start:
43
43
  >>> from ai_pipeline_core import (
44
- ... pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
44
+ ... pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
45
45
  ... )
46
46
  >>>
47
47
  >>> class OutputDoc(FlowDocument):
48
48
  ... '''Analysis result document.'''
49
49
  >>>
50
- >>> @pipeline_flow
50
+ >>> class MyFlowConfig(FlowConfig):
51
+ ... INPUT_DOCUMENT_TYPES = []
52
+ ... OUTPUT_DOCUMENT_TYPE = OutputDoc
53
+ >>>
54
+ >>> @pipeline_flow(config=MyFlowConfig)
51
55
  >>> async def analyze_flow(
52
56
  ... project_name: str,
53
57
  ... documents: DocumentList,
@@ -55,7 +59,7 @@ Quick Start:
55
59
  ... ) -> DocumentList:
56
60
  ... # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
57
61
  ... response = await llm.generate(
58
- ... model="gpt-5",
62
+ ... "gpt-5",
59
63
  ... messages=AIMessages([documents[0]])
60
64
  ... )
61
65
  ... result = OutputDoc.create(
@@ -76,8 +80,6 @@ Optional Environment Variables:
76
80
  - PREFECT_API_KEY: Prefect API authentication key
77
81
  - LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
78
82
  - LMNR_DEBUG: Set to "true" to enable debug-level traces
79
- - LMNR_SESSION_ID: Default session ID for traces
80
- - LMNR_USER_ID: Default user ID for traces
81
83
  """
82
84
 
83
85
  from . import llm
@@ -99,6 +101,8 @@ from .llm import (
99
101
  ModelOptions,
100
102
  ModelResponse,
101
103
  StructuredModelResponse,
104
+ generate,
105
+ generate_structured,
102
106
  )
103
107
  from .logging import (
104
108
  LoggerMixin,
@@ -114,7 +118,7 @@ from .prompt_manager import PromptManager
114
118
  from .settings import Settings
115
119
  from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
116
120
 
117
- __version__ = "0.1.14"
121
+ __version__ = "0.2.1"
118
122
 
119
123
  __all__ = [
120
124
  # Config/Settings
@@ -145,7 +149,9 @@ __all__ = [
145
149
  "prefect_test_harness",
146
150
  "disable_run_logger",
147
151
  # LLM
148
- "llm",
152
+ "llm", # for backward compatibility
153
+ "generate",
154
+ "generate_structured",
149
155
  "ModelName",
150
156
  "ModelOptions",
151
157
  "ModelResponse",
@@ -159,4 +165,6 @@ __all__ = [
159
165
  "set_trace_cost",
160
166
  # Utils
161
167
  "PromptManager",
168
+ "generate",
169
+ "generate_structured",
162
170
  ]
@@ -51,6 +51,7 @@ from .mime_type import (
51
51
  )
52
52
 
53
53
  TModel = TypeVar("TModel", bound=BaseModel)
54
+ TDocument = TypeVar("TDocument", bound="Document")
54
55
 
55
56
 
56
57
  class Document(BaseModel, ABC):
@@ -61,8 +62,7 @@ class Document(BaseModel, ABC):
61
62
  Document is the fundamental data abstraction for all content flowing through
62
63
  pipelines. It provides automatic encoding, MIME type detection, serialization,
63
64
  and validation. All documents must be subclassed from FlowDocument or TaskDocument
64
- based on their persistence requirements. TemporaryDocument is a special concrete
65
- class that can be instantiated directly (not abstract).
65
+ based on their persistence requirements.
66
66
 
67
67
  VALIDATION IS AUTOMATIC - Do not add manual validation!
68
68
  Size validation, name validation, and MIME type detection are built-in.
@@ -74,7 +74,7 @@ class Document(BaseModel, ABC):
74
74
  document.validate_file_name(document.name) # NO! Automatic
75
75
 
76
76
  Best Practices:
77
- - Use create() classmethod for automatic type conversion (90% of cases)
77
+ - Use create() classmethod for automatic type conversion (default preferred)
78
78
  - Omit description parameter unless truly needed for metadata
79
79
  - When using LLM functions, pass AIMessages or str. Wrap any Document values
80
80
  in AIMessages([...]). Do not call .text yourself
@@ -98,6 +98,8 @@ class Document(BaseModel, ABC):
98
98
  - Support for text, JSON, YAML, PDF, and image formats
99
99
  - Conversion utilities between different formats
100
100
  - Source provenance tracking via sources field
101
+ - Document type conversion via model_convert() method
102
+ - Standard Pydantic model_copy() for same-type copying
101
103
 
102
104
  Class Variables:
103
105
  MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -131,10 +133,62 @@ class Document(BaseModel, ABC):
131
133
  2. Embed metadata in content (e.g., JSON with data + metadata fields)
132
134
  3. Create a separate MetadataDocument type to accompany data documents
133
135
  4. Use document naming conventions (e.g., "data_v2_2024.json")
134
- 5. Store metadata in flow_options or pass through TraceInfo
136
+ 5. Store metadata in flow_options
137
+
138
+ FILES Enum Best Practice:
139
+ When defining a FILES enum, NEVER use magic strings to reference files.
140
+ Always use the enum values to maintain type safety and refactorability.
141
+
142
+ WRONG - Magic strings/numbers:
143
+ doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
144
+ doc = docs.get_by("settings.json") # NO! Magic string
145
+ files = ["config.yaml", "settings.json"] # NO! Magic strings
146
+
147
+ CORRECT - Use enum references:
148
+ doc = ConfigDocument.create(
149
+ name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
150
+ content=data
151
+ )
152
+ doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
153
+ files = [
154
+ ConfigDocument.FILES.CONFIG,
155
+ ConfigDocument.FILES.SETTINGS
156
+ ] # YES! Refactorable
157
+
158
+ Pydantic Model Interaction:
159
+ Documents provide DIRECT support for Pydantic models. Use the built-in
160
+ methods instead of manual JSON conversion.
161
+
162
+ WRONG - Manual JSON conversion:
163
+ # Don't do this - manual JSON handling
164
+ json_str = doc.text
165
+ json_data = json.loads(json_str)
166
+ model = MyModel(**json_data) # NO! Use as_pydantic_model
167
+
168
+ # Don't do this - manual serialization
169
+ json_str = model.model_dump_json()
170
+ doc = MyDocument.create(name="data.json", content=json_str) # NO!
171
+
172
+ CORRECT - Direct Pydantic interaction:
173
+ # Reading Pydantic model from document
174
+ model = doc.as_pydantic_model(MyModel) # Direct conversion
175
+ models = doc.as_pydantic_model(list[MyModel]) # List support
176
+
177
+ # Creating document from Pydantic model
178
+ doc = MyDocument.create(
179
+ name="data.json",
180
+ content=model # Direct BaseModel support
181
+ )
182
+
183
+ # Round-trip is seamless
184
+ original_model = MyModel(field="value")
185
+ doc = MyDocument.create(name="data.json", content=original_model)
186
+ restored = doc.as_pydantic_model(MyModel)
187
+ assert restored == original_model # Perfect round-trip
135
188
 
136
189
  Example:
137
190
  >>> from enum import StrEnum
191
+ >>> from pydantic import BaseModel
138
192
  >>>
139
193
  >>> # Simple document:
140
194
  >>> class MyDocument(FlowDocument):
@@ -146,10 +200,23 @@ class Document(BaseModel, ABC):
146
200
  ... CONFIG = "config.yaml"
147
201
  ... SETTINGS = "settings.json"
148
202
  >>>
149
- >>> # RECOMMENDED: Use create for automatic conversion
150
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
151
- >>> print(doc.is_text) # True
152
- >>> data = doc.as_json() # {'key': 'value'}
203
+ >>> # CORRECT FILES usage - no magic strings:
204
+ >>> doc = ConfigDocument.create(
205
+ ... name=ConfigDocument.FILES.CONFIG, # Use enum
206
+ ... content={"key": "value"}
207
+ ... )
208
+ >>>
209
+ >>> # CORRECT Pydantic usage:
210
+ >>> class Config(BaseModel):
211
+ ... key: str
212
+ >>>
213
+ >>> # Direct creation from Pydantic model
214
+ >>> config_model = Config(key="value")
215
+ >>> doc = MyDocument.create(name="data.json", content=config_model)
216
+ >>>
217
+ >>> # Direct extraction to Pydantic model
218
+ >>> restored = doc.as_pydantic_model(Config)
219
+ >>> print(restored.key) # "value"
153
220
  >>>
154
221
  >>> # Track document provenance with sources
155
222
  >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
@@ -159,6 +226,14 @@ class Document(BaseModel, ABC):
159
226
  ... sources=[source_doc.sha256] # Reference source document
160
227
  ... )
161
228
  >>> processed.has_source(source_doc) # True
229
+ >>>
230
+ >>> # Document copying and type conversion:
231
+ >>> # Standard Pydantic model_copy (doesn't validate updates)
232
+ >>> copied = doc.model_copy(update={"name": "new_name.json"})
233
+ >>> # Type conversion with validation via model_convert
234
+ >>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
235
+ >>> flow_doc = task_doc.model_convert(MyFlowDoc) # Convert to FlowDocument
236
+ >>> flow_doc.is_flow # True
162
237
  """
163
238
 
164
239
  MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -170,6 +245,9 @@ class Document(BaseModel, ABC):
170
245
  DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
171
246
  """File extension for description files."""
172
247
 
248
+ SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
249
+ """File extension for sources metadata files."""
250
+
173
251
  MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
174
252
  """Separator for markdown list items."""
175
253
 
@@ -288,7 +366,7 @@ class Document(BaseModel, ABC):
288
366
  content types and automatically converts them to bytes based on the file
289
367
  extension. Use the `parse` method to reverse this conversion.
290
368
 
291
- Best Practice (90% of cases):
369
+ Best Practice (by default, unless instructed otherwise):
292
370
  Only provide name and content. The description parameter is RARELY needed.
293
371
 
294
372
  Args:
@@ -302,8 +380,8 @@ class Document(BaseModel, ABC):
302
380
  - bytes: Used directly without conversion
303
381
  - str: Encoded to UTF-8 bytes
304
382
  - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
305
- - list[str]: Joined with separator for .md (validates no items
306
- contain separator), else JSON/YAML
383
+ - list[str]: Joined automatically for .md (validates format compatibility),
384
+ else JSON/YAML
307
385
  - list[BaseModel]: Serialized to JSON or YAML based on extension
308
386
  - BaseModel: Serialized to JSON or YAML based on extension
309
387
  description: Optional description - USUALLY OMIT THIS (defaults to None).
@@ -319,7 +397,7 @@ class Document(BaseModel, ABC):
319
397
 
320
398
  Raises:
321
399
  ValueError: If content type is not supported for the file extension,
322
- or if markdown list items contain the separator
400
+ or if markdown list format is incompatible
323
401
  DocumentNameError: If filename violates validation rules
324
402
  DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
325
403
 
@@ -329,7 +407,7 @@ class Document(BaseModel, ABC):
329
407
  returns the original dictionary {"key": "value"}.
330
408
 
331
409
  Example:
332
- >>> # CORRECT - no description needed (90% of cases)
410
+ >>> # CORRECT - no description needed (by default, unless instructed otherwise)
333
411
  >>> doc = MyDocument.create(name="test.txt", content="Hello World")
334
412
  >>> doc.content # b'Hello World'
335
413
  >>> doc.parse(str) # "Hello World"
@@ -427,10 +505,6 @@ class Document(BaseModel, ABC):
427
505
  >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
428
506
  >>> doc = MyDocument.create(name="config.yaml", content=my_model)
429
507
  >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
430
-
431
- See Also:
432
- create: Recommended factory method with automatic type conversion
433
- parse: Method to reverse the conversion done by create
434
508
  """
435
509
  if type(self) is Document:
436
510
  raise TypeError("Cannot instantiate abstract Document class directly")
@@ -467,8 +541,7 @@ class Document(BaseModel, ABC):
467
541
 
468
542
  Note:
469
543
  This method determines document persistence and lifecycle.
470
- FlowDocument returns "flow", TaskDocument returns "task",
471
- TemporaryDocument returns "temporary".
544
+ FlowDocument returns "flow", TaskDocument returns "task".
472
545
  """
473
546
  raise NotImplementedError("Subclasses must implement this method")
474
547
 
@@ -520,7 +593,7 @@ class Document(BaseModel, ABC):
520
593
  during execution.
521
594
 
522
595
  Returns:
523
- True if this is a TemporaryDocument, False otherwise.
596
+ True if this document is temporary, False otherwise.
524
597
  """
525
598
  return self.get_base_type() == "temporary"
526
599
 
@@ -565,8 +638,6 @@ class Document(BaseModel, ABC):
565
638
  def validate_file_name(cls, name: str) -> None:
566
639
  """Validate that a file name matches allowed patterns.
567
640
 
568
- @public
569
-
570
641
  DO NOT OVERRIDE this method if you define a FILES enum!
571
642
  The validation is automatic when FILES enum is present.
572
643
 
@@ -610,7 +681,7 @@ class Document(BaseModel, ABC):
610
681
 
611
682
  Ensures the document name is secure and follows conventions:
612
683
  - No path traversal characters (.., \\, /)
613
- - Cannot end with .description.md
684
+ - Cannot end with .description.md or .sources.json
614
685
  - No leading/trailing whitespace
615
686
  - Must match FILES enum if defined
616
687
 
@@ -635,6 +706,9 @@ class Document(BaseModel, ABC):
635
706
  f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
636
707
  )
637
708
 
709
+ if v.endswith(cls.SOURCES_EXTENSION):
710
+ raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
711
+
638
712
  if ".." in v or "\\" in v or "/" in v:
639
713
  raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
640
714
 
@@ -659,7 +733,7 @@ class Document(BaseModel, ABC):
659
733
  2. str → UTF-8 encoding
660
734
  3. dict/BaseModel + .json → JSON serialization (indented)
661
735
  4. dict/BaseModel + .yaml/.yml → YAML serialization
662
- 5. list[str] + .md → Join with markdown separator (validates no items contain separator)
736
+ 5. list[str] + .md → Join with markdown sections (validates format compatibility)
663
737
  6. list[Any] + .json/.yaml → JSON/YAML array
664
738
  7. int/float/bool + .json → JSON primitive
665
739
 
@@ -1028,8 +1102,6 @@ class Document(BaseModel, ABC):
1028
1102
  def as_yaml(self) -> Any:
1029
1103
  r"""Parse document content as YAML.
1030
1104
 
1031
- @public
1032
-
1033
1105
  Parses the document's text content as YAML and returns Python objects.
1034
1106
  Uses ruamel.yaml which is safe by default (no code execution).
1035
1107
 
@@ -1057,8 +1129,6 @@ class Document(BaseModel, ABC):
1057
1129
  def as_json(self) -> Any:
1058
1130
  """Parse document content as JSON.
1059
1131
 
1060
- @public
1061
-
1062
1132
  Parses the document's text content as JSON and returns Python objects.
1063
1133
  Document must contain valid JSON text.
1064
1134
 
@@ -1153,7 +1223,7 @@ class Document(BaseModel, ABC):
1153
1223
 
1154
1224
  @public
1155
1225
 
1156
- Splits text content using markdown separator ("\n\n-----------------\n\n").
1226
+ Splits text content automatically using markdown section separators.
1157
1227
  Designed for markdown documents with multiple sections.
1158
1228
 
1159
1229
  Returns:
@@ -1168,9 +1238,9 @@ class Document(BaseModel, ABC):
1168
1238
  >>> doc = MyDocument.create(name="book.md", content=sections)
1169
1239
  >>> doc.as_markdown_list() # Returns original sections
1170
1240
 
1171
- >>> # Manual creation with separator
1172
- >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
1173
- >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1241
+ >>> # Round-trip conversion works automatically
1242
+ >>> sections = ["Part 1", "Part 2", "Part 3"]
1243
+ >>> doc2 = MyDocument.create(name="parts.md", content=sections)
1174
1244
  >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1175
1245
  """
1176
1246
  return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
@@ -1207,7 +1277,7 @@ class Document(BaseModel, ABC):
1207
1277
  Extension Rules:
1208
1278
  - .json → JSON parsing for dict/list/BaseModel
1209
1279
  - .yaml/.yml → YAML parsing for dict/list/BaseModel
1210
- - .md + list → Split by markdown separator
1280
+ - .md + list → Split automatically into sections
1211
1281
  - Any + str → UTF-8 decode
1212
1282
  - Any + bytes → Raw content
1213
1283
 
@@ -1223,8 +1293,7 @@ class Document(BaseModel, ABC):
1223
1293
 
1224
1294
  >>> # Markdown list
1225
1295
  >>> items = ["Item 1", "Item 2"]
1226
- >>> content = "\n\n---\n\n".join(items).encode()
1227
- >>> doc = MyDocument(name="list.md", content=content)
1296
+ >>> doc = MyDocument.create(name="list.md", content=items)
1228
1297
  >>> doc.parse(list)
1229
1298
  ['Item 1', 'Item 2']
1230
1299
  """
@@ -1330,11 +1399,6 @@ class Document(BaseModel, ABC):
1330
1399
  >>> # Check if specific document is a source
1331
1400
  >>> if source1.sha256 in doc_refs:
1332
1401
  ... print("Document derived from source1")
1333
-
1334
- See Also:
1335
- - get_source_references: Get non-document source references (URLs, etc.)
1336
- - has_source: Check if a specific source is tracked
1337
- - Document.create: Add sources when creating documents
1338
1402
  """
1339
1403
  return [src for src in self.sources if is_document_sha256(src)]
1340
1404
 
@@ -1372,11 +1436,6 @@ class Document(BaseModel, ABC):
1372
1436
  >>> # Use for attribution or debugging
1373
1437
  >>> for ref in refs:
1374
1438
  ... print(f"Data sourced from: {ref}")
1375
-
1376
- See Also:
1377
- - get_source_documents: Get document SHA256 references
1378
- - has_source: Check if a specific source is tracked
1379
- - Document.create: Add sources when creating documents
1380
1439
  """
1381
1440
  return [src for src in self.sources if not is_document_sha256(src)]
1382
1441
 
@@ -1422,11 +1481,6 @@ class Document(BaseModel, ABC):
1422
1481
  >>> # Check by SHA256 directly
1423
1482
  >>> if derived.has_source(source_doc.sha256):
1424
1483
  ... print("Has specific hash")
1425
-
1426
- See Also:
1427
- - get_source_documents: Get all document sources
1428
- - get_source_references: Get all reference sources
1429
- - Document.create: Add sources when creating documents
1430
1484
  """
1431
1485
  if isinstance(source, str):
1432
1486
  # Direct string comparison
@@ -1455,6 +1509,8 @@ class Document(BaseModel, ABC):
1455
1509
  - sha256: Full SHA256 hash in base32 encoding without padding (str)
1456
1510
  - mime_type: Detected MIME type (str)
1457
1511
  - sources: List of source strings (list[dict])
1512
+ - canonical_name: Canonical snake_case name for debug tracing (str)
1513
+ - class_name: Name of the actual document class for debug tracing (str)
1458
1514
  - content: Encoded content (str)
1459
1515
  - content_encoding: Either "utf-8" or "base64" (str)
1460
1516
 
@@ -1478,10 +1534,12 @@ class Document(BaseModel, ABC):
1478
1534
  "sha256": self.sha256,
1479
1535
  "mime_type": self.mime_type,
1480
1536
  "sources": self.sources,
1537
+ "canonical_name": canonical_name_key(self.__class__),
1538
+ "class_name": self.__class__.__name__,
1481
1539
  }
1482
1540
 
1483
1541
  # Try to encode content as UTF-8, fall back to base64
1484
- if self.is_text or self.mime_type.startswith("text/"):
1542
+ if self.is_text:
1485
1543
  try:
1486
1544
  result["content"] = self.content.decode("utf-8")
1487
1545
  result["content_encoding"] = "utf-8"
@@ -1557,3 +1615,96 @@ class Document(BaseModel, ABC):
1557
1615
  description=data.get("description"),
1558
1616
  sources=data.get("sources", []),
1559
1617
  )
1618
+
1619
+ @final
1620
+ def model_convert(
1621
+ self,
1622
+ new_type: type[TDocument],
1623
+ *,
1624
+ update: dict[str, Any] | None = None,
1625
+ deep: bool = False,
1626
+ ) -> TDocument:
1627
+ """Convert document to a different Document type with optional updates.
1628
+
1629
+ @public
1630
+
1631
+ Creates a new document of a different type, preserving all attributes
1632
+ while allowing updates. This is useful for converting between document
1633
+ types (e.g., TaskDocument to FlowDocument) while maintaining data integrity.
1634
+
1635
+ Args:
1636
+ new_type: Target Document class for conversion. Must be a concrete
1637
+ subclass of Document (not abstract classes like Document,
1638
+ FlowDocument, or TaskDocument).
1639
+ update: Dictionary of attributes to update. Supports any attributes
1640
+ that the Document constructor accepts (name, content,
1641
+ description, sources).
1642
+ deep: Whether to perform a deep copy of mutable attributes.
1643
+
1644
+ Returns:
1645
+ New Document instance of the specified type.
1646
+
1647
+ Raises:
1648
+ TypeError: If new_type is not a subclass of Document, is an abstract
1649
+ class, or if update contains invalid attributes.
1650
+ DocumentNameError: If the name violates the target type's FILES enum.
1651
+ DocumentSizeError: If content exceeds MAX_CONTENT_SIZE.
1652
+
1653
+ Example:
1654
+ >>> # Convert TaskDocument to FlowDocument
1655
+ >>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
1656
+ >>> flow_doc = task_doc.model_convert(MyFlowDoc)
1657
+ >>> assert flow_doc.is_flow
1658
+ >>> assert flow_doc.content == task_doc.content
1659
+ >>>
1660
+ >>> # Convert with updates
1661
+ >>> updated = task_doc.model_convert(
1662
+ ... MyFlowDoc,
1663
+ ... update={"name": "permanent.json", "description": "Converted"}
1664
+ ... )
1665
+ >>>
1666
+ >>> # Track document lineage
1667
+ >>> derived = doc.model_convert(
1668
+ ... ProcessedDoc,
1669
+ ... update={"sources": [doc.sha256]}
1670
+ ... )
1671
+ """
1672
+ # Validate new_type
1673
+ try:
1674
+ # Use a runtime check to ensure it's a class
1675
+ if not isinstance(new_type, type): # type: ignore[reportIncompatibleArgumentType]
1676
+ raise TypeError(f"new_type must be a class, got {new_type}")
1677
+ if not issubclass(new_type, Document): # type: ignore[reportIncompatibleArgumentType]
1678
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
1679
+ except (TypeError, AttributeError):
1680
+ # Not a class at all
1681
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
1682
+
1683
+ # Check for abstract classes by name (avoid circular imports)
1684
+ class_name = new_type.__name__
1685
+ if class_name == "Document":
1686
+ raise TypeError("Cannot instantiate abstract Document class directly")
1687
+ if class_name == "FlowDocument":
1688
+ raise TypeError("Cannot instantiate abstract FlowDocument class directly")
1689
+ if class_name == "TaskDocument":
1690
+ raise TypeError("Cannot instantiate abstract TaskDocument class directly")
1691
+
1692
+ # Get current document data with proper typing
1693
+ data: dict[str, Any] = {
1694
+ "name": self.name,
1695
+ "content": self.content,
1696
+ "description": self.description,
1697
+ "sources": self.sources.copy() if deep else self.sources,
1698
+ }
1699
+
1700
+ # Apply updates if provided
1701
+ if update:
1702
+ data.update(update)
1703
+
1704
+ # Create new document of target type
1705
+ return new_type(
1706
+ name=data["name"],
1707
+ content=data["content"],
1708
+ description=data.get("description"),
1709
+ sources=data.get("sources", []),
1710
+ )