ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +21 -13
- ai_pipeline_core/documents/document.py +202 -51
- ai_pipeline_core/documents/document_list.py +148 -24
- ai_pipeline_core/documents/flow_document.py +2 -6
- ai_pipeline_core/documents/task_document.py +0 -4
- ai_pipeline_core/documents/temporary_document.py +1 -8
- ai_pipeline_core/flow/config.py +174 -5
- ai_pipeline_core/llm/__init__.py +1 -6
- ai_pipeline_core/llm/ai_messages.py +137 -4
- ai_pipeline_core/llm/client.py +118 -65
- ai_pipeline_core/llm/model_options.py +6 -7
- ai_pipeline_core/llm/model_response.py +17 -16
- ai_pipeline_core/llm/model_types.py +3 -7
- ai_pipeline_core/logging/__init__.py +0 -2
- ai_pipeline_core/logging/logging_config.py +0 -6
- ai_pipeline_core/logging/logging_mixin.py +2 -10
- ai_pipeline_core/pipeline.py +54 -68
- ai_pipeline_core/prefect.py +12 -3
- ai_pipeline_core/prompt_manager.py +14 -7
- ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core/simple_runner/__init__.py +1 -11
- ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core/simple_runner/simple_runner.py +34 -189
- ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core/storage/storage.py +628 -0
- ai_pipeline_core/tracing.py +234 -30
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/METADATA +35 -20
- ai_pipeline_core-0.2.1.dist-info/RECORD +38 -0
- ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
|
|
|
7
7
|
system designed for production use.
|
|
8
8
|
|
|
9
9
|
The framework enforces best practices through strong typing (Pydantic), automatic retries,
|
|
10
|
-
cost tracking
|
|
10
|
+
and cost tracking. All I/O operations are async for maximum throughput.
|
|
11
11
|
|
|
12
12
|
**CRITICAL IMPORT RULE**:
|
|
13
13
|
Always import from the top-level package:
|
|
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
|
|
|
18
18
|
from ai_pipeline_core.llm import generate # NO!
|
|
19
19
|
from ai_pipeline_core.documents import FlowDocument # NO!
|
|
20
20
|
|
|
21
|
-
FRAMEWORK RULES (
|
|
22
|
-
1. Decorators: Use @
|
|
21
|
+
FRAMEWORK RULES (Use by default, unless instructed otherwise):
|
|
22
|
+
1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
|
|
23
23
|
2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
|
|
24
24
|
3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
|
|
25
|
-
4. Options:
|
|
26
|
-
5. Documents: Create with just name and content - skip description
|
|
25
|
+
4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
|
|
26
|
+
5. Documents: Create with just name and content - skip description unless needed
|
|
27
27
|
6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
|
|
28
28
|
7. Initialization: PromptManager and logger at module scope, not in functions
|
|
29
29
|
8. DocumentList: Use default constructor - no validation flags needed
|
|
@@ -36,18 +36,22 @@ Core Capabilities:
|
|
|
36
36
|
- **LLM Integration**: Unified interface to any model via LiteLLM with caching
|
|
37
37
|
- **Structured Output**: Type-safe generation with Pydantic model validation
|
|
38
38
|
- **Workflow Orchestration**: Prefect-based flows and tasks with retries
|
|
39
|
-
- **Observability**:
|
|
39
|
+
- **Observability**: Built-in monitoring and debugging capabilities
|
|
40
40
|
- **Local Development**: Simple runner for testing without infrastructure
|
|
41
41
|
|
|
42
42
|
Quick Start:
|
|
43
43
|
>>> from ai_pipeline_core import (
|
|
44
|
-
... pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
|
|
44
|
+
... pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
|
|
45
45
|
... )
|
|
46
46
|
>>>
|
|
47
47
|
>>> class OutputDoc(FlowDocument):
|
|
48
48
|
... '''Analysis result document.'''
|
|
49
49
|
>>>
|
|
50
|
-
>>>
|
|
50
|
+
>>> class MyFlowConfig(FlowConfig):
|
|
51
|
+
... INPUT_DOCUMENT_TYPES = []
|
|
52
|
+
... OUTPUT_DOCUMENT_TYPE = OutputDoc
|
|
53
|
+
>>>
|
|
54
|
+
>>> @pipeline_flow(config=MyFlowConfig)
|
|
51
55
|
>>> async def analyze_flow(
|
|
52
56
|
... project_name: str,
|
|
53
57
|
... documents: DocumentList,
|
|
@@ -55,7 +59,7 @@ Quick Start:
|
|
|
55
59
|
... ) -> DocumentList:
|
|
56
60
|
... # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
|
|
57
61
|
... response = await llm.generate(
|
|
58
|
-
...
|
|
62
|
+
... "gpt-5",
|
|
59
63
|
... messages=AIMessages([documents[0]])
|
|
60
64
|
... )
|
|
61
65
|
... result = OutputDoc.create(
|
|
@@ -76,8 +80,6 @@ Optional Environment Variables:
|
|
|
76
80
|
- PREFECT_API_KEY: Prefect API authentication key
|
|
77
81
|
- LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
|
|
78
82
|
- LMNR_DEBUG: Set to "true" to enable debug-level traces
|
|
79
|
-
- LMNR_SESSION_ID: Default session ID for traces
|
|
80
|
-
- LMNR_USER_ID: Default user ID for traces
|
|
81
83
|
"""
|
|
82
84
|
|
|
83
85
|
from . import llm
|
|
@@ -99,6 +101,8 @@ from .llm import (
|
|
|
99
101
|
ModelOptions,
|
|
100
102
|
ModelResponse,
|
|
101
103
|
StructuredModelResponse,
|
|
104
|
+
generate,
|
|
105
|
+
generate_structured,
|
|
102
106
|
)
|
|
103
107
|
from .logging import (
|
|
104
108
|
LoggerMixin,
|
|
@@ -114,7 +118,7 @@ from .prompt_manager import PromptManager
|
|
|
114
118
|
from .settings import Settings
|
|
115
119
|
from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
|
|
116
120
|
|
|
117
|
-
__version__ = "0.1
|
|
121
|
+
__version__ = "0.2.1"
|
|
118
122
|
|
|
119
123
|
__all__ = [
|
|
120
124
|
# Config/Settings
|
|
@@ -145,7 +149,9 @@ __all__ = [
|
|
|
145
149
|
"prefect_test_harness",
|
|
146
150
|
"disable_run_logger",
|
|
147
151
|
# LLM
|
|
148
|
-
"llm",
|
|
152
|
+
"llm", # for backward compatibility
|
|
153
|
+
"generate",
|
|
154
|
+
"generate_structured",
|
|
149
155
|
"ModelName",
|
|
150
156
|
"ModelOptions",
|
|
151
157
|
"ModelResponse",
|
|
@@ -159,4 +165,6 @@ __all__ = [
|
|
|
159
165
|
"set_trace_cost",
|
|
160
166
|
# Utils
|
|
161
167
|
"PromptManager",
|
|
168
|
+
"generate",
|
|
169
|
+
"generate_structured",
|
|
162
170
|
]
|
|
@@ -51,6 +51,7 @@ from .mime_type import (
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
TModel = TypeVar("TModel", bound=BaseModel)
|
|
54
|
+
TDocument = TypeVar("TDocument", bound="Document")
|
|
54
55
|
|
|
55
56
|
|
|
56
57
|
class Document(BaseModel, ABC):
|
|
@@ -61,8 +62,7 @@ class Document(BaseModel, ABC):
|
|
|
61
62
|
Document is the fundamental data abstraction for all content flowing through
|
|
62
63
|
pipelines. It provides automatic encoding, MIME type detection, serialization,
|
|
63
64
|
and validation. All documents must be subclassed from FlowDocument or TaskDocument
|
|
64
|
-
based on their persistence requirements.
|
|
65
|
-
class that can be instantiated directly (not abstract).
|
|
65
|
+
based on their persistence requirements.
|
|
66
66
|
|
|
67
67
|
VALIDATION IS AUTOMATIC - Do not add manual validation!
|
|
68
68
|
Size validation, name validation, and MIME type detection are built-in.
|
|
@@ -74,7 +74,7 @@ class Document(BaseModel, ABC):
|
|
|
74
74
|
document.validate_file_name(document.name) # NO! Automatic
|
|
75
75
|
|
|
76
76
|
Best Practices:
|
|
77
|
-
- Use create() classmethod for automatic type conversion (
|
|
77
|
+
- Use create() classmethod for automatic type conversion (default preferred)
|
|
78
78
|
- Omit description parameter unless truly needed for metadata
|
|
79
79
|
- When using LLM functions, pass AIMessages or str. Wrap any Document values
|
|
80
80
|
in AIMessages([...]). Do not call .text yourself
|
|
@@ -98,6 +98,8 @@ class Document(BaseModel, ABC):
|
|
|
98
98
|
- Support for text, JSON, YAML, PDF, and image formats
|
|
99
99
|
- Conversion utilities between different formats
|
|
100
100
|
- Source provenance tracking via sources field
|
|
101
|
+
- Document type conversion via model_convert() method
|
|
102
|
+
- Standard Pydantic model_copy() for same-type copying
|
|
101
103
|
|
|
102
104
|
Class Variables:
|
|
103
105
|
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
@@ -131,10 +133,62 @@ class Document(BaseModel, ABC):
|
|
|
131
133
|
2. Embed metadata in content (e.g., JSON with data + metadata fields)
|
|
132
134
|
3. Create a separate MetadataDocument type to accompany data documents
|
|
133
135
|
4. Use document naming conventions (e.g., "data_v2_2024.json")
|
|
134
|
-
5. Store metadata in flow_options
|
|
136
|
+
5. Store metadata in flow_options
|
|
137
|
+
|
|
138
|
+
FILES Enum Best Practice:
|
|
139
|
+
When defining a FILES enum, NEVER use magic strings to reference files.
|
|
140
|
+
Always use the enum values to maintain type safety and refactorability.
|
|
141
|
+
|
|
142
|
+
WRONG - Magic strings/numbers:
|
|
143
|
+
doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
|
|
144
|
+
doc = docs.get_by("settings.json") # NO! Magic string
|
|
145
|
+
files = ["config.yaml", "settings.json"] # NO! Magic strings
|
|
146
|
+
|
|
147
|
+
CORRECT - Use enum references:
|
|
148
|
+
doc = ConfigDocument.create(
|
|
149
|
+
name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
|
|
150
|
+
content=data
|
|
151
|
+
)
|
|
152
|
+
doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
|
|
153
|
+
files = [
|
|
154
|
+
ConfigDocument.FILES.CONFIG,
|
|
155
|
+
ConfigDocument.FILES.SETTINGS
|
|
156
|
+
] # YES! Refactorable
|
|
157
|
+
|
|
158
|
+
Pydantic Model Interaction:
|
|
159
|
+
Documents provide DIRECT support for Pydantic models. Use the built-in
|
|
160
|
+
methods instead of manual JSON conversion.
|
|
161
|
+
|
|
162
|
+
WRONG - Manual JSON conversion:
|
|
163
|
+
# Don't do this - manual JSON handling
|
|
164
|
+
json_str = doc.text
|
|
165
|
+
json_data = json.loads(json_str)
|
|
166
|
+
model = MyModel(**json_data) # NO! Use as_pydantic_model
|
|
167
|
+
|
|
168
|
+
# Don't do this - manual serialization
|
|
169
|
+
json_str = model.model_dump_json()
|
|
170
|
+
doc = MyDocument.create(name="data.json", content=json_str) # NO!
|
|
171
|
+
|
|
172
|
+
CORRECT - Direct Pydantic interaction:
|
|
173
|
+
# Reading Pydantic model from document
|
|
174
|
+
model = doc.as_pydantic_model(MyModel) # Direct conversion
|
|
175
|
+
models = doc.as_pydantic_model(list[MyModel]) # List support
|
|
176
|
+
|
|
177
|
+
# Creating document from Pydantic model
|
|
178
|
+
doc = MyDocument.create(
|
|
179
|
+
name="data.json",
|
|
180
|
+
content=model # Direct BaseModel support
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Round-trip is seamless
|
|
184
|
+
original_model = MyModel(field="value")
|
|
185
|
+
doc = MyDocument.create(name="data.json", content=original_model)
|
|
186
|
+
restored = doc.as_pydantic_model(MyModel)
|
|
187
|
+
assert restored == original_model # Perfect round-trip
|
|
135
188
|
|
|
136
189
|
Example:
|
|
137
190
|
>>> from enum import StrEnum
|
|
191
|
+
>>> from pydantic import BaseModel
|
|
138
192
|
>>>
|
|
139
193
|
>>> # Simple document:
|
|
140
194
|
>>> class MyDocument(FlowDocument):
|
|
@@ -146,10 +200,23 @@ class Document(BaseModel, ABC):
|
|
|
146
200
|
... CONFIG = "config.yaml"
|
|
147
201
|
... SETTINGS = "settings.json"
|
|
148
202
|
>>>
|
|
149
|
-
>>> #
|
|
150
|
-
>>> doc =
|
|
151
|
-
|
|
152
|
-
|
|
203
|
+
>>> # CORRECT FILES usage - no magic strings:
|
|
204
|
+
>>> doc = ConfigDocument.create(
|
|
205
|
+
... name=ConfigDocument.FILES.CONFIG, # Use enum
|
|
206
|
+
... content={"key": "value"}
|
|
207
|
+
... )
|
|
208
|
+
>>>
|
|
209
|
+
>>> # CORRECT Pydantic usage:
|
|
210
|
+
>>> class Config(BaseModel):
|
|
211
|
+
... key: str
|
|
212
|
+
>>>
|
|
213
|
+
>>> # Direct creation from Pydantic model
|
|
214
|
+
>>> config_model = Config(key="value")
|
|
215
|
+
>>> doc = MyDocument.create(name="data.json", content=config_model)
|
|
216
|
+
>>>
|
|
217
|
+
>>> # Direct extraction to Pydantic model
|
|
218
|
+
>>> restored = doc.as_pydantic_model(Config)
|
|
219
|
+
>>> print(restored.key) # "value"
|
|
153
220
|
>>>
|
|
154
221
|
>>> # Track document provenance with sources
|
|
155
222
|
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
@@ -159,6 +226,14 @@ class Document(BaseModel, ABC):
|
|
|
159
226
|
... sources=[source_doc.sha256] # Reference source document
|
|
160
227
|
... )
|
|
161
228
|
>>> processed.has_source(source_doc) # True
|
|
229
|
+
>>>
|
|
230
|
+
>>> # Document copying and type conversion:
|
|
231
|
+
>>> # Standard Pydantic model_copy (doesn't validate updates)
|
|
232
|
+
>>> copied = doc.model_copy(update={"name": "new_name.json"})
|
|
233
|
+
>>> # Type conversion with validation via model_convert
|
|
234
|
+
>>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
|
|
235
|
+
>>> flow_doc = task_doc.model_convert(MyFlowDoc) # Convert to FlowDocument
|
|
236
|
+
>>> flow_doc.is_flow # True
|
|
162
237
|
"""
|
|
163
238
|
|
|
164
239
|
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
@@ -170,6 +245,9 @@ class Document(BaseModel, ABC):
|
|
|
170
245
|
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
171
246
|
"""File extension for description files."""
|
|
172
247
|
|
|
248
|
+
SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
|
|
249
|
+
"""File extension for sources metadata files."""
|
|
250
|
+
|
|
173
251
|
MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
|
|
174
252
|
"""Separator for markdown list items."""
|
|
175
253
|
|
|
@@ -288,7 +366,7 @@ class Document(BaseModel, ABC):
|
|
|
288
366
|
content types and automatically converts them to bytes based on the file
|
|
289
367
|
extension. Use the `parse` method to reverse this conversion.
|
|
290
368
|
|
|
291
|
-
Best Practice (
|
|
369
|
+
Best Practice (by default, unless instructed otherwise):
|
|
292
370
|
Only provide name and content. The description parameter is RARELY needed.
|
|
293
371
|
|
|
294
372
|
Args:
|
|
@@ -302,8 +380,8 @@ class Document(BaseModel, ABC):
|
|
|
302
380
|
- bytes: Used directly without conversion
|
|
303
381
|
- str: Encoded to UTF-8 bytes
|
|
304
382
|
- dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
|
|
305
|
-
- list[str]: Joined
|
|
306
|
-
|
|
383
|
+
- list[str]: Joined automatically for .md (validates format compatibility),
|
|
384
|
+
else JSON/YAML
|
|
307
385
|
- list[BaseModel]: Serialized to JSON or YAML based on extension
|
|
308
386
|
- BaseModel: Serialized to JSON or YAML based on extension
|
|
309
387
|
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
@@ -319,7 +397,7 @@ class Document(BaseModel, ABC):
|
|
|
319
397
|
|
|
320
398
|
Raises:
|
|
321
399
|
ValueError: If content type is not supported for the file extension,
|
|
322
|
-
or if markdown list
|
|
400
|
+
or if markdown list format is incompatible
|
|
323
401
|
DocumentNameError: If filename violates validation rules
|
|
324
402
|
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
325
403
|
|
|
@@ -329,7 +407,7 @@ class Document(BaseModel, ABC):
|
|
|
329
407
|
returns the original dictionary {"key": "value"}.
|
|
330
408
|
|
|
331
409
|
Example:
|
|
332
|
-
>>> # CORRECT - no description needed (
|
|
410
|
+
>>> # CORRECT - no description needed (by default, unless instructed otherwise)
|
|
333
411
|
>>> doc = MyDocument.create(name="test.txt", content="Hello World")
|
|
334
412
|
>>> doc.content # b'Hello World'
|
|
335
413
|
>>> doc.parse(str) # "Hello World"
|
|
@@ -427,10 +505,6 @@ class Document(BaseModel, ABC):
|
|
|
427
505
|
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
428
506
|
>>> doc = MyDocument.create(name="config.yaml", content=my_model)
|
|
429
507
|
>>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
|
|
430
|
-
|
|
431
|
-
See Also:
|
|
432
|
-
create: Recommended factory method with automatic type conversion
|
|
433
|
-
parse: Method to reverse the conversion done by create
|
|
434
508
|
"""
|
|
435
509
|
if type(self) is Document:
|
|
436
510
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
@@ -467,8 +541,7 @@ class Document(BaseModel, ABC):
|
|
|
467
541
|
|
|
468
542
|
Note:
|
|
469
543
|
This method determines document persistence and lifecycle.
|
|
470
|
-
FlowDocument returns "flow", TaskDocument returns "task"
|
|
471
|
-
TemporaryDocument returns "temporary".
|
|
544
|
+
FlowDocument returns "flow", TaskDocument returns "task".
|
|
472
545
|
"""
|
|
473
546
|
raise NotImplementedError("Subclasses must implement this method")
|
|
474
547
|
|
|
@@ -520,7 +593,7 @@ class Document(BaseModel, ABC):
|
|
|
520
593
|
during execution.
|
|
521
594
|
|
|
522
595
|
Returns:
|
|
523
|
-
True if this is
|
|
596
|
+
True if this document is temporary, False otherwise.
|
|
524
597
|
"""
|
|
525
598
|
return self.get_base_type() == "temporary"
|
|
526
599
|
|
|
@@ -565,8 +638,6 @@ class Document(BaseModel, ABC):
|
|
|
565
638
|
def validate_file_name(cls, name: str) -> None:
|
|
566
639
|
"""Validate that a file name matches allowed patterns.
|
|
567
640
|
|
|
568
|
-
@public
|
|
569
|
-
|
|
570
641
|
DO NOT OVERRIDE this method if you define a FILES enum!
|
|
571
642
|
The validation is automatic when FILES enum is present.
|
|
572
643
|
|
|
@@ -610,7 +681,7 @@ class Document(BaseModel, ABC):
|
|
|
610
681
|
|
|
611
682
|
Ensures the document name is secure and follows conventions:
|
|
612
683
|
- No path traversal characters (.., \\, /)
|
|
613
|
-
- Cannot end with .description.md
|
|
684
|
+
- Cannot end with .description.md or .sources.json
|
|
614
685
|
- No leading/trailing whitespace
|
|
615
686
|
- Must match FILES enum if defined
|
|
616
687
|
|
|
@@ -635,6 +706,9 @@ class Document(BaseModel, ABC):
|
|
|
635
706
|
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
636
707
|
)
|
|
637
708
|
|
|
709
|
+
if v.endswith(cls.SOURCES_EXTENSION):
|
|
710
|
+
raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
|
|
711
|
+
|
|
638
712
|
if ".." in v or "\\" in v or "/" in v:
|
|
639
713
|
raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
|
|
640
714
|
|
|
@@ -659,7 +733,7 @@ class Document(BaseModel, ABC):
|
|
|
659
733
|
2. str → UTF-8 encoding
|
|
660
734
|
3. dict/BaseModel + .json → JSON serialization (indented)
|
|
661
735
|
4. dict/BaseModel + .yaml/.yml → YAML serialization
|
|
662
|
-
5. list[str] + .md → Join with markdown
|
|
736
|
+
5. list[str] + .md → Join with markdown sections (validates format compatibility)
|
|
663
737
|
6. list[Any] + .json/.yaml → JSON/YAML array
|
|
664
738
|
7. int/float/bool + .json → JSON primitive
|
|
665
739
|
|
|
@@ -1028,8 +1102,6 @@ class Document(BaseModel, ABC):
|
|
|
1028
1102
|
def as_yaml(self) -> Any:
|
|
1029
1103
|
r"""Parse document content as YAML.
|
|
1030
1104
|
|
|
1031
|
-
@public
|
|
1032
|
-
|
|
1033
1105
|
Parses the document's text content as YAML and returns Python objects.
|
|
1034
1106
|
Uses ruamel.yaml which is safe by default (no code execution).
|
|
1035
1107
|
|
|
@@ -1057,8 +1129,6 @@ class Document(BaseModel, ABC):
|
|
|
1057
1129
|
def as_json(self) -> Any:
|
|
1058
1130
|
"""Parse document content as JSON.
|
|
1059
1131
|
|
|
1060
|
-
@public
|
|
1061
|
-
|
|
1062
1132
|
Parses the document's text content as JSON and returns Python objects.
|
|
1063
1133
|
Document must contain valid JSON text.
|
|
1064
1134
|
|
|
@@ -1153,7 +1223,7 @@ class Document(BaseModel, ABC):
|
|
|
1153
1223
|
|
|
1154
1224
|
@public
|
|
1155
1225
|
|
|
1156
|
-
Splits text content using markdown
|
|
1226
|
+
Splits text content automatically using markdown section separators.
|
|
1157
1227
|
Designed for markdown documents with multiple sections.
|
|
1158
1228
|
|
|
1159
1229
|
Returns:
|
|
@@ -1168,9 +1238,9 @@ class Document(BaseModel, ABC):
|
|
|
1168
1238
|
>>> doc = MyDocument.create(name="book.md", content=sections)
|
|
1169
1239
|
>>> doc.as_markdown_list() # Returns original sections
|
|
1170
1240
|
|
|
1171
|
-
>>> #
|
|
1172
|
-
>>>
|
|
1173
|
-
>>> doc2 = MyDocument(name="parts.md", content=
|
|
1241
|
+
>>> # Round-trip conversion works automatically
|
|
1242
|
+
>>> sections = ["Part 1", "Part 2", "Part 3"]
|
|
1243
|
+
>>> doc2 = MyDocument.create(name="parts.md", content=sections)
|
|
1174
1244
|
>>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
|
|
1175
1245
|
"""
|
|
1176
1246
|
return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
|
|
@@ -1207,7 +1277,7 @@ class Document(BaseModel, ABC):
|
|
|
1207
1277
|
Extension Rules:
|
|
1208
1278
|
- .json → JSON parsing for dict/list/BaseModel
|
|
1209
1279
|
- .yaml/.yml → YAML parsing for dict/list/BaseModel
|
|
1210
|
-
- .md + list → Split
|
|
1280
|
+
- .md + list → Split automatically into sections
|
|
1211
1281
|
- Any + str → UTF-8 decode
|
|
1212
1282
|
- Any + bytes → Raw content
|
|
1213
1283
|
|
|
@@ -1223,8 +1293,7 @@ class Document(BaseModel, ABC):
|
|
|
1223
1293
|
|
|
1224
1294
|
>>> # Markdown list
|
|
1225
1295
|
>>> items = ["Item 1", "Item 2"]
|
|
1226
|
-
>>>
|
|
1227
|
-
>>> doc = MyDocument(name="list.md", content=content)
|
|
1296
|
+
>>> doc = MyDocument.create(name="list.md", content=items)
|
|
1228
1297
|
>>> doc.parse(list)
|
|
1229
1298
|
['Item 1', 'Item 2']
|
|
1230
1299
|
"""
|
|
@@ -1330,11 +1399,6 @@ class Document(BaseModel, ABC):
|
|
|
1330
1399
|
>>> # Check if specific document is a source
|
|
1331
1400
|
>>> if source1.sha256 in doc_refs:
|
|
1332
1401
|
... print("Document derived from source1")
|
|
1333
|
-
|
|
1334
|
-
See Also:
|
|
1335
|
-
- get_source_references: Get non-document source references (URLs, etc.)
|
|
1336
|
-
- has_source: Check if a specific source is tracked
|
|
1337
|
-
- Document.create: Add sources when creating documents
|
|
1338
1402
|
"""
|
|
1339
1403
|
return [src for src in self.sources if is_document_sha256(src)]
|
|
1340
1404
|
|
|
@@ -1372,11 +1436,6 @@ class Document(BaseModel, ABC):
|
|
|
1372
1436
|
>>> # Use for attribution or debugging
|
|
1373
1437
|
>>> for ref in refs:
|
|
1374
1438
|
... print(f"Data sourced from: {ref}")
|
|
1375
|
-
|
|
1376
|
-
See Also:
|
|
1377
|
-
- get_source_documents: Get document SHA256 references
|
|
1378
|
-
- has_source: Check if a specific source is tracked
|
|
1379
|
-
- Document.create: Add sources when creating documents
|
|
1380
1439
|
"""
|
|
1381
1440
|
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1382
1441
|
|
|
@@ -1422,11 +1481,6 @@ class Document(BaseModel, ABC):
|
|
|
1422
1481
|
>>> # Check by SHA256 directly
|
|
1423
1482
|
>>> if derived.has_source(source_doc.sha256):
|
|
1424
1483
|
... print("Has specific hash")
|
|
1425
|
-
|
|
1426
|
-
See Also:
|
|
1427
|
-
- get_source_documents: Get all document sources
|
|
1428
|
-
- get_source_references: Get all reference sources
|
|
1429
|
-
- Document.create: Add sources when creating documents
|
|
1430
1484
|
"""
|
|
1431
1485
|
if isinstance(source, str):
|
|
1432
1486
|
# Direct string comparison
|
|
@@ -1455,6 +1509,8 @@ class Document(BaseModel, ABC):
|
|
|
1455
1509
|
- sha256: Full SHA256 hash in base32 encoding without padding (str)
|
|
1456
1510
|
- mime_type: Detected MIME type (str)
|
|
1457
1511
|
- sources: List of source strings (list[dict])
|
|
1512
|
+
- canonical_name: Canonical snake_case name for debug tracing (str)
|
|
1513
|
+
- class_name: Name of the actual document class for debug tracing (str)
|
|
1458
1514
|
- content: Encoded content (str)
|
|
1459
1515
|
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1460
1516
|
|
|
@@ -1478,10 +1534,12 @@ class Document(BaseModel, ABC):
|
|
|
1478
1534
|
"sha256": self.sha256,
|
|
1479
1535
|
"mime_type": self.mime_type,
|
|
1480
1536
|
"sources": self.sources,
|
|
1537
|
+
"canonical_name": canonical_name_key(self.__class__),
|
|
1538
|
+
"class_name": self.__class__.__name__,
|
|
1481
1539
|
}
|
|
1482
1540
|
|
|
1483
1541
|
# Try to encode content as UTF-8, fall back to base64
|
|
1484
|
-
if self.is_text
|
|
1542
|
+
if self.is_text:
|
|
1485
1543
|
try:
|
|
1486
1544
|
result["content"] = self.content.decode("utf-8")
|
|
1487
1545
|
result["content_encoding"] = "utf-8"
|
|
@@ -1557,3 +1615,96 @@ class Document(BaseModel, ABC):
|
|
|
1557
1615
|
description=data.get("description"),
|
|
1558
1616
|
sources=data.get("sources", []),
|
|
1559
1617
|
)
|
|
1618
|
+
|
|
1619
|
+
@final
|
|
1620
|
+
def model_convert(
|
|
1621
|
+
self,
|
|
1622
|
+
new_type: type[TDocument],
|
|
1623
|
+
*,
|
|
1624
|
+
update: dict[str, Any] | None = None,
|
|
1625
|
+
deep: bool = False,
|
|
1626
|
+
) -> TDocument:
|
|
1627
|
+
"""Convert document to a different Document type with optional updates.
|
|
1628
|
+
|
|
1629
|
+
@public
|
|
1630
|
+
|
|
1631
|
+
Creates a new document of a different type, preserving all attributes
|
|
1632
|
+
while allowing updates. This is useful for converting between document
|
|
1633
|
+
types (e.g., TaskDocument to FlowDocument) while maintaining data integrity.
|
|
1634
|
+
|
|
1635
|
+
Args:
|
|
1636
|
+
new_type: Target Document class for conversion. Must be a concrete
|
|
1637
|
+
subclass of Document (not abstract classes like Document,
|
|
1638
|
+
FlowDocument, or TaskDocument).
|
|
1639
|
+
update: Dictionary of attributes to update. Supports any attributes
|
|
1640
|
+
that the Document constructor accepts (name, content,
|
|
1641
|
+
description, sources).
|
|
1642
|
+
deep: Whether to perform a deep copy of mutable attributes.
|
|
1643
|
+
|
|
1644
|
+
Returns:
|
|
1645
|
+
New Document instance of the specified type.
|
|
1646
|
+
|
|
1647
|
+
Raises:
|
|
1648
|
+
TypeError: If new_type is not a subclass of Document, is an abstract
|
|
1649
|
+
class, or if update contains invalid attributes.
|
|
1650
|
+
DocumentNameError: If the name violates the target type's FILES enum.
|
|
1651
|
+
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE.
|
|
1652
|
+
|
|
1653
|
+
Example:
|
|
1654
|
+
>>> # Convert TaskDocument to FlowDocument
|
|
1655
|
+
>>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
|
|
1656
|
+
>>> flow_doc = task_doc.model_convert(MyFlowDoc)
|
|
1657
|
+
>>> assert flow_doc.is_flow
|
|
1658
|
+
>>> assert flow_doc.content == task_doc.content
|
|
1659
|
+
>>>
|
|
1660
|
+
>>> # Convert with updates
|
|
1661
|
+
>>> updated = task_doc.model_convert(
|
|
1662
|
+
... MyFlowDoc,
|
|
1663
|
+
... update={"name": "permanent.json", "description": "Converted"}
|
|
1664
|
+
... )
|
|
1665
|
+
>>>
|
|
1666
|
+
>>> # Track document lineage
|
|
1667
|
+
>>> derived = doc.model_convert(
|
|
1668
|
+
... ProcessedDoc,
|
|
1669
|
+
... update={"sources": [doc.sha256]}
|
|
1670
|
+
... )
|
|
1671
|
+
"""
|
|
1672
|
+
# Validate new_type
|
|
1673
|
+
try:
|
|
1674
|
+
# Use a runtime check to ensure it's a class
|
|
1675
|
+
if not isinstance(new_type, type): # type: ignore[reportIncompatibleArgumentType]
|
|
1676
|
+
raise TypeError(f"new_type must be a class, got {new_type}")
|
|
1677
|
+
if not issubclass(new_type, Document): # type: ignore[reportIncompatibleArgumentType]
|
|
1678
|
+
raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
|
|
1679
|
+
except (TypeError, AttributeError):
|
|
1680
|
+
# Not a class at all
|
|
1681
|
+
raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
|
|
1682
|
+
|
|
1683
|
+
# Check for abstract classes by name (avoid circular imports)
|
|
1684
|
+
class_name = new_type.__name__
|
|
1685
|
+
if class_name == "Document":
|
|
1686
|
+
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
1687
|
+
if class_name == "FlowDocument":
|
|
1688
|
+
raise TypeError("Cannot instantiate abstract FlowDocument class directly")
|
|
1689
|
+
if class_name == "TaskDocument":
|
|
1690
|
+
raise TypeError("Cannot instantiate abstract TaskDocument class directly")
|
|
1691
|
+
|
|
1692
|
+
# Get current document data with proper typing
|
|
1693
|
+
data: dict[str, Any] = {
|
|
1694
|
+
"name": self.name,
|
|
1695
|
+
"content": self.content,
|
|
1696
|
+
"description": self.description,
|
|
1697
|
+
"sources": self.sources.copy() if deep else self.sources,
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
# Apply updates if provided
|
|
1701
|
+
if update:
|
|
1702
|
+
data.update(update)
|
|
1703
|
+
|
|
1704
|
+
# Create new document of target type
|
|
1705
|
+
return new_type(
|
|
1706
|
+
name=data["name"],
|
|
1707
|
+
content=data["content"],
|
|
1708
|
+
description=data.get("description"),
|
|
1709
|
+
sources=data.get("sources", []),
|
|
1710
|
+
)
|