ai-pipeline-core 0.1.14__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/.gitignore +1 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/PKG-INFO +19 -17
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/README.md +17 -16
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/__init__.py +21 -13
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/document.py +93 -50
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/document_list.py +70 -23
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/flow_document.py +2 -6
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/task_document.py +0 -4
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/temporary_document.py +1 -8
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/config.py +174 -5
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/ai_messages.py +14 -4
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/client.py +116 -59
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_options.py +2 -5
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_response.py +17 -16
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_types.py +0 -4
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/__init__.py +0 -2
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging_config.py +0 -6
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging_mixin.py +2 -10
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/pipeline.py +45 -68
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/prefect.py +12 -3
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/prompt_manager.py +6 -7
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core-0.2.0/ai_pipeline_core/simple_runner/__init__.py +14 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core-0.2.0/ai_pipeline_core/simple_runner/simple_runner.py +247 -0
- ai_pipeline_core-0.2.0/ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core-0.2.0/ai_pipeline_core/storage/storage.py +628 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/tracing.py +3 -26
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/pyproject.toml +4 -2
- ai_pipeline_core-0.1.14/ai_pipeline_core/simple_runner/__init__.py +0 -24
- ai_pipeline_core-0.1.14/ai_pipeline_core/simple_runner/simple_runner.py +0 -402
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/LICENSE +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/__init__.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/mime_type.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/utils.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/exceptions.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/__init__.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/options.py +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/__init__.py +1 -1
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging.yml +0 -0
- {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai-pipeline-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Core utilities for AI-powered processing pipelines using prefect
|
|
5
5
|
Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
|
|
6
6
|
Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
|
|
@@ -22,6 +22,7 @@ Requires-Dist: httpx>=0.28.1
|
|
|
22
22
|
Requires-Dist: jinja2>=3.1.6
|
|
23
23
|
Requires-Dist: lmnr>=0.7.6
|
|
24
24
|
Requires-Dist: openai>=1.99.9
|
|
25
|
+
Requires-Dist: prefect-gcp[cloud-storage]>=0.6.10
|
|
25
26
|
Requires-Dist: prefect>=3.4.13
|
|
26
27
|
Requires-Dist: pydantic-settings>=2.10.1
|
|
27
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -111,15 +112,13 @@ class AnalysisConfig(FlowConfig):
|
|
|
111
112
|
INPUT_DOCUMENT_TYPES = [InputDoc]
|
|
112
113
|
OUTPUT_DOCUMENT_TYPE = OutputDoc
|
|
113
114
|
|
|
114
|
-
# Create pipeline flow
|
|
115
|
-
@pipeline_flow
|
|
115
|
+
# Create pipeline flow with required config
|
|
116
|
+
@pipeline_flow(config=AnalysisConfig)
|
|
116
117
|
async def analyze_flow(
|
|
117
118
|
project_name: str,
|
|
118
119
|
documents: DocumentList,
|
|
119
120
|
flow_options: FlowOptions
|
|
120
121
|
) -> DocumentList:
|
|
121
|
-
config = AnalysisConfig()
|
|
122
|
-
|
|
123
122
|
# Process documents
|
|
124
123
|
outputs = []
|
|
125
124
|
for doc in documents:
|
|
@@ -136,7 +135,7 @@ async def analyze_flow(
|
|
|
136
135
|
outputs.append(output)
|
|
137
136
|
|
|
138
137
|
# RECOMMENDED: Always validate output
|
|
139
|
-
return
|
|
138
|
+
return AnalysisConfig.create_and_validate_output(outputs)
|
|
140
139
|
```
|
|
141
140
|
|
|
142
141
|
### Structured Output
|
|
@@ -289,15 +288,15 @@ class ProcessingConfig(FlowConfig):
|
|
|
289
288
|
INPUT_DOCUMENT_TYPES = [RawDataDocument]
|
|
290
289
|
OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Must be different!
|
|
291
290
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
291
|
+
# Use in flows for validation
|
|
292
|
+
@pipeline_flow(config=ProcessingConfig)
|
|
293
|
+
async def process(
|
|
294
|
+
project_name: str,
|
|
295
|
+
documents: DocumentList,
|
|
296
|
+
flow_options: FlowOptions
|
|
297
|
+
) -> DocumentList:
|
|
298
|
+
# ... processing logic ...
|
|
299
|
+
return ProcessingConfig.create_and_validate_output(outputs)
|
|
301
300
|
```
|
|
302
301
|
|
|
303
302
|
### Pipeline Decorators
|
|
@@ -313,7 +312,7 @@ async def process_chunk(data: str) -> str:
|
|
|
313
312
|
set_trace_cost(0.05) # Track costs (new in v0.1.14)
|
|
314
313
|
return result
|
|
315
314
|
|
|
316
|
-
@pipeline_flow # Full observability and orchestration
|
|
315
|
+
@pipeline_flow(config=MyFlowConfig) # Full observability and orchestration
|
|
317
316
|
async def main_flow(
|
|
318
317
|
project_name: str,
|
|
319
318
|
documents: DocumentList,
|
|
@@ -339,6 +338,9 @@ LMNR_DEBUG=true # Enable debug traces
|
|
|
339
338
|
# Optional: Orchestration
|
|
340
339
|
PREFECT_API_URL=http://localhost:4200/api
|
|
341
340
|
PREFECT_API_KEY=your-prefect-key
|
|
341
|
+
|
|
342
|
+
# Optional: Storage (for Google Cloud Storage)
|
|
343
|
+
GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json # GCS auth file
|
|
342
344
|
```
|
|
343
345
|
|
|
344
346
|
### Settings Management
|
|
@@ -366,7 +368,7 @@ print(settings.app_name)
|
|
|
366
368
|
|
|
367
369
|
### Framework Rules (90% Use Cases)
|
|
368
370
|
|
|
369
|
-
1. **Decorators**: Use `@
|
|
371
|
+
1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
|
|
370
372
|
2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
|
|
371
373
|
3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
|
|
372
374
|
4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)
|
|
@@ -67,15 +67,13 @@ class AnalysisConfig(FlowConfig):
|
|
|
67
67
|
INPUT_DOCUMENT_TYPES = [InputDoc]
|
|
68
68
|
OUTPUT_DOCUMENT_TYPE = OutputDoc
|
|
69
69
|
|
|
70
|
-
# Create pipeline flow
|
|
71
|
-
@pipeline_flow
|
|
70
|
+
# Create pipeline flow with required config
|
|
71
|
+
@pipeline_flow(config=AnalysisConfig)
|
|
72
72
|
async def analyze_flow(
|
|
73
73
|
project_name: str,
|
|
74
74
|
documents: DocumentList,
|
|
75
75
|
flow_options: FlowOptions
|
|
76
76
|
) -> DocumentList:
|
|
77
|
-
config = AnalysisConfig()
|
|
78
|
-
|
|
79
77
|
# Process documents
|
|
80
78
|
outputs = []
|
|
81
79
|
for doc in documents:
|
|
@@ -92,7 +90,7 @@ async def analyze_flow(
|
|
|
92
90
|
outputs.append(output)
|
|
93
91
|
|
|
94
92
|
# RECOMMENDED: Always validate output
|
|
95
|
-
return
|
|
93
|
+
return AnalysisConfig.create_and_validate_output(outputs)
|
|
96
94
|
```
|
|
97
95
|
|
|
98
96
|
### Structured Output
|
|
@@ -245,15 +243,15 @@ class ProcessingConfig(FlowConfig):
|
|
|
245
243
|
INPUT_DOCUMENT_TYPES = [RawDataDocument]
|
|
246
244
|
OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Must be different!
|
|
247
245
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
246
|
+
# Use in flows for validation
|
|
247
|
+
@pipeline_flow(config=ProcessingConfig)
|
|
248
|
+
async def process(
|
|
249
|
+
project_name: str,
|
|
250
|
+
documents: DocumentList,
|
|
251
|
+
flow_options: FlowOptions
|
|
252
|
+
) -> DocumentList:
|
|
253
|
+
# ... processing logic ...
|
|
254
|
+
return ProcessingConfig.create_and_validate_output(outputs)
|
|
257
255
|
```
|
|
258
256
|
|
|
259
257
|
### Pipeline Decorators
|
|
@@ -269,7 +267,7 @@ async def process_chunk(data: str) -> str:
|
|
|
269
267
|
set_trace_cost(0.05) # Track costs (new in v0.1.14)
|
|
270
268
|
return result
|
|
271
269
|
|
|
272
|
-
@pipeline_flow # Full observability and orchestration
|
|
270
|
+
@pipeline_flow(config=MyFlowConfig) # Full observability and orchestration
|
|
273
271
|
async def main_flow(
|
|
274
272
|
project_name: str,
|
|
275
273
|
documents: DocumentList,
|
|
@@ -295,6 +293,9 @@ LMNR_DEBUG=true # Enable debug traces
|
|
|
295
293
|
# Optional: Orchestration
|
|
296
294
|
PREFECT_API_URL=http://localhost:4200/api
|
|
297
295
|
PREFECT_API_KEY=your-prefect-key
|
|
296
|
+
|
|
297
|
+
# Optional: Storage (for Google Cloud Storage)
|
|
298
|
+
GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json # GCS auth file
|
|
298
299
|
```
|
|
299
300
|
|
|
300
301
|
### Settings Management
|
|
@@ -322,7 +323,7 @@ print(settings.app_name)
|
|
|
322
323
|
|
|
323
324
|
### Framework Rules (90% Use Cases)
|
|
324
325
|
|
|
325
|
-
1. **Decorators**: Use `@
|
|
326
|
+
1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
|
|
326
327
|
2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
|
|
327
328
|
3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
|
|
328
329
|
4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)
|
|
@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
|
|
|
7
7
|
system designed for production use.
|
|
8
8
|
|
|
9
9
|
The framework enforces best practices through strong typing (Pydantic), automatic retries,
|
|
10
|
-
cost tracking
|
|
10
|
+
and cost tracking. All I/O operations are async for maximum throughput.
|
|
11
11
|
|
|
12
12
|
**CRITICAL IMPORT RULE**:
|
|
13
13
|
Always import from the top-level package:
|
|
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
|
|
|
18
18
|
from ai_pipeline_core.llm import generate # NO!
|
|
19
19
|
from ai_pipeline_core.documents import FlowDocument # NO!
|
|
20
20
|
|
|
21
|
-
FRAMEWORK RULES (
|
|
22
|
-
1. Decorators: Use @
|
|
21
|
+
FRAMEWORK RULES (Use by default, unless instructed otherwise):
|
|
22
|
+
1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
|
|
23
23
|
2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
|
|
24
24
|
3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
|
|
25
|
-
4. Options:
|
|
26
|
-
5. Documents: Create with just name and content - skip description
|
|
25
|
+
4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
|
|
26
|
+
5. Documents: Create with just name and content - skip description unless needed
|
|
27
27
|
6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
|
|
28
28
|
7. Initialization: PromptManager and logger at module scope, not in functions
|
|
29
29
|
8. DocumentList: Use default constructor - no validation flags needed
|
|
@@ -36,18 +36,22 @@ Core Capabilities:
|
|
|
36
36
|
- **LLM Integration**: Unified interface to any model via LiteLLM with caching
|
|
37
37
|
- **Structured Output**: Type-safe generation with Pydantic model validation
|
|
38
38
|
- **Workflow Orchestration**: Prefect-based flows and tasks with retries
|
|
39
|
-
- **Observability**:
|
|
39
|
+
- **Observability**: Built-in monitoring and debugging capabilities
|
|
40
40
|
- **Local Development**: Simple runner for testing without infrastructure
|
|
41
41
|
|
|
42
42
|
Quick Start:
|
|
43
43
|
>>> from ai_pipeline_core import (
|
|
44
|
-
... pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
|
|
44
|
+
... pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
|
|
45
45
|
... )
|
|
46
46
|
>>>
|
|
47
47
|
>>> class OutputDoc(FlowDocument):
|
|
48
48
|
... '''Analysis result document.'''
|
|
49
49
|
>>>
|
|
50
|
-
>>>
|
|
50
|
+
>>> class MyFlowConfig(FlowConfig):
|
|
51
|
+
... INPUT_DOCUMENT_TYPES = []
|
|
52
|
+
... OUTPUT_DOCUMENT_TYPE = OutputDoc
|
|
53
|
+
>>>
|
|
54
|
+
>>> @pipeline_flow(config=MyFlowConfig)
|
|
51
55
|
>>> async def analyze_flow(
|
|
52
56
|
... project_name: str,
|
|
53
57
|
... documents: DocumentList,
|
|
@@ -55,7 +59,7 @@ Quick Start:
|
|
|
55
59
|
... ) -> DocumentList:
|
|
56
60
|
... # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
|
|
57
61
|
... response = await llm.generate(
|
|
58
|
-
...
|
|
62
|
+
... "gpt-5",
|
|
59
63
|
... messages=AIMessages([documents[0]])
|
|
60
64
|
... )
|
|
61
65
|
... result = OutputDoc.create(
|
|
@@ -76,8 +80,6 @@ Optional Environment Variables:
|
|
|
76
80
|
- PREFECT_API_KEY: Prefect API authentication key
|
|
77
81
|
- LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
|
|
78
82
|
- LMNR_DEBUG: Set to "true" to enable debug-level traces
|
|
79
|
-
- LMNR_SESSION_ID: Default session ID for traces
|
|
80
|
-
- LMNR_USER_ID: Default user ID for traces
|
|
81
83
|
"""
|
|
82
84
|
|
|
83
85
|
from . import llm
|
|
@@ -99,6 +101,8 @@ from .llm import (
|
|
|
99
101
|
ModelOptions,
|
|
100
102
|
ModelResponse,
|
|
101
103
|
StructuredModelResponse,
|
|
104
|
+
generate,
|
|
105
|
+
generate_structured,
|
|
102
106
|
)
|
|
103
107
|
from .logging import (
|
|
104
108
|
LoggerMixin,
|
|
@@ -114,7 +118,7 @@ from .prompt_manager import PromptManager
|
|
|
114
118
|
from .settings import Settings
|
|
115
119
|
from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
|
|
116
120
|
|
|
117
|
-
__version__ = "0.
|
|
121
|
+
__version__ = "0.2.0"
|
|
118
122
|
|
|
119
123
|
__all__ = [
|
|
120
124
|
# Config/Settings
|
|
@@ -145,7 +149,9 @@ __all__ = [
|
|
|
145
149
|
"prefect_test_harness",
|
|
146
150
|
"disable_run_logger",
|
|
147
151
|
# LLM
|
|
148
|
-
"llm",
|
|
152
|
+
"llm", # for backward compatibility
|
|
153
|
+
"generate",
|
|
154
|
+
"generate_structured",
|
|
149
155
|
"ModelName",
|
|
150
156
|
"ModelOptions",
|
|
151
157
|
"ModelResponse",
|
|
@@ -159,4 +165,6 @@ __all__ = [
|
|
|
159
165
|
"set_trace_cost",
|
|
160
166
|
# Utils
|
|
161
167
|
"PromptManager",
|
|
168
|
+
"generate",
|
|
169
|
+
"generate_structured",
|
|
162
170
|
]
|
|
@@ -61,8 +61,7 @@ class Document(BaseModel, ABC):
|
|
|
61
61
|
Document is the fundamental data abstraction for all content flowing through
|
|
62
62
|
pipelines. It provides automatic encoding, MIME type detection, serialization,
|
|
63
63
|
and validation. All documents must be subclassed from FlowDocument or TaskDocument
|
|
64
|
-
based on their persistence requirements.
|
|
65
|
-
class that can be instantiated directly (not abstract).
|
|
64
|
+
based on their persistence requirements.
|
|
66
65
|
|
|
67
66
|
VALIDATION IS AUTOMATIC - Do not add manual validation!
|
|
68
67
|
Size validation, name validation, and MIME type detection are built-in.
|
|
@@ -74,7 +73,7 @@ class Document(BaseModel, ABC):
|
|
|
74
73
|
document.validate_file_name(document.name) # NO! Automatic
|
|
75
74
|
|
|
76
75
|
Best Practices:
|
|
77
|
-
- Use create() classmethod for automatic type conversion (
|
|
76
|
+
- Use create() classmethod for automatic type conversion (default preferred)
|
|
78
77
|
- Omit description parameter unless truly needed for metadata
|
|
79
78
|
- When using LLM functions, pass AIMessages or str. Wrap any Document values
|
|
80
79
|
in AIMessages([...]). Do not call .text yourself
|
|
@@ -131,10 +130,62 @@ class Document(BaseModel, ABC):
|
|
|
131
130
|
2. Embed metadata in content (e.g., JSON with data + metadata fields)
|
|
132
131
|
3. Create a separate MetadataDocument type to accompany data documents
|
|
133
132
|
4. Use document naming conventions (e.g., "data_v2_2024.json")
|
|
134
|
-
5. Store metadata in flow_options
|
|
133
|
+
5. Store metadata in flow_options
|
|
134
|
+
|
|
135
|
+
FILES Enum Best Practice:
|
|
136
|
+
When defining a FILES enum, NEVER use magic strings to reference files.
|
|
137
|
+
Always use the enum values to maintain type safety and refactorability.
|
|
138
|
+
|
|
139
|
+
WRONG - Magic strings/numbers:
|
|
140
|
+
doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
|
|
141
|
+
doc = docs.get_by("settings.json") # NO! Magic string
|
|
142
|
+
files = ["config.yaml", "settings.json"] # NO! Magic strings
|
|
143
|
+
|
|
144
|
+
CORRECT - Use enum references:
|
|
145
|
+
doc = ConfigDocument.create(
|
|
146
|
+
name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
|
|
147
|
+
content=data
|
|
148
|
+
)
|
|
149
|
+
doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
|
|
150
|
+
files = [
|
|
151
|
+
ConfigDocument.FILES.CONFIG,
|
|
152
|
+
ConfigDocument.FILES.SETTINGS
|
|
153
|
+
] # YES! Refactorable
|
|
154
|
+
|
|
155
|
+
Pydantic Model Interaction:
|
|
156
|
+
Documents provide DIRECT support for Pydantic models. Use the built-in
|
|
157
|
+
methods instead of manual JSON conversion.
|
|
158
|
+
|
|
159
|
+
WRONG - Manual JSON conversion:
|
|
160
|
+
# Don't do this - manual JSON handling
|
|
161
|
+
json_str = doc.text
|
|
162
|
+
json_data = json.loads(json_str)
|
|
163
|
+
model = MyModel(**json_data) # NO! Use as_pydantic_model
|
|
164
|
+
|
|
165
|
+
# Don't do this - manual serialization
|
|
166
|
+
json_str = model.model_dump_json()
|
|
167
|
+
doc = MyDocument.create(name="data.json", content=json_str) # NO!
|
|
168
|
+
|
|
169
|
+
CORRECT - Direct Pydantic interaction:
|
|
170
|
+
# Reading Pydantic model from document
|
|
171
|
+
model = doc.as_pydantic_model(MyModel) # Direct conversion
|
|
172
|
+
models = doc.as_pydantic_model(list[MyModel]) # List support
|
|
173
|
+
|
|
174
|
+
# Creating document from Pydantic model
|
|
175
|
+
doc = MyDocument.create(
|
|
176
|
+
name="data.json",
|
|
177
|
+
content=model # Direct BaseModel support
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Round-trip is seamless
|
|
181
|
+
original_model = MyModel(field="value")
|
|
182
|
+
doc = MyDocument.create(name="data.json", content=original_model)
|
|
183
|
+
restored = doc.as_pydantic_model(MyModel)
|
|
184
|
+
assert restored == original_model # Perfect round-trip
|
|
135
185
|
|
|
136
186
|
Example:
|
|
137
187
|
>>> from enum import StrEnum
|
|
188
|
+
>>> from pydantic import BaseModel
|
|
138
189
|
>>>
|
|
139
190
|
>>> # Simple document:
|
|
140
191
|
>>> class MyDocument(FlowDocument):
|
|
@@ -146,10 +197,23 @@ class Document(BaseModel, ABC):
|
|
|
146
197
|
... CONFIG = "config.yaml"
|
|
147
198
|
... SETTINGS = "settings.json"
|
|
148
199
|
>>>
|
|
149
|
-
>>> #
|
|
150
|
-
>>> doc =
|
|
151
|
-
|
|
152
|
-
|
|
200
|
+
>>> # CORRECT FILES usage - no magic strings:
|
|
201
|
+
>>> doc = ConfigDocument.create(
|
|
202
|
+
... name=ConfigDocument.FILES.CONFIG, # Use enum
|
|
203
|
+
... content={"key": "value"}
|
|
204
|
+
... )
|
|
205
|
+
>>>
|
|
206
|
+
>>> # CORRECT Pydantic usage:
|
|
207
|
+
>>> class Config(BaseModel):
|
|
208
|
+
... key: str
|
|
209
|
+
>>>
|
|
210
|
+
>>> # Direct creation from Pydantic model
|
|
211
|
+
>>> config_model = Config(key="value")
|
|
212
|
+
>>> doc = MyDocument.create(name="data.json", content=config_model)
|
|
213
|
+
>>>
|
|
214
|
+
>>> # Direct extraction to Pydantic model
|
|
215
|
+
>>> restored = doc.as_pydantic_model(Config)
|
|
216
|
+
>>> print(restored.key) # "value"
|
|
153
217
|
>>>
|
|
154
218
|
>>> # Track document provenance with sources
|
|
155
219
|
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
@@ -170,6 +234,9 @@ class Document(BaseModel, ABC):
|
|
|
170
234
|
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
171
235
|
"""File extension for description files."""
|
|
172
236
|
|
|
237
|
+
SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
|
|
238
|
+
"""File extension for sources metadata files."""
|
|
239
|
+
|
|
173
240
|
MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
|
|
174
241
|
"""Separator for markdown list items."""
|
|
175
242
|
|
|
@@ -288,7 +355,7 @@ class Document(BaseModel, ABC):
|
|
|
288
355
|
content types and automatically converts them to bytes based on the file
|
|
289
356
|
extension. Use the `parse` method to reverse this conversion.
|
|
290
357
|
|
|
291
|
-
Best Practice (
|
|
358
|
+
Best Practice (by default, unless instructed otherwise):
|
|
292
359
|
Only provide name and content. The description parameter is RARELY needed.
|
|
293
360
|
|
|
294
361
|
Args:
|
|
@@ -302,8 +369,8 @@ class Document(BaseModel, ABC):
|
|
|
302
369
|
- bytes: Used directly without conversion
|
|
303
370
|
- str: Encoded to UTF-8 bytes
|
|
304
371
|
- dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
|
|
305
|
-
- list[str]: Joined
|
|
306
|
-
|
|
372
|
+
- list[str]: Joined automatically for .md (validates format compatibility),
|
|
373
|
+
else JSON/YAML
|
|
307
374
|
- list[BaseModel]: Serialized to JSON or YAML based on extension
|
|
308
375
|
- BaseModel: Serialized to JSON or YAML based on extension
|
|
309
376
|
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
@@ -319,7 +386,7 @@ class Document(BaseModel, ABC):
|
|
|
319
386
|
|
|
320
387
|
Raises:
|
|
321
388
|
ValueError: If content type is not supported for the file extension,
|
|
322
|
-
or if markdown list
|
|
389
|
+
or if markdown list format is incompatible
|
|
323
390
|
DocumentNameError: If filename violates validation rules
|
|
324
391
|
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
325
392
|
|
|
@@ -329,7 +396,7 @@ class Document(BaseModel, ABC):
|
|
|
329
396
|
returns the original dictionary {"key": "value"}.
|
|
330
397
|
|
|
331
398
|
Example:
|
|
332
|
-
>>> # CORRECT - no description needed (
|
|
399
|
+
>>> # CORRECT - no description needed (by default, unless instructed otherwise)
|
|
333
400
|
>>> doc = MyDocument.create(name="test.txt", content="Hello World")
|
|
334
401
|
>>> doc.content # b'Hello World'
|
|
335
402
|
>>> doc.parse(str) # "Hello World"
|
|
@@ -427,10 +494,6 @@ class Document(BaseModel, ABC):
|
|
|
427
494
|
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
428
495
|
>>> doc = MyDocument.create(name="config.yaml", content=my_model)
|
|
429
496
|
>>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
|
|
430
|
-
|
|
431
|
-
See Also:
|
|
432
|
-
create: Recommended factory method with automatic type conversion
|
|
433
|
-
parse: Method to reverse the conversion done by create
|
|
434
497
|
"""
|
|
435
498
|
if type(self) is Document:
|
|
436
499
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
@@ -467,8 +530,7 @@ class Document(BaseModel, ABC):
|
|
|
467
530
|
|
|
468
531
|
Note:
|
|
469
532
|
This method determines document persistence and lifecycle.
|
|
470
|
-
FlowDocument returns "flow", TaskDocument returns "task"
|
|
471
|
-
TemporaryDocument returns "temporary".
|
|
533
|
+
FlowDocument returns "flow", TaskDocument returns "task".
|
|
472
534
|
"""
|
|
473
535
|
raise NotImplementedError("Subclasses must implement this method")
|
|
474
536
|
|
|
@@ -520,7 +582,7 @@ class Document(BaseModel, ABC):
|
|
|
520
582
|
during execution.
|
|
521
583
|
|
|
522
584
|
Returns:
|
|
523
|
-
True if this is
|
|
585
|
+
True if this document is temporary, False otherwise.
|
|
524
586
|
"""
|
|
525
587
|
return self.get_base_type() == "temporary"
|
|
526
588
|
|
|
@@ -565,8 +627,6 @@ class Document(BaseModel, ABC):
|
|
|
565
627
|
def validate_file_name(cls, name: str) -> None:
|
|
566
628
|
"""Validate that a file name matches allowed patterns.
|
|
567
629
|
|
|
568
|
-
@public
|
|
569
|
-
|
|
570
630
|
DO NOT OVERRIDE this method if you define a FILES enum!
|
|
571
631
|
The validation is automatic when FILES enum is present.
|
|
572
632
|
|
|
@@ -610,7 +670,7 @@ class Document(BaseModel, ABC):
|
|
|
610
670
|
|
|
611
671
|
Ensures the document name is secure and follows conventions:
|
|
612
672
|
- No path traversal characters (.., \\, /)
|
|
613
|
-
- Cannot end with .description.md
|
|
673
|
+
- Cannot end with .description.md or .sources.json
|
|
614
674
|
- No leading/trailing whitespace
|
|
615
675
|
- Must match FILES enum if defined
|
|
616
676
|
|
|
@@ -635,6 +695,9 @@ class Document(BaseModel, ABC):
|
|
|
635
695
|
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
636
696
|
)
|
|
637
697
|
|
|
698
|
+
if v.endswith(cls.SOURCES_EXTENSION):
|
|
699
|
+
raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
|
|
700
|
+
|
|
638
701
|
if ".." in v or "\\" in v or "/" in v:
|
|
639
702
|
raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
|
|
640
703
|
|
|
@@ -659,7 +722,7 @@ class Document(BaseModel, ABC):
|
|
|
659
722
|
2. str → UTF-8 encoding
|
|
660
723
|
3. dict/BaseModel + .json → JSON serialization (indented)
|
|
661
724
|
4. dict/BaseModel + .yaml/.yml → YAML serialization
|
|
662
|
-
5. list[str] + .md → Join with markdown
|
|
725
|
+
5. list[str] + .md → Join with markdown sections (validates format compatibility)
|
|
663
726
|
6. list[Any] + .json/.yaml → JSON/YAML array
|
|
664
727
|
7. int/float/bool + .json → JSON primitive
|
|
665
728
|
|
|
@@ -1028,8 +1091,6 @@ class Document(BaseModel, ABC):
|
|
|
1028
1091
|
def as_yaml(self) -> Any:
|
|
1029
1092
|
r"""Parse document content as YAML.
|
|
1030
1093
|
|
|
1031
|
-
@public
|
|
1032
|
-
|
|
1033
1094
|
Parses the document's text content as YAML and returns Python objects.
|
|
1034
1095
|
Uses ruamel.yaml which is safe by default (no code execution).
|
|
1035
1096
|
|
|
@@ -1057,8 +1118,6 @@ class Document(BaseModel, ABC):
|
|
|
1057
1118
|
def as_json(self) -> Any:
|
|
1058
1119
|
"""Parse document content as JSON.
|
|
1059
1120
|
|
|
1060
|
-
@public
|
|
1061
|
-
|
|
1062
1121
|
Parses the document's text content as JSON and returns Python objects.
|
|
1063
1122
|
Document must contain valid JSON text.
|
|
1064
1123
|
|
|
@@ -1153,7 +1212,7 @@ class Document(BaseModel, ABC):
|
|
|
1153
1212
|
|
|
1154
1213
|
@public
|
|
1155
1214
|
|
|
1156
|
-
Splits text content using markdown
|
|
1215
|
+
Splits text content automatically using markdown section separators.
|
|
1157
1216
|
Designed for markdown documents with multiple sections.
|
|
1158
1217
|
|
|
1159
1218
|
Returns:
|
|
@@ -1168,9 +1227,9 @@ class Document(BaseModel, ABC):
|
|
|
1168
1227
|
>>> doc = MyDocument.create(name="book.md", content=sections)
|
|
1169
1228
|
>>> doc.as_markdown_list() # Returns original sections
|
|
1170
1229
|
|
|
1171
|
-
>>> #
|
|
1172
|
-
>>>
|
|
1173
|
-
>>> doc2 = MyDocument(name="parts.md", content=
|
|
1230
|
+
>>> # Round-trip conversion works automatically
|
|
1231
|
+
>>> sections = ["Part 1", "Part 2", "Part 3"]
|
|
1232
|
+
>>> doc2 = MyDocument.create(name="parts.md", content=sections)
|
|
1174
1233
|
>>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
|
|
1175
1234
|
"""
|
|
1176
1235
|
return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
|
|
@@ -1207,7 +1266,7 @@ class Document(BaseModel, ABC):
|
|
|
1207
1266
|
Extension Rules:
|
|
1208
1267
|
- .json → JSON parsing for dict/list/BaseModel
|
|
1209
1268
|
- .yaml/.yml → YAML parsing for dict/list/BaseModel
|
|
1210
|
-
- .md + list → Split
|
|
1269
|
+
- .md + list → Split automatically into sections
|
|
1211
1270
|
- Any + str → UTF-8 decode
|
|
1212
1271
|
- Any + bytes → Raw content
|
|
1213
1272
|
|
|
@@ -1223,8 +1282,7 @@ class Document(BaseModel, ABC):
|
|
|
1223
1282
|
|
|
1224
1283
|
>>> # Markdown list
|
|
1225
1284
|
>>> items = ["Item 1", "Item 2"]
|
|
1226
|
-
>>>
|
|
1227
|
-
>>> doc = MyDocument(name="list.md", content=content)
|
|
1285
|
+
>>> doc = MyDocument.create(name="list.md", content=items)
|
|
1228
1286
|
>>> doc.parse(list)
|
|
1229
1287
|
['Item 1', 'Item 2']
|
|
1230
1288
|
"""
|
|
@@ -1330,11 +1388,6 @@ class Document(BaseModel, ABC):
|
|
|
1330
1388
|
>>> # Check if specific document is a source
|
|
1331
1389
|
>>> if source1.sha256 in doc_refs:
|
|
1332
1390
|
... print("Document derived from source1")
|
|
1333
|
-
|
|
1334
|
-
See Also:
|
|
1335
|
-
- get_source_references: Get non-document source references (URLs, etc.)
|
|
1336
|
-
- has_source: Check if a specific source is tracked
|
|
1337
|
-
- Document.create: Add sources when creating documents
|
|
1338
1391
|
"""
|
|
1339
1392
|
return [src for src in self.sources if is_document_sha256(src)]
|
|
1340
1393
|
|
|
@@ -1372,11 +1425,6 @@ class Document(BaseModel, ABC):
|
|
|
1372
1425
|
>>> # Use for attribution or debugging
|
|
1373
1426
|
>>> for ref in refs:
|
|
1374
1427
|
... print(f"Data sourced from: {ref}")
|
|
1375
|
-
|
|
1376
|
-
See Also:
|
|
1377
|
-
- get_source_documents: Get document SHA256 references
|
|
1378
|
-
- has_source: Check if a specific source is tracked
|
|
1379
|
-
- Document.create: Add sources when creating documents
|
|
1380
1428
|
"""
|
|
1381
1429
|
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1382
1430
|
|
|
@@ -1422,11 +1470,6 @@ class Document(BaseModel, ABC):
|
|
|
1422
1470
|
>>> # Check by SHA256 directly
|
|
1423
1471
|
>>> if derived.has_source(source_doc.sha256):
|
|
1424
1472
|
... print("Has specific hash")
|
|
1425
|
-
|
|
1426
|
-
See Also:
|
|
1427
|
-
- get_source_documents: Get all document sources
|
|
1428
|
-
- get_source_references: Get all reference sources
|
|
1429
|
-
- Document.create: Add sources when creating documents
|
|
1430
1473
|
"""
|
|
1431
1474
|
if isinstance(source, str):
|
|
1432
1475
|
# Direct string comparison
|