ai-pipeline-core 0.1.14__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/.gitignore +1 -0
  2. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/PKG-INFO +19 -17
  3. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/README.md +17 -16
  4. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/__init__.py +21 -13
  5. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/document.py +93 -50
  6. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/document_list.py +70 -23
  7. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/flow_document.py +2 -6
  8. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/task_document.py +0 -4
  9. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/temporary_document.py +1 -8
  10. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/config.py +174 -5
  11. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/ai_messages.py +14 -4
  12. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/client.py +116 -59
  13. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_options.py +2 -5
  14. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_response.py +17 -16
  15. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/model_types.py +0 -4
  16. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/__init__.py +0 -2
  17. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging_config.py +0 -6
  18. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging_mixin.py +2 -10
  19. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/pipeline.py +45 -68
  20. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/prefect.py +12 -3
  21. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/prompt_manager.py +6 -7
  22. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/settings.py +13 -5
  23. ai_pipeline_core-0.2.0/ai_pipeline_core/simple_runner/__init__.py +14 -0
  24. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/simple_runner/cli.py +13 -12
  25. ai_pipeline_core-0.2.0/ai_pipeline_core/simple_runner/simple_runner.py +247 -0
  26. ai_pipeline_core-0.2.0/ai_pipeline_core/storage/__init__.py +8 -0
  27. ai_pipeline_core-0.2.0/ai_pipeline_core/storage/storage.py +628 -0
  28. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/tracing.py +3 -26
  29. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/pyproject.toml +4 -2
  30. ai_pipeline_core-0.1.14/ai_pipeline_core/simple_runner/__init__.py +0 -24
  31. ai_pipeline_core-0.1.14/ai_pipeline_core/simple_runner/simple_runner.py +0 -402
  32. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/LICENSE +0 -0
  33. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/__init__.py +0 -0
  34. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/mime_type.py +0 -0
  35. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/documents/utils.py +0 -0
  36. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/exceptions.py +0 -0
  37. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/__init__.py +0 -0
  38. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/flow/options.py +0 -0
  39. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/llm/__init__.py +1 -1
  40. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/logging/logging.yml +0 -0
  41. {ai_pipeline_core-0.1.14 → ai_pipeline_core-0.2.0}/ai_pipeline_core/py.typed +0 -0
@@ -112,6 +112,7 @@ venv/
112
112
  ENV/
113
113
  env.bak/
114
114
  venv.bak/
115
+ key.json
115
116
 
116
117
  # Spyder project settings
117
118
  .spyderproject
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.1.14
3
+ Version: 0.2.0
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -22,6 +22,7 @@ Requires-Dist: httpx>=0.28.1
22
22
  Requires-Dist: jinja2>=3.1.6
23
23
  Requires-Dist: lmnr>=0.7.6
24
24
  Requires-Dist: openai>=1.99.9
25
+ Requires-Dist: prefect-gcp[cloud-storage]>=0.6.10
25
26
  Requires-Dist: prefect>=3.4.13
26
27
  Requires-Dist: pydantic-settings>=2.10.1
27
28
  Requires-Dist: pydantic>=2.11.7
@@ -111,15 +112,13 @@ class AnalysisConfig(FlowConfig):
111
112
  INPUT_DOCUMENT_TYPES = [InputDoc]
112
113
  OUTPUT_DOCUMENT_TYPE = OutputDoc
113
114
 
114
- # Create pipeline flow
115
- @pipeline_flow
115
+ # Create pipeline flow with required config
116
+ @pipeline_flow(config=AnalysisConfig)
116
117
  async def analyze_flow(
117
118
  project_name: str,
118
119
  documents: DocumentList,
119
120
  flow_options: FlowOptions
120
121
  ) -> DocumentList:
121
- config = AnalysisConfig()
122
-
123
122
  # Process documents
124
123
  outputs = []
125
124
  for doc in documents:
@@ -136,7 +135,7 @@ async def analyze_flow(
136
135
  outputs.append(output)
137
136
 
138
137
  # RECOMMENDED: Always validate output
139
- return config.create_and_validate_output(outputs)
138
+ return AnalysisConfig.create_and_validate_output(outputs)
140
139
  ```
141
140
 
142
141
  ### Structured Output
@@ -289,15 +288,15 @@ class ProcessingConfig(FlowConfig):
289
288
  INPUT_DOCUMENT_TYPES = [RawDataDocument]
290
289
  OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Must be different!
291
290
 
292
- # Use in flows for validation
293
- @pipeline_flow
294
- async def process(
295
- config: ProcessingConfig,
296
- documents: DocumentList,
297
- flow_options: FlowOptions
298
- ) -> DocumentList:
299
- # ... processing logic ...
300
- return config.create_and_validate_output(outputs)
291
+ # Use in flows for validation
292
+ @pipeline_flow(config=ProcessingConfig)
293
+ async def process(
294
+ project_name: str,
295
+ documents: DocumentList,
296
+ flow_options: FlowOptions
297
+ ) -> DocumentList:
298
+ # ... processing logic ...
299
+ return ProcessingConfig.create_and_validate_output(outputs)
301
300
  ```
302
301
 
303
302
  ### Pipeline Decorators
@@ -313,7 +312,7 @@ async def process_chunk(data: str) -> str:
313
312
  set_trace_cost(0.05) # Track costs (new in v0.1.14)
314
313
  return result
315
314
 
316
- @pipeline_flow # Full observability and orchestration
315
+ @pipeline_flow(config=MyFlowConfig) # Full observability and orchestration
317
316
  async def main_flow(
318
317
  project_name: str,
319
318
  documents: DocumentList,
@@ -339,6 +338,9 @@ LMNR_DEBUG=true # Enable debug traces
339
338
  # Optional: Orchestration
340
339
  PREFECT_API_URL=http://localhost:4200/api
341
340
  PREFECT_API_KEY=your-prefect-key
341
+
342
+ # Optional: Storage (for Google Cloud Storage)
343
+ GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json # GCS auth file
342
344
  ```
343
345
 
344
346
  ### Settings Management
@@ -366,7 +368,7 @@ print(settings.app_name)
366
368
 
367
369
  ### Framework Rules (90% Use Cases)
368
370
 
369
- 1. **Decorators**: Use `@trace`, `@pipeline_task`, `@pipeline_flow` WITHOUT parameters
371
+ 1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
370
372
  2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
371
373
  3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
372
374
  4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)
@@ -67,15 +67,13 @@ class AnalysisConfig(FlowConfig):
67
67
  INPUT_DOCUMENT_TYPES = [InputDoc]
68
68
  OUTPUT_DOCUMENT_TYPE = OutputDoc
69
69
 
70
- # Create pipeline flow
71
- @pipeline_flow
70
+ # Create pipeline flow with required config
71
+ @pipeline_flow(config=AnalysisConfig)
72
72
  async def analyze_flow(
73
73
  project_name: str,
74
74
  documents: DocumentList,
75
75
  flow_options: FlowOptions
76
76
  ) -> DocumentList:
77
- config = AnalysisConfig()
78
-
79
77
  # Process documents
80
78
  outputs = []
81
79
  for doc in documents:
@@ -92,7 +90,7 @@ async def analyze_flow(
92
90
  outputs.append(output)
93
91
 
94
92
  # RECOMMENDED: Always validate output
95
- return config.create_and_validate_output(outputs)
93
+ return AnalysisConfig.create_and_validate_output(outputs)
96
94
  ```
97
95
 
98
96
  ### Structured Output
@@ -245,15 +243,15 @@ class ProcessingConfig(FlowConfig):
245
243
  INPUT_DOCUMENT_TYPES = [RawDataDocument]
246
244
  OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Must be different!
247
245
 
248
- # Use in flows for validation
249
- @pipeline_flow
250
- async def process(
251
- config: ProcessingConfig,
252
- documents: DocumentList,
253
- flow_options: FlowOptions
254
- ) -> DocumentList:
255
- # ... processing logic ...
256
- return config.create_and_validate_output(outputs)
246
+ # Use in flows for validation
247
+ @pipeline_flow(config=ProcessingConfig)
248
+ async def process(
249
+ project_name: str,
250
+ documents: DocumentList,
251
+ flow_options: FlowOptions
252
+ ) -> DocumentList:
253
+ # ... processing logic ...
254
+ return ProcessingConfig.create_and_validate_output(outputs)
257
255
  ```
258
256
 
259
257
  ### Pipeline Decorators
@@ -269,7 +267,7 @@ async def process_chunk(data: str) -> str:
269
267
  set_trace_cost(0.05) # Track costs (new in v0.1.14)
270
268
  return result
271
269
 
272
- @pipeline_flow # Full observability and orchestration
270
+ @pipeline_flow(config=MyFlowConfig) # Full observability and orchestration
273
271
  async def main_flow(
274
272
  project_name: str,
275
273
  documents: DocumentList,
@@ -295,6 +293,9 @@ LMNR_DEBUG=true # Enable debug traces
295
293
  # Optional: Orchestration
296
294
  PREFECT_API_URL=http://localhost:4200/api
297
295
  PREFECT_API_KEY=your-prefect-key
296
+
297
+ # Optional: Storage (for Google Cloud Storage)
298
+ GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json # GCS auth file
298
299
  ```
299
300
 
300
301
  ### Settings Management
@@ -322,7 +323,7 @@ print(settings.app_name)
322
323
 
323
324
  ### Framework Rules (90% Use Cases)
324
325
 
325
- 1. **Decorators**: Use `@trace`, `@pipeline_task`, `@pipeline_flow` WITHOUT parameters
326
+ 1. **Decorators**: Use `@pipeline_task` WITHOUT parameters, `@pipeline_flow` WITH config
326
327
  2. **Logging**: Use `get_pipeline_logger(__name__)` - NEVER `print()` or `logging` module
327
328
  3. **LLM calls**: Use `AIMessages` or `str`. Wrap Documents in `AIMessages`
328
329
  4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are optimal)
@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
7
7
  system designed for production use.
8
8
 
9
9
  The framework enforces best practices through strong typing (Pydantic), automatic retries,
10
- cost tracking, and distributed tracing. All I/O operations are async for maximum throughput.
10
+ and cost tracking. All I/O operations are async for maximum throughput.
11
11
 
12
12
  **CRITICAL IMPORT RULE**:
13
13
  Always import from the top-level package:
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
18
18
  from ai_pipeline_core.llm import generate # NO!
19
19
  from ai_pipeline_core.documents import FlowDocument # NO!
20
20
 
21
- FRAMEWORK RULES (90% Use Cases):
22
- 1. Decorators: Use @trace, @pipeline_task, @pipeline_flow WITHOUT parameters
21
+ FRAMEWORK RULES (Use by default, unless instructed otherwise):
22
+ 1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
23
23
  2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
24
24
  3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
25
- 4. Options: Omit ModelOptions unless specifically needed (defaults are optimal)
26
- 5. Documents: Create with just name and content - skip description
25
+ 4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
26
+ 5. Documents: Create with just name and content - skip description unless needed
27
27
  6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
28
28
  7. Initialization: PromptManager and logger at module scope, not in functions
29
29
  8. DocumentList: Use default constructor - no validation flags needed
@@ -36,18 +36,22 @@ Core Capabilities:
36
36
  - **LLM Integration**: Unified interface to any model via LiteLLM with caching
37
37
  - **Structured Output**: Type-safe generation with Pydantic model validation
38
38
  - **Workflow Orchestration**: Prefect-based flows and tasks with retries
39
- - **Observability**: Distributed tracing via Laminar (LMNR) for debugging
39
+ - **Observability**: Built-in monitoring and debugging capabilities
40
40
  - **Local Development**: Simple runner for testing without infrastructure
41
41
 
42
42
  Quick Start:
43
43
  >>> from ai_pipeline_core import (
44
- ... pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
44
+ ... pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
45
45
  ... )
46
46
  >>>
47
47
  >>> class OutputDoc(FlowDocument):
48
48
  ... '''Analysis result document.'''
49
49
  >>>
50
- >>> @pipeline_flow
50
+ >>> class MyFlowConfig(FlowConfig):
51
+ ... INPUT_DOCUMENT_TYPES = []
52
+ ... OUTPUT_DOCUMENT_TYPE = OutputDoc
53
+ >>>
54
+ >>> @pipeline_flow(config=MyFlowConfig)
51
55
  >>> async def analyze_flow(
52
56
  ... project_name: str,
53
57
  ... documents: DocumentList,
@@ -55,7 +59,7 @@ Quick Start:
55
59
  ... ) -> DocumentList:
56
60
  ... # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
57
61
  ... response = await llm.generate(
58
- ... model="gpt-5",
62
+ ... "gpt-5",
59
63
  ... messages=AIMessages([documents[0]])
60
64
  ... )
61
65
  ... result = OutputDoc.create(
@@ -76,8 +80,6 @@ Optional Environment Variables:
76
80
  - PREFECT_API_KEY: Prefect API authentication key
77
81
  - LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
78
82
  - LMNR_DEBUG: Set to "true" to enable debug-level traces
79
- - LMNR_SESSION_ID: Default session ID for traces
80
- - LMNR_USER_ID: Default user ID for traces
81
83
  """
82
84
 
83
85
  from . import llm
@@ -99,6 +101,8 @@ from .llm import (
99
101
  ModelOptions,
100
102
  ModelResponse,
101
103
  StructuredModelResponse,
104
+ generate,
105
+ generate_structured,
102
106
  )
103
107
  from .logging import (
104
108
  LoggerMixin,
@@ -114,7 +118,7 @@ from .prompt_manager import PromptManager
114
118
  from .settings import Settings
115
119
  from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
116
120
 
117
- __version__ = "0.1.14"
121
+ __version__ = "0.2.0"
118
122
 
119
123
  __all__ = [
120
124
  # Config/Settings
@@ -145,7 +149,9 @@ __all__ = [
145
149
  "prefect_test_harness",
146
150
  "disable_run_logger",
147
151
  # LLM
148
- "llm",
152
+ "llm", # for backward compatibility
153
+ "generate",
154
+ "generate_structured",
149
155
  "ModelName",
150
156
  "ModelOptions",
151
157
  "ModelResponse",
@@ -159,4 +165,6 @@ __all__ = [
159
165
  "set_trace_cost",
160
166
  # Utils
161
167
  "PromptManager",
168
+ "generate",
169
+ "generate_structured",
162
170
  ]
@@ -61,8 +61,7 @@ class Document(BaseModel, ABC):
61
61
  Document is the fundamental data abstraction for all content flowing through
62
62
  pipelines. It provides automatic encoding, MIME type detection, serialization,
63
63
  and validation. All documents must be subclassed from FlowDocument or TaskDocument
64
- based on their persistence requirements. TemporaryDocument is a special concrete
65
- class that can be instantiated directly (not abstract).
64
+ based on their persistence requirements.
66
65
 
67
66
  VALIDATION IS AUTOMATIC - Do not add manual validation!
68
67
  Size validation, name validation, and MIME type detection are built-in.
@@ -74,7 +73,7 @@ class Document(BaseModel, ABC):
74
73
  document.validate_file_name(document.name) # NO! Automatic
75
74
 
76
75
  Best Practices:
77
- - Use create() classmethod for automatic type conversion (90% of cases)
76
+ - Use create() classmethod for automatic type conversion (default preferred)
78
77
  - Omit description parameter unless truly needed for metadata
79
78
  - When using LLM functions, pass AIMessages or str. Wrap any Document values
80
79
  in AIMessages([...]). Do not call .text yourself
@@ -131,10 +130,62 @@ class Document(BaseModel, ABC):
131
130
  2. Embed metadata in content (e.g., JSON with data + metadata fields)
132
131
  3. Create a separate MetadataDocument type to accompany data documents
133
132
  4. Use document naming conventions (e.g., "data_v2_2024.json")
134
- 5. Store metadata in flow_options or pass through TraceInfo
133
+ 5. Store metadata in flow_options
134
+
135
+ FILES Enum Best Practice:
136
+ When defining a FILES enum, NEVER use magic strings to reference files.
137
+ Always use the enum values to maintain type safety and refactorability.
138
+
139
+ WRONG - Magic strings/numbers:
140
+ doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
141
+ doc = docs.get_by("settings.json") # NO! Magic string
142
+ files = ["config.yaml", "settings.json"] # NO! Magic strings
143
+
144
+ CORRECT - Use enum references:
145
+ doc = ConfigDocument.create(
146
+ name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
147
+ content=data
148
+ )
149
+ doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
150
+ files = [
151
+ ConfigDocument.FILES.CONFIG,
152
+ ConfigDocument.FILES.SETTINGS
153
+ ] # YES! Refactorable
154
+
155
+ Pydantic Model Interaction:
156
+ Documents provide DIRECT support for Pydantic models. Use the built-in
157
+ methods instead of manual JSON conversion.
158
+
159
+ WRONG - Manual JSON conversion:
160
+ # Don't do this - manual JSON handling
161
+ json_str = doc.text
162
+ json_data = json.loads(json_str)
163
+ model = MyModel(**json_data) # NO! Use as_pydantic_model
164
+
165
+ # Don't do this - manual serialization
166
+ json_str = model.model_dump_json()
167
+ doc = MyDocument.create(name="data.json", content=json_str) # NO!
168
+
169
+ CORRECT - Direct Pydantic interaction:
170
+ # Reading Pydantic model from document
171
+ model = doc.as_pydantic_model(MyModel) # Direct conversion
172
+ models = doc.as_pydantic_model(list[MyModel]) # List support
173
+
174
+ # Creating document from Pydantic model
175
+ doc = MyDocument.create(
176
+ name="data.json",
177
+ content=model # Direct BaseModel support
178
+ )
179
+
180
+ # Round-trip is seamless
181
+ original_model = MyModel(field="value")
182
+ doc = MyDocument.create(name="data.json", content=original_model)
183
+ restored = doc.as_pydantic_model(MyModel)
184
+ assert restored == original_model # Perfect round-trip
135
185
 
136
186
  Example:
137
187
  >>> from enum import StrEnum
188
+ >>> from pydantic import BaseModel
138
189
  >>>
139
190
  >>> # Simple document:
140
191
  >>> class MyDocument(FlowDocument):
@@ -146,10 +197,23 @@ class Document(BaseModel, ABC):
146
197
  ... CONFIG = "config.yaml"
147
198
  ... SETTINGS = "settings.json"
148
199
  >>>
149
- >>> # RECOMMENDED: Use create for automatic conversion
150
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
151
- >>> print(doc.is_text) # True
152
- >>> data = doc.as_json() # {'key': 'value'}
200
+ >>> # CORRECT FILES usage - no magic strings:
201
+ >>> doc = ConfigDocument.create(
202
+ ... name=ConfigDocument.FILES.CONFIG, # Use enum
203
+ ... content={"key": "value"}
204
+ ... )
205
+ >>>
206
+ >>> # CORRECT Pydantic usage:
207
+ >>> class Config(BaseModel):
208
+ ... key: str
209
+ >>>
210
+ >>> # Direct creation from Pydantic model
211
+ >>> config_model = Config(key="value")
212
+ >>> doc = MyDocument.create(name="data.json", content=config_model)
213
+ >>>
214
+ >>> # Direct extraction to Pydantic model
215
+ >>> restored = doc.as_pydantic_model(Config)
216
+ >>> print(restored.key) # "value"
153
217
  >>>
154
218
  >>> # Track document provenance with sources
155
219
  >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
@@ -170,6 +234,9 @@ class Document(BaseModel, ABC):
170
234
  DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
171
235
  """File extension for description files."""
172
236
 
237
+ SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
238
+ """File extension for sources metadata files."""
239
+
173
240
  MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
174
241
  """Separator for markdown list items."""
175
242
 
@@ -288,7 +355,7 @@ class Document(BaseModel, ABC):
288
355
  content types and automatically converts them to bytes based on the file
289
356
  extension. Use the `parse` method to reverse this conversion.
290
357
 
291
- Best Practice (90% of cases):
358
+ Best Practice (by default, unless instructed otherwise):
292
359
  Only provide name and content. The description parameter is RARELY needed.
293
360
 
294
361
  Args:
@@ -302,8 +369,8 @@ class Document(BaseModel, ABC):
302
369
  - bytes: Used directly without conversion
303
370
  - str: Encoded to UTF-8 bytes
304
371
  - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
305
- - list[str]: Joined with separator for .md (validates no items
306
- contain separator), else JSON/YAML
372
+ - list[str]: Joined automatically for .md (validates format compatibility),
373
+ else JSON/YAML
307
374
  - list[BaseModel]: Serialized to JSON or YAML based on extension
308
375
  - BaseModel: Serialized to JSON or YAML based on extension
309
376
  description: Optional description - USUALLY OMIT THIS (defaults to None).
@@ -319,7 +386,7 @@ class Document(BaseModel, ABC):
319
386
 
320
387
  Raises:
321
388
  ValueError: If content type is not supported for the file extension,
322
- or if markdown list items contain the separator
389
+ or if markdown list format is incompatible
323
390
  DocumentNameError: If filename violates validation rules
324
391
  DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
325
392
 
@@ -329,7 +396,7 @@ class Document(BaseModel, ABC):
329
396
  returns the original dictionary {"key": "value"}.
330
397
 
331
398
  Example:
332
- >>> # CORRECT - no description needed (90% of cases)
399
+ >>> # CORRECT - no description needed (by default, unless instructed otherwise)
333
400
  >>> doc = MyDocument.create(name="test.txt", content="Hello World")
334
401
  >>> doc.content # b'Hello World'
335
402
  >>> doc.parse(str) # "Hello World"
@@ -427,10 +494,6 @@ class Document(BaseModel, ABC):
427
494
  >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
428
495
  >>> doc = MyDocument.create(name="config.yaml", content=my_model)
429
496
  >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
430
-
431
- See Also:
432
- create: Recommended factory method with automatic type conversion
433
- parse: Method to reverse the conversion done by create
434
497
  """
435
498
  if type(self) is Document:
436
499
  raise TypeError("Cannot instantiate abstract Document class directly")
@@ -467,8 +530,7 @@ class Document(BaseModel, ABC):
467
530
 
468
531
  Note:
469
532
  This method determines document persistence and lifecycle.
470
- FlowDocument returns "flow", TaskDocument returns "task",
471
- TemporaryDocument returns "temporary".
533
+ FlowDocument returns "flow", TaskDocument returns "task".
472
534
  """
473
535
  raise NotImplementedError("Subclasses must implement this method")
474
536
 
@@ -520,7 +582,7 @@ class Document(BaseModel, ABC):
520
582
  during execution.
521
583
 
522
584
  Returns:
523
- True if this is a TemporaryDocument, False otherwise.
585
+ True if this document is temporary, False otherwise.
524
586
  """
525
587
  return self.get_base_type() == "temporary"
526
588
 
@@ -565,8 +627,6 @@ class Document(BaseModel, ABC):
565
627
  def validate_file_name(cls, name: str) -> None:
566
628
  """Validate that a file name matches allowed patterns.
567
629
 
568
- @public
569
-
570
630
  DO NOT OVERRIDE this method if you define a FILES enum!
571
631
  The validation is automatic when FILES enum is present.
572
632
 
@@ -610,7 +670,7 @@ class Document(BaseModel, ABC):
610
670
 
611
671
  Ensures the document name is secure and follows conventions:
612
672
  - No path traversal characters (.., \\, /)
613
- - Cannot end with .description.md
673
+ - Cannot end with .description.md or .sources.json
614
674
  - No leading/trailing whitespace
615
675
  - Must match FILES enum if defined
616
676
 
@@ -635,6 +695,9 @@ class Document(BaseModel, ABC):
635
695
  f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
636
696
  )
637
697
 
698
+ if v.endswith(cls.SOURCES_EXTENSION):
699
+ raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
700
+
638
701
  if ".." in v or "\\" in v or "/" in v:
639
702
  raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
640
703
 
@@ -659,7 +722,7 @@ class Document(BaseModel, ABC):
659
722
  2. str → UTF-8 encoding
660
723
  3. dict/BaseModel + .json → JSON serialization (indented)
661
724
  4. dict/BaseModel + .yaml/.yml → YAML serialization
662
- 5. list[str] + .md → Join with markdown separator (validates no items contain separator)
725
+ 5. list[str] + .md → Join with markdown sections (validates format compatibility)
663
726
  6. list[Any] + .json/.yaml → JSON/YAML array
664
727
  7. int/float/bool + .json → JSON primitive
665
728
 
@@ -1028,8 +1091,6 @@ class Document(BaseModel, ABC):
1028
1091
  def as_yaml(self) -> Any:
1029
1092
  r"""Parse document content as YAML.
1030
1093
 
1031
- @public
1032
-
1033
1094
  Parses the document's text content as YAML and returns Python objects.
1034
1095
  Uses ruamel.yaml which is safe by default (no code execution).
1035
1096
 
@@ -1057,8 +1118,6 @@ class Document(BaseModel, ABC):
1057
1118
  def as_json(self) -> Any:
1058
1119
  """Parse document content as JSON.
1059
1120
 
1060
- @public
1061
-
1062
1121
  Parses the document's text content as JSON and returns Python objects.
1063
1122
  Document must contain valid JSON text.
1064
1123
 
@@ -1153,7 +1212,7 @@ class Document(BaseModel, ABC):
1153
1212
 
1154
1213
  @public
1155
1214
 
1156
- Splits text content using markdown separator ("\n\n-----------------\n\n").
1215
+ Splits text content automatically using markdown section separators.
1157
1216
  Designed for markdown documents with multiple sections.
1158
1217
 
1159
1218
  Returns:
@@ -1168,9 +1227,9 @@ class Document(BaseModel, ABC):
1168
1227
  >>> doc = MyDocument.create(name="book.md", content=sections)
1169
1228
  >>> doc.as_markdown_list() # Returns original sections
1170
1229
 
1171
- >>> # Manual creation with separator
1172
- >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
1173
- >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1230
+ >>> # Round-trip conversion works automatically
1231
+ >>> sections = ["Part 1", "Part 2", "Part 3"]
1232
+ >>> doc2 = MyDocument.create(name="parts.md", content=sections)
1174
1233
  >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1175
1234
  """
1176
1235
  return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
@@ -1207,7 +1266,7 @@ class Document(BaseModel, ABC):
1207
1266
  Extension Rules:
1208
1267
  - .json → JSON parsing for dict/list/BaseModel
1209
1268
  - .yaml/.yml → YAML parsing for dict/list/BaseModel
1210
- - .md + list → Split by markdown separator
1269
+ - .md + list → Split automatically into sections
1211
1270
  - Any + str → UTF-8 decode
1212
1271
  - Any + bytes → Raw content
1213
1272
 
@@ -1223,8 +1282,7 @@ class Document(BaseModel, ABC):
1223
1282
 
1224
1283
  >>> # Markdown list
1225
1284
  >>> items = ["Item 1", "Item 2"]
1226
- >>> content = "\n\n---\n\n".join(items).encode()
1227
- >>> doc = MyDocument(name="list.md", content=content)
1285
+ >>> doc = MyDocument.create(name="list.md", content=items)
1228
1286
  >>> doc.parse(list)
1229
1287
  ['Item 1', 'Item 2']
1230
1288
  """
@@ -1330,11 +1388,6 @@ class Document(BaseModel, ABC):
1330
1388
  >>> # Check if specific document is a source
1331
1389
  >>> if source1.sha256 in doc_refs:
1332
1390
  ... print("Document derived from source1")
1333
-
1334
- See Also:
1335
- - get_source_references: Get non-document source references (URLs, etc.)
1336
- - has_source: Check if a specific source is tracked
1337
- - Document.create: Add sources when creating documents
1338
1391
  """
1339
1392
  return [src for src in self.sources if is_document_sha256(src)]
1340
1393
 
@@ -1372,11 +1425,6 @@ class Document(BaseModel, ABC):
1372
1425
  >>> # Use for attribution or debugging
1373
1426
  >>> for ref in refs:
1374
1427
  ... print(f"Data sourced from: {ref}")
1375
-
1376
- See Also:
1377
- - get_source_documents: Get document SHA256 references
1378
- - has_source: Check if a specific source is tracked
1379
- - Document.create: Add sources when creating documents
1380
1428
  """
1381
1429
  return [src for src in self.sources if not is_document_sha256(src)]
1382
1430
 
@@ -1422,11 +1470,6 @@ class Document(BaseModel, ABC):
1422
1470
  >>> # Check by SHA256 directly
1423
1471
  >>> if derived.has_source(source_doc.sha256):
1424
1472
  ... print("Has specific hash")
1425
-
1426
- See Also:
1427
- - get_source_documents: Get all document sources
1428
- - get_source_references: Get all reference sources
1429
- - Document.create: Add sources when creating documents
1430
1473
  """
1431
1474
  if isinstance(source, str):
1432
1475
  # Direct string comparison