ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +64 -158
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +11 -84
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +102 -90
  37. ai_pipeline_core/llm/client.py +229 -183
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,117 +0,0 @@
1
- """Task-specific document base class for temporary pipeline data.
2
-
3
- @public
4
-
5
- This module provides the TaskDocument abstract base class for documents
6
- that exist only during Prefect task execution and are not persisted.
7
- """
8
-
9
- from typing import Literal, final
10
-
11
- from .document import Document
12
-
13
-
14
- class TaskDocument(Document):
15
- """Abstract base class for temporary documents within task execution.
16
-
17
- @public
18
-
19
- TaskDocument is used for intermediate data that exists only during
20
- the execution of a Prefect task and is not persisted to disk. These
21
- documents are ideal for temporary processing results, transformations,
22
- and data that doesn't need to survive beyond the current task.
23
-
24
- Key characteristics:
25
- - Not persisted to file system
26
- - Exists only during task execution
27
- - Garbage collected after task completes
28
- - Used for intermediate processing results
29
- - Reduces persistent I/O for temporary data
30
-
31
- Creating TaskDocuments:
32
- Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
33
- See Document.create() for detailed usage examples.
34
-
35
- Use Cases:
36
- - Intermediate transformation results
37
- - Temporary buffers during processing
38
- - Task-local cache data
39
- - Processing status documents
40
-
41
- Note:
42
- - Cannot instantiate TaskDocument directly - must subclass
43
- - Not saved by deployment utilities
44
- - Reduces I/O overhead for temporary data
45
- - No additional abstract methods to implement
46
- """
47
-
48
- def __init__(
49
- self,
50
- *,
51
- name: str,
52
- content: bytes,
53
- description: str | None = None,
54
- sources: list[str] | None = None,
55
- ) -> None:
56
- """Initialize a TaskDocument with raw bytes content.
57
-
58
- See Document.__init__() for parameter details and usage notes.
59
-
60
- Prevents direct instantiation of the abstract TaskDocument class.
61
- TaskDocument must be subclassed for specific temporary document types.
62
-
63
- Args:
64
- name: Document filename (required, keyword-only)
65
- content: Document content as raw bytes (required, keyword-only)
66
- description: Optional human-readable description (keyword-only)
67
- sources: Optional list of strings for provenance tracking
68
-
69
- Raises:
70
- TypeError: If attempting to instantiate TaskDocument directly
71
- instead of using a concrete subclass.
72
-
73
- Example:
74
- >>> from enum import StrEnum
75
- >>>
76
- >>> # Simple subclass:
77
- >>> class MyTaskDoc(TaskDocument):
78
- ... pass
79
- >>>
80
- >>> # With FILES restriction:
81
- >>> class TempProcessDoc(TaskDocument):
82
- ... class FILES(StrEnum):
83
- ... BUFFER = "buffer.bin"
84
- ... STATUS = "status.json"
85
- >>>
86
- >>> # Direct constructor - only for bytes:
87
- >>> doc = MyTaskDoc(name="temp.bin", content=b"raw data")
88
- >>>
89
- >>> # RECOMMENDED - use create for automatic conversion:
90
- >>> doc = TempProcessDoc.create(name="status.json", content={"percent": 50})
91
- >>> # This would raise DocumentNameError:
92
- >>> # doc = TempProcessDoc.create(name="other.json", content={})
93
- """
94
- if type(self) is TaskDocument:
95
- raise TypeError("Cannot instantiate abstract TaskDocument class directly")
96
-
97
- # Only pass sources if not None to let Pydantic's default_factory handle it
98
- if sources is not None:
99
- super().__init__(name=name, content=content, description=description, sources=sources)
100
- else:
101
- super().__init__(name=name, content=content, description=description)
102
-
103
- @final
104
- def get_base_type(self) -> Literal["task"]:
105
- """Return the base type identifier for task documents.
106
-
107
- This method is final and cannot be overridden by subclasses.
108
- It identifies this document as a task-scoped temporary document.
109
-
110
- Returns:
111
- "task" - Indicates this document is temporary within task execution.
112
-
113
- Note:
114
- This determines that the document will not be persisted and
115
- exists only during task execution.
116
- """
117
- return "task"
@@ -1,74 +0,0 @@
1
- """Temporary document implementation for non-persistent data.
2
-
3
- This module provides the TemporaryDocument class for documents that
4
- are never persisted, regardless of context.
5
- """
6
-
7
- from typing import Any, Literal, final
8
-
9
- from .document import Document
10
-
11
-
12
- @final
13
- class TemporaryDocument(Document):
14
- r"""Concrete document class for data that is never persisted.
15
-
16
- TemporaryDocument is a final (non-subclassable) document type for
17
- data that should never be saved to disk, regardless of whether it's
18
- used in a flow or task context. Unlike FlowDocument and TaskDocument
19
- which are abstract, TemporaryDocument can be instantiated directly.
20
-
21
- Key characteristics:
22
- - Never persisted to file system
23
- - Can be instantiated directly (not abstract)
24
- - Cannot be subclassed (annotated with Python's @final decorator in code)
25
- - Useful for transient data like API responses or intermediate calculations
26
- - Ignored by deployment save operations
27
- - Useful for tests and debugging
28
-
29
- Creating TemporaryDocuments:
30
- Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
31
- Unlike abstract document types, TemporaryDocument can be instantiated directly.
32
- See Document.create() for detailed usage examples.
33
-
34
- >>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
35
- >>> doc.is_temporary # Always True
36
-
37
- Use Cases:
38
- - API responses that shouldn't be cached
39
- - Sensitive credentials or tokens
40
- - Intermediate calculations
41
- - Temporary transformations
42
- - Data explicitly marked as non-persistent
43
-
44
- Note:
45
- - This is a final class and cannot be subclassed
46
- - Use when you explicitly want to prevent persistence
47
- - Useful for sensitive data that shouldn't be written to disk
48
- """
49
-
50
- def __init_subclass__(cls, **kwargs: Any) -> None:
51
- """Disallow subclassing.
52
-
53
- Args:
54
- **kwargs: Additional keyword arguments (ignored).
55
-
56
- Raises:
57
- TypeError: Always raised to prevent subclassing of `TemporaryDocument`.
58
- """
59
- raise TypeError("TemporaryDocument is final and cannot be subclassed")
60
-
61
- def get_base_type(self) -> Literal["temporary"]:
62
- """Return the base type identifier for temporary documents.
63
-
64
- Identifies this document as temporary, ensuring it will
65
- never be persisted by the pipeline system.
66
-
67
- Returns:
68
- "temporary" - Indicates this document is never persisted.
69
-
70
- Note:
71
- Documents with this type are explicitly excluded from
72
- all persistence operations in the pipeline system.
73
- """
74
- return "temporary"
@@ -1,9 +0,0 @@
1
- """Flow configuration and options for Prefect-based pipeline flows."""
2
-
3
- from .config import FlowConfig
4
- from .options import FlowOptions
5
-
6
- __all__ = [
7
- "FlowConfig",
8
- "FlowOptions",
9
- ]
@@ -1,494 +0,0 @@
1
- """Flow configuration system for type-safe pipeline definitions.
2
-
3
- @public
4
-
5
- This module provides the FlowConfig abstract base class that enforces
6
- type safety for flow inputs and outputs in the pipeline system.
7
-
8
- Best Practice:
9
- Always finish @pipeline_flow functions with create_and_validate_output()
10
- to ensure type safety and proper validation of output documents.
11
- """
12
-
13
- import json
14
- from abc import ABC
15
- from typing import Any, ClassVar, Iterable
16
-
17
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
18
- from ai_pipeline_core.exceptions import DocumentValidationError
19
- from ai_pipeline_core.logging import get_pipeline_logger
20
- from ai_pipeline_core.storage import Storage
21
-
22
- logger = get_pipeline_logger(__name__)
23
-
24
-
25
- class FlowConfig(ABC):
26
- """Abstract base class for type-safe flow configuration.
27
-
28
- @public
29
-
30
- FlowConfig defines the contract for flow inputs and outputs, ensuring
31
- type safety and preventing circular dependencies in pipeline flows.
32
- Each flow must have a corresponding FlowConfig subclass that specifies
33
- its input document types and output document type.
34
-
35
- CRITICAL RULE: OUTPUT_DOCUMENT_TYPE must NEVER be in INPUT_DOCUMENT_TYPES!
36
- This prevents circular dependencies as flows chain together.
37
- Each flow transforms input types to a DIFFERENT output type.
38
-
39
- Class Variables:
40
- INPUT_DOCUMENT_TYPES: List of FlowDocument types this flow accepts
41
- OUTPUT_DOCUMENT_TYPE: Single FlowDocument type this flow produces
42
- WEIGHT: Weight for progress calculation (default 1.0, based on avg duration)
43
-
44
- Validation Rules:
45
- - INPUT_DOCUMENT_TYPES and OUTPUT_DOCUMENT_TYPE must be defined
46
- - OUTPUT_DOCUMENT_TYPE cannot be in INPUT_DOCUMENT_TYPES (prevents cycles)
47
- - Field names must be exact (common typos are detected)
48
- - WEIGHT must be a positive number
49
-
50
- Why this matters:
51
- Flows connect in pipelines where one flow's output becomes another's input.
52
- Same input/output types would create infinite loops or circular dependencies.
53
-
54
- Example:
55
- >>> # CORRECT - Different output type from inputs
56
- >>> class ProcessingFlowConfig(FlowConfig):
57
- ... INPUT_DOCUMENT_TYPES = [RawDataDocument]
58
- ... OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Different type!
59
- ... WEIGHT = 45.0 # Average ~45 minutes
60
- >>>
61
- >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
62
- >>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
63
- >>> async def process(
64
- ... project_name: str, docs: DocumentList, flow_options: FlowOptions
65
- ... ) -> DocumentList:
66
- ... outputs = []
67
- ... # ... processing logic ...
68
- ... return config.create_and_validate_output(outputs)
69
-
70
- >>> # WRONG - Will raise TypeError
71
- >>> class BadConfig(FlowConfig):
72
- ... INPUT_DOCUMENT_TYPES = [DataDocument]
73
- ... OUTPUT_DOCUMENT_TYPE = DataDocument # SAME TYPE - NOT ALLOWED!
74
-
75
- Note:
76
- - Validation happens at class definition time
77
- - Helps catch configuration errors early
78
- - Used by PipelineDeployment to manage document flow
79
- """
80
-
81
- INPUT_DOCUMENT_TYPES: ClassVar[list[type[FlowDocument]]]
82
- OUTPUT_DOCUMENT_TYPE: ClassVar[type[FlowDocument]]
83
- WEIGHT: ClassVar[float] = 1.0
84
-
85
- def __init_subclass__(cls, **kwargs: Any):
86
- """Validate flow configuration at subclass definition time.
87
-
88
- Performs comprehensive validation when a FlowConfig subclass is defined:
89
- 1. Checks for common field name mistakes (typos)
90
- 2. Ensures required fields are defined
91
- 3. Prevents circular dependencies (output != input)
92
-
93
- Args:
94
- **kwargs: Additional arguments for parent __init_subclass__.
95
-
96
- Raises:
97
- TypeError: If configuration violates any validation rules:
98
- - Missing required fields
99
- - Incorrect field names
100
- - Circular dependency detected
101
-
102
- Note:
103
- This runs at class definition time, not instantiation,
104
- providing immediate feedback during development.
105
- """
106
- super().__init_subclass__(**kwargs)
107
-
108
- # Skip validation for the abstract base class itself
109
- if cls.__name__ == "FlowConfig":
110
- return
111
-
112
- # Check for invalid field names (common mistakes)
113
- allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE", "WEIGHT"}
114
- class_attrs = {name for name in dir(cls) if not name.startswith("_") and name.isupper()}
115
-
116
- # Find fields that look like they might be mistakes
117
- suspicious_fields = class_attrs - allowed_fields
118
- common_mistakes = {
119
- "OUTPUT_DOCUMENT_TYPES": "OUTPUT_DOCUMENT_TYPE",
120
- "INPUT_DOCUMENT_TYPE": "INPUT_DOCUMENT_TYPES",
121
- }
122
-
123
- for field in suspicious_fields:
124
- # Skip inherited attributes from parent classes
125
- if any(hasattr(base, field) for base in cls.__bases__):
126
- continue
127
-
128
- if field in common_mistakes:
129
- raise TypeError(
130
- f"FlowConfig {cls.__name__}: Found '{field}' but expected "
131
- f"'{common_mistakes[field]}'. Please use the correct field name."
132
- )
133
- elif "DOCUMENT" in field:
134
- raise TypeError(
135
- f"FlowConfig {cls.__name__}: Invalid field '{field}'. "
136
- f"Only 'INPUT_DOCUMENT_TYPES' and 'OUTPUT_DOCUMENT_TYPE' are allowed."
137
- )
138
-
139
- # Ensure required attributes are defined
140
- if not hasattr(cls, "INPUT_DOCUMENT_TYPES"):
141
- raise TypeError(f"FlowConfig {cls.__name__} must define INPUT_DOCUMENT_TYPES")
142
- if not hasattr(cls, "OUTPUT_DOCUMENT_TYPE"):
143
- raise TypeError(f"FlowConfig {cls.__name__} must define OUTPUT_DOCUMENT_TYPE")
144
-
145
- # Validate that output type is not in input types
146
- if cls.OUTPUT_DOCUMENT_TYPE in cls.INPUT_DOCUMENT_TYPES:
147
- raise TypeError(
148
- f"FlowConfig {cls.__name__}: OUTPUT_DOCUMENT_TYPE "
149
- f"({cls.OUTPUT_DOCUMENT_TYPE.__name__}) cannot be in INPUT_DOCUMENT_TYPES"
150
- )
151
-
152
- # Validate WEIGHT
153
- weight = getattr(cls, "WEIGHT", 1.0)
154
- if not isinstance(weight, (int, float)) or weight <= 0:
155
- raise TypeError(
156
- f"FlowConfig {cls.__name__}: WEIGHT must be a positive number, got {weight}"
157
- )
158
-
159
- @classmethod
160
- def get_input_document_types(cls) -> list[type[FlowDocument]]:
161
- """Get the list of input document types this flow accepts.
162
-
163
- Returns:
164
- List of FlowDocument subclasses that this flow requires
165
- as input.
166
-
167
- Example:
168
- >>> types = MyFlowConfig.get_input_document_types()
169
- >>> print([t.__name__ for t in types])
170
- ['InputDoc', 'ConfigDoc']
171
- """
172
- return cls.INPUT_DOCUMENT_TYPES
173
-
174
- @classmethod
175
- def get_output_document_type(cls) -> type[FlowDocument]:
176
- """Get the output document type this flow produces.
177
-
178
- Returns:
179
- Single FlowDocument subclass that this flow outputs.
180
-
181
- Example:
182
- >>> output_type = MyFlowConfig.get_output_document_type()
183
- >>> print(output_type.__name__)
184
- 'ProcessedDataDocument'
185
- """
186
- return cls.OUTPUT_DOCUMENT_TYPE
187
-
188
- @classmethod
189
- def has_input_documents(cls, documents: DocumentList) -> bool:
190
- """Check if all required input documents are present.
191
-
192
- Verifies that the document list contains at least one instance
193
- of each required input document type.
194
-
195
- Args:
196
- documents: DocumentList to check for required inputs.
197
-
198
- Returns:
199
- True if all required document types are present,
200
- False if any are missing.
201
-
202
- Example:
203
- >>> docs = DocumentList([input_doc, config_doc])
204
- >>> if MyFlowConfig.has_input_documents(docs):
205
- ... # Safe to proceed with flow
206
- ... pass
207
-
208
- Note:
209
- Use this before get_input_documents() to avoid exceptions.
210
- """
211
- for doc_cls in cls.INPUT_DOCUMENT_TYPES:
212
- if not any(isinstance(doc, doc_cls) for doc in documents):
213
- return False
214
- return True
215
-
216
- @classmethod
217
- def get_input_documents(cls, documents: DocumentList) -> DocumentList:
218
- """Extract and return all required input documents.
219
-
220
- Filters the provided document list to return only documents
221
- matching the required input types. Returns all matching documents,
222
- not just the first of each type.
223
-
224
- Args:
225
- documents: DocumentList containing mixed document types.
226
-
227
- Returns:
228
- DocumentList containing only the required input documents.
229
-
230
- Raises:
231
- ValueError: If any required document type is missing.
232
-
233
- Example:
234
- >>> all_docs = DocumentList([input1, input2, other_doc])
235
- >>> input_docs = MyFlowConfig.get_input_documents(all_docs)
236
- >>> len(input_docs) # Contains only input1 and input2
237
- 2
238
-
239
- Note:
240
- Call has_input_documents() first to check availability.
241
- """
242
- input_documents = DocumentList()
243
- for doc_cls in cls.INPUT_DOCUMENT_TYPES:
244
- filtered_documents = [doc for doc in documents if isinstance(doc, doc_cls)]
245
- if not filtered_documents:
246
- raise ValueError(f"No input document found for class {doc_cls.__name__}")
247
- input_documents.extend(filtered_documents)
248
- return input_documents
249
-
250
- @classmethod
251
- def validate_output_documents(cls, documents: Any) -> None:
252
- """Validate that output documents match the expected type.
253
-
254
- Ensures all documents in the list are instances of the
255
- declared OUTPUT_DOCUMENT_TYPE.
256
-
257
- Args:
258
- documents: DocumentList to validate.
259
-
260
- Raises:
261
- DocumentValidationError: If documents is not a DocumentList or if any
262
- document has incorrect type.
263
-
264
- Example:
265
- >>> output = DocumentList([ProcessedDoc(...)])
266
- >>> MyFlowConfig.validate_output_documents(output)
267
- >>> # No exception means valid
268
-
269
- Note:
270
- Used internally by create_and_validate_output().
271
- Uses explicit exceptions for validation (works with python -O).
272
- """
273
- if not isinstance(documents, DocumentList):
274
- raise DocumentValidationError("Documents must be a DocumentList")
275
-
276
- output_document_class = cls.get_output_document_type()
277
-
278
- for doc in documents:
279
- if not isinstance(doc, output_document_class):
280
- raise DocumentValidationError(
281
- f"Document '{doc.name}' has incorrect type. "
282
- f"Expected: {output_document_class.__name__}, "
283
- f"Got: {type(doc).__name__}"
284
- )
285
-
286
- @classmethod
287
- def create_and_validate_output(
288
- cls, output: FlowDocument | Iterable[FlowDocument] | DocumentList
289
- ) -> DocumentList:
290
- """Create and validate flow output documents.
291
-
292
- @public
293
-
294
- RECOMMENDED: Always use this method at the end of @pipeline_flow functions
295
- to ensure type safety and proper output validation.
296
-
297
- Convenience method that wraps output in a DocumentList if needed
298
- and validates it matches the expected OUTPUT_DOCUMENT_TYPE.
299
-
300
- Args:
301
- output: Single document, iterable of documents, or DocumentList.
302
-
303
- Returns:
304
- Validated DocumentList containing the output documents.
305
-
306
- Raises:
307
- DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
308
-
309
- Example:
310
- >>> @pipeline_flow(config=MyFlowConfig, name="my_flow")
311
- >>> async def process_flow(
312
- ... project_name: str, documents: DocumentList, flow_options: FlowOptions
313
- ... ) -> DocumentList:
314
- >>> outputs = []
315
- >>> # ... processing logic ...
316
- >>> outputs.append(OutputDoc(...))
317
- >>>
318
- >>> # Always finish with this validation
319
- >>> return config.create_and_validate_output(outputs)
320
-
321
- Note:
322
- This is the recommended pattern for all @pipeline_flow functions.
323
- It ensures type safety and catches output errors immediately.
324
- """
325
- documents: DocumentList
326
- if isinstance(output, FlowDocument):
327
- documents = DocumentList([output])
328
- elif isinstance(output, DocumentList):
329
- documents = output
330
- else:
331
- # Handle any iterable of FlowDocuments
332
- documents = DocumentList(list(output)) # type: ignore[arg-type]
333
- cls.validate_output_documents(documents)
334
- return documents
335
-
336
- @classmethod
337
- async def load_documents(
338
- cls,
339
- uri: str,
340
- ) -> DocumentList:
341
- """Load documents from storage matching INPUT_DOCUMENT_TYPES.
342
-
343
- Loads documents from a storage location based on the class's INPUT_DOCUMENT_TYPES.
344
- Supports both local filesystem and Google Cloud Storage backends.
345
- Automatically loads metadata (.description.md and .sources.json) when present.
346
-
347
- Args:
348
- uri: Storage URI (file://, gs://, or local path)
349
-
350
- Returns:
351
- DocumentList containing loaded documents matching INPUT_DOCUMENT_TYPES
352
-
353
- Example:
354
- >>> # Load from local filesystem
355
- >>> docs = await MyFlowConfig.load_documents("./data")
356
- >>>
357
- >>> # Load from GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
358
- >>> docs = await MyFlowConfig.load_documents("gs://bucket/data")
359
- """
360
- # Use INPUT_DOCUMENT_TYPES if not specified
361
- storage = await Storage.from_uri(uri)
362
- loaded_documents = DocumentList()
363
-
364
- # Process each document type
365
- for doc_type in cls.INPUT_DOCUMENT_TYPES:
366
- canonical_name = doc_type.canonical_name()
367
- doc_storage = storage.with_base(canonical_name)
368
-
369
- # Check if subdirectory exists
370
- if not await doc_storage.exists(""):
371
- logger.debug(f"Subdirectory {canonical_name} not found, skipping")
372
- continue
373
-
374
- # List files in subdirectory
375
- objects = await doc_storage.list("", recursive=False, include_dirs=False)
376
-
377
- # Create lookup set for metadata files
378
- object_keys = {obj.key for obj in objects}
379
-
380
- # Filter out metadata files
381
- doc_files = [
382
- obj
383
- for obj in objects
384
- if not obj.key.endswith(Document.DESCRIPTION_EXTENSION)
385
- and not obj.key.endswith(Document.SOURCES_EXTENSION)
386
- ]
387
-
388
- for obj in doc_files:
389
- try:
390
- # Load document content
391
- content = await doc_storage.read_bytes(obj.key)
392
-
393
- # Load metadata if present
394
- description = None
395
- sources: list[str] = []
396
-
397
- # Check for description in objects list
398
- desc_path = f"{obj.key}{Document.DESCRIPTION_EXTENSION}"
399
- if desc_path in object_keys:
400
- try:
401
- description = await doc_storage.read_text(desc_path)
402
- except Exception as e:
403
- logger.warning(f"Failed to load description for {obj.key}: {e}")
404
-
405
- # Check for sources in objects list
406
- sources_path = f"{obj.key}{Document.SOURCES_EXTENSION}"
407
- if sources_path in object_keys:
408
- try:
409
- sources_text = await doc_storage.read_text(sources_path)
410
- sources = json.loads(sources_text)
411
- except Exception as e:
412
- logger.warning(f"Failed to load sources for {obj.key}: {e}")
413
-
414
- # Create document instance
415
- doc = doc_type(
416
- name=obj.key,
417
- content=content,
418
- description=description,
419
- sources=sources,
420
- )
421
-
422
- loaded_documents.append(doc)
423
- logger.debug(f"Loaded {doc_type.__name__} document: {obj.key}")
424
- except Exception as e:
425
- logger.error(f"Failed to load {doc_type.__name__} document {obj.key}: {e}")
426
-
427
- logger.info(f"Loaded {len(loaded_documents)} documents from {uri}")
428
- return loaded_documents
429
-
430
- @classmethod
431
- async def save_documents(
432
- cls,
433
- uri: str,
434
- documents: DocumentList,
435
- *,
436
- validate_output_type: bool = True,
437
- ) -> None:
438
- """Save documents to storage with metadata.
439
-
440
- Saves FlowDocument instances to a storage location with their content
441
- and metadata files (Document.DESCRIPTION_EXTENSION and Document.SOURCES_EXTENSION).
442
- Non-FlowDocument instances (TaskDocument, TemporaryDocument) are skipped.
443
-
444
- Args:
445
- uri: Storage URI (file://, gs://, or local path)
446
- documents: DocumentList to save
447
- validate_output_type: If True, validate documents match cls.OUTPUT_DOCUMENT_TYPE
448
-
449
- Raises:
450
- DocumentValidationError: If validate_output_type=True and documents don't match
451
- OUTPUT_DOCUMENT_TYPE
452
-
453
- Example:
454
- >>> # Save to local filesystem
455
- >>> await MyFlowConfig.save_documents("./output", docs)
456
- >>>
457
- >>> # Save to GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
458
- >>> await MyFlowConfig.save_documents("gs://bucket/output", docs)
459
- """
460
- # Validate output type if requested
461
- if validate_output_type:
462
- cls.validate_output_documents(documents)
463
-
464
- storage = await Storage.from_uri(uri)
465
- saved_count = 0
466
-
467
- for doc in documents:
468
- # Skip non-FlowDocument instances
469
- if not isinstance(doc, FlowDocument):
470
- logger.warning(f"Skipping non-FlowDocument: {type(doc).__name__}")
471
- continue
472
-
473
- # Get canonical name for subdirectory
474
- canonical_name = doc.canonical_name()
475
- doc_storage = storage.with_base(canonical_name)
476
-
477
- # Save document content
478
- await doc_storage.write_bytes(doc.name, doc.content)
479
- saved_count += 1
480
-
481
- # Save description if present
482
- if doc.description:
483
- desc_path = f"{doc.name}{Document.DESCRIPTION_EXTENSION}"
484
- await doc_storage.write_text(desc_path, doc.description)
485
-
486
- # Save sources if present
487
- if doc.sources:
488
- sources_path = f"{doc.name}{Document.SOURCES_EXTENSION}"
489
- sources_json = json.dumps(doc.sources, indent=2)
490
- await doc_storage.write_text(sources_path, sources_json)
491
-
492
- logger.debug(f"Saved {type(doc).__name__} document: {doc.name}")
493
-
494
- logger.info(f"Saved {saved_count} documents to {uri}")