ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +64 -158
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +11 -84
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +102 -90
- ai_pipeline_core/llm/client.py +229 -183
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
"""Task-specific document base class for temporary pipeline data.
|
|
2
|
-
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
|
-
This module provides the TaskDocument abstract base class for documents
|
|
6
|
-
that exist only during Prefect task execution and are not persisted.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from typing import Literal, final
|
|
10
|
-
|
|
11
|
-
from .document import Document
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TaskDocument(Document):
|
|
15
|
-
"""Abstract base class for temporary documents within task execution.
|
|
16
|
-
|
|
17
|
-
@public
|
|
18
|
-
|
|
19
|
-
TaskDocument is used for intermediate data that exists only during
|
|
20
|
-
the execution of a Prefect task and is not persisted to disk. These
|
|
21
|
-
documents are ideal for temporary processing results, transformations,
|
|
22
|
-
and data that doesn't need to survive beyond the current task.
|
|
23
|
-
|
|
24
|
-
Key characteristics:
|
|
25
|
-
- Not persisted to file system
|
|
26
|
-
- Exists only during task execution
|
|
27
|
-
- Garbage collected after task completes
|
|
28
|
-
- Used for intermediate processing results
|
|
29
|
-
- Reduces persistent I/O for temporary data
|
|
30
|
-
|
|
31
|
-
Creating TaskDocuments:
|
|
32
|
-
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
33
|
-
See Document.create() for detailed usage examples.
|
|
34
|
-
|
|
35
|
-
Use Cases:
|
|
36
|
-
- Intermediate transformation results
|
|
37
|
-
- Temporary buffers during processing
|
|
38
|
-
- Task-local cache data
|
|
39
|
-
- Processing status documents
|
|
40
|
-
|
|
41
|
-
Note:
|
|
42
|
-
- Cannot instantiate TaskDocument directly - must subclass
|
|
43
|
-
- Not saved by deployment utilities
|
|
44
|
-
- Reduces I/O overhead for temporary data
|
|
45
|
-
- No additional abstract methods to implement
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
def __init__(
|
|
49
|
-
self,
|
|
50
|
-
*,
|
|
51
|
-
name: str,
|
|
52
|
-
content: bytes,
|
|
53
|
-
description: str | None = None,
|
|
54
|
-
sources: list[str] | None = None,
|
|
55
|
-
) -> None:
|
|
56
|
-
"""Initialize a TaskDocument with raw bytes content.
|
|
57
|
-
|
|
58
|
-
See Document.__init__() for parameter details and usage notes.
|
|
59
|
-
|
|
60
|
-
Prevents direct instantiation of the abstract TaskDocument class.
|
|
61
|
-
TaskDocument must be subclassed for specific temporary document types.
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
name: Document filename (required, keyword-only)
|
|
65
|
-
content: Document content as raw bytes (required, keyword-only)
|
|
66
|
-
description: Optional human-readable description (keyword-only)
|
|
67
|
-
sources: Optional list of strings for provenance tracking
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
TypeError: If attempting to instantiate TaskDocument directly
|
|
71
|
-
instead of using a concrete subclass.
|
|
72
|
-
|
|
73
|
-
Example:
|
|
74
|
-
>>> from enum import StrEnum
|
|
75
|
-
>>>
|
|
76
|
-
>>> # Simple subclass:
|
|
77
|
-
>>> class MyTaskDoc(TaskDocument):
|
|
78
|
-
... pass
|
|
79
|
-
>>>
|
|
80
|
-
>>> # With FILES restriction:
|
|
81
|
-
>>> class TempProcessDoc(TaskDocument):
|
|
82
|
-
... class FILES(StrEnum):
|
|
83
|
-
... BUFFER = "buffer.bin"
|
|
84
|
-
... STATUS = "status.json"
|
|
85
|
-
>>>
|
|
86
|
-
>>> # Direct constructor - only for bytes:
|
|
87
|
-
>>> doc = MyTaskDoc(name="temp.bin", content=b"raw data")
|
|
88
|
-
>>>
|
|
89
|
-
>>> # RECOMMENDED - use create for automatic conversion:
|
|
90
|
-
>>> doc = TempProcessDoc.create(name="status.json", content={"percent": 50})
|
|
91
|
-
>>> # This would raise DocumentNameError:
|
|
92
|
-
>>> # doc = TempProcessDoc.create(name="other.json", content={})
|
|
93
|
-
"""
|
|
94
|
-
if type(self) is TaskDocument:
|
|
95
|
-
raise TypeError("Cannot instantiate abstract TaskDocument class directly")
|
|
96
|
-
|
|
97
|
-
# Only pass sources if not None to let Pydantic's default_factory handle it
|
|
98
|
-
if sources is not None:
|
|
99
|
-
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
100
|
-
else:
|
|
101
|
-
super().__init__(name=name, content=content, description=description)
|
|
102
|
-
|
|
103
|
-
@final
|
|
104
|
-
def get_base_type(self) -> Literal["task"]:
|
|
105
|
-
"""Return the base type identifier for task documents.
|
|
106
|
-
|
|
107
|
-
This method is final and cannot be overridden by subclasses.
|
|
108
|
-
It identifies this document as a task-scoped temporary document.
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
"task" - Indicates this document is temporary within task execution.
|
|
112
|
-
|
|
113
|
-
Note:
|
|
114
|
-
This determines that the document will not be persisted and
|
|
115
|
-
exists only during task execution.
|
|
116
|
-
"""
|
|
117
|
-
return "task"
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
"""Temporary document implementation for non-persistent data.
|
|
2
|
-
|
|
3
|
-
This module provides the TemporaryDocument class for documents that
|
|
4
|
-
are never persisted, regardless of context.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from typing import Any, Literal, final
|
|
8
|
-
|
|
9
|
-
from .document import Document
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@final
|
|
13
|
-
class TemporaryDocument(Document):
|
|
14
|
-
r"""Concrete document class for data that is never persisted.
|
|
15
|
-
|
|
16
|
-
TemporaryDocument is a final (non-subclassable) document type for
|
|
17
|
-
data that should never be saved to disk, regardless of whether it's
|
|
18
|
-
used in a flow or task context. Unlike FlowDocument and TaskDocument
|
|
19
|
-
which are abstract, TemporaryDocument can be instantiated directly.
|
|
20
|
-
|
|
21
|
-
Key characteristics:
|
|
22
|
-
- Never persisted to file system
|
|
23
|
-
- Can be instantiated directly (not abstract)
|
|
24
|
-
- Cannot be subclassed (annotated with Python's @final decorator in code)
|
|
25
|
-
- Useful for transient data like API responses or intermediate calculations
|
|
26
|
-
- Ignored by deployment save operations
|
|
27
|
-
- Useful for tests and debugging
|
|
28
|
-
|
|
29
|
-
Creating TemporaryDocuments:
|
|
30
|
-
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
31
|
-
Unlike abstract document types, TemporaryDocument can be instantiated directly.
|
|
32
|
-
See Document.create() for detailed usage examples.
|
|
33
|
-
|
|
34
|
-
>>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
|
|
35
|
-
>>> doc.is_temporary # Always True
|
|
36
|
-
|
|
37
|
-
Use Cases:
|
|
38
|
-
- API responses that shouldn't be cached
|
|
39
|
-
- Sensitive credentials or tokens
|
|
40
|
-
- Intermediate calculations
|
|
41
|
-
- Temporary transformations
|
|
42
|
-
- Data explicitly marked as non-persistent
|
|
43
|
-
|
|
44
|
-
Note:
|
|
45
|
-
- This is a final class and cannot be subclassed
|
|
46
|
-
- Use when you explicitly want to prevent persistence
|
|
47
|
-
- Useful for sensitive data that shouldn't be written to disk
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
51
|
-
"""Disallow subclassing.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
**kwargs: Additional keyword arguments (ignored).
|
|
55
|
-
|
|
56
|
-
Raises:
|
|
57
|
-
TypeError: Always raised to prevent subclassing of `TemporaryDocument`.
|
|
58
|
-
"""
|
|
59
|
-
raise TypeError("TemporaryDocument is final and cannot be subclassed")
|
|
60
|
-
|
|
61
|
-
def get_base_type(self) -> Literal["temporary"]:
|
|
62
|
-
"""Return the base type identifier for temporary documents.
|
|
63
|
-
|
|
64
|
-
Identifies this document as temporary, ensuring it will
|
|
65
|
-
never be persisted by the pipeline system.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
"temporary" - Indicates this document is never persisted.
|
|
69
|
-
|
|
70
|
-
Note:
|
|
71
|
-
Documents with this type are explicitly excluded from
|
|
72
|
-
all persistence operations in the pipeline system.
|
|
73
|
-
"""
|
|
74
|
-
return "temporary"
|
ai_pipeline_core/flow/config.py
DELETED
|
@@ -1,494 +0,0 @@
|
|
|
1
|
-
"""Flow configuration system for type-safe pipeline definitions.
|
|
2
|
-
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
|
-
This module provides the FlowConfig abstract base class that enforces
|
|
6
|
-
type safety for flow inputs and outputs in the pipeline system.
|
|
7
|
-
|
|
8
|
-
Best Practice:
|
|
9
|
-
Always finish @pipeline_flow functions with create_and_validate_output()
|
|
10
|
-
to ensure type safety and proper validation of output documents.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import json
|
|
14
|
-
from abc import ABC
|
|
15
|
-
from typing import Any, ClassVar, Iterable
|
|
16
|
-
|
|
17
|
-
from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
|
|
18
|
-
from ai_pipeline_core.exceptions import DocumentValidationError
|
|
19
|
-
from ai_pipeline_core.logging import get_pipeline_logger
|
|
20
|
-
from ai_pipeline_core.storage import Storage
|
|
21
|
-
|
|
22
|
-
logger = get_pipeline_logger(__name__)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class FlowConfig(ABC):
|
|
26
|
-
"""Abstract base class for type-safe flow configuration.
|
|
27
|
-
|
|
28
|
-
@public
|
|
29
|
-
|
|
30
|
-
FlowConfig defines the contract for flow inputs and outputs, ensuring
|
|
31
|
-
type safety and preventing circular dependencies in pipeline flows.
|
|
32
|
-
Each flow must have a corresponding FlowConfig subclass that specifies
|
|
33
|
-
its input document types and output document type.
|
|
34
|
-
|
|
35
|
-
CRITICAL RULE: OUTPUT_DOCUMENT_TYPE must NEVER be in INPUT_DOCUMENT_TYPES!
|
|
36
|
-
This prevents circular dependencies as flows chain together.
|
|
37
|
-
Each flow transforms input types to a DIFFERENT output type.
|
|
38
|
-
|
|
39
|
-
Class Variables:
|
|
40
|
-
INPUT_DOCUMENT_TYPES: List of FlowDocument types this flow accepts
|
|
41
|
-
OUTPUT_DOCUMENT_TYPE: Single FlowDocument type this flow produces
|
|
42
|
-
WEIGHT: Weight for progress calculation (default 1.0, based on avg duration)
|
|
43
|
-
|
|
44
|
-
Validation Rules:
|
|
45
|
-
- INPUT_DOCUMENT_TYPES and OUTPUT_DOCUMENT_TYPE must be defined
|
|
46
|
-
- OUTPUT_DOCUMENT_TYPE cannot be in INPUT_DOCUMENT_TYPES (prevents cycles)
|
|
47
|
-
- Field names must be exact (common typos are detected)
|
|
48
|
-
- WEIGHT must be a positive number
|
|
49
|
-
|
|
50
|
-
Why this matters:
|
|
51
|
-
Flows connect in pipelines where one flow's output becomes another's input.
|
|
52
|
-
Same input/output types would create infinite loops or circular dependencies.
|
|
53
|
-
|
|
54
|
-
Example:
|
|
55
|
-
>>> # CORRECT - Different output type from inputs
|
|
56
|
-
>>> class ProcessingFlowConfig(FlowConfig):
|
|
57
|
-
... INPUT_DOCUMENT_TYPES = [RawDataDocument]
|
|
58
|
-
... OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Different type!
|
|
59
|
-
... WEIGHT = 45.0 # Average ~45 minutes
|
|
60
|
-
>>>
|
|
61
|
-
>>> # Use in @pipeline_flow - RECOMMENDED PATTERN
|
|
62
|
-
>>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
|
|
63
|
-
>>> async def process(
|
|
64
|
-
... project_name: str, docs: DocumentList, flow_options: FlowOptions
|
|
65
|
-
... ) -> DocumentList:
|
|
66
|
-
... outputs = []
|
|
67
|
-
... # ... processing logic ...
|
|
68
|
-
... return config.create_and_validate_output(outputs)
|
|
69
|
-
|
|
70
|
-
>>> # WRONG - Will raise TypeError
|
|
71
|
-
>>> class BadConfig(FlowConfig):
|
|
72
|
-
... INPUT_DOCUMENT_TYPES = [DataDocument]
|
|
73
|
-
... OUTPUT_DOCUMENT_TYPE = DataDocument # SAME TYPE - NOT ALLOWED!
|
|
74
|
-
|
|
75
|
-
Note:
|
|
76
|
-
- Validation happens at class definition time
|
|
77
|
-
- Helps catch configuration errors early
|
|
78
|
-
- Used by PipelineDeployment to manage document flow
|
|
79
|
-
"""
|
|
80
|
-
|
|
81
|
-
INPUT_DOCUMENT_TYPES: ClassVar[list[type[FlowDocument]]]
|
|
82
|
-
OUTPUT_DOCUMENT_TYPE: ClassVar[type[FlowDocument]]
|
|
83
|
-
WEIGHT: ClassVar[float] = 1.0
|
|
84
|
-
|
|
85
|
-
def __init_subclass__(cls, **kwargs: Any):
|
|
86
|
-
"""Validate flow configuration at subclass definition time.
|
|
87
|
-
|
|
88
|
-
Performs comprehensive validation when a FlowConfig subclass is defined:
|
|
89
|
-
1. Checks for common field name mistakes (typos)
|
|
90
|
-
2. Ensures required fields are defined
|
|
91
|
-
3. Prevents circular dependencies (output != input)
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
**kwargs: Additional arguments for parent __init_subclass__.
|
|
95
|
-
|
|
96
|
-
Raises:
|
|
97
|
-
TypeError: If configuration violates any validation rules:
|
|
98
|
-
- Missing required fields
|
|
99
|
-
- Incorrect field names
|
|
100
|
-
- Circular dependency detected
|
|
101
|
-
|
|
102
|
-
Note:
|
|
103
|
-
This runs at class definition time, not instantiation,
|
|
104
|
-
providing immediate feedback during development.
|
|
105
|
-
"""
|
|
106
|
-
super().__init_subclass__(**kwargs)
|
|
107
|
-
|
|
108
|
-
# Skip validation for the abstract base class itself
|
|
109
|
-
if cls.__name__ == "FlowConfig":
|
|
110
|
-
return
|
|
111
|
-
|
|
112
|
-
# Check for invalid field names (common mistakes)
|
|
113
|
-
allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE", "WEIGHT"}
|
|
114
|
-
class_attrs = {name for name in dir(cls) if not name.startswith("_") and name.isupper()}
|
|
115
|
-
|
|
116
|
-
# Find fields that look like they might be mistakes
|
|
117
|
-
suspicious_fields = class_attrs - allowed_fields
|
|
118
|
-
common_mistakes = {
|
|
119
|
-
"OUTPUT_DOCUMENT_TYPES": "OUTPUT_DOCUMENT_TYPE",
|
|
120
|
-
"INPUT_DOCUMENT_TYPE": "INPUT_DOCUMENT_TYPES",
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
for field in suspicious_fields:
|
|
124
|
-
# Skip inherited attributes from parent classes
|
|
125
|
-
if any(hasattr(base, field) for base in cls.__bases__):
|
|
126
|
-
continue
|
|
127
|
-
|
|
128
|
-
if field in common_mistakes:
|
|
129
|
-
raise TypeError(
|
|
130
|
-
f"FlowConfig {cls.__name__}: Found '{field}' but expected "
|
|
131
|
-
f"'{common_mistakes[field]}'. Please use the correct field name."
|
|
132
|
-
)
|
|
133
|
-
elif "DOCUMENT" in field:
|
|
134
|
-
raise TypeError(
|
|
135
|
-
f"FlowConfig {cls.__name__}: Invalid field '{field}'. "
|
|
136
|
-
f"Only 'INPUT_DOCUMENT_TYPES' and 'OUTPUT_DOCUMENT_TYPE' are allowed."
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# Ensure required attributes are defined
|
|
140
|
-
if not hasattr(cls, "INPUT_DOCUMENT_TYPES"):
|
|
141
|
-
raise TypeError(f"FlowConfig {cls.__name__} must define INPUT_DOCUMENT_TYPES")
|
|
142
|
-
if not hasattr(cls, "OUTPUT_DOCUMENT_TYPE"):
|
|
143
|
-
raise TypeError(f"FlowConfig {cls.__name__} must define OUTPUT_DOCUMENT_TYPE")
|
|
144
|
-
|
|
145
|
-
# Validate that output type is not in input types
|
|
146
|
-
if cls.OUTPUT_DOCUMENT_TYPE in cls.INPUT_DOCUMENT_TYPES:
|
|
147
|
-
raise TypeError(
|
|
148
|
-
f"FlowConfig {cls.__name__}: OUTPUT_DOCUMENT_TYPE "
|
|
149
|
-
f"({cls.OUTPUT_DOCUMENT_TYPE.__name__}) cannot be in INPUT_DOCUMENT_TYPES"
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
# Validate WEIGHT
|
|
153
|
-
weight = getattr(cls, "WEIGHT", 1.0)
|
|
154
|
-
if not isinstance(weight, (int, float)) or weight <= 0:
|
|
155
|
-
raise TypeError(
|
|
156
|
-
f"FlowConfig {cls.__name__}: WEIGHT must be a positive number, got {weight}"
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
@classmethod
|
|
160
|
-
def get_input_document_types(cls) -> list[type[FlowDocument]]:
|
|
161
|
-
"""Get the list of input document types this flow accepts.
|
|
162
|
-
|
|
163
|
-
Returns:
|
|
164
|
-
List of FlowDocument subclasses that this flow requires
|
|
165
|
-
as input.
|
|
166
|
-
|
|
167
|
-
Example:
|
|
168
|
-
>>> types = MyFlowConfig.get_input_document_types()
|
|
169
|
-
>>> print([t.__name__ for t in types])
|
|
170
|
-
['InputDoc', 'ConfigDoc']
|
|
171
|
-
"""
|
|
172
|
-
return cls.INPUT_DOCUMENT_TYPES
|
|
173
|
-
|
|
174
|
-
@classmethod
|
|
175
|
-
def get_output_document_type(cls) -> type[FlowDocument]:
|
|
176
|
-
"""Get the output document type this flow produces.
|
|
177
|
-
|
|
178
|
-
Returns:
|
|
179
|
-
Single FlowDocument subclass that this flow outputs.
|
|
180
|
-
|
|
181
|
-
Example:
|
|
182
|
-
>>> output_type = MyFlowConfig.get_output_document_type()
|
|
183
|
-
>>> print(output_type.__name__)
|
|
184
|
-
'ProcessedDataDocument'
|
|
185
|
-
"""
|
|
186
|
-
return cls.OUTPUT_DOCUMENT_TYPE
|
|
187
|
-
|
|
188
|
-
@classmethod
|
|
189
|
-
def has_input_documents(cls, documents: DocumentList) -> bool:
|
|
190
|
-
"""Check if all required input documents are present.
|
|
191
|
-
|
|
192
|
-
Verifies that the document list contains at least one instance
|
|
193
|
-
of each required input document type.
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
documents: DocumentList to check for required inputs.
|
|
197
|
-
|
|
198
|
-
Returns:
|
|
199
|
-
True if all required document types are present,
|
|
200
|
-
False if any are missing.
|
|
201
|
-
|
|
202
|
-
Example:
|
|
203
|
-
>>> docs = DocumentList([input_doc, config_doc])
|
|
204
|
-
>>> if MyFlowConfig.has_input_documents(docs):
|
|
205
|
-
... # Safe to proceed with flow
|
|
206
|
-
... pass
|
|
207
|
-
|
|
208
|
-
Note:
|
|
209
|
-
Use this before get_input_documents() to avoid exceptions.
|
|
210
|
-
"""
|
|
211
|
-
for doc_cls in cls.INPUT_DOCUMENT_TYPES:
|
|
212
|
-
if not any(isinstance(doc, doc_cls) for doc in documents):
|
|
213
|
-
return False
|
|
214
|
-
return True
|
|
215
|
-
|
|
216
|
-
@classmethod
|
|
217
|
-
def get_input_documents(cls, documents: DocumentList) -> DocumentList:
|
|
218
|
-
"""Extract and return all required input documents.
|
|
219
|
-
|
|
220
|
-
Filters the provided document list to return only documents
|
|
221
|
-
matching the required input types. Returns all matching documents,
|
|
222
|
-
not just the first of each type.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
documents: DocumentList containing mixed document types.
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
DocumentList containing only the required input documents.
|
|
229
|
-
|
|
230
|
-
Raises:
|
|
231
|
-
ValueError: If any required document type is missing.
|
|
232
|
-
|
|
233
|
-
Example:
|
|
234
|
-
>>> all_docs = DocumentList([input1, input2, other_doc])
|
|
235
|
-
>>> input_docs = MyFlowConfig.get_input_documents(all_docs)
|
|
236
|
-
>>> len(input_docs) # Contains only input1 and input2
|
|
237
|
-
2
|
|
238
|
-
|
|
239
|
-
Note:
|
|
240
|
-
Call has_input_documents() first to check availability.
|
|
241
|
-
"""
|
|
242
|
-
input_documents = DocumentList()
|
|
243
|
-
for doc_cls in cls.INPUT_DOCUMENT_TYPES:
|
|
244
|
-
filtered_documents = [doc for doc in documents if isinstance(doc, doc_cls)]
|
|
245
|
-
if not filtered_documents:
|
|
246
|
-
raise ValueError(f"No input document found for class {doc_cls.__name__}")
|
|
247
|
-
input_documents.extend(filtered_documents)
|
|
248
|
-
return input_documents
|
|
249
|
-
|
|
250
|
-
@classmethod
|
|
251
|
-
def validate_output_documents(cls, documents: Any) -> None:
|
|
252
|
-
"""Validate that output documents match the expected type.
|
|
253
|
-
|
|
254
|
-
Ensures all documents in the list are instances of the
|
|
255
|
-
declared OUTPUT_DOCUMENT_TYPE.
|
|
256
|
-
|
|
257
|
-
Args:
|
|
258
|
-
documents: DocumentList to validate.
|
|
259
|
-
|
|
260
|
-
Raises:
|
|
261
|
-
DocumentValidationError: If documents is not a DocumentList or if any
|
|
262
|
-
document has incorrect type.
|
|
263
|
-
|
|
264
|
-
Example:
|
|
265
|
-
>>> output = DocumentList([ProcessedDoc(...)])
|
|
266
|
-
>>> MyFlowConfig.validate_output_documents(output)
|
|
267
|
-
>>> # No exception means valid
|
|
268
|
-
|
|
269
|
-
Note:
|
|
270
|
-
Used internally by create_and_validate_output().
|
|
271
|
-
Uses explicit exceptions for validation (works with python -O).
|
|
272
|
-
"""
|
|
273
|
-
if not isinstance(documents, DocumentList):
|
|
274
|
-
raise DocumentValidationError("Documents must be a DocumentList")
|
|
275
|
-
|
|
276
|
-
output_document_class = cls.get_output_document_type()
|
|
277
|
-
|
|
278
|
-
for doc in documents:
|
|
279
|
-
if not isinstance(doc, output_document_class):
|
|
280
|
-
raise DocumentValidationError(
|
|
281
|
-
f"Document '{doc.name}' has incorrect type. "
|
|
282
|
-
f"Expected: {output_document_class.__name__}, "
|
|
283
|
-
f"Got: {type(doc).__name__}"
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
@classmethod
|
|
287
|
-
def create_and_validate_output(
|
|
288
|
-
cls, output: FlowDocument | Iterable[FlowDocument] | DocumentList
|
|
289
|
-
) -> DocumentList:
|
|
290
|
-
"""Create and validate flow output documents.
|
|
291
|
-
|
|
292
|
-
@public
|
|
293
|
-
|
|
294
|
-
RECOMMENDED: Always use this method at the end of @pipeline_flow functions
|
|
295
|
-
to ensure type safety and proper output validation.
|
|
296
|
-
|
|
297
|
-
Convenience method that wraps output in a DocumentList if needed
|
|
298
|
-
and validates it matches the expected OUTPUT_DOCUMENT_TYPE.
|
|
299
|
-
|
|
300
|
-
Args:
|
|
301
|
-
output: Single document, iterable of documents, or DocumentList.
|
|
302
|
-
|
|
303
|
-
Returns:
|
|
304
|
-
Validated DocumentList containing the output documents.
|
|
305
|
-
|
|
306
|
-
Raises:
|
|
307
|
-
DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
|
|
308
|
-
|
|
309
|
-
Example:
|
|
310
|
-
>>> @pipeline_flow(config=MyFlowConfig, name="my_flow")
|
|
311
|
-
>>> async def process_flow(
|
|
312
|
-
... project_name: str, documents: DocumentList, flow_options: FlowOptions
|
|
313
|
-
... ) -> DocumentList:
|
|
314
|
-
>>> outputs = []
|
|
315
|
-
>>> # ... processing logic ...
|
|
316
|
-
>>> outputs.append(OutputDoc(...))
|
|
317
|
-
>>>
|
|
318
|
-
>>> # Always finish with this validation
|
|
319
|
-
>>> return config.create_and_validate_output(outputs)
|
|
320
|
-
|
|
321
|
-
Note:
|
|
322
|
-
This is the recommended pattern for all @pipeline_flow functions.
|
|
323
|
-
It ensures type safety and catches output errors immediately.
|
|
324
|
-
"""
|
|
325
|
-
documents: DocumentList
|
|
326
|
-
if isinstance(output, FlowDocument):
|
|
327
|
-
documents = DocumentList([output])
|
|
328
|
-
elif isinstance(output, DocumentList):
|
|
329
|
-
documents = output
|
|
330
|
-
else:
|
|
331
|
-
# Handle any iterable of FlowDocuments
|
|
332
|
-
documents = DocumentList(list(output)) # type: ignore[arg-type]
|
|
333
|
-
cls.validate_output_documents(documents)
|
|
334
|
-
return documents
|
|
335
|
-
|
|
336
|
-
@classmethod
|
|
337
|
-
async def load_documents(
|
|
338
|
-
cls,
|
|
339
|
-
uri: str,
|
|
340
|
-
) -> DocumentList:
|
|
341
|
-
"""Load documents from storage matching INPUT_DOCUMENT_TYPES.
|
|
342
|
-
|
|
343
|
-
Loads documents from a storage location based on the class's INPUT_DOCUMENT_TYPES.
|
|
344
|
-
Supports both local filesystem and Google Cloud Storage backends.
|
|
345
|
-
Automatically loads metadata (.description.md and .sources.json) when present.
|
|
346
|
-
|
|
347
|
-
Args:
|
|
348
|
-
uri: Storage URI (file://, gs://, or local path)
|
|
349
|
-
|
|
350
|
-
Returns:
|
|
351
|
-
DocumentList containing loaded documents matching INPUT_DOCUMENT_TYPES
|
|
352
|
-
|
|
353
|
-
Example:
|
|
354
|
-
>>> # Load from local filesystem
|
|
355
|
-
>>> docs = await MyFlowConfig.load_documents("./data")
|
|
356
|
-
>>>
|
|
357
|
-
>>> # Load from GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
|
|
358
|
-
>>> docs = await MyFlowConfig.load_documents("gs://bucket/data")
|
|
359
|
-
"""
|
|
360
|
-
# Use INPUT_DOCUMENT_TYPES if not specified
|
|
361
|
-
storage = await Storage.from_uri(uri)
|
|
362
|
-
loaded_documents = DocumentList()
|
|
363
|
-
|
|
364
|
-
# Process each document type
|
|
365
|
-
for doc_type in cls.INPUT_DOCUMENT_TYPES:
|
|
366
|
-
canonical_name = doc_type.canonical_name()
|
|
367
|
-
doc_storage = storage.with_base(canonical_name)
|
|
368
|
-
|
|
369
|
-
# Check if subdirectory exists
|
|
370
|
-
if not await doc_storage.exists(""):
|
|
371
|
-
logger.debug(f"Subdirectory {canonical_name} not found, skipping")
|
|
372
|
-
continue
|
|
373
|
-
|
|
374
|
-
# List files in subdirectory
|
|
375
|
-
objects = await doc_storage.list("", recursive=False, include_dirs=False)
|
|
376
|
-
|
|
377
|
-
# Create lookup set for metadata files
|
|
378
|
-
object_keys = {obj.key for obj in objects}
|
|
379
|
-
|
|
380
|
-
# Filter out metadata files
|
|
381
|
-
doc_files = [
|
|
382
|
-
obj
|
|
383
|
-
for obj in objects
|
|
384
|
-
if not obj.key.endswith(Document.DESCRIPTION_EXTENSION)
|
|
385
|
-
and not obj.key.endswith(Document.SOURCES_EXTENSION)
|
|
386
|
-
]
|
|
387
|
-
|
|
388
|
-
for obj in doc_files:
|
|
389
|
-
try:
|
|
390
|
-
# Load document content
|
|
391
|
-
content = await doc_storage.read_bytes(obj.key)
|
|
392
|
-
|
|
393
|
-
# Load metadata if present
|
|
394
|
-
description = None
|
|
395
|
-
sources: list[str] = []
|
|
396
|
-
|
|
397
|
-
# Check for description in objects list
|
|
398
|
-
desc_path = f"{obj.key}{Document.DESCRIPTION_EXTENSION}"
|
|
399
|
-
if desc_path in object_keys:
|
|
400
|
-
try:
|
|
401
|
-
description = await doc_storage.read_text(desc_path)
|
|
402
|
-
except Exception as e:
|
|
403
|
-
logger.warning(f"Failed to load description for {obj.key}: {e}")
|
|
404
|
-
|
|
405
|
-
# Check for sources in objects list
|
|
406
|
-
sources_path = f"{obj.key}{Document.SOURCES_EXTENSION}"
|
|
407
|
-
if sources_path in object_keys:
|
|
408
|
-
try:
|
|
409
|
-
sources_text = await doc_storage.read_text(sources_path)
|
|
410
|
-
sources = json.loads(sources_text)
|
|
411
|
-
except Exception as e:
|
|
412
|
-
logger.warning(f"Failed to load sources for {obj.key}: {e}")
|
|
413
|
-
|
|
414
|
-
# Create document instance
|
|
415
|
-
doc = doc_type(
|
|
416
|
-
name=obj.key,
|
|
417
|
-
content=content,
|
|
418
|
-
description=description,
|
|
419
|
-
sources=sources,
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
loaded_documents.append(doc)
|
|
423
|
-
logger.debug(f"Loaded {doc_type.__name__} document: {obj.key}")
|
|
424
|
-
except Exception as e:
|
|
425
|
-
logger.error(f"Failed to load {doc_type.__name__} document {obj.key}: {e}")
|
|
426
|
-
|
|
427
|
-
logger.info(f"Loaded {len(loaded_documents)} documents from {uri}")
|
|
428
|
-
return loaded_documents
|
|
429
|
-
|
|
430
|
-
@classmethod
|
|
431
|
-
async def save_documents(
|
|
432
|
-
cls,
|
|
433
|
-
uri: str,
|
|
434
|
-
documents: DocumentList,
|
|
435
|
-
*,
|
|
436
|
-
validate_output_type: bool = True,
|
|
437
|
-
) -> None:
|
|
438
|
-
"""Save documents to storage with metadata.
|
|
439
|
-
|
|
440
|
-
Saves FlowDocument instances to a storage location with their content
|
|
441
|
-
and metadata files (Document.DESCRIPTION_EXTENSION and Document.SOURCES_EXTENSION).
|
|
442
|
-
Non-FlowDocument instances (TaskDocument, TemporaryDocument) are skipped.
|
|
443
|
-
|
|
444
|
-
Args:
|
|
445
|
-
uri: Storage URI (file://, gs://, or local path)
|
|
446
|
-
documents: DocumentList to save
|
|
447
|
-
validate_output_type: If True, validate documents match cls.OUTPUT_DOCUMENT_TYPE
|
|
448
|
-
|
|
449
|
-
Raises:
|
|
450
|
-
DocumentValidationError: If validate_output_type=True and documents don't match
|
|
451
|
-
OUTPUT_DOCUMENT_TYPE
|
|
452
|
-
|
|
453
|
-
Example:
|
|
454
|
-
>>> # Save to local filesystem
|
|
455
|
-
>>> await MyFlowConfig.save_documents("./output", docs)
|
|
456
|
-
>>>
|
|
457
|
-
>>> # Save to GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
|
|
458
|
-
>>> await MyFlowConfig.save_documents("gs://bucket/output", docs)
|
|
459
|
-
"""
|
|
460
|
-
# Validate output type if requested
|
|
461
|
-
if validate_output_type:
|
|
462
|
-
cls.validate_output_documents(documents)
|
|
463
|
-
|
|
464
|
-
storage = await Storage.from_uri(uri)
|
|
465
|
-
saved_count = 0
|
|
466
|
-
|
|
467
|
-
for doc in documents:
|
|
468
|
-
# Skip non-FlowDocument instances
|
|
469
|
-
if not isinstance(doc, FlowDocument):
|
|
470
|
-
logger.warning(f"Skipping non-FlowDocument: {type(doc).__name__}")
|
|
471
|
-
continue
|
|
472
|
-
|
|
473
|
-
# Get canonical name for subdirectory
|
|
474
|
-
canonical_name = doc.canonical_name()
|
|
475
|
-
doc_storage = storage.with_base(canonical_name)
|
|
476
|
-
|
|
477
|
-
# Save document content
|
|
478
|
-
await doc_storage.write_bytes(doc.name, doc.content)
|
|
479
|
-
saved_count += 1
|
|
480
|
-
|
|
481
|
-
# Save description if present
|
|
482
|
-
if doc.description:
|
|
483
|
-
desc_path = f"{doc.name}{Document.DESCRIPTION_EXTENSION}"
|
|
484
|
-
await doc_storage.write_text(desc_path, doc.description)
|
|
485
|
-
|
|
486
|
-
# Save sources if present
|
|
487
|
-
if doc.sources:
|
|
488
|
-
sources_path = f"{doc.name}{Document.SOURCES_EXTENSION}"
|
|
489
|
-
sources_json = json.dumps(doc.sources, indent=2)
|
|
490
|
-
await doc_storage.write_text(sources_path, sources_json)
|
|
491
|
-
|
|
492
|
-
logger.debug(f"Saved {type(doc).__name__} document: {doc.name}")
|
|
493
|
-
|
|
494
|
-
logger.info(f"Saved {saved_count} documents to {uri}")
|