aponyx 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. aponyx/__init__.py +14 -0
  2. aponyx/backtest/__init__.py +31 -0
  3. aponyx/backtest/adapters.py +77 -0
  4. aponyx/backtest/config.py +84 -0
  5. aponyx/backtest/engine.py +560 -0
  6. aponyx/backtest/protocols.py +101 -0
  7. aponyx/backtest/registry.py +334 -0
  8. aponyx/backtest/strategy_catalog.json +50 -0
  9. aponyx/cli/__init__.py +5 -0
  10. aponyx/cli/commands/__init__.py +8 -0
  11. aponyx/cli/commands/clean.py +349 -0
  12. aponyx/cli/commands/list.py +302 -0
  13. aponyx/cli/commands/report.py +167 -0
  14. aponyx/cli/commands/run.py +377 -0
  15. aponyx/cli/main.py +125 -0
  16. aponyx/config/__init__.py +82 -0
  17. aponyx/data/__init__.py +99 -0
  18. aponyx/data/bloomberg_config.py +306 -0
  19. aponyx/data/bloomberg_instruments.json +26 -0
  20. aponyx/data/bloomberg_securities.json +42 -0
  21. aponyx/data/cache.py +294 -0
  22. aponyx/data/fetch.py +659 -0
  23. aponyx/data/fetch_registry.py +135 -0
  24. aponyx/data/loaders.py +205 -0
  25. aponyx/data/providers/__init__.py +13 -0
  26. aponyx/data/providers/bloomberg.py +383 -0
  27. aponyx/data/providers/file.py +111 -0
  28. aponyx/data/registry.py +500 -0
  29. aponyx/data/requirements.py +96 -0
  30. aponyx/data/sample_data.py +415 -0
  31. aponyx/data/schemas.py +60 -0
  32. aponyx/data/sources.py +171 -0
  33. aponyx/data/synthetic_params.json +46 -0
  34. aponyx/data/transforms.py +336 -0
  35. aponyx/data/validation.py +308 -0
  36. aponyx/docs/__init__.py +24 -0
  37. aponyx/docs/adding_data_providers.md +682 -0
  38. aponyx/docs/cdx_knowledge_base.md +455 -0
  39. aponyx/docs/cdx_overlay_strategy.md +135 -0
  40. aponyx/docs/cli_guide.md +607 -0
  41. aponyx/docs/governance_design.md +551 -0
  42. aponyx/docs/logging_design.md +251 -0
  43. aponyx/docs/performance_evaluation_design.md +265 -0
  44. aponyx/docs/python_guidelines.md +786 -0
  45. aponyx/docs/signal_registry_usage.md +369 -0
  46. aponyx/docs/signal_suitability_design.md +558 -0
  47. aponyx/docs/visualization_design.md +277 -0
  48. aponyx/evaluation/__init__.py +11 -0
  49. aponyx/evaluation/performance/__init__.py +24 -0
  50. aponyx/evaluation/performance/adapters.py +109 -0
  51. aponyx/evaluation/performance/analyzer.py +384 -0
  52. aponyx/evaluation/performance/config.py +320 -0
  53. aponyx/evaluation/performance/decomposition.py +304 -0
  54. aponyx/evaluation/performance/metrics.py +761 -0
  55. aponyx/evaluation/performance/registry.py +327 -0
  56. aponyx/evaluation/performance/report.py +541 -0
  57. aponyx/evaluation/suitability/__init__.py +67 -0
  58. aponyx/evaluation/suitability/config.py +143 -0
  59. aponyx/evaluation/suitability/evaluator.py +389 -0
  60. aponyx/evaluation/suitability/registry.py +328 -0
  61. aponyx/evaluation/suitability/report.py +398 -0
  62. aponyx/evaluation/suitability/scoring.py +367 -0
  63. aponyx/evaluation/suitability/tests.py +303 -0
  64. aponyx/examples/01_generate_synthetic_data.py +53 -0
  65. aponyx/examples/02_fetch_data_file.py +82 -0
  66. aponyx/examples/03_fetch_data_bloomberg.py +104 -0
  67. aponyx/examples/04_compute_signal.py +164 -0
  68. aponyx/examples/05_evaluate_suitability.py +224 -0
  69. aponyx/examples/06_run_backtest.py +242 -0
  70. aponyx/examples/07_analyze_performance.py +214 -0
  71. aponyx/examples/08_visualize_results.py +272 -0
  72. aponyx/main.py +7 -0
  73. aponyx/models/__init__.py +45 -0
  74. aponyx/models/config.py +83 -0
  75. aponyx/models/indicator_transformation.json +52 -0
  76. aponyx/models/indicators.py +292 -0
  77. aponyx/models/metadata.py +447 -0
  78. aponyx/models/orchestrator.py +213 -0
  79. aponyx/models/registry.py +860 -0
  80. aponyx/models/score_transformation.json +42 -0
  81. aponyx/models/signal_catalog.json +29 -0
  82. aponyx/models/signal_composer.py +513 -0
  83. aponyx/models/signal_transformation.json +29 -0
  84. aponyx/persistence/__init__.py +16 -0
  85. aponyx/persistence/json_io.py +132 -0
  86. aponyx/persistence/parquet_io.py +378 -0
  87. aponyx/py.typed +0 -0
  88. aponyx/reporting/__init__.py +10 -0
  89. aponyx/reporting/generator.py +517 -0
  90. aponyx/visualization/__init__.py +20 -0
  91. aponyx/visualization/app.py +37 -0
  92. aponyx/visualization/plots.py +309 -0
  93. aponyx/visualization/visualizer.py +242 -0
  94. aponyx/workflows/__init__.py +18 -0
  95. aponyx/workflows/concrete_steps.py +720 -0
  96. aponyx/workflows/config.py +122 -0
  97. aponyx/workflows/engine.py +279 -0
  98. aponyx/workflows/registry.py +116 -0
  99. aponyx/workflows/steps.py +180 -0
  100. aponyx-0.1.18.dist-info/METADATA +552 -0
  101. aponyx-0.1.18.dist-info/RECORD +104 -0
  102. aponyx-0.1.18.dist-info/WHEEL +4 -0
  103. aponyx-0.1.18.dist-info/entry_points.txt +2 -0
  104. aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,122 @@
1
+ """
2
+ Workflow configuration management.
3
+
4
+ Defines immutable configuration for workflow execution including
5
+ signal/strategy selection, data sources, and execution options.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Literal
11
+
12
+ from aponyx.config import DATA_WORKFLOWS_DIR
13
+
14
+ StepName = Literal[
15
+ "data",
16
+ "signal",
17
+ "suitability",
18
+ "backtest",
19
+ "performance",
20
+ "visualization",
21
+ ]
22
+
23
+ # DataSource now accepts any string to support dynamic source discovery
24
+ DataSource = str
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class WorkflowConfig:
29
+ """
30
+ Immutable workflow execution configuration.
31
+
32
+ Attributes
33
+ ----------
34
+ label : str
35
+ Workflow label (lowercase, underscores only, pattern: ^[a-z][a-z0-9_]*$).
36
+ Used for workflow identification and directory naming.
37
+ signal_name : str
38
+ Signal name from signal catalog.
39
+ strategy_name : str
40
+ Strategy name from strategy catalog.
41
+ product : str
42
+ Product identifier for backtesting (e.g., "cdx_ig_5y", "cdx_hy_5y").
43
+ data_source : str
44
+ Data source type (e.g., "synthetic", "file", "bloomberg", or custom sources).
45
+ security_mapping : dict[str, str] | None
46
+ Maps generic instrument types to specific securities.
47
+ Example: {"cdx": "cdx_ig_5y", "etf": "hyg", "vix": "vix"}
48
+ If None, uses defaults from indicator catalog.
49
+ indicator_transformation_override : str | None
50
+ Override indicator transformation from catalog (must exist in indicator_transformation.json).
51
+ If None, uses indicator_transformation from signal catalog.
52
+ Example: "spread_momentum_5d" to swap indicator while keeping score/signal transformations.
53
+ score_transformation_override : str | None
54
+ Override score transformation from catalog (must exist in score_transformation.json).
55
+ If None, uses score_transformation from signal catalog.
56
+ Example: "z_score_60d" to swap normalization window while keeping indicator/signal transformations.
57
+ signal_transformation_override : str | None
58
+ Override signal transformation from catalog (must exist in signal_transformation.json).
59
+ If None, uses signal_transformation from signal catalog.
60
+ Example: "bounded_2_0" to swap trading rules while keeping indicator/score transformations.
61
+ steps : list[StepName] | None
62
+ Specific steps to execute (None = all steps in order).
63
+ force_rerun : bool
64
+ Force re-execution even if cached outputs exist.
65
+ output_dir : Path
66
+ Base directory for workflow outputs.
67
+
68
+ Notes
69
+ -----
70
+ Configuration is frozen to prevent accidental mutation during execution.
71
+ Use dataclasses.replace() to create modified copies if needed.
72
+
73
+ Four-Stage Transformation Pipeline
74
+ -----------------------------------
75
+ Security → Indicator → Score → Signal → Position
76
+
77
+ Each signal references exactly one transformation from each stage (1:1:1 relationship).
78
+
79
+ Runtime overrides allow swapping components at any stage without editing catalogs:
80
+ - security_mapping: Override which securities to load for each instrument type
81
+ - indicator_transformation_override: Swap indicator while keeping score/signal transformations
82
+ - score_transformation_override: Swap normalization while keeping indicator/signal transformations
83
+ - signal_transformation_override: Swap trading rules while keeping indicator/score transformations
84
+ """
85
+
86
+ label: str
87
+ signal_name: str
88
+ strategy_name: str
89
+ product: str
90
+ data_source: DataSource = "synthetic"
91
+ security_mapping: dict[str, str] | None = None
92
+ indicator_transformation_override: str | None = None
93
+ score_transformation_override: str | None = None
94
+ signal_transformation_override: str | None = None
95
+ steps: list[StepName] | None = None
96
+ force_rerun: bool = False
97
+ output_dir: Path = field(default_factory=lambda: DATA_WORKFLOWS_DIR)
98
+
99
+ def __post_init__(self) -> None:
100
+ """Validate configuration on initialization."""
101
+ import re
102
+
103
+ # Validate label format
104
+ if not re.match(r"^[a-z][a-z0-9_]*$", self.label):
105
+ raise ValueError(
106
+ f"Label '{self.label}' is invalid. "
107
+ "Must start with lowercase letter and contain only lowercase letters, numbers, and underscores."
108
+ )
109
+
110
+ # Validate steps
111
+ if self.steps is not None:
112
+ valid_steps = {
113
+ "data",
114
+ "signal",
115
+ "suitability",
116
+ "backtest",
117
+ "performance",
118
+ "visualization",
119
+ }
120
+ invalid = set(self.steps) - valid_steps
121
+ if invalid:
122
+ raise ValueError(f"Invalid steps: {invalid}")
@@ -0,0 +1,279 @@
1
+ """
2
+ Workflow orchestration engine.
3
+
4
+ Coordinates sequential execution of workflow steps with dependency tracking,
5
+ caching, error handling, and progress logging.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from .config import WorkflowConfig
14
+ from .steps import WorkflowStep
15
+ from .registry import StepRegistry
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class WorkflowEngine:
21
+ """
22
+ Workflow execution orchestrator.
23
+
24
+ Manages sequential pipeline execution with:
25
+ - Dependency resolution (data → signal → backtest → ...)
26
+ - Smart caching (skip completed steps)
27
+ - Error handling (save partial results)
28
+ - Progress tracking (structured logging)
29
+
30
+ Parameters
31
+ ----------
32
+ config : WorkflowConfig
33
+ Workflow execution configuration.
34
+
35
+ Examples
36
+ --------
37
+ Execute full workflow:
38
+ >>> config = WorkflowConfig(
39
+ ... signal_name="spread_momentum",
40
+ ... strategy_name="balanced",
41
+ ... )
42
+ >>> engine = WorkflowEngine(config)
43
+ >>> results = engine.execute()
44
+
45
+ Execute specific steps:
46
+ >>> config = WorkflowConfig(
47
+ ... signal_name="spread_momentum",
48
+ ... strategy_name="balanced",
49
+ ... steps=["data", "signal", "backtest"],
50
+ ... )
51
+ >>> engine = WorkflowEngine(config)
52
+ >>> results = engine.execute()
53
+ """
54
+
55
+ def __init__(self, config: WorkflowConfig) -> None:
56
+ self.config = config
57
+ self._registry = StepRegistry()
58
+ self._steps = self._resolve_steps()
59
+ self._context: dict[str, Any] = {}
60
+ self._start_time: datetime | None = None
61
+
62
+ def execute(self) -> dict[str, Any]:
63
+ """
64
+ Execute workflow pipeline.
65
+
66
+ Returns
67
+ -------
68
+ dict[str, Any]
69
+ Workflow results with keys:
70
+ - steps_completed: int (number of steps executed)
71
+ - steps_skipped: int (number cached steps skipped)
72
+ - output_dir: Path (workflow output directory)
73
+ - duration_seconds: float (total execution time)
74
+ - errors: list[dict] (errors if any step failed)
75
+
76
+ Notes
77
+ -----
78
+ Steps execute in dependency order. If step N fails, steps N+1...
79
+ are skipped but results from steps 1...N-1 are preserved.
80
+ """
81
+ self._start_time = datetime.now()
82
+
83
+ logger.info(
84
+ "Starting workflow: signal=%s, strategy=%s, source=%s, steps=%d",
85
+ self.config.signal_name,
86
+ self.config.strategy_name,
87
+ self.config.data_source,
88
+ len(self._steps),
89
+ )
90
+
91
+ # Create workflow output directory upfront
92
+ output_dir = self._create_output_directory()
93
+
94
+ # Add output_dir to context for steps to use
95
+ self._context["output_dir"] = output_dir
96
+
97
+ completed = 0
98
+ skipped = 0
99
+ errors = []
100
+
101
+ for idx, step in enumerate(self._steps, start=1):
102
+ step_num = f"{idx}/{len(self._steps)}"
103
+
104
+ # Check cache
105
+ if self._should_skip_step(step):
106
+ logger.info("Step %s: %s (cached)", step_num, step.name)
107
+ # Load cached output into context for downstream steps
108
+ try:
109
+ cached_output = step.load_cached_output()
110
+ self._context[step.name] = cached_output
111
+ except Exception as e:
112
+ logger.warning(
113
+ "Failed to load cached output for %s: %s. Re-running step.",
114
+ step.name,
115
+ str(e),
116
+ )
117
+ # Fall through to execute step instead
118
+ else:
119
+ skipped += 1
120
+ continue
121
+
122
+ # Execute step
123
+ try:
124
+ logger.info("Step %s: %s", step_num, step.name)
125
+ output = step.execute(self._context)
126
+ self._context[step.name] = output
127
+ completed += 1
128
+ logger.info("Step %s: %s complete", step_num, step.name)
129
+
130
+ except Exception as e:
131
+ logger.error("Step %s: %s failed - %s", step_num, step.name, str(e))
132
+ errors.append(
133
+ {
134
+ "step": step.name,
135
+ "error": str(e),
136
+ "type": type(e).__name__,
137
+ }
138
+ )
139
+ break # Stop execution on first error
140
+
141
+ duration = (datetime.now() - self._start_time).total_seconds()
142
+
143
+ result = {
144
+ "steps_completed": completed,
145
+ "steps_skipped": skipped,
146
+ "output_dir": output_dir,
147
+ "duration_seconds": duration,
148
+ "errors": errors,
149
+ }
150
+
151
+ # Save workflow metadata
152
+ self._save_metadata(output_dir, completed, skipped, errors, duration)
153
+
154
+ if errors:
155
+ logger.error(
156
+ "Workflow failed: completed=%d, skipped=%d, failed=%d (%.1fs)",
157
+ completed,
158
+ skipped,
159
+ len(errors),
160
+ duration,
161
+ )
162
+ else:
163
+ logger.info(
164
+ "Workflow complete: completed=%d, skipped=%d (%.1fs)",
165
+ completed,
166
+ skipped,
167
+ duration,
168
+ )
169
+
170
+ return result
171
+
172
+ def _resolve_steps(self) -> list[WorkflowStep]:
173
+ """
174
+ Resolve workflow steps from configuration.
175
+
176
+ Returns
177
+ -------
178
+ list[WorkflowStep]
179
+ Ordered list of step instances to execute.
180
+
181
+ Notes
182
+ -----
183
+ If config.steps is None, returns all steps in dependency order.
184
+ If config.steps is specified, returns subset in correct order.
185
+ """
186
+ all_steps = self._registry.get_all_steps(self.config)
187
+
188
+ if self.config.steps is None:
189
+ return all_steps
190
+
191
+ # Filter to requested steps (maintain order)
192
+ requested = set(self.config.steps)
193
+ return [s for s in all_steps if s.name in requested]
194
+
195
+ def _should_skip_step(self, step: WorkflowStep) -> bool:
196
+ """
197
+ Determine if step should be skipped (cached).
198
+
199
+ Parameters
200
+ ----------
201
+ step : WorkflowStep
202
+ Step to check.
203
+
204
+ Returns
205
+ -------
206
+ bool
207
+ True if step output exists and force_rerun is False.
208
+ """
209
+ if self.config.force_rerun:
210
+ return False
211
+ return step.output_exists()
212
+
213
+ def _create_output_directory(self) -> Path:
214
+ """
215
+ Create timestamped output directory for workflow.
216
+
217
+ Returns
218
+ -------
219
+ Path
220
+ Created output directory path.
221
+
222
+ Notes
223
+ -----
224
+ Format: workflows/{label}_{timestamp}/
225
+ """
226
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
227
+ dirname = f"{self.config.label}_{timestamp}"
228
+ output_dir = self.config.output_dir / dirname
229
+ output_dir.mkdir(parents=True, exist_ok=True)
230
+ return output_dir
231
+
232
+ def _save_metadata(
233
+ self,
234
+ output_dir: Path,
235
+ completed: int,
236
+ skipped: int,
237
+ errors: list[dict[str, Any]],
238
+ duration: float,
239
+ ) -> None:
240
+ """
241
+ Save workflow metadata to metadata.json.
242
+
243
+ Parameters
244
+ ----------
245
+ output_dir : Path
246
+ Workflow output directory.
247
+ completed : int
248
+ Number of completed steps.
249
+ skipped : int
250
+ Number of skipped steps.
251
+ errors : list of dict
252
+ Error details if any.
253
+ duration : float
254
+ Execution duration in seconds.
255
+ """
256
+ from ..persistence import save_json
257
+
258
+ # Extract securities_used from signal step if available
259
+ securities_used = self._context.get("signal", {}).get("securities_used", {})
260
+
261
+ metadata = {
262
+ "label": self.config.label,
263
+ "signal": self.config.signal_name,
264
+ "strategy": self.config.strategy_name,
265
+ "product": self.config.product,
266
+ "data_source": self.config.data_source,
267
+ "securities_used": securities_used,
268
+ "timestamp": self._start_time.isoformat() if self._start_time else None,
269
+ "duration_seconds": duration,
270
+ "steps_completed": completed,
271
+ "steps_skipped": skipped,
272
+ "steps_total": len(self._steps),
273
+ "status": "failed" if errors else "completed",
274
+ "errors": errors if errors else None,
275
+ }
276
+
277
+ metadata_path = output_dir / "metadata.json"
278
+ save_json(metadata, metadata_path)
279
+ logger.debug("Saved workflow metadata: %s", metadata_path)
@@ -0,0 +1,116 @@
1
+ """
2
+ Workflow step registry.
3
+
4
+ Central factory for creating workflow step instances.
5
+ Decouples engine from concrete step implementations.
6
+ """
7
+
8
+ import logging
9
+ from typing import TYPE_CHECKING
10
+
11
+ from .config import WorkflowConfig
12
+ from .concrete_steps import (
13
+ DataStep,
14
+ SignalStep,
15
+ SuitabilityStep,
16
+ BacktestStep,
17
+ PerformanceStep,
18
+ VisualizationStep,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ from .steps import WorkflowStep
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class StepRegistry:
28
+ """
29
+ Factory for workflow step instances.
30
+
31
+ Centralizes step creation and ensures consistent dependency order.
32
+
33
+ Examples
34
+ --------
35
+ Get all steps for workflow:
36
+ >>> registry = StepRegistry()
37
+ >>> config = WorkflowConfig(signal_name="spread_momentum", strategy_name="balanced")
38
+ >>> steps = registry.get_all_steps(config)
39
+
40
+ Get specific step:
41
+ >>> step = registry.get_step("data", config)
42
+ """
43
+
44
+ def __init__(self) -> None:
45
+ self._step_order = [
46
+ "data",
47
+ "signal",
48
+ "suitability",
49
+ "backtest",
50
+ "performance",
51
+ "visualization",
52
+ ]
53
+
54
+ def get_canonical_order(self) -> list[str]:
55
+ """
56
+ Get canonical workflow step order.
57
+
58
+ Returns
59
+ -------
60
+ list[str]
61
+ Ordered list of step names.
62
+ """
63
+ return self._step_order.copy()
64
+
65
+ def get_all_steps(self, config: WorkflowConfig) -> list["WorkflowStep"]:
66
+ """
67
+ Create all workflow steps in dependency order.
68
+
69
+ Parameters
70
+ ----------
71
+ config : WorkflowConfig
72
+ Workflow configuration.
73
+
74
+ Returns
75
+ -------
76
+ list[WorkflowStep]
77
+ Ordered list of step instances.
78
+ """
79
+ return [self._create_step(name, config) for name in self._step_order]
80
+
81
+ def get_step(self, name: str, config: WorkflowConfig) -> "WorkflowStep":
82
+ """
83
+ Create single workflow step by name.
84
+
85
+ Parameters
86
+ ----------
87
+ name : str
88
+ Step name (data, signal, suitability, backtest, performance, visualization).
89
+ config : WorkflowConfig
90
+ Workflow configuration.
91
+
92
+ Returns
93
+ -------
94
+ WorkflowStep
95
+ Step instance.
96
+
97
+ Raises
98
+ ------
99
+ ValueError
100
+ If step name is invalid.
101
+ """
102
+ if name not in self._step_order:
103
+ raise ValueError(f"Unknown step: {name}")
104
+ return self._create_step(name, config)
105
+
106
+ def _create_step(self, name: str, config: WorkflowConfig) -> "WorkflowStep":
107
+ """Create step instance by name."""
108
+ step_classes = {
109
+ "data": DataStep,
110
+ "signal": SignalStep,
111
+ "suitability": SuitabilityStep,
112
+ "backtest": BacktestStep,
113
+ "performance": PerformanceStep,
114
+ "visualization": VisualizationStep,
115
+ }
116
+ return step_classes[name](config)
@@ -0,0 +1,180 @@
1
+ """
2
+ Workflow step abstractions.
3
+
4
+ Defines protocol for executable workflow steps with dependency tracking,
5
+ caching, and standardized I/O.
6
+ """
7
+
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from pathlib import Path
11
+ from typing import Any, Protocol
12
+
13
+ from .config import WorkflowConfig
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class WorkflowStep(Protocol):
19
+ """
20
+ Protocol for executable workflow steps.
21
+
22
+ All workflow steps must implement this interface for orchestration.
23
+
24
+ Attributes
25
+ ----------
26
+ name : str
27
+ Step identifier (used for caching and logging).
28
+ config : WorkflowConfig
29
+ Workflow configuration.
30
+
31
+ Methods
32
+ -------
33
+ execute(context)
34
+ Execute step logic and return output data.
35
+ output_exists()
36
+ Check if step output already exists (for caching).
37
+ get_output_path()
38
+ Return path to expected output files.
39
+ """
40
+
41
+ name: str
42
+ config: WorkflowConfig
43
+
44
+ def execute(self, context: dict[str, Any]) -> dict[str, Any]:
45
+ """
46
+ Execute workflow step.
47
+
48
+ Parameters
49
+ ----------
50
+ context : dict[str, Any]
51
+ Outputs from previous steps (keyed by step name).
52
+
53
+ Returns
54
+ -------
55
+ dict[str, Any]
56
+ Step output data to pass to subsequent steps.
57
+
58
+ Notes
59
+ -----
60
+ Steps should be idempotent: running twice produces same results.
61
+ Use context["data"] to access data from DataStep, etc.
62
+ """
63
+ ...
64
+
65
+ def output_exists(self) -> bool:
66
+ """
67
+ Check if step output files exist.
68
+
69
+ Returns
70
+ -------
71
+ bool
72
+ True if all required outputs exist, False otherwise.
73
+
74
+ Notes
75
+ -----
76
+ Used by caching logic to skip completed steps.
77
+ Should check file existence and basic validation.
78
+ """
79
+ ...
80
+
81
+ def get_output_path(self) -> Path:
82
+ """
83
+ Get expected output directory path.
84
+
85
+ Returns
86
+ -------
87
+ Path
88
+ Directory where step outputs are saved.
89
+ """
90
+ ...
91
+
92
+ def load_cached_output(self) -> dict[str, Any]:
93
+ """
94
+ Load cached output from previous execution.
95
+
96
+ Returns
97
+ -------
98
+ dict[str, Any]
99
+ Cached step output data.
100
+
101
+ Raises
102
+ ------
103
+ FileNotFoundError
104
+ If cached output files don't exist.
105
+ ValueError
106
+ If cached output is invalid or corrupted.
107
+
108
+ Notes
109
+ -----
110
+ Called when step is skipped due to caching.
111
+ Must restore same output structure as execute() would return.
112
+ """
113
+ ...
114
+
115
+
116
+ class BaseWorkflowStep(ABC):
117
+ """
118
+ Abstract base class for workflow steps.
119
+
120
+ Provides common functionality for concrete step implementations.
121
+
122
+ Parameters
123
+ ----------
124
+ config : WorkflowConfig
125
+ Workflow configuration.
126
+ """
127
+
128
+ def __init__(self, config: WorkflowConfig) -> None:
129
+ self.config = config
130
+
131
+ @property
132
+ @abstractmethod
133
+ def name(self) -> str:
134
+ """Step identifier."""
135
+ ...
136
+
137
+ @abstractmethod
138
+ def execute(self, context: dict[str, Any]) -> dict[str, Any]:
139
+ """Execute step logic."""
140
+ ...
141
+
142
+ @abstractmethod
143
+ def output_exists(self) -> bool:
144
+ """Check if output exists."""
145
+ ...
146
+
147
+ @abstractmethod
148
+ def get_output_path(self) -> Path:
149
+ """Get output directory."""
150
+ ...
151
+
152
+ def load_cached_output(self) -> dict[str, Any]:
153
+ """
154
+ Load cached output from previous execution.
155
+
156
+ Default implementation raises NotImplementedError.
157
+ Steps that support caching must override this method.
158
+
159
+ Returns
160
+ -------
161
+ dict[str, Any]
162
+ Cached step output data.
163
+
164
+ Raises
165
+ ------
166
+ NotImplementedError
167
+ If step doesn't support loading cached outputs.
168
+ """
169
+ raise NotImplementedError(
170
+ f"Step {self.name} doesn't support loading cached outputs. "
171
+ "Override load_cached_output() method."
172
+ )
173
+
174
+ def _log_start(self) -> None:
175
+ """Log step start."""
176
+ logger.info("Starting step: %s", self.name)
177
+
178
+ def _log_complete(self, output: dict[str, Any]) -> None:
179
+ """Log step completion."""
180
+ logger.info("Completed step: %s", self.name)