photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/pipeline_graph.py
CHANGED
|
@@ -1,334 +1,334 @@
|
|
|
1
|
-
"""Pipeline dependency graph with validation and topological sorting.
|
|
2
|
-
|
|
3
|
-
Provides graph computation methods for pipeline orchestration:
|
|
4
|
-
- Cycle detection to enforce DAG property (using NetworkX)
|
|
5
|
-
- Connected component checking (using NetworkX)
|
|
6
|
-
- Topological sorting for execution order (using NetworkX)
|
|
7
|
-
- Port connectivity validation
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from __future__ import annotations
|
|
11
|
-
|
|
12
|
-
import networkx as nx
|
|
13
|
-
|
|
14
|
-
from .base_pipeline_stage import BasePipelineStage
|
|
15
|
-
from .base_ports import BaseChannel, BaseInputPort, BaseOutputPort
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PipelineGraph:
|
|
19
|
-
"""Dependency graph of pipeline stages.
|
|
20
|
-
|
|
21
|
-
Manages registration, validation, and analysis of stage dependencies.
|
|
22
|
-
Stages and channels auto-register during construction within a
|
|
23
|
-
PipelineBuilder context.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(self) -> None:
|
|
27
|
-
"""Initialize empty pipeline graph."""
|
|
28
|
-
self.nodes: dict[str, BasePipelineStage] = {}
|
|
29
|
-
self.channels: list[BaseChannel] = []
|
|
30
|
-
self._execution_order: list[str] | None = None
|
|
31
|
-
|
|
32
|
-
# === Registration (called by stage/channel constructors) ===
|
|
33
|
-
|
|
34
|
-
def add_node(self, stage: BasePipelineStage) -> None:
|
|
35
|
-
"""Register a stage in the graph.
|
|
36
|
-
|
|
37
|
-
Called automatically by BasePipelineStage.__init__() during
|
|
38
|
-
auto-registration.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
stage: The pipeline stage to register
|
|
42
|
-
|
|
43
|
-
Raises:
|
|
44
|
-
ValueError: If a stage with this name already exists
|
|
45
|
-
"""
|
|
46
|
-
if stage.stage_name in self.nodes:
|
|
47
|
-
raise ValueError(f"Stage '{stage.stage_name}' already registered in graph")
|
|
48
|
-
self.nodes[stage.stage_name] = stage
|
|
49
|
-
|
|
50
|
-
def add_edge(self, channel: BaseChannel) -> None:
|
|
51
|
-
"""Register a channel (dependency edge with port information).
|
|
52
|
-
|
|
53
|
-
Called automatically by Channel.__init__() during auto-registration.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
channel: The Channel instance containing output/input port references
|
|
57
|
-
"""
|
|
58
|
-
self.channels.append(channel)
|
|
59
|
-
|
|
60
|
-
# === Validation (called by PipelineBuilder.__exit__) ===
|
|
61
|
-
|
|
62
|
-
def validate(self) -> None:
|
|
63
|
-
"""Validate graph structure.
|
|
64
|
-
|
|
65
|
-
Performs comprehensive validation:
|
|
66
|
-
- No cycles (DAG property)
|
|
67
|
-
- Single connected component (no isolated subgraphs)
|
|
68
|
-
- All input ports are bound (have a source)
|
|
69
|
-
- All output ports that exist are connected (at least one consumer)
|
|
70
|
-
- All referenced stages exist
|
|
71
|
-
|
|
72
|
-
Raises:
|
|
73
|
-
ValueError: If validation fails with descriptive error message
|
|
74
|
-
|
|
75
|
-
Note:
|
|
76
|
-
Source stages (no input ports) and sink stages (no output ports)
|
|
77
|
-
are valid. Validation only checks ports that actually exist.
|
|
78
|
-
"""
|
|
79
|
-
if not self.nodes:
|
|
80
|
-
raise ValueError("Cannot validate empty pipeline graph")
|
|
81
|
-
|
|
82
|
-
# Check 1: All channels reference valid stages
|
|
83
|
-
self._check_valid_stage_references()
|
|
84
|
-
|
|
85
|
-
# Check 2: No cycles (DAG property)
|
|
86
|
-
self._check_no_cycles()
|
|
87
|
-
|
|
88
|
-
# Check 3: Single connected component
|
|
89
|
-
self._check_single_component()
|
|
90
|
-
|
|
91
|
-
# Check 4: All ports are properly connected
|
|
92
|
-
self._check_all_ports_connected()
|
|
93
|
-
|
|
94
|
-
def _check_valid_stage_references(self) -> None:
|
|
95
|
-
"""Verify all channels reference stages that exist in the graph.
|
|
96
|
-
|
|
97
|
-
Raises:
|
|
98
|
-
ValueError: If a channel references an unknown stage
|
|
99
|
-
"""
|
|
100
|
-
for channel in self.channels:
|
|
101
|
-
producer_name = channel.output.owner.stage_name
|
|
102
|
-
# FIXME: Provide a base class method for this instead of using a private attribute.
|
|
103
|
-
consumer_name = channel.input._source.owner.stage_name if channel.input._source else None
|
|
104
|
-
|
|
105
|
-
if producer_name not in self.nodes:
|
|
106
|
-
raise ValueError(f"Channel references unknown producer stage: '{producer_name}'")
|
|
107
|
-
|
|
108
|
-
if consumer_name and consumer_name not in self.nodes:
|
|
109
|
-
raise ValueError(f"Channel references unknown consumer stage: '{consumer_name}'")
|
|
110
|
-
|
|
111
|
-
# FIXME: Why not do this as self.nodes and self.channels are populated? Having those structures separate seems pointless.
|
|
112
|
-
def _build_networkx_graph(self) -> nx.DiGraph[str]:
|
|
113
|
-
"""Build NetworkX directed graph from pipeline stages and channels.
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
NetworkX DiGraph with stage names as nodes and channels as edges
|
|
117
|
-
"""
|
|
118
|
-
graph: nx.DiGraph[str] = nx.DiGraph()
|
|
119
|
-
|
|
120
|
-
# Add all stage nodes
|
|
121
|
-
graph.add_nodes_from(self.nodes.keys())
|
|
122
|
-
|
|
123
|
-
# Build reverse mapping: InputPort → Stage (owner)
|
|
124
|
-
# Needed because InputPort doesn't have an owner attribute
|
|
125
|
-
input_port_owners: dict[int, str] = {}
|
|
126
|
-
for stage_name, stage in self.nodes.items():
|
|
127
|
-
# Use __dict__ to avoid triggering properties
|
|
128
|
-
for attr_name, attr in stage.__dict__.items():
|
|
129
|
-
if attr_name.startswith("_"):
|
|
130
|
-
continue
|
|
131
|
-
if isinstance(attr, BaseInputPort):
|
|
132
|
-
input_port_owners[id(attr)] = stage_name
|
|
133
|
-
|
|
134
|
-
# Add edges from channels (producer → consumer)
|
|
135
|
-
for channel in self.channels:
|
|
136
|
-
producer = channel.output.owner.stage_name
|
|
137
|
-
consumer = input_port_owners.get(id(channel.input))
|
|
138
|
-
if consumer:
|
|
139
|
-
graph.add_edge(producer, consumer)
|
|
140
|
-
|
|
141
|
-
return graph
|
|
142
|
-
|
|
143
|
-
def _check_no_cycles(self) -> None:
|
|
144
|
-
"""Detect cycles using NetworkX DAG checking.
|
|
145
|
-
|
|
146
|
-
Raises:
|
|
147
|
-
ValueError: If a cycle is detected, with the cycle path
|
|
148
|
-
"""
|
|
149
|
-
graph = self._build_networkx_graph()
|
|
150
|
-
|
|
151
|
-
if not nx.is_directed_acyclic_graph(graph):
|
|
152
|
-
# Find a cycle to report in error message
|
|
153
|
-
try:
|
|
154
|
-
cycle = nx.find_cycle(graph, orientation="original")
|
|
155
|
-
# cycle is a list of (source, target, key) tuples
|
|
156
|
-
cycle_nodes = [edge[0] for edge in cycle] + [cycle[0][0]]
|
|
157
|
-
cycle_path = " → ".join(cycle_nodes)
|
|
158
|
-
raise ValueError(f"Cycle detected in pipeline: {cycle_path}")
|
|
159
|
-
except nx.NetworkXNoCycle:
|
|
160
|
-
# Shouldn't happen but provide fallback
|
|
161
|
-
raise ValueError("Cycle detected in pipeline (details unavailable)") from None
|
|
162
|
-
|
|
163
|
-
def _check_single_component(self) -> None:
|
|
164
|
-
"""Verify graph is a single connected component.
|
|
165
|
-
|
|
166
|
-
Treats edges as undirected for connectivity check.
|
|
167
|
-
This prevents isolated subgraphs that would indicate
|
|
168
|
-
configuration errors.
|
|
169
|
-
|
|
170
|
-
Raises:
|
|
171
|
-
ValueError: If graph has multiple connected components
|
|
172
|
-
"""
|
|
173
|
-
if not self.nodes:
|
|
174
|
-
return
|
|
175
|
-
|
|
176
|
-
graph = self._build_networkx_graph()
|
|
177
|
-
|
|
178
|
-
# Check weak connectivity (treat directed edges as undirected)
|
|
179
|
-
num_components = nx.number_weakly_connected_components(graph)
|
|
180
|
-
|
|
181
|
-
if num_components > 1:
|
|
182
|
-
# Find the components to report in error
|
|
183
|
-
components = list(nx.weakly_connected_components(graph))
|
|
184
|
-
# Sort components by size for consistent error messages
|
|
185
|
-
components_sorted = sorted(components, key=len, reverse=True)
|
|
186
|
-
component_sizes = [len(comp) for comp in components_sorted]
|
|
187
|
-
|
|
188
|
-
# Show which stages are unreachable from the largest component
|
|
189
|
-
largest_component = components_sorted[0]
|
|
190
|
-
unreached = set(self.nodes.keys()) - largest_component
|
|
191
|
-
|
|
192
|
-
raise ValueError(
|
|
193
|
-
f"Pipeline has {len(components_sorted)} disconnected components "
|
|
194
|
-
f"with sizes {component_sizes}. "
|
|
195
|
-
f"Unreached stages from main component: {sorted(unreached)}"
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
def _check_all_ports_connected(self) -> None:
|
|
199
|
-
"""Verify all ports on all stages are properly connected.
|
|
200
|
-
|
|
201
|
-
- All input ports must be bound to a source
|
|
202
|
-
- All output ports must have at least one consumer
|
|
203
|
-
|
|
204
|
-
Source stages (no inputs) and sink stages (no outputs) are valid.
|
|
205
|
-
|
|
206
|
-
Raises:
|
|
207
|
-
ValueError: If any port is unconnected
|
|
208
|
-
"""
|
|
209
|
-
# Build set of connected ports from channels
|
|
210
|
-
# Use base classes since we check identity regardless of data type
|
|
211
|
-
connected_inputs: set[BaseInputPort] = set()
|
|
212
|
-
connected_outputs: set[BaseOutputPort] = set()
|
|
213
|
-
|
|
214
|
-
for channel in self.channels:
|
|
215
|
-
connected_outputs.add(channel.output)
|
|
216
|
-
connected_inputs.add(channel.input)
|
|
217
|
-
|
|
218
|
-
# Check all ports on all stages
|
|
219
|
-
for stage_name, stage in self.nodes.items():
|
|
220
|
-
# Use __dict__ to avoid triggering properties
|
|
221
|
-
for attr_name, attr in stage.__dict__.items():
|
|
222
|
-
if attr_name.startswith("_"):
|
|
223
|
-
continue
|
|
224
|
-
|
|
225
|
-
# Check input ports are bound
|
|
226
|
-
if isinstance(attr, BaseInputPort):
|
|
227
|
-
if not attr.is_bound():
|
|
228
|
-
raise ValueError(f"Unbound input port: {stage_name}.{attr_name}")
|
|
229
|
-
if attr not in connected_inputs:
|
|
230
|
-
raise ValueError(f"Input port {stage_name}.{attr_name} bound but not connected by channel")
|
|
231
|
-
|
|
232
|
-
# Check output ports are connected
|
|
233
|
-
# Exceptions: Optional ports that don't need to be connected:
|
|
234
|
-
# - Review ports (for web UI, not pipeline flow)
|
|
235
|
-
# - Full tuple output ports (backward compatibility, use specific ports instead)
|
|
236
|
-
# - Final output ports (sink nodes - consumed by orchestrator after execution)
|
|
237
|
-
# - photofiles_o (only consumed by optional benchmarks stage)
|
|
238
|
-
if isinstance(attr, BaseOutputPort) and attr not in connected_outputs:
|
|
239
|
-
is_optional = "review" in attr_name.lower() or (
|
|
240
|
-
# Full tuple outputs (return complete result, not subset)
|
|
241
|
-
"forest_template_bins" in attr_name # ComputeVersions
|
|
242
|
-
or "forest_bins" in attr_name # ComputeTemplateSimilarity, ComputeIndices
|
|
243
|
-
or "final_forest" in attr_name # Final pipeline output (sink node)
|
|
244
|
-
or "photofiles" in attr_name # Only consumed by optional benchmarks stage
|
|
245
|
-
)
|
|
246
|
-
if not is_optional:
|
|
247
|
-
raise ValueError(
|
|
248
|
-
f"Unconnected output port: {stage_name}.{attr_name} (no channels consume this output)"
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
# === Graph Analysis (called by orchestrator) ===
|
|
252
|
-
|
|
253
|
-
def compute_execution_order(self) -> list[str]:
|
|
254
|
-
"""Compute topological sort of stages using NetworkX.
|
|
255
|
-
|
|
256
|
-
Must be called after validate() succeeds.
|
|
257
|
-
|
|
258
|
-
Returns:
|
|
259
|
-
List of stage names in valid execution order (dependency order)
|
|
260
|
-
|
|
261
|
-
Raises:
|
|
262
|
-
ValueError: If cycle detected during sort (shouldn't happen after validate)
|
|
263
|
-
"""
|
|
264
|
-
graph = self._build_networkx_graph()
|
|
265
|
-
|
|
266
|
-
try:
|
|
267
|
-
result = list(nx.topological_sort(graph))
|
|
268
|
-
self._execution_order = result
|
|
269
|
-
|
|
270
|
-
# Annotate stages with their execution order position (stable ID)
|
|
271
|
-
# Use 1-based indexing (0 means "not started" in orchestrator)
|
|
272
|
-
for i, stage_name in enumerate(result, start=1):
|
|
273
|
-
self.nodes[stage_name].stage_id = i
|
|
274
|
-
|
|
275
|
-
return result
|
|
276
|
-
except nx.NetworkXError as e:
|
|
277
|
-
raise ValueError(f"Cannot compute topological sort (cycle may be present): {e}") from e
|
|
278
|
-
|
|
279
|
-
def get_execution_order(self) -> list[str]:
|
|
280
|
-
"""Get previously computed execution order.
|
|
281
|
-
|
|
282
|
-
Returns:
|
|
283
|
-
Cached execution order from compute_execution_order()
|
|
284
|
-
|
|
285
|
-
Raises:
|
|
286
|
-
RuntimeError: If compute_execution_order() not called yet
|
|
287
|
-
"""
|
|
288
|
-
if self._execution_order is None:
|
|
289
|
-
raise RuntimeError("Execution order not computed. Call compute_execution_order() first.")
|
|
290
|
-
return self._execution_order
|
|
291
|
-
|
|
292
|
-
def get_stages_in_order(self) -> list[BasePipelineStage]:
|
|
293
|
-
"""Get stage instances in execution order.
|
|
294
|
-
|
|
295
|
-
Returns:
|
|
296
|
-
List of BasePipelineStage instances in topological order
|
|
297
|
-
|
|
298
|
-
Raises:
|
|
299
|
-
RuntimeError: If compute_execution_order() not called yet
|
|
300
|
-
"""
|
|
301
|
-
order = self.get_execution_order()
|
|
302
|
-
return [self.nodes[name] for name in order]
|
|
303
|
-
|
|
304
|
-
def get_dependencies(self, stage_name: str) -> list[str]:
|
|
305
|
-
"""Get names of stages that a given stage depends on.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
stage_name: Name of the stage to query
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
List of upstream stage names (producers that this stage consumes from)
|
|
312
|
-
|
|
313
|
-
Raises:
|
|
314
|
-
KeyError: If stage_name not in graph
|
|
315
|
-
"""
|
|
316
|
-
if stage_name not in self.nodes:
|
|
317
|
-
raise KeyError(f"Stage '{stage_name}' not found in graph")
|
|
318
|
-
|
|
319
|
-
dependencies: list[str] = []
|
|
320
|
-
for channel in self.channels:
|
|
321
|
-
consumer_port = channel.input._source
|
|
322
|
-
if consumer_port and consumer_port.owner.stage_name == stage_name:
|
|
323
|
-
producer = channel.output.owner.stage_name
|
|
324
|
-
dependencies.append(producer)
|
|
325
|
-
|
|
326
|
-
return dependencies
|
|
327
|
-
|
|
328
|
-
def get_all_stages(self) -> dict[str, BasePipelineStage]:
|
|
329
|
-
"""Get all registered stages.
|
|
330
|
-
|
|
331
|
-
Returns:
|
|
332
|
-
Dictionary mapping stage names to stage instances
|
|
333
|
-
"""
|
|
334
|
-
return self.nodes.copy()
|
|
1
|
+
"""Pipeline dependency graph with validation and topological sorting.
|
|
2
|
+
|
|
3
|
+
Provides graph computation methods for pipeline orchestration:
|
|
4
|
+
- Cycle detection to enforce DAG property (using NetworkX)
|
|
5
|
+
- Connected component checking (using NetworkX)
|
|
6
|
+
- Topological sorting for execution order (using NetworkX)
|
|
7
|
+
- Port connectivity validation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import networkx as nx
|
|
13
|
+
|
|
14
|
+
from .base_pipeline_stage import BasePipelineStage
|
|
15
|
+
from .base_ports import BaseChannel, BaseInputPort, BaseOutputPort
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PipelineGraph:
|
|
19
|
+
"""Dependency graph of pipeline stages.
|
|
20
|
+
|
|
21
|
+
Manages registration, validation, and analysis of stage dependencies.
|
|
22
|
+
Stages and channels auto-register during construction within a
|
|
23
|
+
PipelineBuilder context.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
"""Initialize empty pipeline graph."""
|
|
28
|
+
self.nodes: dict[str, BasePipelineStage] = {}
|
|
29
|
+
self.channels: list[BaseChannel] = []
|
|
30
|
+
self._execution_order: list[str] | None = None
|
|
31
|
+
|
|
32
|
+
# === Registration (called by stage/channel constructors) ===
|
|
33
|
+
|
|
34
|
+
def add_node(self, stage: BasePipelineStage) -> None:
|
|
35
|
+
"""Register a stage in the graph.
|
|
36
|
+
|
|
37
|
+
Called automatically by BasePipelineStage.__init__() during
|
|
38
|
+
auto-registration.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
stage: The pipeline stage to register
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If a stage with this name already exists
|
|
45
|
+
"""
|
|
46
|
+
if stage.stage_name in self.nodes:
|
|
47
|
+
raise ValueError(f"Stage '{stage.stage_name}' already registered in graph")
|
|
48
|
+
self.nodes[stage.stage_name] = stage
|
|
49
|
+
|
|
50
|
+
def add_edge(self, channel: BaseChannel) -> None:
|
|
51
|
+
"""Register a channel (dependency edge with port information).
|
|
52
|
+
|
|
53
|
+
Called automatically by Channel.__init__() during auto-registration.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
channel: The Channel instance containing output/input port references
|
|
57
|
+
"""
|
|
58
|
+
self.channels.append(channel)
|
|
59
|
+
|
|
60
|
+
# === Validation (called by PipelineBuilder.__exit__) ===
|
|
61
|
+
|
|
62
|
+
def validate(self) -> None:
|
|
63
|
+
"""Validate graph structure.
|
|
64
|
+
|
|
65
|
+
Performs comprehensive validation:
|
|
66
|
+
- No cycles (DAG property)
|
|
67
|
+
- Single connected component (no isolated subgraphs)
|
|
68
|
+
- All input ports are bound (have a source)
|
|
69
|
+
- All output ports that exist are connected (at least one consumer)
|
|
70
|
+
- All referenced stages exist
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If validation fails with descriptive error message
|
|
74
|
+
|
|
75
|
+
Note:
|
|
76
|
+
Source stages (no input ports) and sink stages (no output ports)
|
|
77
|
+
are valid. Validation only checks ports that actually exist.
|
|
78
|
+
"""
|
|
79
|
+
if not self.nodes:
|
|
80
|
+
raise ValueError("Cannot validate empty pipeline graph")
|
|
81
|
+
|
|
82
|
+
# Check 1: All channels reference valid stages
|
|
83
|
+
self._check_valid_stage_references()
|
|
84
|
+
|
|
85
|
+
# Check 2: No cycles (DAG property)
|
|
86
|
+
self._check_no_cycles()
|
|
87
|
+
|
|
88
|
+
# Check 3: Single connected component
|
|
89
|
+
self._check_single_component()
|
|
90
|
+
|
|
91
|
+
# Check 4: All ports are properly connected
|
|
92
|
+
self._check_all_ports_connected()
|
|
93
|
+
|
|
94
|
+
def _check_valid_stage_references(self) -> None:
|
|
95
|
+
"""Verify all channels reference stages that exist in the graph.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If a channel references an unknown stage
|
|
99
|
+
"""
|
|
100
|
+
for channel in self.channels:
|
|
101
|
+
producer_name = channel.output.owner.stage_name
|
|
102
|
+
# FIXME: Provide a base class method for this instead of using a private attribute.
|
|
103
|
+
consumer_name = channel.input._source.owner.stage_name if channel.input._source else None
|
|
104
|
+
|
|
105
|
+
if producer_name not in self.nodes:
|
|
106
|
+
raise ValueError(f"Channel references unknown producer stage: '{producer_name}'")
|
|
107
|
+
|
|
108
|
+
if consumer_name and consumer_name not in self.nodes:
|
|
109
|
+
raise ValueError(f"Channel references unknown consumer stage: '{consumer_name}'")
|
|
110
|
+
|
|
111
|
+
# FIXME: Why not do this as self.nodes and self.channels are populated? Having those structures separate seems pointless.
|
|
112
|
+
def _build_networkx_graph(self) -> nx.DiGraph[str]:
|
|
113
|
+
"""Build NetworkX directed graph from pipeline stages and channels.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
NetworkX DiGraph with stage names as nodes and channels as edges
|
|
117
|
+
"""
|
|
118
|
+
graph: nx.DiGraph[str] = nx.DiGraph()
|
|
119
|
+
|
|
120
|
+
# Add all stage nodes
|
|
121
|
+
graph.add_nodes_from(self.nodes.keys())
|
|
122
|
+
|
|
123
|
+
# Build reverse mapping: InputPort → Stage (owner)
|
|
124
|
+
# Needed because InputPort doesn't have an owner attribute
|
|
125
|
+
input_port_owners: dict[int, str] = {}
|
|
126
|
+
for stage_name, stage in self.nodes.items():
|
|
127
|
+
# Use __dict__ to avoid triggering properties
|
|
128
|
+
for attr_name, attr in stage.__dict__.items():
|
|
129
|
+
if attr_name.startswith("_"):
|
|
130
|
+
continue
|
|
131
|
+
if isinstance(attr, BaseInputPort):
|
|
132
|
+
input_port_owners[id(attr)] = stage_name
|
|
133
|
+
|
|
134
|
+
# Add edges from channels (producer → consumer)
|
|
135
|
+
for channel in self.channels:
|
|
136
|
+
producer = channel.output.owner.stage_name
|
|
137
|
+
consumer = input_port_owners.get(id(channel.input))
|
|
138
|
+
if consumer:
|
|
139
|
+
graph.add_edge(producer, consumer)
|
|
140
|
+
|
|
141
|
+
return graph
|
|
142
|
+
|
|
143
|
+
def _check_no_cycles(self) -> None:
|
|
144
|
+
"""Detect cycles using NetworkX DAG checking.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ValueError: If a cycle is detected, with the cycle path
|
|
148
|
+
"""
|
|
149
|
+
graph = self._build_networkx_graph()
|
|
150
|
+
|
|
151
|
+
if not nx.is_directed_acyclic_graph(graph):
|
|
152
|
+
# Find a cycle to report in error message
|
|
153
|
+
try:
|
|
154
|
+
cycle = nx.find_cycle(graph, orientation="original")
|
|
155
|
+
# cycle is a list of (source, target, key) tuples
|
|
156
|
+
cycle_nodes = [edge[0] for edge in cycle] + [cycle[0][0]]
|
|
157
|
+
cycle_path = " → ".join(cycle_nodes)
|
|
158
|
+
raise ValueError(f"Cycle detected in pipeline: {cycle_path}")
|
|
159
|
+
except nx.NetworkXNoCycle:
|
|
160
|
+
# Shouldn't happen but provide fallback
|
|
161
|
+
raise ValueError("Cycle detected in pipeline (details unavailable)") from None
|
|
162
|
+
|
|
163
|
+
def _check_single_component(self) -> None:
|
|
164
|
+
"""Verify graph is a single connected component.
|
|
165
|
+
|
|
166
|
+
Treats edges as undirected for connectivity check.
|
|
167
|
+
This prevents isolated subgraphs that would indicate
|
|
168
|
+
configuration errors.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: If graph has multiple connected components
|
|
172
|
+
"""
|
|
173
|
+
if not self.nodes:
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
graph = self._build_networkx_graph()
|
|
177
|
+
|
|
178
|
+
# Check weak connectivity (treat directed edges as undirected)
|
|
179
|
+
num_components = nx.number_weakly_connected_components(graph)
|
|
180
|
+
|
|
181
|
+
if num_components > 1:
|
|
182
|
+
# Find the components to report in error
|
|
183
|
+
components = list(nx.weakly_connected_components(graph))
|
|
184
|
+
# Sort components by size for consistent error messages
|
|
185
|
+
components_sorted = sorted(components, key=len, reverse=True)
|
|
186
|
+
component_sizes = [len(comp) for comp in components_sorted]
|
|
187
|
+
|
|
188
|
+
# Show which stages are unreachable from the largest component
|
|
189
|
+
largest_component = components_sorted[0]
|
|
190
|
+
unreached = set(self.nodes.keys()) - largest_component
|
|
191
|
+
|
|
192
|
+
raise ValueError(
|
|
193
|
+
f"Pipeline has {len(components_sorted)} disconnected components "
|
|
194
|
+
f"with sizes {component_sizes}. "
|
|
195
|
+
f"Unreached stages from main component: {sorted(unreached)}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _check_all_ports_connected(self) -> None:
|
|
199
|
+
"""Verify all ports on all stages are properly connected.
|
|
200
|
+
|
|
201
|
+
- All input ports must be bound to a source
|
|
202
|
+
- All output ports must have at least one consumer
|
|
203
|
+
|
|
204
|
+
Source stages (no inputs) and sink stages (no outputs) are valid.
|
|
205
|
+
|
|
206
|
+
Raises:
|
|
207
|
+
ValueError: If any port is unconnected
|
|
208
|
+
"""
|
|
209
|
+
# Build set of connected ports from channels
|
|
210
|
+
# Use base classes since we check identity regardless of data type
|
|
211
|
+
connected_inputs: set[BaseInputPort] = set()
|
|
212
|
+
connected_outputs: set[BaseOutputPort] = set()
|
|
213
|
+
|
|
214
|
+
for channel in self.channels:
|
|
215
|
+
connected_outputs.add(channel.output)
|
|
216
|
+
connected_inputs.add(channel.input)
|
|
217
|
+
|
|
218
|
+
# Check all ports on all stages
|
|
219
|
+
for stage_name, stage in self.nodes.items():
|
|
220
|
+
# Use __dict__ to avoid triggering properties
|
|
221
|
+
for attr_name, attr in stage.__dict__.items():
|
|
222
|
+
if attr_name.startswith("_"):
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
# Check input ports are bound
|
|
226
|
+
if isinstance(attr, BaseInputPort):
|
|
227
|
+
if not attr.is_bound():
|
|
228
|
+
raise ValueError(f"Unbound input port: {stage_name}.{attr_name}")
|
|
229
|
+
if attr not in connected_inputs:
|
|
230
|
+
raise ValueError(f"Input port {stage_name}.{attr_name} bound but not connected by channel")
|
|
231
|
+
|
|
232
|
+
# Check output ports are connected
|
|
233
|
+
# Exceptions: Optional ports that don't need to be connected:
|
|
234
|
+
# - Review ports (for web UI, not pipeline flow)
|
|
235
|
+
# - Full tuple output ports (backward compatibility, use specific ports instead)
|
|
236
|
+
# - Final output ports (sink nodes - consumed by orchestrator after execution)
|
|
237
|
+
# - photofiles_o (only consumed by optional benchmarks stage)
|
|
238
|
+
if isinstance(attr, BaseOutputPort) and attr not in connected_outputs:
|
|
239
|
+
is_optional = "review" in attr_name.lower() or (
|
|
240
|
+
# Full tuple outputs (return complete result, not subset)
|
|
241
|
+
"forest_template_bins" in attr_name # ComputeVersions
|
|
242
|
+
or "forest_bins" in attr_name # ComputeTemplateSimilarity, ComputeIndices
|
|
243
|
+
or "final_forest" in attr_name # Final pipeline output (sink node)
|
|
244
|
+
or "photofiles" in attr_name # Only consumed by optional benchmarks stage
|
|
245
|
+
)
|
|
246
|
+
if not is_optional:
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"Unconnected output port: {stage_name}.{attr_name} (no channels consume this output)"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# === Graph Analysis (called by orchestrator) ===
|
|
252
|
+
|
|
253
|
+
def compute_execution_order(self) -> list[str]:
|
|
254
|
+
"""Compute topological sort of stages using NetworkX.
|
|
255
|
+
|
|
256
|
+
Must be called after validate() succeeds.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of stage names in valid execution order (dependency order)
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
ValueError: If cycle detected during sort (shouldn't happen after validate)
|
|
263
|
+
"""
|
|
264
|
+
graph = self._build_networkx_graph()
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
result = list(nx.topological_sort(graph))
|
|
268
|
+
self._execution_order = result
|
|
269
|
+
|
|
270
|
+
# Annotate stages with their execution order position (stable ID)
|
|
271
|
+
# Use 1-based indexing (0 means "not started" in orchestrator)
|
|
272
|
+
for i, stage_name in enumerate(result, start=1):
|
|
273
|
+
self.nodes[stage_name].stage_id = i
|
|
274
|
+
|
|
275
|
+
return result
|
|
276
|
+
except nx.NetworkXError as e:
|
|
277
|
+
raise ValueError(f"Cannot compute topological sort (cycle may be present): {e}") from e
|
|
278
|
+
|
|
279
|
+
def get_execution_order(self) -> list[str]:
|
|
280
|
+
"""Get previously computed execution order.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Cached execution order from compute_execution_order()
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
RuntimeError: If compute_execution_order() not called yet
|
|
287
|
+
"""
|
|
288
|
+
if self._execution_order is None:
|
|
289
|
+
raise RuntimeError("Execution order not computed. Call compute_execution_order() first.")
|
|
290
|
+
return self._execution_order
|
|
291
|
+
|
|
292
|
+
def get_stages_in_order(self) -> list[BasePipelineStage]:
|
|
293
|
+
"""Get stage instances in execution order.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
List of BasePipelineStage instances in topological order
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
RuntimeError: If compute_execution_order() not called yet
|
|
300
|
+
"""
|
|
301
|
+
order = self.get_execution_order()
|
|
302
|
+
return [self.nodes[name] for name in order]
|
|
303
|
+
|
|
304
|
+
def get_dependencies(self, stage_name: str) -> list[str]:
|
|
305
|
+
"""Get names of stages that a given stage depends on.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
stage_name: Name of the stage to query
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
List of upstream stage names (producers that this stage consumes from)
|
|
312
|
+
|
|
313
|
+
Raises:
|
|
314
|
+
KeyError: If stage_name not in graph
|
|
315
|
+
"""
|
|
316
|
+
if stage_name not in self.nodes:
|
|
317
|
+
raise KeyError(f"Stage '{stage_name}' not found in graph")
|
|
318
|
+
|
|
319
|
+
dependencies: list[str] = []
|
|
320
|
+
for channel in self.channels:
|
|
321
|
+
consumer_port = channel.input._source
|
|
322
|
+
if consumer_port and consumer_port.owner.stage_name == stage_name:
|
|
323
|
+
producer = channel.output.owner.stage_name
|
|
324
|
+
dependencies.append(producer)
|
|
325
|
+
|
|
326
|
+
return dependencies
|
|
327
|
+
|
|
328
|
+
def get_all_stages(self) -> dict[str, BasePipelineStage]:
|
|
329
|
+
"""Get all registered stages.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Dictionary mapping stage names to stage instances
|
|
333
|
+
"""
|
|
334
|
+
return self.nodes.copy()
|