photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/pipeline_graph.py CHANGED
@@ -1,334 +1,334 @@
1
- """Pipeline dependency graph with validation and topological sorting.
2
-
3
- Provides graph computation methods for pipeline orchestration:
4
- - Cycle detection to enforce DAG property (using NetworkX)
5
- - Connected component checking (using NetworkX)
6
- - Topological sorting for execution order (using NetworkX)
7
- - Port connectivity validation
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import networkx as nx
13
-
14
- from .base_pipeline_stage import BasePipelineStage
15
- from .base_ports import BaseChannel, BaseInputPort, BaseOutputPort
16
-
17
-
18
- class PipelineGraph:
19
- """Dependency graph of pipeline stages.
20
-
21
- Manages registration, validation, and analysis of stage dependencies.
22
- Stages and channels auto-register during construction within a
23
- PipelineBuilder context.
24
- """
25
-
26
- def __init__(self) -> None:
27
- """Initialize empty pipeline graph."""
28
- self.nodes: dict[str, BasePipelineStage] = {}
29
- self.channels: list[BaseChannel] = []
30
- self._execution_order: list[str] | None = None
31
-
32
- # === Registration (called by stage/channel constructors) ===
33
-
34
- def add_node(self, stage: BasePipelineStage) -> None:
35
- """Register a stage in the graph.
36
-
37
- Called automatically by BasePipelineStage.__init__() during
38
- auto-registration.
39
-
40
- Args:
41
- stage: The pipeline stage to register
42
-
43
- Raises:
44
- ValueError: If a stage with this name already exists
45
- """
46
- if stage.stage_name in self.nodes:
47
- raise ValueError(f"Stage '{stage.stage_name}' already registered in graph")
48
- self.nodes[stage.stage_name] = stage
49
-
50
- def add_edge(self, channel: BaseChannel) -> None:
51
- """Register a channel (dependency edge with port information).
52
-
53
- Called automatically by Channel.__init__() during auto-registration.
54
-
55
- Args:
56
- channel: The Channel instance containing output/input port references
57
- """
58
- self.channels.append(channel)
59
-
60
- # === Validation (called by PipelineBuilder.__exit__) ===
61
-
62
- def validate(self) -> None:
63
- """Validate graph structure.
64
-
65
- Performs comprehensive validation:
66
- - No cycles (DAG property)
67
- - Single connected component (no isolated subgraphs)
68
- - All input ports are bound (have a source)
69
- - All output ports that exist are connected (at least one consumer)
70
- - All referenced stages exist
71
-
72
- Raises:
73
- ValueError: If validation fails with descriptive error message
74
-
75
- Note:
76
- Source stages (no input ports) and sink stages (no output ports)
77
- are valid. Validation only checks ports that actually exist.
78
- """
79
- if not self.nodes:
80
- raise ValueError("Cannot validate empty pipeline graph")
81
-
82
- # Check 1: All channels reference valid stages
83
- self._check_valid_stage_references()
84
-
85
- # Check 2: No cycles (DAG property)
86
- self._check_no_cycles()
87
-
88
- # Check 3: Single connected component
89
- self._check_single_component()
90
-
91
- # Check 4: All ports are properly connected
92
- self._check_all_ports_connected()
93
-
94
- def _check_valid_stage_references(self) -> None:
95
- """Verify all channels reference stages that exist in the graph.
96
-
97
- Raises:
98
- ValueError: If a channel references an unknown stage
99
- """
100
- for channel in self.channels:
101
- producer_name = channel.output.owner.stage_name
102
- # FIXME: Provide a base class method for this instead of using a private attribute.
103
- consumer_name = channel.input._source.owner.stage_name if channel.input._source else None
104
-
105
- if producer_name not in self.nodes:
106
- raise ValueError(f"Channel references unknown producer stage: '{producer_name}'")
107
-
108
- if consumer_name and consumer_name not in self.nodes:
109
- raise ValueError(f"Channel references unknown consumer stage: '{consumer_name}'")
110
-
111
- # FIXME: Why not do this as self.nodes and self.channels are populated? Having those structures separate seems pointless.
112
- def _build_networkx_graph(self) -> nx.DiGraph[str]:
113
- """Build NetworkX directed graph from pipeline stages and channels.
114
-
115
- Returns:
116
- NetworkX DiGraph with stage names as nodes and channels as edges
117
- """
118
- graph: nx.DiGraph[str] = nx.DiGraph()
119
-
120
- # Add all stage nodes
121
- graph.add_nodes_from(self.nodes.keys())
122
-
123
- # Build reverse mapping: InputPort → Stage (owner)
124
- # Needed because InputPort doesn't have an owner attribute
125
- input_port_owners: dict[int, str] = {}
126
- for stage_name, stage in self.nodes.items():
127
- # Use __dict__ to avoid triggering properties
128
- for attr_name, attr in stage.__dict__.items():
129
- if attr_name.startswith("_"):
130
- continue
131
- if isinstance(attr, BaseInputPort):
132
- input_port_owners[id(attr)] = stage_name
133
-
134
- # Add edges from channels (producer → consumer)
135
- for channel in self.channels:
136
- producer = channel.output.owner.stage_name
137
- consumer = input_port_owners.get(id(channel.input))
138
- if consumer:
139
- graph.add_edge(producer, consumer)
140
-
141
- return graph
142
-
143
- def _check_no_cycles(self) -> None:
144
- """Detect cycles using NetworkX DAG checking.
145
-
146
- Raises:
147
- ValueError: If a cycle is detected, with the cycle path
148
- """
149
- graph = self._build_networkx_graph()
150
-
151
- if not nx.is_directed_acyclic_graph(graph):
152
- # Find a cycle to report in error message
153
- try:
154
- cycle = nx.find_cycle(graph, orientation="original")
155
- # cycle is a list of (source, target, key) tuples
156
- cycle_nodes = [edge[0] for edge in cycle] + [cycle[0][0]]
157
- cycle_path = " → ".join(cycle_nodes)
158
- raise ValueError(f"Cycle detected in pipeline: {cycle_path}")
159
- except nx.NetworkXNoCycle:
160
- # Shouldn't happen but provide fallback
161
- raise ValueError("Cycle detected in pipeline (details unavailable)") from None
162
-
163
- def _check_single_component(self) -> None:
164
- """Verify graph is a single connected component.
165
-
166
- Treats edges as undirected for connectivity check.
167
- This prevents isolated subgraphs that would indicate
168
- configuration errors.
169
-
170
- Raises:
171
- ValueError: If graph has multiple connected components
172
- """
173
- if not self.nodes:
174
- return
175
-
176
- graph = self._build_networkx_graph()
177
-
178
- # Check weak connectivity (treat directed edges as undirected)
179
- num_components = nx.number_weakly_connected_components(graph)
180
-
181
- if num_components > 1:
182
- # Find the components to report in error
183
- components = list(nx.weakly_connected_components(graph))
184
- # Sort components by size for consistent error messages
185
- components_sorted = sorted(components, key=len, reverse=True)
186
- component_sizes = [len(comp) for comp in components_sorted]
187
-
188
- # Show which stages are unreachable from the largest component
189
- largest_component = components_sorted[0]
190
- unreached = set(self.nodes.keys()) - largest_component
191
-
192
- raise ValueError(
193
- f"Pipeline has {len(components_sorted)} disconnected components "
194
- f"with sizes {component_sizes}. "
195
- f"Unreached stages from main component: {sorted(unreached)}"
196
- )
197
-
198
- def _check_all_ports_connected(self) -> None:
199
- """Verify all ports on all stages are properly connected.
200
-
201
- - All input ports must be bound to a source
202
- - All output ports must have at least one consumer
203
-
204
- Source stages (no inputs) and sink stages (no outputs) are valid.
205
-
206
- Raises:
207
- ValueError: If any port is unconnected
208
- """
209
- # Build set of connected ports from channels
210
- # Use base classes since we check identity regardless of data type
211
- connected_inputs: set[BaseInputPort] = set()
212
- connected_outputs: set[BaseOutputPort] = set()
213
-
214
- for channel in self.channels:
215
- connected_outputs.add(channel.output)
216
- connected_inputs.add(channel.input)
217
-
218
- # Check all ports on all stages
219
- for stage_name, stage in self.nodes.items():
220
- # Use __dict__ to avoid triggering properties
221
- for attr_name, attr in stage.__dict__.items():
222
- if attr_name.startswith("_"):
223
- continue
224
-
225
- # Check input ports are bound
226
- if isinstance(attr, BaseInputPort):
227
- if not attr.is_bound():
228
- raise ValueError(f"Unbound input port: {stage_name}.{attr_name}")
229
- if attr not in connected_inputs:
230
- raise ValueError(f"Input port {stage_name}.{attr_name} bound but not connected by channel")
231
-
232
- # Check output ports are connected
233
- # Exceptions: Optional ports that don't need to be connected:
234
- # - Review ports (for web UI, not pipeline flow)
235
- # - Full tuple output ports (backward compatibility, use specific ports instead)
236
- # - Final output ports (sink nodes - consumed by orchestrator after execution)
237
- # - photofiles_o (only consumed by optional benchmarks stage)
238
- if isinstance(attr, BaseOutputPort) and attr not in connected_outputs:
239
- is_optional = "review" in attr_name.lower() or (
240
- # Full tuple outputs (return complete result, not subset)
241
- "forest_template_bins" in attr_name # ComputeVersions
242
- or "forest_bins" in attr_name # ComputeTemplateSimilarity, ComputeIndices
243
- or "final_forest" in attr_name # Final pipeline output (sink node)
244
- or "photofiles" in attr_name # Only consumed by optional benchmarks stage
245
- )
246
- if not is_optional:
247
- raise ValueError(
248
- f"Unconnected output port: {stage_name}.{attr_name} (no channels consume this output)"
249
- )
250
-
251
- # === Graph Analysis (called by orchestrator) ===
252
-
253
- def compute_execution_order(self) -> list[str]:
254
- """Compute topological sort of stages using NetworkX.
255
-
256
- Must be called after validate() succeeds.
257
-
258
- Returns:
259
- List of stage names in valid execution order (dependency order)
260
-
261
- Raises:
262
- ValueError: If cycle detected during sort (shouldn't happen after validate)
263
- """
264
- graph = self._build_networkx_graph()
265
-
266
- try:
267
- result = list(nx.topological_sort(graph))
268
- self._execution_order = result
269
-
270
- # Annotate stages with their execution order position (stable ID)
271
- # Use 1-based indexing (0 means "not started" in orchestrator)
272
- for i, stage_name in enumerate(result, start=1):
273
- self.nodes[stage_name].stage_id = i
274
-
275
- return result
276
- except nx.NetworkXError as e:
277
- raise ValueError(f"Cannot compute topological sort (cycle may be present): {e}") from e
278
-
279
- def get_execution_order(self) -> list[str]:
280
- """Get previously computed execution order.
281
-
282
- Returns:
283
- Cached execution order from compute_execution_order()
284
-
285
- Raises:
286
- RuntimeError: If compute_execution_order() not called yet
287
- """
288
- if self._execution_order is None:
289
- raise RuntimeError("Execution order not computed. Call compute_execution_order() first.")
290
- return self._execution_order
291
-
292
- def get_stages_in_order(self) -> list[BasePipelineStage]:
293
- """Get stage instances in execution order.
294
-
295
- Returns:
296
- List of BasePipelineStage instances in topological order
297
-
298
- Raises:
299
- RuntimeError: If compute_execution_order() not called yet
300
- """
301
- order = self.get_execution_order()
302
- return [self.nodes[name] for name in order]
303
-
304
- def get_dependencies(self, stage_name: str) -> list[str]:
305
- """Get names of stages that a given stage depends on.
306
-
307
- Args:
308
- stage_name: Name of the stage to query
309
-
310
- Returns:
311
- List of upstream stage names (producers that this stage consumes from)
312
-
313
- Raises:
314
- KeyError: If stage_name not in graph
315
- """
316
- if stage_name not in self.nodes:
317
- raise KeyError(f"Stage '{stage_name}' not found in graph")
318
-
319
- dependencies: list[str] = []
320
- for channel in self.channels:
321
- consumer_port = channel.input._source
322
- if consumer_port and consumer_port.owner.stage_name == stage_name:
323
- producer = channel.output.owner.stage_name
324
- dependencies.append(producer)
325
-
326
- return dependencies
327
-
328
- def get_all_stages(self) -> dict[str, BasePipelineStage]:
329
- """Get all registered stages.
330
-
331
- Returns:
332
- Dictionary mapping stage names to stage instances
333
- """
334
- return self.nodes.copy()
1
+ """Pipeline dependency graph with validation and topological sorting.
2
+
3
+ Provides graph computation methods for pipeline orchestration:
4
+ - Cycle detection to enforce DAG property (using NetworkX)
5
+ - Connected component checking (using NetworkX)
6
+ - Topological sorting for execution order (using NetworkX)
7
+ - Port connectivity validation
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import networkx as nx
13
+
14
+ from .base_pipeline_stage import BasePipelineStage
15
+ from .base_ports import BaseChannel, BaseInputPort, BaseOutputPort
16
+
17
+
18
+ class PipelineGraph:
19
+ """Dependency graph of pipeline stages.
20
+
21
+ Manages registration, validation, and analysis of stage dependencies.
22
+ Stages and channels auto-register during construction within a
23
+ PipelineBuilder context.
24
+ """
25
+
26
+ def __init__(self) -> None:
27
+ """Initialize empty pipeline graph."""
28
+ self.nodes: dict[str, BasePipelineStage] = {}
29
+ self.channels: list[BaseChannel] = []
30
+ self._execution_order: list[str] | None = None
31
+
32
+ # === Registration (called by stage/channel constructors) ===
33
+
34
+ def add_node(self, stage: BasePipelineStage) -> None:
35
+ """Register a stage in the graph.
36
+
37
+ Called automatically by BasePipelineStage.__init__() during
38
+ auto-registration.
39
+
40
+ Args:
41
+ stage: The pipeline stage to register
42
+
43
+ Raises:
44
+ ValueError: If a stage with this name already exists
45
+ """
46
+ if stage.stage_name in self.nodes:
47
+ raise ValueError(f"Stage '{stage.stage_name}' already registered in graph")
48
+ self.nodes[stage.stage_name] = stage
49
+
50
+ def add_edge(self, channel: BaseChannel) -> None:
51
+ """Register a channel (dependency edge with port information).
52
+
53
+ Called automatically by Channel.__init__() during auto-registration.
54
+
55
+ Args:
56
+ channel: The Channel instance containing output/input port references
57
+ """
58
+ self.channels.append(channel)
59
+
60
+ # === Validation (called by PipelineBuilder.__exit__) ===
61
+
62
+ def validate(self) -> None:
63
+ """Validate graph structure.
64
+
65
+ Performs comprehensive validation:
66
+ - No cycles (DAG property)
67
+ - Single connected component (no isolated subgraphs)
68
+ - All input ports are bound (have a source)
69
+ - All output ports that exist are connected (at least one consumer)
70
+ - All referenced stages exist
71
+
72
+ Raises:
73
+ ValueError: If validation fails with descriptive error message
74
+
75
+ Note:
76
+ Source stages (no input ports) and sink stages (no output ports)
77
+ are valid. Validation only checks ports that actually exist.
78
+ """
79
+ if not self.nodes:
80
+ raise ValueError("Cannot validate empty pipeline graph")
81
+
82
+ # Check 1: All channels reference valid stages
83
+ self._check_valid_stage_references()
84
+
85
+ # Check 2: No cycles (DAG property)
86
+ self._check_no_cycles()
87
+
88
+ # Check 3: Single connected component
89
+ self._check_single_component()
90
+
91
+ # Check 4: All ports are properly connected
92
+ self._check_all_ports_connected()
93
+
94
+ def _check_valid_stage_references(self) -> None:
95
+ """Verify all channels reference stages that exist in the graph.
96
+
97
+ Raises:
98
+ ValueError: If a channel references an unknown stage
99
+ """
100
+ for channel in self.channels:
101
+ producer_name = channel.output.owner.stage_name
102
+ # FIXME: Provide a base class method for this instead of using a private attribute.
103
+ consumer_name = channel.input._source.owner.stage_name if channel.input._source else None
104
+
105
+ if producer_name not in self.nodes:
106
+ raise ValueError(f"Channel references unknown producer stage: '{producer_name}'")
107
+
108
+ if consumer_name and consumer_name not in self.nodes:
109
+ raise ValueError(f"Channel references unknown consumer stage: '{consumer_name}'")
110
+
111
+ # FIXME: Why not do this as self.nodes and self.channels are populated? Having those structures separate seems pointless.
112
+ def _build_networkx_graph(self) -> nx.DiGraph[str]:
113
+ """Build NetworkX directed graph from pipeline stages and channels.
114
+
115
+ Returns:
116
+ NetworkX DiGraph with stage names as nodes and channels as edges
117
+ """
118
+ graph: nx.DiGraph[str] = nx.DiGraph()
119
+
120
+ # Add all stage nodes
121
+ graph.add_nodes_from(self.nodes.keys())
122
+
123
+ # Build reverse mapping: InputPort → Stage (owner)
124
+ # Needed because InputPort doesn't have an owner attribute
125
+ input_port_owners: dict[int, str] = {}
126
+ for stage_name, stage in self.nodes.items():
127
+ # Use __dict__ to avoid triggering properties
128
+ for attr_name, attr in stage.__dict__.items():
129
+ if attr_name.startswith("_"):
130
+ continue
131
+ if isinstance(attr, BaseInputPort):
132
+ input_port_owners[id(attr)] = stage_name
133
+
134
+ # Add edges from channels (producer → consumer)
135
+ for channel in self.channels:
136
+ producer = channel.output.owner.stage_name
137
+ consumer = input_port_owners.get(id(channel.input))
138
+ if consumer:
139
+ graph.add_edge(producer, consumer)
140
+
141
+ return graph
142
+
143
+ def _check_no_cycles(self) -> None:
144
+ """Detect cycles using NetworkX DAG checking.
145
+
146
+ Raises:
147
+ ValueError: If a cycle is detected, with the cycle path
148
+ """
149
+ graph = self._build_networkx_graph()
150
+
151
+ if not nx.is_directed_acyclic_graph(graph):
152
+ # Find a cycle to report in error message
153
+ try:
154
+ cycle = nx.find_cycle(graph, orientation="original")
155
+ # cycle is a list of (source, target, key) tuples
156
+ cycle_nodes = [edge[0] for edge in cycle] + [cycle[0][0]]
157
+ cycle_path = " → ".join(cycle_nodes)
158
+ raise ValueError(f"Cycle detected in pipeline: {cycle_path}")
159
+ except nx.NetworkXNoCycle:
160
+ # Shouldn't happen but provide fallback
161
+ raise ValueError("Cycle detected in pipeline (details unavailable)") from None
162
+
163
+ def _check_single_component(self) -> None:
164
+ """Verify graph is a single connected component.
165
+
166
+ Treats edges as undirected for connectivity check.
167
+ This prevents isolated subgraphs that would indicate
168
+ configuration errors.
169
+
170
+ Raises:
171
+ ValueError: If graph has multiple connected components
172
+ """
173
+ if not self.nodes:
174
+ return
175
+
176
+ graph = self._build_networkx_graph()
177
+
178
+ # Check weak connectivity (treat directed edges as undirected)
179
+ num_components = nx.number_weakly_connected_components(graph)
180
+
181
+ if num_components > 1:
182
+ # Find the components to report in error
183
+ components = list(nx.weakly_connected_components(graph))
184
+ # Sort components by size for consistent error messages
185
+ components_sorted = sorted(components, key=len, reverse=True)
186
+ component_sizes = [len(comp) for comp in components_sorted]
187
+
188
+ # Show which stages are unreachable from the largest component
189
+ largest_component = components_sorted[0]
190
+ unreached = set(self.nodes.keys()) - largest_component
191
+
192
+ raise ValueError(
193
+ f"Pipeline has {len(components_sorted)} disconnected components "
194
+ f"with sizes {component_sizes}. "
195
+ f"Unreached stages from main component: {sorted(unreached)}"
196
+ )
197
+
198
+ def _check_all_ports_connected(self) -> None:
199
+ """Verify all ports on all stages are properly connected.
200
+
201
+ - All input ports must be bound to a source
202
+ - All output ports must have at least one consumer
203
+
204
+ Source stages (no inputs) and sink stages (no outputs) are valid.
205
+
206
+ Raises:
207
+ ValueError: If any port is unconnected
208
+ """
209
+ # Build set of connected ports from channels
210
+ # Use base classes since we check identity regardless of data type
211
+ connected_inputs: set[BaseInputPort] = set()
212
+ connected_outputs: set[BaseOutputPort] = set()
213
+
214
+ for channel in self.channels:
215
+ connected_outputs.add(channel.output)
216
+ connected_inputs.add(channel.input)
217
+
218
+ # Check all ports on all stages
219
+ for stage_name, stage in self.nodes.items():
220
+ # Use __dict__ to avoid triggering properties
221
+ for attr_name, attr in stage.__dict__.items():
222
+ if attr_name.startswith("_"):
223
+ continue
224
+
225
+ # Check input ports are bound
226
+ if isinstance(attr, BaseInputPort):
227
+ if not attr.is_bound():
228
+ raise ValueError(f"Unbound input port: {stage_name}.{attr_name}")
229
+ if attr not in connected_inputs:
230
+ raise ValueError(f"Input port {stage_name}.{attr_name} bound but not connected by channel")
231
+
232
+ # Check output ports are connected
233
+ # Exceptions: Optional ports that don't need to be connected:
234
+ # - Review ports (for web UI, not pipeline flow)
235
+ # - Full tuple output ports (backward compatibility, use specific ports instead)
236
+ # - Final output ports (sink nodes - consumed by orchestrator after execution)
237
+ # - photofiles_o (only consumed by optional benchmarks stage)
238
+ if isinstance(attr, BaseOutputPort) and attr not in connected_outputs:
239
+ is_optional = "review" in attr_name.lower() or (
240
+ # Full tuple outputs (return complete result, not subset)
241
+ "forest_template_bins" in attr_name # ComputeVersions
242
+ or "forest_bins" in attr_name # ComputeTemplateSimilarity, ComputeIndices
243
+ or "final_forest" in attr_name # Final pipeline output (sink node)
244
+ or "photofiles" in attr_name # Only consumed by optional benchmarks stage
245
+ )
246
+ if not is_optional:
247
+ raise ValueError(
248
+ f"Unconnected output port: {stage_name}.{attr_name} (no channels consume this output)"
249
+ )
250
+
251
+ # === Graph Analysis (called by orchestrator) ===
252
+
253
+ def compute_execution_order(self) -> list[str]:
254
+ """Compute topological sort of stages using NetworkX.
255
+
256
+ Must be called after validate() succeeds.
257
+
258
+ Returns:
259
+ List of stage names in valid execution order (dependency order)
260
+
261
+ Raises:
262
+ ValueError: If cycle detected during sort (shouldn't happen after validate)
263
+ """
264
+ graph = self._build_networkx_graph()
265
+
266
+ try:
267
+ result = list(nx.topological_sort(graph))
268
+ self._execution_order = result
269
+
270
+ # Annotate stages with their execution order position (stable ID)
271
+ # Use 1-based indexing (0 means "not started" in orchestrator)
272
+ for i, stage_name in enumerate(result, start=1):
273
+ self.nodes[stage_name].stage_id = i
274
+
275
+ return result
276
+ except nx.NetworkXError as e:
277
+ raise ValueError(f"Cannot compute topological sort (cycle may be present): {e}") from e
278
+
279
+ def get_execution_order(self) -> list[str]:
280
+ """Get previously computed execution order.
281
+
282
+ Returns:
283
+ Cached execution order from compute_execution_order()
284
+
285
+ Raises:
286
+ RuntimeError: If compute_execution_order() not called yet
287
+ """
288
+ if self._execution_order is None:
289
+ raise RuntimeError("Execution order not computed. Call compute_execution_order() first.")
290
+ return self._execution_order
291
+
292
+ def get_stages_in_order(self) -> list[BasePipelineStage]:
293
+ """Get stage instances in execution order.
294
+
295
+ Returns:
296
+ List of BasePipelineStage instances in topological order
297
+
298
+ Raises:
299
+ RuntimeError: If compute_execution_order() not called yet
300
+ """
301
+ order = self.get_execution_order()
302
+ return [self.nodes[name] for name in order]
303
+
304
+ def get_dependencies(self, stage_name: str) -> list[str]:
305
+ """Get names of stages that a given stage depends on.
306
+
307
+ Args:
308
+ stage_name: Name of the stage to query
309
+
310
+ Returns:
311
+ List of upstream stage names (producers that this stage consumes from)
312
+
313
+ Raises:
314
+ KeyError: If stage_name not in graph
315
+ """
316
+ if stage_name not in self.nodes:
317
+ raise KeyError(f"Stage '{stage_name}' not found in graph")
318
+
319
+ dependencies: list[str] = []
320
+ for channel in self.channels:
321
+ consumer_port = channel.input._source
322
+ if consumer_port and consumer_port.owner.stage_name == stage_name:
323
+ producer = channel.output.owner.stage_name
324
+ dependencies.append(producer)
325
+
326
+ return dependencies
327
+
328
+ def get_all_stages(self) -> dict[str, BasePipelineStage]:
329
+ """Get all registered stages.
330
+
331
+ Returns:
332
+ Dictionary mapping stage names to stage instances
333
+ """
334
+ return self.nodes.copy()