lsst-pipe-base 29.2025.3100__py3-none-any.whl → 29.2025.3200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. lsst/pipe/base/__init__.py +0 -1
  2. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +4 -42
  3. lsst/pipe/base/caching_limited_butler.py +8 -4
  4. lsst/pipe/base/graph/graphSummary.py +4 -4
  5. lsst/pipe/base/mp_graph_executor.py +21 -9
  6. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +40 -10
  7. lsst/pipe/base/pipeline_graph/_tasks.py +106 -0
  8. lsst/pipe/base/pipeline_graph/io.py +1 -1
  9. lsst/pipe/base/quantum_graph_builder.py +42 -16
  10. lsst/pipe/base/quantum_graph_skeleton.py +60 -1
  11. lsst/pipe/base/single_quantum_executor.py +10 -11
  12. lsst/pipe/base/tests/in_memory_limited_butler.py +223 -0
  13. lsst/pipe/base/tests/mocks/__init__.py +1 -0
  14. lsst/pipe/base/tests/mocks/_in_memory_repo.py +357 -0
  15. lsst/pipe/base/tests/mocks/_pipeline_task.py +19 -2
  16. lsst/pipe/base/version.py +1 -1
  17. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/METADATA +1 -1
  18. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/RECORD +26 -25
  19. lsst/pipe/base/executionButlerBuilder.py +0 -493
  20. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/WHEEL +0 -0
  21. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/entry_points.txt +0 -0
  22. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/COPYRIGHT +0 -0
  23. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/LICENSE +0 -0
  24. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/bsd_license.txt +0 -0
  25. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/gpl-v3.0.txt +0 -0
  26. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/top_level.txt +0 -0
  27. {lsst_pipe_base-29.2025.3100.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/zip-safe +0 -0
@@ -1,493 +0,0 @@
1
- # This file is part of pipe_base.
2
- #
3
- # Developed for the LSST Data Management System.
4
- # This product includes software developed by the LSST Project
5
- # (http://www.lsst.org).
6
- # See the COPYRIGHT file at the top-level directory of this distribution
7
- # for details of code ownership.
8
- #
9
- # This software is dual licensed under the GNU General Public License and also
10
- # under a 3-clause BSD license. Recipients may choose which of these licenses
11
- # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
- # respectively. If you choose the GPL option then the following text applies
13
- # (but note that there is still no warranty even if you opt for BSD instead):
14
- #
15
- # This program is free software: you can redistribute it and/or modify
16
- # it under the terms of the GNU General Public License as published by
17
- # the Free Software Foundation, either version 3 of the License, or
18
- # (at your option) any later version.
19
- #
20
- # This program is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
- # GNU General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU General Public License
26
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
- from __future__ import annotations
28
-
29
- __all__ = ("buildExecutionButler",)
30
-
31
- import io
32
- from collections import defaultdict
33
- from collections.abc import Callable, Iterable, Mapping
34
-
35
- from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry
36
- from lsst.daf.butler.direct_butler import DirectButler
37
- from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError
38
- from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG
39
- from lsst.daf.butler.transfers import RepoExportContext
40
- from lsst.resources import ResourcePath, ResourcePathExpression
41
- from lsst.utils.introspection import get_class_of
42
-
43
- from .graph import QuantumGraph
44
-
45
- DataSetTypeRefMap = Mapping[DatasetType, set[DatasetRef]]
46
-
47
-
48
- def _validate_dataset_type(
49
- candidate: DatasetType, previous: dict[str | DatasetType, DatasetType], registry: Registry
50
- ) -> DatasetType:
51
- """Check the dataset types and return a consistent variant if there are
52
- different compatible options.
53
-
54
- Parameters
55
- ----------
56
- candidate : `lsst.daf.butler.DatasetType`
57
- The candidate dataset type.
58
- previous : `dict` [ `str` | `~lsst.daf.butler.DatasetType`, \
59
- `~lsst.daf.butler.DatasetType`]
60
- Previous dataset types found, indexed by name and also by
61
- dataset type. The latter provides a quick way of returning a
62
- previously checked dataset type.
63
- registry : `lsst.daf.butler.Registry`
64
- Main registry whose dataset type registration should override the
65
- given one if it exists.
66
-
67
- Returns
68
- -------
69
- datasetType : `lsst.daf.butler.DatasetType`
70
- The dataset type to be used. This can be different from the
71
- given ``candidate`` if a previous dataset type was encountered
72
- with the same name and this one is compatible with it.
73
-
74
- Raises
75
- ------
76
- ConflictingDefinitionError
77
- Raised if a candidate dataset type has the same name as one
78
- previously encountered but is not compatible with it.
79
-
80
- Notes
81
- -----
82
- This function ensures that if a dataset type is given that has the
83
- same name as a previously encountered dataset type but differs solely
84
- in a way that is interchangeable (through a supported storage class)
85
- then we will always return the first dataset type encountered instead
86
- of the new variant. We assume that the butler will handle the
87
- type conversion itself later.
88
- """
89
- # First check that if we have previously vetted this dataset type.
90
- # Return the vetted form immediately if we have.
91
- checked = previous.get(candidate)
92
- if checked:
93
- return checked
94
-
95
- # Have not previously encountered this dataset type.
96
- name = candidate.name
97
- if prevDsType := previous.get(name):
98
- # Check compatibility. For now assume both directions have to
99
- # be acceptable.
100
- if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
101
- # Ensure that if this dataset type is used again we will return
102
- # the version that we were first given with this name. Store
103
- # it for next time and return the previous one.
104
- previous[candidate] = prevDsType
105
- return prevDsType
106
- else:
107
- raise ConflictingDefinitionError(
108
- f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
109
- )
110
-
111
- # We haven't seen this dataset type in this graph before, but it may
112
- # already be in the registry.
113
- try:
114
- registryDsType = registry.getDatasetType(name)
115
- previous[candidate] = registryDsType
116
- return registryDsType
117
- except MissingDatasetTypeError:
118
- pass
119
- # Dataset type is totally new. Store it by name and by dataset type so
120
- # it will be validated immediately next time it comes up.
121
- previous[name] = candidate
122
- previous[candidate] = candidate
123
- return candidate
124
-
125
-
126
- def _accumulate(
127
- butler: Butler,
128
- graph: QuantumGraph,
129
- ) -> tuple[set[DatasetRef], DataSetTypeRefMap]:
130
- # accumulate the DatasetRefs that will be transferred to the execution
131
- # registry
132
-
133
- # exports holds all the existing data that will be migrated to the
134
- # execution butler
135
- exports: set[DatasetRef] = set()
136
-
137
- # inserts is the mapping of DatasetType to dataIds for what is to be
138
- # inserted into the registry. These are the products that are expected
139
- # to be produced during processing of the QuantumGraph
140
- inserts: DataSetTypeRefMap = defaultdict(set)
141
-
142
- # It is possible to end up with a graph that has different storage
143
- # classes attached to the same dataset type name. This is okay but
144
- # must we must ensure that only a single dataset type definition is
145
- # accumulated in the loop below. This data structure caches every dataset
146
- # type encountered and stores the compatible alternative.
147
- datasetTypes: dict[str | DatasetType, DatasetType] = {}
148
-
149
- # Find the initOutput refs.
150
- initOutputRefs = list(graph.globalInitOutputRefs())
151
- for task_def in graph.iterTaskGraph():
152
- task_refs = graph.initOutputRefs(task_def)
153
- if task_refs:
154
- initOutputRefs.extend(task_refs)
155
-
156
- for ref in initOutputRefs:
157
- dataset_type = ref.datasetType
158
- if dataset_type.component() is not None:
159
- dataset_type = dataset_type.makeCompositeDatasetType()
160
- dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry)
161
- inserts[dataset_type].add(ref)
162
-
163
- # Output references may be resolved even if they do not exist. Find all
164
- # actually existing refs.
165
- check_refs: set[DatasetRef] = set()
166
- for quantum in (n.quantum for n in graph):
167
- for attrName in ("initInputs", "inputs", "outputs"):
168
- attr: Mapping[DatasetType, DatasetRef | list[DatasetRef]] = getattr(quantum, attrName)
169
- for refs in attr.values():
170
- # This if block is because init inputs has a different
171
- # signature for its items
172
- if not isinstance(refs, list | tuple):
173
- refs = [refs]
174
- for ref in refs:
175
- if ref.isComponent():
176
- ref = ref.makeCompositeRef()
177
- check_refs.add(ref)
178
- exist_map = butler._exists_many(check_refs, full_check=False)
179
- existing_ids = {ref.id for ref, exists in exist_map.items() if exists}
180
- del exist_map
181
-
182
- for quantum in (n.quantum for n in graph):
183
- for attrName in ("initInputs", "inputs", "outputs"):
184
- attr = getattr(quantum, attrName)
185
-
186
- for type, refs in attr.items():
187
- if not isinstance(refs, list | tuple):
188
- refs = [refs]
189
- if type.component() is not None:
190
- type = type.makeCompositeDatasetType()
191
- type = _validate_dataset_type(type, datasetTypes, butler.registry)
192
- # iterate over all the references, if it exists and should be
193
- # exported, if not it should be inserted into the new registry
194
- for ref in refs:
195
- # Component dataset ID is the same as its parent ID, so
196
- # checking component in existing_ids works OK.
197
- if ref.id in existing_ids:
198
- # If this is a component we want the composite to be
199
- # exported.
200
- if ref.isComponent():
201
- ref = ref.makeCompositeRef()
202
- # Make sure we export this with the registry's dataset
203
- # type, since transfer_from doesn't handle storage
204
- # class differences (maybe it should, but it's not
205
- # bad to be defensive here even if that changes).
206
- if type != ref.datasetType:
207
- ref = ref.overrideStorageClass(type.storageClass)
208
- assert ref.datasetType == type, "Dataset types should not differ in other ways."
209
- exports.add(ref)
210
- else:
211
- if ref.isComponent():
212
- # We can't insert a component, and a component will
213
- # be part of some other upstream dataset, so it
214
- # should be safe to skip them here
215
- continue
216
- inserts[type].add(ref)
217
-
218
- return exports, inserts
219
-
220
-
221
- def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
222
- # Recurse through any discovered collections to make sure all collections
223
- # are exported. This exists because I ran into a situation where some
224
- # collections were not properly being discovered and exported. This
225
- # method may be able to be removed in the future if collection export
226
- # logic changes
227
- collections = set(collections)
228
- while True:
229
- discoveredCollections = set(
230
- butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
231
- )
232
- if len(discoveredCollections) > len(collections):
233
- collections = discoveredCollections
234
- else:
235
- break
236
- return collections
237
-
238
-
239
- def _export(
240
- butler: DirectButler, collections: Iterable[str] | None, inserts: DataSetTypeRefMap
241
- ) -> io.StringIO:
242
- # This exports relevant dimension records and collections using daf butler
243
- # objects, however it reaches in deep and does not use the public methods
244
- # so that it can export it to a string buffer and skip disk access. This
245
- # does not export the datasets themselves, since we use transfer_from for
246
- # that.
247
- yamlBuffer = io.StringIO()
248
- # Yaml is hard coded, since the class controls both ends of the
249
- # export/import
250
- BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
251
- backend = BackendClass(yamlBuffer, universe=butler.dimensions)
252
- exporter = RepoExportContext(butler, backend, directory=None, transfer=None)
253
-
254
- # Need to ensure that the dimension records for outputs are
255
- # transferred.
256
- for _, refs in inserts.items():
257
- exporter.saveDataIds([ref.dataId for ref in refs])
258
-
259
- # Look for any defined collection, if not get the defaults
260
- if collections is None:
261
- collections = butler.registry.defaults.collections
262
-
263
- # look up all collections associated with those inputs, this follows
264
- # all chains to make sure everything is properly exported
265
- for c in _discoverCollections(butler, collections):
266
- exporter.saveCollection(c)
267
- exporter._finish()
268
-
269
- # reset the string buffer to the beginning so the read operation will
270
- # actually *see* the data that was exported
271
- yamlBuffer.seek(0)
272
- return yamlBuffer
273
-
274
-
275
- def _setupNewButler(
276
- butler: DirectButler,
277
- outputLocation: ResourcePath,
278
- dirExists: bool,
279
- datastoreRoot: ResourcePath | None = None,
280
- ) -> Butler:
281
- """Set up the execution butler
282
-
283
- Parameters
284
- ----------
285
- butler : `Butler`
286
- The original butler, upon which the execution butler is based.
287
- outputLocation : `~lsst.resources.ResourcePath`
288
- Location of the execution butler.
289
- dirExists : `bool`
290
- Does the ``outputLocation`` exist, and if so, should it be clobbered?
291
- datastoreRoot : `~lsst.resources.ResourcePath`, optional
292
- Path for the execution butler datastore. If not specified, then the
293
- original butler's datastore will be used.
294
-
295
- Returns
296
- -------
297
- execution_butler : `Butler`
298
- Execution butler.
299
- """
300
- # Set up the new butler object at the specified location
301
- if dirExists:
302
- # Remove the existing table, if the code got this far and this exists
303
- # clobber must be true
304
- executionRegistry = outputLocation.join("gen3.sqlite3")
305
- if executionRegistry.exists():
306
- executionRegistry.remove()
307
- else:
308
- outputLocation.mkdir()
309
-
310
- # Copy the existing butler config, modifying the location of the
311
- # registry to the specified location.
312
- # Preserve the root path from the existing butler so things like
313
- # file data stores continue to look at the old location.
314
- config = Config(butler._config)
315
- config["root"] = outputLocation.geturl()
316
- config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
317
-
318
- # Remove any namespace that may be set in main registry.
319
- config.pop(("registry", "namespace"), None)
320
-
321
- # Obscore manager cannot be used with execution butler.
322
- config.pop(("registry", "managers", "obscore"), None)
323
-
324
- # record the current root of the datastore if it is specified relative
325
- # to the butler root
326
- if datastoreRoot is not None:
327
- config["datastore", "root"] = datastoreRoot.geturl()
328
- elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
329
- config["datastore", "root"] = butler._config.configDir.geturl()
330
- config["datastore", "trust_get_request"] = True
331
-
332
- # Requires that we use the dimension configuration from the original
333
- # butler and not use the defaults.
334
- config = Butler.makeRepo(
335
- root=outputLocation,
336
- config=config,
337
- dimensionConfig=butler.dimensions.dimensionConfig,
338
- overwrite=True,
339
- forceConfigRoot=False,
340
- )
341
-
342
- # Return a newly created butler
343
- return Butler.from_config(config, writeable=True)
344
-
345
-
346
- def _import(
347
- yamlBuffer: io.StringIO,
348
- newButler: Butler,
349
- inserts: DataSetTypeRefMap,
350
- run: str | None,
351
- butlerModifier: Callable[[Butler], Butler] | None,
352
- ) -> Butler:
353
- # This method takes the exports from the existing butler, imports
354
- # them into the newly created butler, and then inserts the datasets
355
- # that are expected to be produced.
356
-
357
- # import the existing datasets using "split" mode. "split" is safe
358
- # because execution butler is assumed to be able to see all the file
359
- # locations that the main datastore can see. "split" supports some
360
- # absolute URIs in the datastore.
361
- newButler.import_(filename=yamlBuffer, format="yaml", transfer="split")
362
-
363
- # If there is modifier callable, run it to make necessary updates
364
- # to the new butler.
365
- if butlerModifier is not None:
366
- newButler = butlerModifier(newButler)
367
-
368
- # Register datasets to be produced and insert them into the registry
369
- for dsType, refs in inserts.items():
370
- # Storage class differences should have already been resolved by calls
371
- # _validate_dataset_type in _export, resulting in the Registry dataset
372
- # type whenever that exists.
373
- newButler.registry.registerDatasetType(dsType)
374
- newButler.registry._importDatasets(refs)
375
-
376
- return newButler
377
-
378
-
379
- def buildExecutionButler(
380
- butler: DirectButler,
381
- graph: QuantumGraph,
382
- outputLocation: ResourcePathExpression,
383
- run: str | None,
384
- *,
385
- clobber: bool = False,
386
- butlerModifier: Callable[[Butler], Butler] | None = None,
387
- collections: Iterable[str] | None = None,
388
- datastoreRoot: ResourcePathExpression | None = None,
389
- transfer: str = "auto",
390
- ) -> Butler:
391
- r"""Create an execution butler.
392
-
393
- Responsible for exporting
394
- input `QuantumGraph`\s into a new minimal `~lsst.daf.butler.Butler` which
395
- only contains datasets specified by the `QuantumGraph`.
396
-
397
- These datasets are both those that already exist in the input
398
- `~lsst.daf.butler.Butler`, and those that are expected to be produced
399
- during the execution of the `QuantumGraph`.
400
-
401
- Parameters
402
- ----------
403
- butler : `lsst.daf.butler.Butler`
404
- This is the existing `~lsst.daf.butler.Butler` instance from
405
- which existing datasets will be exported. This should be the
406
- `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
407
- that will be converted with this object.
408
- graph : `QuantumGraph`
409
- Graph containing nodes that are to be exported into an execution
410
- butler.
411
- outputLocation : convertible to `~lsst.resources.ResourcePath`
412
- URI Location at which the execution butler is to be exported. May be
413
- specified as a string or a `~lsst.resources.ResourcePath` instance.
414
- run : `str`, optional
415
- The run collection that the exported datasets are to be placed in. If
416
- None, the default value in registry.defaults will be used.
417
- clobber : `bool`, Optional
418
- By default a butler will not be created if a file or directory
419
- already exists at the output location. If this is set to `True`
420
- what is at the location will be deleted prior to running the
421
- export. Defaults to `False`.
422
- butlerModifier : `~typing.Callable`, Optional
423
- If supplied this should be a callable that accepts a
424
- `~lsst.daf.butler.Butler`, and returns an instantiated
425
- `~lsst.daf.butler.Butler`. This callable may be used to make any
426
- modifications to the `~lsst.daf.butler.Butler` desired. This
427
- will be called after importing all datasets that exist in the input
428
- `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
429
- to be produced. Examples of what this method could do include
430
- things such as creating collections/runs/ etc.
431
- collections : `~typing.Iterable` of `str`, Optional
432
- An iterable of collection names that will be exported from the input
433
- `~lsst.daf.butler.Butler` when creating the execution butler. If not
434
- supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
435
- default collections will be used.
436
- datastoreRoot : convertible to `~lsst.resources.ResourcePath`, Optional
437
- Root directory for datastore of execution butler. If `None`, then the
438
- original butler's datastore will be used.
439
- transfer : `str`
440
- How (and whether) the input datasets should be added to the execution
441
- butler datastore. This should be a ``transfer`` string recognized by
442
- :func:`lsst.resources.ResourcePath.transfer_from`.
443
- ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified.
444
-
445
- Returns
446
- -------
447
- executionButler : `lsst.daf.butler.Butler`
448
- An instance of the newly created execution butler.
449
-
450
- Raises
451
- ------
452
- FileExistsError
453
- Raised if something exists in the filesystem at the specified output
454
- location and clobber is `False`.
455
- NotADirectoryError
456
- Raised if specified output URI does not correspond to a directory.
457
- """
458
- # Now require that if run is given it must match the graph run.
459
- if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")):
460
- raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}")
461
-
462
- # We know this must refer to a directory.
463
- outputLocation = ResourcePath(outputLocation, forceDirectory=True)
464
- if datastoreRoot is not None:
465
- datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True)
466
-
467
- # Do this first to Fail Fast if the output exists
468
- if (dirExists := outputLocation.exists()) and not clobber:
469
- raise FileExistsError("Cannot create a butler at specified location, location exists")
470
- if not outputLocation.isdir():
471
- raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
472
-
473
- exports, inserts = _accumulate(butler, graph)
474
- yamlBuffer = _export(butler, collections, inserts)
475
-
476
- newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot)
477
-
478
- newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
479
-
480
- if transfer == "auto" and datastoreRoot is not None:
481
- transfer = "copy"
482
-
483
- # Transfer the existing datasets directly from the source butler.
484
- newButler.transfer_from(
485
- butler,
486
- exports,
487
- transfer=transfer,
488
- skip_missing=False, # Everything should exist.
489
- register_dataset_types=True,
490
- transfer_dimensions=True,
491
- )
492
-
493
- return newButler