lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. lsst/daf/butler/_butler.py +27 -8
  2. lsst/daf/butler/_butler_collections.py +4 -4
  3. lsst/daf/butler/_butler_metrics.py +51 -2
  4. lsst/daf/butler/_dataset_provenance.py +1 -1
  5. lsst/daf/butler/_dataset_ref.py +1 -1
  6. lsst/daf/butler/_exceptions.py +2 -2
  7. lsst/daf/butler/_file_dataset.py +2 -1
  8. lsst/daf/butler/_formatter.py +14 -7
  9. lsst/daf/butler/_labeled_butler_factory.py +28 -8
  10. lsst/daf/butler/_query_all_datasets.py +2 -0
  11. lsst/daf/butler/_rubin/temporary_for_ingest.py +207 -0
  12. lsst/daf/butler/cli/cmd/_remove_runs.py +1 -12
  13. lsst/daf/butler/column_spec.py +4 -4
  14. lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
  15. lsst/daf/butler/configs/storageClasses.yaml +15 -0
  16. lsst/daf/butler/datastore/_datastore.py +21 -1
  17. lsst/daf/butler/datastore/record_data.py +1 -1
  18. lsst/daf/butler/datastore/stored_file_info.py +2 -2
  19. lsst/daf/butler/datastores/chainedDatastore.py +4 -0
  20. lsst/daf/butler/datastores/fileDatastore.py +26 -13
  21. lsst/daf/butler/datastores/file_datastore/get.py +4 -4
  22. lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +5 -1
  23. lsst/daf/butler/datastores/file_datastore/transfer.py +2 -2
  24. lsst/daf/butler/datastores/inMemoryDatastore.py +8 -0
  25. lsst/daf/butler/ddl.py +2 -2
  26. lsst/daf/butler/dimensions/_coordinate.py +11 -8
  27. lsst/daf/butler/dimensions/_record_set.py +1 -1
  28. lsst/daf/butler/dimensions/_records.py +9 -3
  29. lsst/daf/butler/direct_butler/_direct_butler.py +85 -51
  30. lsst/daf/butler/direct_query_driver/_driver.py +5 -4
  31. lsst/daf/butler/direct_query_driver/_result_page_converter.py +1 -1
  32. lsst/daf/butler/formatters/parquet.py +6 -6
  33. lsst/daf/butler/logging.py +9 -3
  34. lsst/daf/butler/nonempty_mapping.py +1 -1
  35. lsst/daf/butler/persistence_context.py +8 -5
  36. lsst/daf/butler/queries/_general_query_results.py +1 -1
  37. lsst/daf/butler/queries/driver.py +1 -1
  38. lsst/daf/butler/queries/expression_factory.py +2 -2
  39. lsst/daf/butler/queries/expressions/parser/exprTree.py +1 -1
  40. lsst/daf/butler/queries/expressions/parser/parserYacc.py +1 -1
  41. lsst/daf/butler/queries/overlaps.py +2 -2
  42. lsst/daf/butler/queries/tree/_column_set.py +1 -1
  43. lsst/daf/butler/registry/_collection_record_cache.py +1 -1
  44. lsst/daf/butler/registry/_collection_summary_cache.py +5 -4
  45. lsst/daf/butler/registry/_registry.py +4 -0
  46. lsst/daf/butler/registry/bridge/monolithic.py +17 -13
  47. lsst/daf/butler/registry/databases/postgresql.py +2 -1
  48. lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py +1 -1
  49. lsst/daf/butler/registry/datasets/byDimensions/_manager.py +53 -47
  50. lsst/daf/butler/registry/datasets/byDimensions/summaries.py +3 -2
  51. lsst/daf/butler/registry/expand_data_ids.py +93 -0
  52. lsst/daf/butler/registry/interfaces/_database.py +6 -1
  53. lsst/daf/butler/registry/interfaces/_datasets.py +2 -1
  54. lsst/daf/butler/registry/interfaces/_obscore.py +1 -1
  55. lsst/daf/butler/registry/obscore/_records.py +1 -1
  56. lsst/daf/butler/registry/obscore/_spatial.py +2 -2
  57. lsst/daf/butler/registry/queries/_results.py +2 -2
  58. lsst/daf/butler/registry/sql_registry.py +3 -25
  59. lsst/daf/butler/registry/wildcards.py +5 -5
  60. lsst/daf/butler/remote_butler/_get.py +1 -1
  61. lsst/daf/butler/remote_butler/_remote_butler.py +6 -1
  62. lsst/daf/butler/remote_butler/_remote_file_transfer_source.py +4 -0
  63. lsst/daf/butler/remote_butler/authentication/cadc.py +4 -3
  64. lsst/daf/butler/script/_pruneDatasets.py +4 -2
  65. lsst/daf/butler/script/configValidate.py +2 -2
  66. lsst/daf/butler/script/queryCollections.py +2 -2
  67. lsst/daf/butler/script/removeCollections.py +2 -0
  68. lsst/daf/butler/script/removeRuns.py +2 -0
  69. lsst/daf/butler/tests/cliCmdTestBase.py +2 -0
  70. lsst/daf/butler/tests/cliLogTestBase.py +2 -0
  71. lsst/daf/butler/tests/hybrid_butler.py +10 -2
  72. lsst/daf/butler/tests/registry_data/lsstcam-subset.yaml +191 -0
  73. lsst/daf/butler/tests/registry_data/spatial.py +4 -2
  74. lsst/daf/butler/tests/testFormatters.py +2 -2
  75. lsst/daf/butler/tests/utils.py +1 -1
  76. lsst/daf/butler/timespan_database_representation.py +3 -3
  77. lsst/daf/butler/transfers/_context.py +7 -6
  78. lsst/daf/butler/version.py +1 -1
  79. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/METADATA +3 -2
  80. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/RECORD +88 -85
  81. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/WHEEL +1 -1
  82. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/entry_points.txt +0 -0
  83. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/COPYRIGHT +0 -0
  84. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/LICENSE +0 -0
  85. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/bsd_license.txt +0 -0
  86. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/gpl-v3.0.txt +0 -0
  87. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/top_level.txt +0 -0
  88. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/zip-safe +0 -0
@@ -610,15 +610,15 @@ class DirectQueryDriver(QueryDriver):
610
610
  ----------
611
611
  tree : `.queries.tree.QueryTree`
612
612
  Description of the joins and row filters in the query.
613
+ allow_duplicate_overlaps : `bool`, optional
614
+ If set to `True` then query will be allowed to generate
615
+ non-distinct rows for spatial overlaps.
613
616
 
614
617
  Returns
615
618
  -------
616
619
  tree_analysis : `QueryTreeAnalysis`
617
620
  Struct containing additional information need to build the joins
618
621
  stage of a query.
619
- allow_duplicate_overlaps : `bool`, optional
620
- If set to `True` then query will be allowed to generate
621
- non-distinct rows for spatial overlaps.
622
622
 
623
623
  Notes
624
624
  -----
@@ -1313,7 +1313,8 @@ class DirectQueryDriver(QueryDriver):
1313
1313
  Mapping of collection names to collection records, must contain
1314
1314
  records for all collections in ``collection_names`` and all their
1315
1315
  children collections.
1316
- summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
1316
+ summaries : `~collections.abc.Mapping` [`typing.Any`, \
1317
+ `CollectionSummary`]
1317
1318
  Mapping of collection IDs to collection summaries, must contain
1318
1319
  summaries for all non-chained collections in the collection tree.
1319
1320
 
@@ -386,7 +386,7 @@ class _GeneralColumnConverter:
386
386
 
387
387
  Returns
388
388
  -------
389
- value : `Any`
389
+ value : `typing.Any`
390
390
  Result of the conversion.
391
391
  """
392
392
  raise NotImplementedError()
@@ -270,18 +270,18 @@ def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame:
270
270
 
271
271
 
272
272
  def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table:
273
- """Convert a pyarrow table to an `astropy.Table`.
273
+ """Convert a pyarrow table to an `astropy.table.Table`.
274
274
 
275
275
  Parameters
276
276
  ----------
277
277
  arrow_table : `pyarrow.Table`
278
278
  Input arrow table to convert. If the table has astropy unit
279
279
  metadata in the schema it will be used in the construction
280
- of the ``astropy.Table``.
280
+ of the ``astropy.table.Table``.
281
281
 
282
282
  Returns
283
283
  -------
284
- table : `astropy.Table`
284
+ table : `astropy.table.Table`
285
285
  Converted astropy table.
286
286
  """
287
287
  from astropy.table import Table
@@ -520,7 +520,7 @@ def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
520
520
 
521
521
  Parameters
522
522
  ----------
523
- astropy_table : `astropy.Table`
523
+ astropy_table : `astropy.table.Table`
524
524
  Input astropy table.
525
525
 
526
526
  Returns
@@ -584,7 +584,7 @@ def astropy_to_pandas(astropy_table: atable.Table, index: str | None = None) ->
584
584
 
585
585
  Parameters
586
586
  ----------
587
- astropy_table : `astropy.Table`
587
+ astropy_table : `astropy.table.Table`
588
588
  Input astropy table.
589
589
  index : `str`, optional
590
590
  Name of column to set as index.
@@ -640,7 +640,7 @@ def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]
640
640
 
641
641
  Parameters
642
642
  ----------
643
- astropy_table : `astropy.Table`
643
+ astropy_table : `astropy.table.Table`
644
644
  Input astropy table.
645
645
 
646
646
  Returns
@@ -764,11 +764,17 @@ class ButlerLogRecords(MutableSequence[ButlerLogRecord]):
764
764
 
765
765
 
766
766
  class ButlerLogRecordHandler(StreamHandler):
767
- """Python log handler that accumulates records."""
767
+ """Python log handler that accumulates records.
768
768
 
769
- def __init__(self) -> None:
769
+ Parameters
770
+ ----------
771
+ records : `ButlerLogRecords`, optional
772
+ Container to store logs in.
773
+ """
774
+
775
+ def __init__(self, records: ButlerLogRecords | None = None) -> None:
770
776
  super().__init__()
771
- self.records = ButlerLogRecords([])
777
+ self.records = ButlerLogRecords([]) if records is None else records
772
778
 
773
779
  def emit(self, record: LogRecord) -> None:
774
780
  self.records.append(record)
@@ -43,7 +43,7 @@ _V = TypeVar("_V", bound=Copyable, covariant=True)
43
43
 
44
44
 
45
45
  class NonemptyMapping(Mapping[_K, _V]):
46
- """A `Mapping` that implicitly adds values (like
46
+ """A `~collections.abc.Mapping` that implicitly adds values (like
47
47
  `~collections.defaultdict`) but treats any that evaluate to `False` as not
48
48
  present.
49
49
 
@@ -180,17 +180,20 @@ class PersistenceContextVars:
180
180
 
181
181
  Parameters
182
182
  ----------
183
- function : `Callable`
183
+ function : `collections.abc.Callable`
184
184
  A callable which is to be executed inside a specific context.
185
185
  *args : tuple
186
- Positional arguments which are to be passed to the `Callable`.
186
+ Positional arguments which are to be passed to the
187
+ `~collections.abc.Callable`.
187
188
  **kwargs : dict, optional
188
- Extra key word arguments which are to be passed to the `Callable`.
189
+ Extra key word arguments which are to be passed to the
190
+ `~collections.abc.Callable`.
189
191
 
190
192
  Returns
191
193
  -------
192
- result : `Any`
193
- The result returned by executing the supplied `Callable`.
194
+ result : `typing.Any`
195
+ The result returned by executing the supplied
196
+ `~collections.abc.Callable`.
194
197
  """
195
198
  self._ctx = copy_context()
196
199
  # Type checkers seem to have trouble with a second layer nesting of
@@ -93,7 +93,7 @@ class GeneralQueryResults(QueryResultsBase):
93
93
 
94
94
  Yields
95
95
  ------
96
- row_dict : `dict` [`str`, `Any`]
96
+ row_dict : `dict` [`str`, `typing.Any`]
97
97
  Result row as dictionary, the keys are the names of the dimensions,
98
98
  dimension fields (separated from dimension by dot) or dataset type
99
99
  fields (separated from dataset type name by dot).
@@ -245,7 +245,7 @@ class QueryDriver(AbstractContextManager[None]):
245
245
  ----------
246
246
  dimensions : `DimensionGroup`
247
247
  Dimensions of the data coordinates.
248
- rows : `Iterable` [ `tuple` ]
248
+ rows : `~collections.abc.Iterable` [ `tuple` ]
249
249
  Tuples of data coordinate values, covering just the "required"
250
250
  subset of ``dimensions``.
251
251
 
@@ -557,7 +557,7 @@ class ExpressionFactory:
557
557
  -------
558
558
  logical_and : `tree.Predicate`
559
559
  A boolean expression that evaluates to `True` only if all operands
560
- evaluate to `True.
560
+ evaluate to `True`.
561
561
  """
562
562
  return first.logical_and(*args)
563
563
 
@@ -575,7 +575,7 @@ class ExpressionFactory:
575
575
  -------
576
576
  logical_or : `tree.Predicate`
577
577
  A boolean expression that evaluates to `True` if any operand
578
- evaluates to `True.
578
+ evaluates to `True`.
579
579
  """
580
580
  return first.logical_or(*args)
581
581
 
@@ -561,7 +561,7 @@ class PolygonNode(Node):
561
561
 
562
562
  Parameters
563
563
  ----------
564
- vertices : `list`[`tuple`[`Node`, `Node`]]
564
+ vertices : `list` [`tuple` [`Node`, `Node`]]
565
565
  Node representing vertices of polygon.
566
566
  """
567
567
 
@@ -109,7 +109,7 @@ def _parseTimeString(time_str: str) -> astropy.time.Time:
109
109
  Returns
110
110
  -------
111
111
  time : `astropy.time.Time`
112
- The parsed time.
112
+ The parsed time.
113
113
 
114
114
  Raises
115
115
  ------
@@ -69,9 +69,9 @@ class _NaiveDisjointSet(Generic[_T]):
69
69
 
70
70
  Parameters
71
71
  ----------
72
- a :
72
+ a
73
73
  Element whose subset should be merged.
74
- b :
74
+ b
75
75
  Element whose subset should be merged.
76
76
 
77
77
  Returns
@@ -429,7 +429,7 @@ class ColumnOrder:
429
429
 
430
430
  Parameters
431
431
  ----------
432
- row : `Sequence` [ `DataIdValue` ]
432
+ row : `~collections.abc.Sequence` [ `DataIdValue` ]
433
433
  A row output by the SQL query associated with these columns.
434
434
  """
435
435
  return row[: len(self._dimension_keys)]
@@ -135,7 +135,7 @@ class CollectionRecordCache:
135
135
 
136
136
  Parameters
137
137
  ----------
138
- key : `Any`
138
+ key : `typing.Any`
139
139
  Collection key.
140
140
 
141
141
  Returns
@@ -54,7 +54,8 @@ class CollectionSummaryCache:
54
54
 
55
55
  Parameters
56
56
  ----------
57
- summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
57
+ summaries : `~collections.abc.Mapping` [`typing.Any`, \
58
+ `CollectionSummary`]
58
59
  Summary records indexed by collection key, records must include all
59
60
  dataset types.
60
61
  """
@@ -65,15 +66,15 @@ class CollectionSummaryCache:
65
66
 
66
67
  Parameters
67
68
  ----------
68
- keys : `~collections.abc.Iterable` [`Any`]
69
+ keys : `~collections.abc.Iterable` [`typing.Any`]
69
70
  Sequence of collection keys.
70
71
 
71
72
  Returns
72
73
  -------
73
- summaries : `dict` [`Any`, `CollectionSummary`]
74
+ summaries : `dict` [`typing.Any`, `CollectionSummary`]
74
75
  Dictionary of summaries indexed by collection keys, includes
75
76
  records found in the cache.
76
- missing_keys : `set` [`Any`]
77
+ missing_keys : `set` [`typing.Any`]
77
78
  Collection keys that are not present in the cache.
78
79
  """
79
80
  found = {}
@@ -437,6 +437,10 @@ class Registry(ABC):
437
437
  Name of the type to be removed or tuple containing a list of type
438
438
  names to be removed. Wildcards are allowed.
439
439
 
440
+ Returns
441
+ -------
442
+ None
443
+
440
444
  Raises
441
445
  ------
442
446
  lsst.daf.butler.registry.OrphanedRecordError
@@ -215,20 +215,24 @@ class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
215
215
  def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
216
216
  # Docstring inherited from DatastoreRegistryBridge
217
217
  byId = {ref.id: ref for ref in refs}
218
- sql = (
219
- sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
220
- .select_from(self._tables.dataset_location)
221
- .where(
222
- sqlalchemy.sql.and_(
223
- self._tables.dataset_location.columns.datastore_name == self.datastoreName,
224
- self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
218
+ found: list[DatasetIdRef] = []
219
+ with self._db.session():
220
+ for batch in chunk_iterable(byId.keys(), 50000):
221
+ sql = (
222
+ sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
223
+ .select_from(self._tables.dataset_location)
224
+ .where(
225
+ sqlalchemy.sql.and_(
226
+ self._tables.dataset_location.columns.datastore_name == self.datastoreName,
227
+ self._tables.dataset_location.columns.dataset_id.in_(batch),
228
+ )
229
+ )
225
230
  )
226
- )
227
- )
228
- with self._db.query(sql) as sql_result:
229
- sql_rows = sql_result.fetchall()
230
- for row in sql_rows:
231
- yield byId[row.dataset_id]
231
+ with self._db.query(sql) as sql_result:
232
+ sql_ids = sql_result.scalars().all()
233
+ found.extend(byId[id] for id in sql_ids)
234
+
235
+ return found
232
236
 
233
237
  @contextmanager
234
238
  def emptyTrash(
@@ -92,7 +92,8 @@ class PostgresqlDatabase(Database):
92
92
  allow_temporary_tables: bool = True,
93
93
  ):
94
94
  with engine.connect() as connection:
95
- # `Any` to make mypy ignore the line below, can't use type: ignore
95
+ # `typing.Any` to make mypy ignore the line below, can't
96
+ # use type: ignore
96
97
  dbapi: Any = connection.connection
97
98
  try:
98
99
  dsn = dbapi.get_dsn_parameters()
@@ -155,7 +155,7 @@ class DatasetTypeCache:
155
155
  dataset_type : `DatasetType` or `None`
156
156
  Cached dataset type, `None` is returned if the name is not in the
157
157
  cache.
158
- extra : `Any` or `None`
158
+ extra : `typing.Any` or `None`
159
159
  Cached opaque data, `None` is returned if the name is not in the
160
160
  cache.
161
161
  """
@@ -12,6 +12,8 @@ from typing import TYPE_CHECKING, Any, ClassVar
12
12
  import astropy.time
13
13
  import sqlalchemy
14
14
 
15
+ from lsst.utils.iteration import chunk_iterable
16
+
15
17
  from .... import ddl
16
18
  from ...._collection_type import CollectionType
17
19
  from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
@@ -424,17 +426,18 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
424
426
  return result
425
427
 
426
428
  def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
427
- # Look up the dataset types corresponding to the given Dataset IDs.
428
- id_col = self._static.dataset.columns["id"]
429
- sql = sqlalchemy.sql.select(
430
- id_col,
431
- self._static.dataset.columns["dataset_type_id"],
432
- ).where(id_col.in_(ids))
433
- with self._db.query(sql) as sql_result:
434
- dataset_rows = sql_result.mappings().all()
435
- dataset_type_map: dict[DatasetId, DatasetType] = {
436
- row["id"]: self._get_dataset_type_by_id(row["dataset_type_id"]) for row in dataset_rows
437
- }
429
+ dataset_type_map: dict[DatasetId, DatasetType] = {}
430
+ for batch in chunk_iterable(set(ids), 50000):
431
+ # Look up the dataset types corresponding to the given Dataset IDs.
432
+ id_col = self._static.dataset.columns["id"]
433
+ sql = sqlalchemy.sql.select(
434
+ id_col,
435
+ self._static.dataset.columns["dataset_type_id"],
436
+ ).where(id_col.in_(batch))
437
+ with self._db.query(sql) as sql_result:
438
+ dataset_rows = sql_result.mappings().all()
439
+ for row in dataset_rows:
440
+ dataset_type_map[row["id"]] = self._get_dataset_type_by_id(row["dataset_type_id"])
438
441
 
439
442
  # Group the given dataset IDs by the DimensionGroup of their dataset
440
443
  # types -- there is a separate tags table for each DimensionGroup.
@@ -448,40 +451,41 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
448
451
  # data IDs corresponding to the UUIDs found from the dataset table.
449
452
  dynamic_tables = self._get_dynamic_tables(dimension_group)
450
453
  tags_table = self._get_tags_table(dynamic_tables)
451
- tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(datasets))
452
- # Join in the collection table to fetch the run name.
453
- collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
454
- joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
455
- tags_sql = joined_collections.joined_sql
456
- run_name_column = joined_collections.name_column
457
- tags_sql = tags_sql.add_columns(run_name_column)
458
- # Tags table includes run collections and tagged
459
- # collections.
460
- # In theory the data ID for a given dataset should be the
461
- # same in both, but nothing actually guarantees this.
462
- # So skip any tagged collections, using the run collection
463
- # as the definitive definition.
464
- tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
465
-
466
- with self._db.query(tags_sql) as sql_result:
467
- data_id_rows = sql_result.mappings().all()
468
-
469
- assert run_name_column.key is not None
470
- for data_id_row in data_id_rows:
471
- id = data_id_row["dataset_id"]
472
- dataset_type = dataset_type_map[id]
473
- run_name = data_id_row[run_name_column.key]
474
- data_id = DataCoordinate.from_required_values(
475
- dimension_group,
476
- tuple(data_id_row[dimension] for dimension in dimension_group.required),
477
- )
478
- ref = DatasetRef(
479
- datasetType=dataset_type,
480
- dataId=data_id,
481
- id=id,
482
- run=run_name,
483
- )
484
- output_refs.append(ref)
454
+ for batch in chunk_iterable(datasets, 50000):
455
+ tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(batch))
456
+ # Join in the collection table to fetch the run name.
457
+ collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
458
+ joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
459
+ tags_sql = joined_collections.joined_sql
460
+ run_name_column = joined_collections.name_column
461
+ tags_sql = tags_sql.add_columns(run_name_column)
462
+ # Tags table includes run collections and tagged
463
+ # collections.
464
+ # In theory the data ID for a given dataset should be the
465
+ # same in both, but nothing actually guarantees this.
466
+ # So skip any tagged collections, using the run collection
467
+ # as the definitive definition.
468
+ tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
469
+
470
+ with self._db.query(tags_sql) as sql_result:
471
+ data_id_rows = sql_result.mappings().all()
472
+
473
+ assert run_name_column.key is not None
474
+ for data_id_row in data_id_rows:
475
+ id = data_id_row["dataset_id"]
476
+ dataset_type = dataset_type_map[id]
477
+ run_name = data_id_row[run_name_column.key]
478
+ data_id = DataCoordinate.from_required_values(
479
+ dimension_group,
480
+ tuple(data_id_row[dimension] for dimension in dimension_group.required),
481
+ )
482
+ ref = DatasetRef(
483
+ datasetType=dataset_type,
484
+ dataId=data_id,
485
+ id=id,
486
+ run=run_name,
487
+ )
488
+ output_refs.append(ref)
485
489
 
486
490
  return output_refs
487
491
 
@@ -818,8 +822,10 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
818
822
 
819
823
  Parameters
820
824
  ----------
821
- storage : `_DatasetREcordStorage`
822
- Struct that holds the tables and ID for a dataset type.
825
+ dimensions : `DimensionGroup`
826
+ Dimensions to validate.
827
+ tags : `sqlalchemy.schema.Table`
828
+ ???
823
829
  tmp_tags : `sqlalchemy.schema.Table`
824
830
  Temporary table with new datasets and the same schema as tags
825
831
  table.
@@ -304,13 +304,14 @@ class CollectionSummaryManager:
304
304
  dataset_type_names : `~collections.abc.Iterable` [`str`]
305
305
  Names of dataset types to include into returned summaries. If
306
306
  `None` then all dataset types will be included.
307
- dataset_type_factory : `Callable`
307
+ dataset_type_factory : `~collections.abc.Callable`
308
308
  Method that takes a table row and make `DatasetType` instance out
309
309
  of it.
310
310
 
311
311
  Returns
312
312
  -------
313
- summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
313
+ summaries : `~collections.abc.Mapping` [`typing.Any`, \
314
+ `CollectionSummary`]
314
315
  Collection summaries indexed by collection record key. This mapping
315
316
  will also contain all nested non-chained collections of the chained
316
317
  collections.
@@ -0,0 +1,93 @@
1
+ # This file is part of daf_butler.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ from collections import defaultdict
31
+ from collections.abc import Iterable
32
+
33
+ from ..dimensions import (
34
+ DataCoordinate,
35
+ DimensionDataAttacher,
36
+ DimensionGroup,
37
+ DimensionUniverse,
38
+ )
39
+ from ..dimensions.record_cache import DimensionRecordCache
40
+ from ..queries import QueryFactoryFunction
41
+
42
+
43
+ def expand_data_ids(
44
+ data_ids: Iterable[DataCoordinate],
45
+ universe: DimensionUniverse,
46
+ query_func: QueryFactoryFunction,
47
+ cache: DimensionRecordCache | None,
48
+ ) -> list[DataCoordinate]:
49
+ """Expand the given data IDs to look up implied dimension values and attach
50
+ dimension records.
51
+
52
+ Parameters
53
+ ----------
54
+ data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ]
55
+ Data coordinates to be expanded.
56
+ universe : `DimensionUniverse`
57
+ Dimension universe associated with the given ``data_ids`` values.
58
+ query_func : QueryFactoryFunction
59
+ Function used to set up a Butler query context for looking up required
60
+ information from the database.
61
+ cache : `DimensionRecordCache` | None
62
+ Cache containing already-known dimension records. May be `None` if a
63
+ cache is not available.
64
+
65
+ Returns
66
+ -------
67
+ expanded : `list` [ `DataCoordinate` ]
68
+ List of `DataCoordinate` instances in the same order as the input
69
+ values. It is guaranteed that each `DataCoordinate` has
70
+ ``hasRecords()=True`` and ``hasFull()=True``.
71
+ """
72
+ output = list(data_ids)
73
+
74
+ grouped_by_dimensions: defaultdict[DimensionGroup, list[int]] = defaultdict(list)
75
+ for i, data_id in enumerate(data_ids):
76
+ if not data_id.hasRecords():
77
+ grouped_by_dimensions[data_id.dimensions].append(i)
78
+
79
+ if not grouped_by_dimensions:
80
+ # All given DataCoordinate values are already expanded.
81
+ return output
82
+
83
+ attacher = DimensionDataAttacher(
84
+ cache=cache,
85
+ dimensions=DimensionGroup.union(*grouped_by_dimensions.keys(), universe=universe),
86
+ )
87
+ for dimensions, indexes in grouped_by_dimensions.items():
88
+ with query_func() as query:
89
+ expanded = attacher.attach(dimensions, (output[index] for index in indexes), query)
90
+ for index, data_id in zip(indexes, expanded):
91
+ output[index] = data_id
92
+
93
+ return output
@@ -1562,7 +1562,12 @@ class Database(ABC):
1562
1562
  return None
1563
1563
  else:
1564
1564
  sql = table.insert()
1565
- return [connection.execute(sql, row).inserted_primary_key[0] for row in rows]
1565
+ ids = []
1566
+ for row in rows:
1567
+ key = connection.execute(sql, row).inserted_primary_key
1568
+ assert key is not None
1569
+ ids.append(key[0])
1570
+ return ids
1566
1571
 
1567
1572
  @abstractmethod
1568
1573
  def replace(self, table: sqlalchemy.schema.Table, *rows: dict) -> None:
@@ -378,7 +378,8 @@ class DatasetRecordStorageManager(VersionedExtension):
378
378
 
379
379
  Returns
380
380
  -------
381
- summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
381
+ summaries : `~collections.abc.Mapping` [`typing.Any`, \
382
+ `CollectionSummary`]
382
383
  Collection summaries indexed by collection record key. This mapping
383
384
  will also contain all nested non-chained collections of the chained
384
385
  collections.
@@ -115,7 +115,7 @@ class ObsCoreTableManager(VersionedExtension):
115
115
  implemented with this manager.
116
116
  universe : `DimensionUniverse`
117
117
  All dimensions known to the registry.
118
- config : `dict` [ `str`, `Any` ]
118
+ config : `dict` [ `str`, `typing.Any` ]
119
119
  Configuration of the obscore manager.
120
120
  datasets : `type`
121
121
  Type of dataset manager.
@@ -256,7 +256,7 @@ class RecordFactory:
256
256
 
257
257
  Returns
258
258
  -------
259
- record : `dict` [ `str`, `Any` ] or `None`
259
+ record : `dict` [ `str`, `typing.Any` ] or `None`
260
260
  ObsCore record represented as a dictionary. `None` is returned if
261
261
  dataset does not need to be stored in the obscore table, e.g. when
262
262
  dataset type is not in obscore configuration.
@@ -72,7 +72,7 @@ class SpatialObsCorePlugin(ABC):
72
72
  name : `str`
73
73
  Arbitrary name given to this plugin (usually key in
74
74
  configuration).
75
- config : `dict` [ `str`, `Any` ]
75
+ config : `dict` [ `str`, `typing.Any` ]
76
76
  Plugin configuration dictionary.
77
77
  db : `Database`, optional
78
78
  Interface to the underlying database engine and namespace. In some
@@ -120,7 +120,7 @@ class SpatialObsCorePlugin(ABC):
120
120
 
121
121
  Returns
122
122
  -------
123
- record : `dict` [ `str`, `Any` ] or `None`
123
+ record : `dict` [ `str`, `typing.Any` ] or `None`
124
124
  Data to store in the main obscore table with column values
125
125
  corresponding to a region or `None` if there is nothing to store.
126
126