lsst-felis 27.2024.4000__py3-none-any.whl → 27.2024.4200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lsst-felis might be problematic. Click here for more details.

felis/tap_schema.py ADDED
@@ -0,0 +1,644 @@
1
+ """Provides utilities for creating and populating the TAP_SCHEMA database."""
2
+
3
+ # This file is part of felis.
4
+ #
5
+ # Developed for the LSST Data Management System.
6
+ # This product includes software developed by the LSST Project
7
+ # (https://www.lsst.org).
8
+ # See the COPYRIGHT file at the top-level directory of this distribution
9
+ # for details of code ownership.
10
+ #
11
+ # This program is free software: you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation, either version 3 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # This program is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # You should have received a copy of the GNU General Public License
22
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
23
+
24
+ import logging
25
+ import os
26
+ import re
27
+ from typing import Any
28
+
29
+ from lsst.resources import ResourcePath
30
+ from sqlalchemy import MetaData, Table, text
31
+ from sqlalchemy.engine import Connection, Engine
32
+ from sqlalchemy.engine.mock import MockConnection
33
+ from sqlalchemy.exc import SQLAlchemyError
34
+ from sqlalchemy.schema import CreateSchema
35
+ from sqlalchemy.sql.dml import Insert
36
+
37
+ from felis import datamodel
38
+ from felis.datamodel import Schema
39
+ from felis.db.utils import is_valid_engine
40
+ from felis.metadata import MetaDataBuilder
41
+
42
+ from .types import FelisType
43
+
44
+ __all__ = ["TableManager", "DataLoader"]
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ class TableManager:
50
+ """Manage creation of TAP_SCHEMA tables.
51
+
52
+ Parameters
53
+ ----------
54
+ engine
55
+ The SQLAlchemy engine for reflecting the TAP_SCHEMA tables from an
56
+ existing database.
57
+ This can be a mock connection or None, in which case the internal
58
+ TAP_SCHEMA schema will be used by loading an internal YAML file.
59
+ schema_name
60
+ The name of the schema to use for the TAP_SCHEMA tables.
61
+ Leave as None to use the standard name of "TAP_SCHEMA".
62
+ apply_schema_to_metadata
63
+ If True, apply the schema to the metadata as well as the tables.
64
+ If False, these will be set to None, e.g., for sqlite.
65
+ table_name_postfix
66
+ A string to append to all the standard table names.
67
+ This needs to be used in a way such that the resultant table names
68
+ map to tables within the TAP_SCHEMA database.
69
+
70
+ Notes
71
+ -----
72
+ The TAP_SCHEMA schema must either have been created already, in which case
73
+ the ``engine`` should be provided. Or the internal TAP_SCHEMA schema will
74
+ be used if ``engine`` is None or a ``MockConnection``.
75
+ """
76
+
77
+ _TABLE_NAMES_STD = ["schemas", "tables", "columns", "keys", "key_columns"]
78
+ """The standard table names for the TAP_SCHEMA tables."""
79
+
80
+ _SCHEMA_NAME_STD = "TAP_SCHEMA"
81
+ """The standard schema name for the TAP_SCHEMA tables."""
82
+
83
+ def __init__(
84
+ self,
85
+ engine: Engine | MockConnection | None = None,
86
+ schema_name: str | None = None,
87
+ apply_schema_to_metadata: bool = True,
88
+ table_name_postfix: str = "",
89
+ ):
90
+ """Initialize the table manager."""
91
+ self.table_name_postfix = table_name_postfix
92
+ self.apply_schema_to_metadata = apply_schema_to_metadata
93
+ self.schema_name = schema_name or TableManager._SCHEMA_NAME_STD
94
+
95
+ if is_valid_engine(engine):
96
+ assert isinstance(engine, Engine)
97
+ logger.debug(
98
+ "Reflecting TAP_SCHEMA database from existing database at %s",
99
+ engine.url._replace(password="***"),
100
+ )
101
+ self._reflect(engine)
102
+ else:
103
+ self._load_yaml()
104
+
105
+ self._create_table_map()
106
+ self._check_tables()
107
+
108
+ def _reflect(self, engine: Engine) -> None:
109
+ """Reflect the TAP_SCHEMA database tables into the metadata.
110
+
111
+ Parameters
112
+ ----------
113
+ engine
114
+ The SQLAlchemy engine to use to reflect the tables.
115
+ """
116
+ self._metadata = MetaData(schema=self.schema_name if self.apply_schema_to_metadata else None)
117
+ try:
118
+ self.metadata.reflect(bind=engine)
119
+ except SQLAlchemyError as e:
120
+ logger.error("Error reflecting TAP_SCHEMA database: %s", e)
121
+ raise
122
+
123
+ def _load_yaml(self) -> None:
124
+ """Load the standard TAP_SCHEMA schema from a Felis package
125
+ resource.
126
+ """
127
+ self._load_schema()
128
+ if self.schema_name != TableManager._SCHEMA_NAME_STD:
129
+ self.schema.name = self.schema_name
130
+ else:
131
+ self.schema_name = self.schema.name
132
+
133
+ self._metadata = MetaDataBuilder(
134
+ self.schema,
135
+ apply_schema_to_metadata=self.apply_schema_to_metadata,
136
+ apply_schema_to_tables=self.apply_schema_to_metadata,
137
+ ).build()
138
+
139
+ logger.debug("Loaded TAP_SCHEMA '%s' from YAML resource", self.schema_name)
140
+
141
+ def __getitem__(self, table_name: str) -> Table:
142
+ """Get one of the TAP_SCHEMA tables by its standard TAP_SCHEMA name.
143
+
144
+ Parameters
145
+ ----------
146
+ table_name
147
+ The name of the table to get.
148
+
149
+ Returns
150
+ -------
151
+ Table
152
+ The table with the given name.
153
+
154
+ Notes
155
+ -----
156
+ This implements array semantics for the table manager, allowing
157
+ tables to be accessed by their standard TAP_SCHEMA names.
158
+ """
159
+ if table_name not in self._table_map:
160
+ raise KeyError(f"Table '{table_name}' not found in table map")
161
+ return self.metadata.tables[self._table_map[table_name]]
162
+
163
+ @property
164
+ def schema(self) -> Schema:
165
+ """Get the TAP_SCHEMA schema.
166
+
167
+ Returns
168
+ -------
169
+ Schema
170
+ The TAP_SCHEMA schema.
171
+
172
+ Notes
173
+ -----
174
+ This will only be set if the TAP_SCHEMA schema was loaded from a
175
+ Felis package resource. In the case where the TAP_SCHEMA schema was
176
+ reflected from an existing database, this will be None.
177
+ """
178
+ return self._schema
179
+
180
+ @property
181
+ def metadata(self) -> MetaData:
182
+ """Get the metadata for the TAP_SCHEMA tables.
183
+
184
+ Returns
185
+ -------
186
+ `~sqlalchemy.sql.schema.MetaData`
187
+ The metadata for the TAP_SCHEMA tables.
188
+
189
+ Notes
190
+ -----
191
+ This will either be the metadata that was reflected from an existing
192
+ database or the metadata that was loaded from a Felis package resource.
193
+ """
194
+ return self._metadata
195
+
196
+ @classmethod
197
+ def get_tap_schema_std_path(cls) -> str:
198
+ """Get the path to the standard TAP_SCHEMA schema resource.
199
+
200
+ Returns
201
+ -------
202
+ str
203
+ The path to the standard TAP_SCHEMA schema resource.
204
+ """
205
+ return os.path.join(os.path.dirname(__file__), "schemas", "tap_schema_std.yaml")
206
+
207
+ @classmethod
208
+ def get_tap_schema_std_resource(cls) -> ResourcePath:
209
+ """Get the standard TAP_SCHEMA schema resource.
210
+
211
+ Returns
212
+ -------
213
+ `~lsst.resources.ResourcePath`
214
+ The standard TAP_SCHEMA schema resource.
215
+ """
216
+ return ResourcePath("resource://felis/schemas/tap_schema_std.yaml")
217
+
218
+ @classmethod
219
+ def get_table_names_std(cls) -> list[str]:
220
+ """Get the standard column names for the TAP_SCHEMA tables.
221
+
222
+ Returns
223
+ -------
224
+ list
225
+ The standard table names for the TAP_SCHEMA tables.
226
+ """
227
+ return cls._TABLE_NAMES_STD
228
+
229
+ @classmethod
230
+ def get_schema_name_std(cls) -> str:
231
+ """Get the standard schema name for the TAP_SCHEMA tables.
232
+
233
+ Returns
234
+ -------
235
+ str
236
+ The standard schema name for the TAP_SCHEMA tables.
237
+ """
238
+ return cls._SCHEMA_NAME_STD
239
+
240
+ @classmethod
241
+ def load_schema_resource(cls) -> Schema:
242
+ """Load the standard TAP_SCHEMA schema from a Felis package
243
+ resource into a Felis `~felis.datamodel.Schema`.
244
+
245
+ Returns
246
+ -------
247
+ Schema
248
+ The TAP_SCHEMA schema.
249
+ """
250
+ rp = cls.get_tap_schema_std_resource()
251
+ return Schema.from_uri(rp, context={"id_generation": True})
252
+
253
+ def _load_schema(self) -> None:
254
+ """Load the TAP_SCHEMA schema from a Felis package resource."""
255
+ self._schema = self.load_schema_resource()
256
+
257
+ def _create_table_map(self) -> None:
258
+ """Create a mapping of standard table names to the table names modified
259
+ with a postfix, as well as the prepended schema name if it is set.
260
+
261
+ Returns
262
+ -------
263
+ dict
264
+ A dictionary mapping the standard table names to the modified
265
+ table names.
266
+
267
+ Notes
268
+ -----
269
+ This is a private method that is called during initialization, allowing
270
+ us to use table names like ``schemas11`` such as those used by the CADC
271
+ TAP library instead of the standard table names. It also maps between
272
+ the standard table names and those with the schema name prepended like
273
+ SQLAlchemy uses.
274
+ """
275
+ self._table_map = {
276
+ table_name: (
277
+ f"{self.schema_name + '.' if self.apply_schema_to_metadata else ''}"
278
+ f"{table_name}{self.table_name_postfix}"
279
+ )
280
+ for table_name in TableManager.get_table_names_std()
281
+ }
282
+ logger.debug(f"Created TAP_SCHEMA table map: {self._table_map}")
283
+
284
+ def _check_tables(self) -> None:
285
+ """Check that there is a valid mapping to each standard table.
286
+
287
+ Raises
288
+ ------
289
+ KeyError
290
+ If a table is missing from the table map.
291
+ """
292
+ for table_name in TableManager.get_table_names_std():
293
+ self[table_name]
294
+
295
+ def _create_schema(self, engine: Engine) -> None:
296
+ """Create the database schema for TAP_SCHEMA if it does not already
297
+ exist.
298
+
299
+ Parameters
300
+ ----------
301
+ engine
302
+ The SQLAlchemy engine to use to create the schema.
303
+
304
+ Notes
305
+ -----
306
+ This method only creates the schema in the database. It does not create
307
+ the tables.
308
+ """
309
+ create_schema_functions = {
310
+ "postgresql": self._create_schema_postgresql,
311
+ "mysql": self._create_schema_mysql,
312
+ }
313
+
314
+ dialect_name = engine.dialect.name
315
+ if dialect_name == "sqlite":
316
+ # SQLite doesn't have schemas.
317
+ return
318
+
319
+ create_function = create_schema_functions.get(dialect_name)
320
+
321
+ if create_function:
322
+ with engine.begin() as connection:
323
+ create_function(connection)
324
+ else:
325
+ # Some other database engine we don't currently know how to handle.
326
+ raise NotImplementedError(
327
+ f"Database engine '{engine.dialect.name}' is not supported for schema creation"
328
+ )
329
+
330
+ def _create_schema_postgresql(self, connection: Connection) -> None:
331
+ """Create the schema in a PostgreSQL database.
332
+
333
+ Parameters
334
+ ----------
335
+ connection
336
+ The SQLAlchemy connection to use to create the schema.
337
+ """
338
+ connection.execute(CreateSchema(self.schema_name, if_not_exists=True))
339
+
340
+ def _create_schema_mysql(self, connection: Connection) -> None:
341
+ """Create the schema in a MySQL database.
342
+
343
+ Parameters
344
+ ----------
345
+ connection
346
+ The SQLAlchemy connection to use to create the schema.
347
+ """
348
+ connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {self.schema_name}"))
349
+
350
+ def initialize_database(self, engine: Engine) -> None:
351
+ """Initialize a database with the TAP_SCHEMA tables.
352
+
353
+ Parameters
354
+ ----------
355
+ engine
356
+ The SQLAlchemy engine to use to create the tables.
357
+ """
358
+ logger.info("Creating TAP_SCHEMA database '%s'", self.metadata.schema)
359
+ self._create_schema(engine)
360
+ self.metadata.create_all(engine)
361
+
362
+
363
+ class DataLoader:
364
+ """Load data into the TAP_SCHEMA tables.
365
+
366
+ Parameters
367
+ ----------
368
+ schema
369
+ The Felis ``Schema`` to load into the TAP_SCHEMA tables.
370
+ mgr
371
+ The table manager that contains the TAP_SCHEMA tables.
372
+ engine
373
+ The SQLAlchemy engine to use to connect to the database.
374
+ tap_schema_index
375
+ The index of the schema in the TAP_SCHEMA database.
376
+ output_path
377
+ The file to write the SQL statements to. If None, printing will be
378
+ suppressed.
379
+ print_sql
380
+ If True, print the SQL statements that will be executed.
381
+ dry_run
382
+ If True, the data will not be loaded into the database.
383
+ """
384
+
385
+ def __init__(
386
+ self,
387
+ schema: Schema,
388
+ mgr: TableManager,
389
+ engine: Engine | MockConnection,
390
+ tap_schema_index: int = 0,
391
+ output_path: str | None = None,
392
+ print_sql: bool = False,
393
+ dry_run: bool = False,
394
+ ):
395
+ self.schema = schema
396
+ self.mgr = mgr
397
+ self.engine = engine
398
+ self.tap_schema_index = tap_schema_index
399
+ self.inserts: list[Insert] = []
400
+ self.output_path = output_path
401
+ self.print_sql = print_sql
402
+ self.dry_run = dry_run
403
+
404
+ def load(self) -> None:
405
+ """Load the schema data into the TAP_SCHEMA tables.
406
+
407
+ Notes
408
+ -----
409
+ This will generate inserts for the data, print the SQL statements if
410
+ requested, save the SQL statements to a file if requested, and load the
411
+ data into the database if not in dry run mode. These are done as
412
+ sequential operations rather than for each insert. The logic is that
413
+ the user may still want the complete SQL output to be printed or saved
414
+ to a file even if loading into the database causes errors. If there are
415
+ errors when inserting into the database, the SQLAlchemy error message
416
+ should indicate which SQL statement caused the error.
417
+ """
418
+ self._generate_all_inserts()
419
+ if self.print_sql:
420
+ # Print to stdout.
421
+ self._print_sql()
422
+ if self.output_path:
423
+ # Print to an output file.
424
+ self._write_sql_to_file()
425
+ if not self.dry_run:
426
+ # Execute the inserts if not in dry run mode.
427
+ self._execute_inserts()
428
+ else:
429
+ logger.info("Dry run: not loading data into database")
430
+
431
+ def _insert_schemas(self) -> None:
432
+ """Insert the schema data into the schemas table."""
433
+ schema_record = {
434
+ "schema_name": self.schema.name,
435
+ "utype": self.schema.votable_utype,
436
+ "description": self.schema.description,
437
+ "schema_index": self.tap_schema_index,
438
+ }
439
+ self._insert("schemas", schema_record)
440
+
441
+ def _get_table_name(self, table: datamodel.Table) -> str:
442
+ """Get the name of the table with the schema name prepended.
443
+
444
+ Parameters
445
+ ----------
446
+ table
447
+ The table to get the name for.
448
+
449
+ Returns
450
+ -------
451
+ str
452
+ The name of the table with the schema name prepended.
453
+ """
454
+ return f"{self.schema.name}.{table.name}"
455
+
456
+ def _insert_tables(self) -> None:
457
+ """Insert the table data into the tables table."""
458
+ for table in self.schema.tables:
459
+ table_record = {
460
+ "schema_name": self.schema.name,
461
+ "table_name": self._get_table_name(table),
462
+ "table_type": "table",
463
+ "utype": table.votable_utype,
464
+ "description": table.description,
465
+ "table_index": 0 if table.tap_table_index is None else table.tap_table_index,
466
+ }
467
+ self._insert("tables", table_record)
468
+
469
+ def _insert_columns(self) -> None:
470
+ """Insert the column data into the columns table."""
471
+ for table in self.schema.tables:
472
+ for column in table.columns:
473
+ felis_type = FelisType.felis_type(column.datatype.value)
474
+ arraysize = str(column.votable_arraysize) if column.votable_arraysize else None
475
+ size = DataLoader._get_size(column)
476
+ indexed = DataLoader._is_indexed(column, table)
477
+ tap_column_index = column.tap_column_index
478
+ unit = column.ivoa_unit or column.fits_tunit
479
+
480
+ column_record = {
481
+ "table_name": self._get_table_name(table),
482
+ "column_name": column.name,
483
+ "datatype": felis_type.votable_name,
484
+ "arraysize": arraysize,
485
+ "size": size,
486
+ "xtype": column.votable_xtype,
487
+ "description": column.description,
488
+ "utype": column.votable_utype,
489
+ "unit": unit,
490
+ "ucd": column.ivoa_ucd,
491
+ "indexed": indexed,
492
+ "principal": column.tap_principal,
493
+ "std": column.tap_std,
494
+ "column_index": tap_column_index,
495
+ }
496
+ self._insert("columns", column_record)
497
+
498
+ def _insert_keys(self) -> None:
499
+ """Insert the foreign keys into the keys and key_columns tables."""
500
+ for table in self.schema.tables:
501
+ for constraint in table.constraints:
502
+ if isinstance(constraint, datamodel.ForeignKeyConstraint):
503
+ # Handle keys table
504
+ referenced_column = self.schema.find_object_by_id(
505
+ constraint.referenced_columns[0], datamodel.Column
506
+ )
507
+ referenced_table = self.schema.get_table_by_column(referenced_column)
508
+ key_record = {
509
+ "key_id": constraint.name,
510
+ "from_table": self._get_table_name(table),
511
+ "target_table": self._get_table_name(referenced_table),
512
+ "description": constraint.description,
513
+ "utype": constraint.votable_utype,
514
+ }
515
+ self._insert("keys", key_record)
516
+
517
+ # Handle key_columns table
518
+ from_column = self.schema.find_object_by_id(constraint.columns[0], datamodel.Column)
519
+ target_column = self.schema.find_object_by_id(
520
+ constraint.referenced_columns[0], datamodel.Column
521
+ )
522
+ key_columns_record = {
523
+ "key_id": constraint.name,
524
+ "from_column": from_column.name,
525
+ "target_column": target_column.name,
526
+ }
527
+ self._insert("key_columns", key_columns_record)
528
+
529
+ def _generate_all_inserts(self) -> None:
530
+ """Generate the inserts for all the data."""
531
+ self.inserts.clear()
532
+ self._insert_schemas()
533
+ self._insert_tables()
534
+ self._insert_columns()
535
+ self._insert_keys()
536
+ logger.debug("Generated %d insert statements", len(self.inserts))
537
+
538
+ def _execute_inserts(self) -> None:
539
+ """Load the `~felis.datamodel.Schema` data into the TAP_SCHEMA
540
+ tables.
541
+ """
542
+ if isinstance(self.engine, Engine):
543
+ with self.engine.connect() as connection:
544
+ transaction = connection.begin()
545
+ try:
546
+ for insert in self.inserts:
547
+ connection.execute(insert)
548
+ transaction.commit()
549
+ except Exception as e:
550
+ logger.error("Error loading data into database: %s", e)
551
+ transaction.rollback()
552
+ raise
553
+
554
+ def _compiled_inserts(self) -> list[str]:
555
+ """Compile the inserts to SQL.
556
+
557
+ Returns
558
+ -------
559
+ list
560
+ A list of the compiled insert statements.
561
+ """
562
+ return [
563
+ str(insert.compile(self.engine, compile_kwargs={"literal_binds": True}))
564
+ for insert in self.inserts
565
+ ]
566
+
567
+ def _print_sql(self) -> None:
568
+ """Print the generated inserts to stdout."""
569
+ for insert_str in self._compiled_inserts():
570
+ print(insert_str)
571
+
572
+ def _write_sql_to_file(self) -> None:
573
+ """Write the generated insert statements to a file."""
574
+ if not self.output_path:
575
+ raise ValueError("No output path specified")
576
+ with open(self.output_path, "w") as outfile:
577
+ for insert_str in self._compiled_inserts():
578
+ outfile.write(insert_str + "\n")
579
+
580
+ def _insert(self, table_name: str, record: list[Any] | dict[str, Any]) -> None:
581
+ """Generate an insert statement for a record.
582
+
583
+ Parameters
584
+ ----------
585
+ table_name
586
+ The name of the table to insert the record into.
587
+ record
588
+ The record to insert into the table.
589
+ """
590
+ table = self.mgr[table_name]
591
+ insert_statement = table.insert().values(record)
592
+ self.inserts.append(insert_statement)
593
+
594
+ @staticmethod
595
+ def _get_size(column: datamodel.Column) -> int | None:
596
+ """Get the size of the column.
597
+
598
+ Parameters
599
+ ----------
600
+ column
601
+ The column to get the size for.
602
+
603
+ Returns
604
+ -------
605
+ int or None
606
+ The size of the column or None if not applicable.
607
+ """
608
+ arraysize = column.votable_arraysize
609
+
610
+ if not arraysize:
611
+ return None
612
+
613
+ arraysize_str = str(arraysize)
614
+ if arraysize_str.isdigit():
615
+ return int(arraysize_str)
616
+
617
+ match = re.match(r"^([0-9]+)\*$", arraysize_str)
618
+ if match and match.group(1) is not None:
619
+ return int(match.group(1))
620
+
621
+ return None
622
+
623
+ @staticmethod
624
+ def _is_indexed(column: datamodel.Column, table: datamodel.Table) -> int:
625
+ """Check if the column is indexed in the table.
626
+
627
+ Parameters
628
+ ----------
629
+ column
630
+ The column to check.
631
+ table
632
+ The table to check.
633
+
634
+ Returns
635
+ -------
636
+ int
637
+ 1 if the column is indexed, 0 otherwise.
638
+ """
639
+ if isinstance(table.primary_key, str) and table.primary_key == column.id:
640
+ return 1
641
+ for index in table.indexes:
642
+ if index.columns and len(index.columns) == 1 and index.columns[0] == column.id:
643
+ return 1
644
+ return 0