lsst-felis 26.2024.900__py3-none-any.whl → 29.2025.4500__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- felis/__init__.py +10 -24
- felis/cli.py +437 -341
- felis/config/tap_schema/columns.csv +33 -0
- felis/config/tap_schema/key_columns.csv +8 -0
- felis/config/tap_schema/keys.csv +8 -0
- felis/config/tap_schema/schemas.csv +2 -0
- felis/config/tap_schema/tables.csv +6 -0
- felis/config/tap_schema/tap_schema_std.yaml +273 -0
- felis/datamodel.py +1386 -193
- felis/db/dialects.py +116 -0
- felis/db/schema.py +62 -0
- felis/db/sqltypes.py +275 -48
- felis/db/utils.py +409 -0
- felis/db/variants.py +159 -0
- felis/diff.py +234 -0
- felis/metadata.py +385 -0
- felis/tap_schema.py +767 -0
- felis/tests/__init__.py +0 -0
- felis/tests/postgresql.py +134 -0
- felis/tests/run_cli.py +79 -0
- felis/types.py +57 -9
- lsst_felis-29.2025.4500.dist-info/METADATA +38 -0
- lsst_felis-29.2025.4500.dist-info/RECORD +31 -0
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info}/WHEEL +1 -1
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info/licenses}/COPYRIGHT +1 -1
- felis/check.py +0 -381
- felis/simple.py +0 -424
- felis/sql.py +0 -275
- felis/tap.py +0 -433
- felis/utils.py +0 -100
- felis/validation.py +0 -103
- felis/version.py +0 -2
- felis/visitor.py +0 -180
- lsst_felis-26.2024.900.dist-info/METADATA +0 -28
- lsst_felis-26.2024.900.dist-info/RECORD +0 -23
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info}/entry_points.txt +0 -0
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info/licenses}/LICENSE +0 -0
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info}/top_level.txt +0 -0
- {lsst_felis-26.2024.900.dist-info → lsst_felis-29.2025.4500.dist-info}/zip-safe +0 -0
felis/tap_schema.py
ADDED
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
"""Provides utilities for creating and populating the TAP_SCHEMA database."""
|
|
2
|
+
|
|
3
|
+
# This file is part of felis.
|
|
4
|
+
#
|
|
5
|
+
# Developed for the LSST Data Management System.
|
|
6
|
+
# This product includes software developed by the LSST Project
|
|
7
|
+
# (https://www.lsst.org).
|
|
8
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
9
|
+
# for details of code ownership.
|
|
10
|
+
#
|
|
11
|
+
# This program is free software: you can redistribute it and/or modify
|
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
|
13
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
14
|
+
# (at your option) any later version.
|
|
15
|
+
#
|
|
16
|
+
# This program is distributed in the hope that it will be useful,
|
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+
# GNU General Public License for more details.
|
|
20
|
+
#
|
|
21
|
+
# You should have received a copy of the GNU General Public License
|
|
22
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
23
|
+
|
|
24
|
+
import csv
|
|
25
|
+
import io
|
|
26
|
+
import logging
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
from lsst.resources import ResourcePath
|
|
32
|
+
from sqlalchemy import MetaData, Table, select, text
|
|
33
|
+
from sqlalchemy.engine import Connection, Engine
|
|
34
|
+
from sqlalchemy.engine.mock import MockConnection
|
|
35
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
36
|
+
from sqlalchemy.schema import CreateSchema
|
|
37
|
+
from sqlalchemy.sql.dml import Insert
|
|
38
|
+
|
|
39
|
+
from felis import datamodel
|
|
40
|
+
from felis.datamodel import Constraint, Schema
|
|
41
|
+
from felis.db.utils import is_valid_engine
|
|
42
|
+
from felis.metadata import MetaDataBuilder
|
|
43
|
+
|
|
44
|
+
from .types import FelisType
|
|
45
|
+
|
|
46
|
+
__all__ = ["DataLoader", "TableManager"]
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TableManager:
|
|
52
|
+
"""Manage creation of TAP_SCHEMA tables.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
engine
|
|
57
|
+
The SQLAlchemy engine for reflecting the TAP_SCHEMA tables from an
|
|
58
|
+
existing database.
|
|
59
|
+
This can be a mock connection or None, in which case the internal
|
|
60
|
+
TAP_SCHEMA schema will be used by loading an internal YAML file.
|
|
61
|
+
schema_name
|
|
62
|
+
The name of the schema to use for the TAP_SCHEMA tables.
|
|
63
|
+
Leave as None to use the standard name of "TAP_SCHEMA".
|
|
64
|
+
apply_schema_to_metadata
|
|
65
|
+
If True, apply the schema to the metadata as well as the tables.
|
|
66
|
+
If False, these will be set to None, e.g., for sqlite.
|
|
67
|
+
table_name_postfix
|
|
68
|
+
A string to append to all the standard table names.
|
|
69
|
+
This needs to be used in a way such that the resultant table names
|
|
70
|
+
map to tables within the TAP_SCHEMA database.
|
|
71
|
+
|
|
72
|
+
Notes
|
|
73
|
+
-----
|
|
74
|
+
The TAP_SCHEMA schema must either have been created already, in which case
|
|
75
|
+
the ``engine`` should be provided. Or the internal TAP_SCHEMA schema will
|
|
76
|
+
be used if ``engine`` is None or a ``MockConnection``.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
_TABLE_NAMES_STD = ["schemas", "tables", "columns", "keys", "key_columns"]
|
|
80
|
+
"""The standard table names for the TAP_SCHEMA tables."""
|
|
81
|
+
|
|
82
|
+
_SCHEMA_NAME_STD = "TAP_SCHEMA"
|
|
83
|
+
"""The standard schema name for the TAP_SCHEMA tables."""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
engine: Engine | MockConnection | None = None,
|
|
88
|
+
schema_name: str | None = None,
|
|
89
|
+
apply_schema_to_metadata: bool = True,
|
|
90
|
+
table_name_postfix: str = "",
|
|
91
|
+
):
|
|
92
|
+
"""Initialize the table manager."""
|
|
93
|
+
self.table_name_postfix = table_name_postfix
|
|
94
|
+
self.apply_schema_to_metadata = apply_schema_to_metadata
|
|
95
|
+
self.schema_name = schema_name or TableManager._SCHEMA_NAME_STD
|
|
96
|
+
self.table_name_postfix = table_name_postfix
|
|
97
|
+
|
|
98
|
+
if is_valid_engine(engine):
|
|
99
|
+
assert isinstance(engine, Engine)
|
|
100
|
+
if table_name_postfix != "":
|
|
101
|
+
logger.warning(
|
|
102
|
+
"Table name postfix '%s' will be ignored when reflecting TAP_SCHEMA database",
|
|
103
|
+
table_name_postfix,
|
|
104
|
+
)
|
|
105
|
+
logger.debug(
|
|
106
|
+
"Reflecting TAP_SCHEMA database from existing database at %s",
|
|
107
|
+
engine.url._replace(password="***"),
|
|
108
|
+
)
|
|
109
|
+
self._reflect(engine)
|
|
110
|
+
else:
|
|
111
|
+
self._load_yaml()
|
|
112
|
+
|
|
113
|
+
self._create_table_map()
|
|
114
|
+
self._check_tables()
|
|
115
|
+
|
|
116
|
+
def _reflect(self, engine: Engine) -> None:
|
|
117
|
+
"""Reflect the TAP_SCHEMA database tables into the metadata.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
engine
|
|
122
|
+
The SQLAlchemy engine to use to reflect the tables.
|
|
123
|
+
"""
|
|
124
|
+
self._metadata = MetaData(schema=self.schema_name if self.apply_schema_to_metadata else None)
|
|
125
|
+
try:
|
|
126
|
+
self.metadata.reflect(bind=engine)
|
|
127
|
+
except SQLAlchemyError as e:
|
|
128
|
+
logger.error("Error reflecting TAP_SCHEMA database: %s", e)
|
|
129
|
+
raise
|
|
130
|
+
|
|
131
|
+
def _load_yaml(self) -> None:
|
|
132
|
+
"""Load the standard TAP_SCHEMA schema from a Felis package
|
|
133
|
+
resource.
|
|
134
|
+
"""
|
|
135
|
+
self._load_schema()
|
|
136
|
+
if self.schema_name != TableManager._SCHEMA_NAME_STD:
|
|
137
|
+
self.schema.name = self.schema_name
|
|
138
|
+
else:
|
|
139
|
+
self.schema_name = self.schema.name
|
|
140
|
+
|
|
141
|
+
self._metadata = MetaDataBuilder(
|
|
142
|
+
self.schema,
|
|
143
|
+
apply_schema_to_metadata=self.apply_schema_to_metadata,
|
|
144
|
+
table_name_postfix=self.table_name_postfix,
|
|
145
|
+
).build()
|
|
146
|
+
|
|
147
|
+
logger.debug("Loaded TAP_SCHEMA '%s' from YAML resource", self.schema_name)
|
|
148
|
+
|
|
149
|
+
def __getitem__(self, table_name: str) -> Table:
|
|
150
|
+
"""Get one of the TAP_SCHEMA tables by its standard TAP_SCHEMA name.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
table_name
|
|
155
|
+
The name of the table to get.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
Table
|
|
160
|
+
The table with the given name.
|
|
161
|
+
|
|
162
|
+
Notes
|
|
163
|
+
-----
|
|
164
|
+
This implements array semantics for the table manager, allowing
|
|
165
|
+
tables to be accessed by their standard TAP_SCHEMA names.
|
|
166
|
+
"""
|
|
167
|
+
if table_name not in self._table_map:
|
|
168
|
+
raise KeyError(f"Table '{table_name}' not found in TAP_SCHEMA")
|
|
169
|
+
return self.metadata.tables[self._table_map[table_name]]
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def schema(self) -> Schema:
|
|
173
|
+
"""Get the TAP_SCHEMA schema.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
Schema
|
|
178
|
+
The TAP_SCHEMA schema.
|
|
179
|
+
|
|
180
|
+
Notes
|
|
181
|
+
-----
|
|
182
|
+
This will only be set if the TAP_SCHEMA schema was loaded from a
|
|
183
|
+
Felis package resource. In the case where the TAP_SCHEMA schema was
|
|
184
|
+
reflected from an existing database, this will be None.
|
|
185
|
+
"""
|
|
186
|
+
return self._schema
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def metadata(self) -> MetaData:
|
|
190
|
+
"""Get the metadata for the TAP_SCHEMA tables.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
`~sqlalchemy.sql.schema.MetaData`
|
|
195
|
+
The metadata for the TAP_SCHEMA tables.
|
|
196
|
+
|
|
197
|
+
Notes
|
|
198
|
+
-----
|
|
199
|
+
This will either be the metadata that was reflected from an existing
|
|
200
|
+
database or the metadata that was loaded from a Felis package resource.
|
|
201
|
+
"""
|
|
202
|
+
return self._metadata
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def get_tap_schema_std_path(cls) -> str:
|
|
206
|
+
"""Get the path to the standard TAP_SCHEMA schema resource.
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
str
|
|
211
|
+
The path to the standard TAP_SCHEMA schema resource.
|
|
212
|
+
"""
|
|
213
|
+
return os.path.join(os.path.dirname(__file__), "config", "tap_schema", "tap_schema_std.yaml")
|
|
214
|
+
|
|
215
|
+
@classmethod
|
|
216
|
+
def get_tap_schema_std_resource(cls) -> ResourcePath:
|
|
217
|
+
"""Get the standard TAP_SCHEMA schema resource.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
`~lsst.resources.ResourcePath`
|
|
222
|
+
The standard TAP_SCHEMA schema resource.
|
|
223
|
+
"""
|
|
224
|
+
return ResourcePath("resource://felis/config/tap_schema/tap_schema_std.yaml")
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def get_table_names_std(cls) -> list[str]:
|
|
228
|
+
"""Get the standard column names for the TAP_SCHEMA tables.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
list
|
|
233
|
+
The standard table names for the TAP_SCHEMA tables.
|
|
234
|
+
"""
|
|
235
|
+
return cls._TABLE_NAMES_STD
|
|
236
|
+
|
|
237
|
+
@classmethod
|
|
238
|
+
def get_schema_name_std(cls) -> str:
|
|
239
|
+
"""Get the standard schema name for the TAP_SCHEMA tables.
|
|
240
|
+
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
str
|
|
244
|
+
The standard schema name for the TAP_SCHEMA tables.
|
|
245
|
+
"""
|
|
246
|
+
return cls._SCHEMA_NAME_STD
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def load_schema_resource(cls) -> Schema:
|
|
250
|
+
"""Load the standard TAP_SCHEMA schema from a Felis package
|
|
251
|
+
resource into a Felis `~felis.datamodel.Schema`.
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
Schema
|
|
256
|
+
The TAP_SCHEMA schema.
|
|
257
|
+
"""
|
|
258
|
+
rp = cls.get_tap_schema_std_resource()
|
|
259
|
+
return Schema.from_uri(rp, context={"id_generation": True})
|
|
260
|
+
|
|
261
|
+
def _load_schema(self) -> None:
|
|
262
|
+
"""Load the TAP_SCHEMA schema from a Felis package resource."""
|
|
263
|
+
self._schema = self.load_schema_resource()
|
|
264
|
+
|
|
265
|
+
def _create_table_map(self) -> None:
|
|
266
|
+
"""Create a mapping of standard table names to the table names modified
|
|
267
|
+
with a postfix, as well as the prepended schema name if it is set.
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
dict
|
|
272
|
+
A dictionary mapping the standard table names to the modified
|
|
273
|
+
table names.
|
|
274
|
+
|
|
275
|
+
Notes
|
|
276
|
+
-----
|
|
277
|
+
This is a private method that is called during initialization, allowing
|
|
278
|
+
us to use table names like ``schemas11`` such as those used by the CADC
|
|
279
|
+
TAP library instead of the standard table names. It also maps between
|
|
280
|
+
the standard table names and those with the schema name prepended like
|
|
281
|
+
SQLAlchemy uses.
|
|
282
|
+
"""
|
|
283
|
+
self._table_map = {
|
|
284
|
+
table_name: (
|
|
285
|
+
f"{self.schema_name + '.' if self.apply_schema_to_metadata else ''}"
|
|
286
|
+
f"{table_name}{self.table_name_postfix}"
|
|
287
|
+
)
|
|
288
|
+
for table_name in TableManager.get_table_names_std()
|
|
289
|
+
}
|
|
290
|
+
logger.debug(f"Created TAP_SCHEMA table map: {self._table_map}")
|
|
291
|
+
|
|
292
|
+
def _check_tables(self) -> None:
|
|
293
|
+
"""Check that there is a valid mapping to each standard table.
|
|
294
|
+
|
|
295
|
+
Raises
|
|
296
|
+
------
|
|
297
|
+
KeyError
|
|
298
|
+
If a table is missing from the table map.
|
|
299
|
+
"""
|
|
300
|
+
for table_name in TableManager.get_table_names_std():
|
|
301
|
+
self[table_name]
|
|
302
|
+
|
|
303
|
+
def _create_schema(self, engine: Engine) -> None:
|
|
304
|
+
"""Create the database schema for TAP_SCHEMA if it does not already
|
|
305
|
+
exist.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
engine
|
|
310
|
+
The SQLAlchemy engine to use to create the schema.
|
|
311
|
+
|
|
312
|
+
Notes
|
|
313
|
+
-----
|
|
314
|
+
This method only creates the schema in the database. It does not create
|
|
315
|
+
the tables.
|
|
316
|
+
"""
|
|
317
|
+
create_schema_functions = {
|
|
318
|
+
"postgresql": self._create_schema_postgresql,
|
|
319
|
+
"mysql": self._create_schema_mysql,
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
dialect_name = engine.dialect.name
|
|
323
|
+
if dialect_name == "sqlite":
|
|
324
|
+
# SQLite doesn't have schemas.
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
create_function = create_schema_functions.get(dialect_name)
|
|
328
|
+
|
|
329
|
+
if create_function:
|
|
330
|
+
with engine.begin() as connection:
|
|
331
|
+
create_function(connection)
|
|
332
|
+
else:
|
|
333
|
+
# Some other database engine we don't currently know how to handle.
|
|
334
|
+
raise NotImplementedError(
|
|
335
|
+
f"Database engine '{engine.dialect.name}' is not supported for schema creation"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
def _create_schema_postgresql(self, connection: Connection) -> None:
|
|
339
|
+
"""Create the schema in a PostgreSQL database.
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
----------
|
|
343
|
+
connection
|
|
344
|
+
The SQLAlchemy connection to use to create the schema.
|
|
345
|
+
"""
|
|
346
|
+
connection.execute(CreateSchema(self.schema_name, if_not_exists=True))
|
|
347
|
+
|
|
348
|
+
def _create_schema_mysql(self, connection: Connection) -> None:
|
|
349
|
+
"""Create the schema in a MySQL database.
|
|
350
|
+
|
|
351
|
+
Parameters
|
|
352
|
+
----------
|
|
353
|
+
connection
|
|
354
|
+
The SQLAlchemy connection to use to create the schema.
|
|
355
|
+
"""
|
|
356
|
+
connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {self.schema_name}"))
|
|
357
|
+
|
|
358
|
+
def initialize_database(self, engine: Engine) -> None:
|
|
359
|
+
"""Initialize a database with the TAP_SCHEMA tables.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
engine
|
|
364
|
+
The SQLAlchemy engine to use to create the tables.
|
|
365
|
+
"""
|
|
366
|
+
logger.info("Creating TAP_SCHEMA database '%s'", self.schema_name)
|
|
367
|
+
self._create_schema(engine)
|
|
368
|
+
self.metadata.create_all(engine)
|
|
369
|
+
|
|
370
|
+
def select(self, engine: Engine, table_name: str, filter_condition: str = "") -> list[dict[str, Any]]:
|
|
371
|
+
"""Select all rows from a TAP_SCHEMA table with an optional filter
|
|
372
|
+
condition.
|
|
373
|
+
|
|
374
|
+
Parameters
|
|
375
|
+
----------
|
|
376
|
+
engine
|
|
377
|
+
The SQLAlchemy engine to use to connect to the database.
|
|
378
|
+
table_name
|
|
379
|
+
The name of the table to select from.
|
|
380
|
+
filter_condition
|
|
381
|
+
The filter condition as a string. If empty, no filter will be
|
|
382
|
+
applied.
|
|
383
|
+
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
386
|
+
list
|
|
387
|
+
A list of dictionaries containing the rows from the table.
|
|
388
|
+
"""
|
|
389
|
+
table = self[table_name]
|
|
390
|
+
query = select(table)
|
|
391
|
+
if filter_condition:
|
|
392
|
+
query = query.where(text(filter_condition))
|
|
393
|
+
with engine.connect() as connection:
|
|
394
|
+
result = connection.execute(query)
|
|
395
|
+
rows = [dict(row._mapping) for row in result]
|
|
396
|
+
return rows
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class DataLoader:
|
|
400
|
+
"""Load data into the TAP_SCHEMA tables.
|
|
401
|
+
|
|
402
|
+
Parameters
|
|
403
|
+
----------
|
|
404
|
+
schema
|
|
405
|
+
The Felis ``Schema`` to load into the TAP_SCHEMA tables.
|
|
406
|
+
mgr
|
|
407
|
+
The table manager that contains the TAP_SCHEMA tables.
|
|
408
|
+
engine
|
|
409
|
+
The SQLAlchemy engine to use to connect to the database.
|
|
410
|
+
tap_schema_index
|
|
411
|
+
The index of the schema in the TAP_SCHEMA database.
|
|
412
|
+
output_path
|
|
413
|
+
The file to write the SQL statements to. If None, printing will be
|
|
414
|
+
suppressed.
|
|
415
|
+
print_sql
|
|
416
|
+
If True, print the SQL statements that will be executed.
|
|
417
|
+
dry_run
|
|
418
|
+
If True, the data will not be loaded into the database.
|
|
419
|
+
unique_keys
|
|
420
|
+
If True, prepend the schema name to the key name to make it unique
|
|
421
|
+
when loading data into the keys and key_columns tables.
|
|
422
|
+
"""
|
|
423
|
+
|
|
424
|
+
def __init__(
|
|
425
|
+
self,
|
|
426
|
+
schema: Schema,
|
|
427
|
+
mgr: TableManager,
|
|
428
|
+
engine: Engine | MockConnection,
|
|
429
|
+
tap_schema_index: int = 0,
|
|
430
|
+
output_path: str | None = None,
|
|
431
|
+
print_sql: bool = False,
|
|
432
|
+
dry_run: bool = False,
|
|
433
|
+
unique_keys: bool = False,
|
|
434
|
+
):
|
|
435
|
+
self.schema = schema
|
|
436
|
+
self.mgr = mgr
|
|
437
|
+
self.engine = engine
|
|
438
|
+
self.tap_schema_index = tap_schema_index
|
|
439
|
+
self.inserts: list[Insert] = []
|
|
440
|
+
self.output_path = output_path
|
|
441
|
+
self.print_sql = print_sql
|
|
442
|
+
self.dry_run = dry_run
|
|
443
|
+
self.unique_keys = unique_keys
|
|
444
|
+
|
|
445
|
+
def load(self) -> None:
|
|
446
|
+
"""Load the schema data into the TAP_SCHEMA tables.
|
|
447
|
+
|
|
448
|
+
Notes
|
|
449
|
+
-----
|
|
450
|
+
This will generate inserts for the data, print the SQL statements if
|
|
451
|
+
requested, save the SQL statements to a file if requested, and load the
|
|
452
|
+
data into the database if not in dry run mode. These are done as
|
|
453
|
+
sequential operations rather than for each insert. The logic is that
|
|
454
|
+
the user may still want the complete SQL output to be printed or saved
|
|
455
|
+
to a file even if loading into the database causes errors. If there are
|
|
456
|
+
errors when inserting into the database, the SQLAlchemy error message
|
|
457
|
+
should indicate which SQL statement caused the error.
|
|
458
|
+
"""
|
|
459
|
+
self._generate_all_inserts()
|
|
460
|
+
if self.print_sql:
|
|
461
|
+
# Print to stdout.
|
|
462
|
+
self._print_sql()
|
|
463
|
+
if self.output_path:
|
|
464
|
+
# Print to an output file.
|
|
465
|
+
self._write_sql_to_file()
|
|
466
|
+
if not self.dry_run:
|
|
467
|
+
# Execute the inserts if not in dry run mode.
|
|
468
|
+
self._execute_inserts()
|
|
469
|
+
else:
|
|
470
|
+
logger.info("Dry run - not loading data into database")
|
|
471
|
+
|
|
472
|
+
def _insert_schemas(self) -> None:
|
|
473
|
+
"""Insert the schema data into the ``schemas`` table."""
|
|
474
|
+
schema_record = {
|
|
475
|
+
"schema_name": self.schema.name,
|
|
476
|
+
"utype": self.schema.votable_utype,
|
|
477
|
+
"description": self.schema.description,
|
|
478
|
+
"schema_index": self.tap_schema_index,
|
|
479
|
+
}
|
|
480
|
+
self._insert("schemas", schema_record)
|
|
481
|
+
|
|
482
|
+
def _get_table_name(self, table: datamodel.Table) -> str:
|
|
483
|
+
"""Get the name of the table with the schema name prepended.
|
|
484
|
+
|
|
485
|
+
Parameters
|
|
486
|
+
----------
|
|
487
|
+
table
|
|
488
|
+
The table to get the name for.
|
|
489
|
+
|
|
490
|
+
Returns
|
|
491
|
+
-------
|
|
492
|
+
str
|
|
493
|
+
The name of the table with the schema name prepended.
|
|
494
|
+
"""
|
|
495
|
+
return f"{self.schema.name}.{table.name}"
|
|
496
|
+
|
|
497
|
+
def _insert_tables(self) -> None:
|
|
498
|
+
"""Insert the table data into the ``tables`` table."""
|
|
499
|
+
for table in self.schema.tables:
|
|
500
|
+
table_record = {
|
|
501
|
+
"schema_name": self.schema.name,
|
|
502
|
+
"table_name": self._get_table_name(table),
|
|
503
|
+
"table_type": "table",
|
|
504
|
+
"utype": table.votable_utype,
|
|
505
|
+
"description": table.description,
|
|
506
|
+
"table_index": 0 if table.tap_table_index is None else table.tap_table_index,
|
|
507
|
+
}
|
|
508
|
+
self._insert("tables", table_record)
|
|
509
|
+
|
|
510
|
+
def _insert_columns(self) -> None:
|
|
511
|
+
"""Insert the column data into the ``columns`` table."""
|
|
512
|
+
for table in self.schema.tables:
|
|
513
|
+
for column in table.columns:
|
|
514
|
+
felis_type = FelisType.felis_type(column.datatype.value)
|
|
515
|
+
arraysize = str(column.votable_arraysize) if column.votable_arraysize else None
|
|
516
|
+
size = DataLoader._get_size(column)
|
|
517
|
+
indexed = DataLoader._is_indexed(column, table)
|
|
518
|
+
tap_column_index = column.tap_column_index
|
|
519
|
+
unit = column.ivoa_unit or column.fits_tunit
|
|
520
|
+
|
|
521
|
+
column_record = {
|
|
522
|
+
"table_name": self._get_table_name(table),
|
|
523
|
+
"column_name": column.name,
|
|
524
|
+
"datatype": felis_type.votable_name,
|
|
525
|
+
"arraysize": arraysize,
|
|
526
|
+
"size": size,
|
|
527
|
+
"xtype": column.votable_xtype,
|
|
528
|
+
"description": column.description,
|
|
529
|
+
"utype": column.votable_utype,
|
|
530
|
+
"unit": unit,
|
|
531
|
+
"ucd": column.ivoa_ucd,
|
|
532
|
+
"indexed": indexed,
|
|
533
|
+
"principal": column.tap_principal,
|
|
534
|
+
"std": column.tap_std,
|
|
535
|
+
"column_index": tap_column_index,
|
|
536
|
+
}
|
|
537
|
+
self._insert("columns", column_record)
|
|
538
|
+
|
|
539
|
+
def _get_key(self, constraint: Constraint) -> str:
|
|
540
|
+
"""Get the key name for a constraint.
|
|
541
|
+
|
|
542
|
+
Parameters
|
|
543
|
+
----------
|
|
544
|
+
constraint
|
|
545
|
+
The constraint to get the key name for.
|
|
546
|
+
|
|
547
|
+
Returns
|
|
548
|
+
-------
|
|
549
|
+
str
|
|
550
|
+
The key name for the constraint.
|
|
551
|
+
|
|
552
|
+
Notes
|
|
553
|
+
-----
|
|
554
|
+
This will prepend the name of the schema to the key name if the
|
|
555
|
+
`unique_keys` attribute is set to True. Otherwise, it will just return
|
|
556
|
+
the name of the constraint.
|
|
557
|
+
"""
|
|
558
|
+
if self.unique_keys:
|
|
559
|
+
key_id = f"{self.schema.name}_{constraint.name}"
|
|
560
|
+
logger.debug("Generated unique key_id: %s -> %s", constraint.name, key_id)
|
|
561
|
+
else:
|
|
562
|
+
key_id = constraint.name
|
|
563
|
+
return key_id
|
|
564
|
+
|
|
565
|
+
def _insert_keys(self) -> None:
|
|
566
|
+
"""Insert the foreign keys into the ``keys`` and ``key_columns``
|
|
567
|
+
tables.
|
|
568
|
+
"""
|
|
569
|
+
for table in self.schema.tables:
|
|
570
|
+
for constraint in table.constraints:
|
|
571
|
+
if isinstance(constraint, datamodel.ForeignKeyConstraint):
|
|
572
|
+
###########################################################
|
|
573
|
+
# Handle keys table
|
|
574
|
+
###########################################################
|
|
575
|
+
referenced_column = self.schema.find_object_by_id(
|
|
576
|
+
constraint.referenced_columns[0], datamodel.Column
|
|
577
|
+
)
|
|
578
|
+
referenced_table = self.schema.get_table_by_column(referenced_column)
|
|
579
|
+
key_id = self._get_key(constraint)
|
|
580
|
+
key_record = {
|
|
581
|
+
"key_id": key_id,
|
|
582
|
+
"from_table": self._get_table_name(table),
|
|
583
|
+
"target_table": self._get_table_name(referenced_table),
|
|
584
|
+
"description": constraint.description,
|
|
585
|
+
"utype": constraint.votable_utype,
|
|
586
|
+
}
|
|
587
|
+
self._insert("keys", key_record)
|
|
588
|
+
|
|
589
|
+
###########################################################
|
|
590
|
+
# Handle key_columns table
|
|
591
|
+
###########################################################
|
|
592
|
+
# Loop over the corresponding columns and referenced
|
|
593
|
+
# columns and insert a record for each pair. This is
|
|
594
|
+
# necessary for proper handling of composite keys.
|
|
595
|
+
for from_column_id, target_column_id in zip(
|
|
596
|
+
constraint.columns, constraint.referenced_columns
|
|
597
|
+
):
|
|
598
|
+
from_column = self.schema.find_object_by_id(from_column_id, datamodel.Column)
|
|
599
|
+
target_column = self.schema.find_object_by_id(target_column_id, datamodel.Column)
|
|
600
|
+
key_columns_record = {
|
|
601
|
+
"key_id": key_id,
|
|
602
|
+
"from_column": from_column.name,
|
|
603
|
+
"target_column": target_column.name,
|
|
604
|
+
}
|
|
605
|
+
self._insert("key_columns", key_columns_record)
|
|
606
|
+
|
|
607
|
+
def _generate_all_inserts(self) -> None:
|
|
608
|
+
"""Generate the inserts for all the data."""
|
|
609
|
+
self.inserts.clear()
|
|
610
|
+
self._insert_schemas()
|
|
611
|
+
self._insert_tables()
|
|
612
|
+
self._insert_columns()
|
|
613
|
+
self._insert_keys()
|
|
614
|
+
logger.debug("Generated %d insert statements", len(self.inserts))
|
|
615
|
+
|
|
616
|
+
def _execute_inserts(self) -> None:
|
|
617
|
+
"""Load the `~felis.datamodel.Schema` data into the TAP_SCHEMA
|
|
618
|
+
tables.
|
|
619
|
+
"""
|
|
620
|
+
if isinstance(self.engine, Engine):
|
|
621
|
+
with self.engine.connect() as connection:
|
|
622
|
+
transaction = connection.begin()
|
|
623
|
+
try:
|
|
624
|
+
for insert in self.inserts:
|
|
625
|
+
connection.execute(insert)
|
|
626
|
+
transaction.commit()
|
|
627
|
+
except Exception as e:
|
|
628
|
+
logger.error("Error loading data into database: %s", e)
|
|
629
|
+
transaction.rollback()
|
|
630
|
+
raise
|
|
631
|
+
|
|
632
|
+
def _compiled_inserts(self) -> list[str]:
|
|
633
|
+
"""Compile the inserts to SQL.
|
|
634
|
+
|
|
635
|
+
Returns
|
|
636
|
+
-------
|
|
637
|
+
list
|
|
638
|
+
A list of the compiled insert statements.
|
|
639
|
+
"""
|
|
640
|
+
return [
|
|
641
|
+
str(insert.compile(self.engine, compile_kwargs={"literal_binds": True}))
|
|
642
|
+
for insert in self.inserts
|
|
643
|
+
]
|
|
644
|
+
|
|
645
|
+
def _print_sql(self) -> None:
|
|
646
|
+
"""Print the generated inserts to stdout."""
|
|
647
|
+
for insert_str in self._compiled_inserts():
|
|
648
|
+
print(insert_str + ";")
|
|
649
|
+
|
|
650
|
+
def _write_sql_to_file(self) -> None:
|
|
651
|
+
"""Write the generated insert statements to a file."""
|
|
652
|
+
if not self.output_path:
|
|
653
|
+
raise ValueError("No output path specified")
|
|
654
|
+
with open(self.output_path, "w") as outfile:
|
|
655
|
+
for insert_str in self._compiled_inserts():
|
|
656
|
+
outfile.write(insert_str + ";" + "\n")
|
|
657
|
+
|
|
658
|
+
def _insert(self, table_name: str, record: list[Any] | dict[str, Any]) -> None:
|
|
659
|
+
"""Generate an insert statement for a record.
|
|
660
|
+
|
|
661
|
+
Parameters
|
|
662
|
+
----------
|
|
663
|
+
table_name
|
|
664
|
+
The name of the table to insert the record into.
|
|
665
|
+
record
|
|
666
|
+
The record to insert into the table.
|
|
667
|
+
"""
|
|
668
|
+
table = self.mgr[table_name]
|
|
669
|
+
insert_statement = table.insert().values(record)
|
|
670
|
+
self.inserts.append(insert_statement)
|
|
671
|
+
|
|
672
|
+
@staticmethod
|
|
673
|
+
def _get_size(column: datamodel.Column) -> int | None:
|
|
674
|
+
"""Get the size of the column.
|
|
675
|
+
|
|
676
|
+
Parameters
|
|
677
|
+
----------
|
|
678
|
+
column
|
|
679
|
+
The column to get the size for.
|
|
680
|
+
|
|
681
|
+
Returns
|
|
682
|
+
-------
|
|
683
|
+
int or None
|
|
684
|
+
The size of the column or None if not applicable.
|
|
685
|
+
"""
|
|
686
|
+
arraysize = column.votable_arraysize
|
|
687
|
+
|
|
688
|
+
if not arraysize:
|
|
689
|
+
return None
|
|
690
|
+
|
|
691
|
+
arraysize_str = str(arraysize)
|
|
692
|
+
if arraysize_str.isdigit():
|
|
693
|
+
return int(arraysize_str)
|
|
694
|
+
|
|
695
|
+
match = re.match(r"^([0-9]+)\*$", arraysize_str)
|
|
696
|
+
if match and match.group(1) is not None:
|
|
697
|
+
return int(match.group(1))
|
|
698
|
+
|
|
699
|
+
return None
|
|
700
|
+
|
|
701
|
+
@staticmethod
|
|
702
|
+
def _is_indexed(column: datamodel.Column, table: datamodel.Table) -> int:
|
|
703
|
+
"""Check if the column is indexed in the table.
|
|
704
|
+
|
|
705
|
+
Parameters
|
|
706
|
+
----------
|
|
707
|
+
column
|
|
708
|
+
The column to check.
|
|
709
|
+
table
|
|
710
|
+
The table to check.
|
|
711
|
+
|
|
712
|
+
Returns
|
|
713
|
+
-------
|
|
714
|
+
int
|
|
715
|
+
1 if the column is indexed, 0 otherwise.
|
|
716
|
+
"""
|
|
717
|
+
if isinstance(table.primary_key, str) and table.primary_key == column.id:
|
|
718
|
+
return 1
|
|
719
|
+
for index in table.indexes:
|
|
720
|
+
if index.columns and len(index.columns) == 1 and index.columns[0] == column.id:
|
|
721
|
+
return 1
|
|
722
|
+
return 0
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
class MetadataInserter:
|
|
726
|
+
"""Insert TAP_SCHEMA self-description rows into the database.
|
|
727
|
+
|
|
728
|
+
Parameters
|
|
729
|
+
----------
|
|
730
|
+
mgr
|
|
731
|
+
The table manager that contains the TAP_SCHEMA tables.
|
|
732
|
+
engine
|
|
733
|
+
The engine for connecting to the TAP_SCHEMA database.
|
|
734
|
+
"""
|
|
735
|
+
|
|
736
|
+
def __init__(self, mgr: TableManager, engine: Engine):
|
|
737
|
+
"""Initialize the metadata inserter.
|
|
738
|
+
|
|
739
|
+
Parameters
|
|
740
|
+
----------
|
|
741
|
+
mgr
|
|
742
|
+
The table manager representing the TAP_SCHEMA tables.
|
|
743
|
+
engine
|
|
744
|
+
The SQLAlchemy engine for connecting to the database.
|
|
745
|
+
"""
|
|
746
|
+
self._mgr = mgr
|
|
747
|
+
self._engine = engine
|
|
748
|
+
|
|
749
|
+
def insert_metadata(self) -> None:
|
|
750
|
+
"""Insert the TAP_SCHEMA metadata into the database."""
|
|
751
|
+
for table_name in self._mgr.get_table_names_std():
|
|
752
|
+
table = self._mgr[table_name]
|
|
753
|
+
csv_bytes = ResourcePath(f"resource://felis/config/tap_schema/{table_name}.csv").read()
|
|
754
|
+
text_stream = io.TextIOWrapper(io.BytesIO(csv_bytes), encoding="utf-8")
|
|
755
|
+
reader = csv.reader(text_stream)
|
|
756
|
+
headers = next(reader)
|
|
757
|
+
rows = [
|
|
758
|
+
{key: None if value == "\\N" else value for key, value in zip(headers, row)} for row in reader
|
|
759
|
+
]
|
|
760
|
+
logger.debug(
|
|
761
|
+
"Inserting %d rows into table '%s' with headers: %s",
|
|
762
|
+
len(rows),
|
|
763
|
+
table_name,
|
|
764
|
+
headers,
|
|
765
|
+
)
|
|
766
|
+
with self._engine.begin() as conn:
|
|
767
|
+
conn.execute(table.insert(), rows)
|