thds.tabularasa 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__main__.py +5 -3
- thds/tabularasa/data_dependencies/adls.py +2 -0
- thds/tabularasa/data_dependencies/sqlite.py +6 -1
- thds/tabularasa/loaders/sqlite_util.py +4 -18
- thds/tabularasa/testing/__init__.py +5 -0
- thds/tabularasa/testing/mock_sqlite.py +114 -0
- {thds_tabularasa-0.13.1.dist-info → thds_tabularasa-0.14.1.dist-info}/METADATA +1 -1
- {thds_tabularasa-0.13.1.dist-info → thds_tabularasa-0.14.1.dist-info}/RECORD +11 -9
- {thds_tabularasa-0.13.1.dist-info → thds_tabularasa-0.14.1.dist-info}/WHEEL +0 -0
- {thds_tabularasa-0.13.1.dist-info → thds_tabularasa-0.14.1.dist-info}/entry_points.txt +0 -0
- {thds_tabularasa-0.13.1.dist-info → thds_tabularasa-0.14.1.dist-info}/top_level.txt +0 -0
thds/tabularasa/__main__.py
CHANGED
|
@@ -943,16 +943,18 @@ class ReferenceDataManager:
|
|
|
943
943
|
return None
|
|
944
944
|
raise IOError(table.name)
|
|
945
945
|
|
|
946
|
-
failed:
|
|
946
|
+
failed: list[tuple[str, Exception]] = []
|
|
947
947
|
synced: List[str] = []
|
|
948
948
|
for table_name, res in parallel.yield_all([(t.name, partial(inner, t)) for t in tables_to_sync]):
|
|
949
949
|
if isinstance(res, parallel.Error):
|
|
950
|
-
failed.append(table_name)
|
|
950
|
+
failed.append((table_name, res.error))
|
|
951
951
|
elif res is not None:
|
|
952
952
|
synced.append(table_name)
|
|
953
953
|
|
|
954
954
|
if failed:
|
|
955
|
-
|
|
955
|
+
first_exc = failed[0][1]
|
|
956
|
+
table_names = [name for name, _ in failed]
|
|
957
|
+
raise RuntimeError(f"Sync failed for tables {', '.join(table_names)}") from first_exc
|
|
956
958
|
|
|
957
959
|
down_ = (
|
|
958
960
|
f"to local build directory {pkg_resources.resource_filename(self.package, self.package_data_dir)}"
|
|
@@ -6,6 +6,7 @@ from typing import List, Optional, Protocol, Union
|
|
|
6
6
|
import attr
|
|
7
7
|
|
|
8
8
|
from thds.adls import ADLSFileSystem, fqn
|
|
9
|
+
from thds.core import fretry
|
|
9
10
|
from thds.tabularasa.schema.files import ADLSDataSpec, RemoteBlobStoreSpec
|
|
10
11
|
|
|
11
12
|
CACHE_DIR = ".cache/"
|
|
@@ -40,6 +41,7 @@ def adls_filesystem(account: str, filesystem: str, cache_dir: Optional[Union[Pat
|
|
|
40
41
|
return ADLSFileSystem(account, filesystem, cache_dir=cache_dir)
|
|
41
42
|
|
|
42
43
|
|
|
44
|
+
@fretry.retry_regular(fretry.is_exc(Exception), fretry.n_times(3))
|
|
43
45
|
def sync_adls_data(
|
|
44
46
|
adls_spec: ADLSDataSpec, cache_dir: Optional[Union[Path, str]] = CACHE_DIR
|
|
45
47
|
) -> List[ADLSDownloadResult]:
|
|
@@ -222,7 +222,9 @@ def populate_sqlite_db(
|
|
|
222
222
|
table_predicate: Callable[[Table], bool] = is_build_time_package_table,
|
|
223
223
|
data_path_overrides: Optional[Mapping[str, Path]] = None,
|
|
224
224
|
):
|
|
225
|
-
"""Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema
|
|
225
|
+
"""Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema`.
|
|
226
|
+
Note that this can safely be called concurrently in multiple processes on the same database file; a file lock
|
|
227
|
+
is acquired on the database file and only released when the data insertion is complete.
|
|
226
228
|
|
|
227
229
|
:param schema: the `reference_data.schema.Schema` object defining the data to be inserted
|
|
228
230
|
:param db_package: name of the package where the database file is stored, if any. In case `None` is
|
|
@@ -259,6 +261,9 @@ def populate_sqlite_db(
|
|
|
259
261
|
# gather all tables before executing any I/O
|
|
260
262
|
insert_tables = [table for table in schema.filter_tables(table_predicate) if table.has_indexes]
|
|
261
263
|
|
|
264
|
+
if not insert_tables:
|
|
265
|
+
return
|
|
266
|
+
|
|
262
267
|
with bulk_write_connection(db_path, db_package, close=True) as con:
|
|
263
268
|
for table in insert_tables:
|
|
264
269
|
table_filename: Optional[str]
|
|
@@ -22,7 +22,6 @@ from thds.tabularasa.sqlite3_compat import sqlite3
|
|
|
22
22
|
|
|
23
23
|
DEFAULT_ATTR_SQLITE_CACHE_SIZE = 100_000
|
|
24
24
|
DEFAULT_MMAP_BYTES = int(os.environ.get("TABULA_RASA_DEFAULT_MMAP_BYTES", 8_589_934_592)) # 8 GB
|
|
25
|
-
DISABLE_WAL_MODE = bool(os.environ.get("REF_D_DISABLE_SQLITE_WAL_MODE", False))
|
|
26
25
|
|
|
27
26
|
PARAMETERIZABLE_BUILTINS = sys.version_info >= (3, 9)
|
|
28
27
|
|
|
@@ -159,11 +158,6 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
|
|
|
159
158
|
logger.debug("Setting pragmas for bulk write optimization")
|
|
160
159
|
# https://www.sqlite.org/pragma.html#pragma_synchronous
|
|
161
160
|
_log_exec_sql(logger, con, "PRAGMA synchronous = 0") # OFF
|
|
162
|
-
# https://www.sqlite.org/pragma.html#pragma_journal_mode
|
|
163
|
-
if not DISABLE_WAL_MODE:
|
|
164
|
-
_log_exec_sql(logger, con, "PRAGMA journal_mode = WAL")
|
|
165
|
-
# https://www.sqlite.org/pragma.html#pragma_locking_mode
|
|
166
|
-
_log_exec_sql(logger, con, "PRAGMA locking_mode = EXCLUSIVE")
|
|
167
161
|
|
|
168
162
|
return con
|
|
169
163
|
|
|
@@ -171,16 +165,7 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
|
|
|
171
165
|
def unset_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
|
|
172
166
|
logger = logging.getLogger(__name__)
|
|
173
167
|
logger.debug("Setting pragmas for bulk write optimization")
|
|
174
|
-
# https://www.sqlite.org/pragma.html#pragma_journal_mode
|
|
175
|
-
# resetting this to the default. This is a property of the database, rather than the connection.
|
|
176
|
-
# the other settings are connection-specific.
|
|
177
|
-
# according to the docs, the WAL journal mode should be disabled before the locking mode is restored,
|
|
178
|
-
# else any attempt to do so is a no-op.
|
|
179
|
-
_log_exec_sql(logger, con, "PRAGMA journal_mode = DELETE")
|
|
180
|
-
# https://www.sqlite.org/pragma.html#pragma_synchronous
|
|
181
168
|
_log_exec_sql(logger, con, "PRAGMA synchronous = 2") # FULL (default)
|
|
182
|
-
# https://www.sqlite.org/pragma.html#pragma_locking_mode
|
|
183
|
-
_log_exec_sql(logger, con, "PRAGMA locking_mode = NORMAL")
|
|
184
169
|
|
|
185
170
|
return con
|
|
186
171
|
|
|
@@ -191,9 +176,10 @@ def bulk_write_connection(
|
|
|
191
176
|
) -> ty.Generator[sqlite3.Connection, None, None]:
|
|
192
177
|
"""Context manager to set/unset bulk write mode on a sqlite connection. Sets pragmas for efficient bulk writes,
|
|
193
178
|
such as loosening synchronous and locking modes. If `close` is True, the connection will be closed on exit.
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
179
|
+
To avoid bulk insert routines being run by other processes concurrently, we also acquire a file lock on the
|
|
180
|
+
database file on entry and release it on exit. Other processes attempting to perform bulk writes to the same file
|
|
181
|
+
will block until the lock is released. In the case of tabularasa init-sqlite, the semantics then imply that those
|
|
182
|
+
workers will perform no writes at all, since metadata will indicate that the data in the file is up-to-date.
|
|
197
183
|
"""
|
|
198
184
|
db_path_ = to_local_path(db_path, db_package).absolute()
|
|
199
185
|
lock_path = db_path_.with_suffix(".lock")
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import inspect
|
|
3
|
+
import sqlite3
|
|
4
|
+
import tempfile
|
|
5
|
+
import typing as ty
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import attrs
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.parquet
|
|
11
|
+
|
|
12
|
+
from thds.core import scope
|
|
13
|
+
from thds.core.types import StrOrPath
|
|
14
|
+
from thds.tabularasa.data_dependencies import sqlite, util
|
|
15
|
+
from thds.tabularasa.schema import load_schema
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _GeneratedSqliteLoader(ty.Protocol):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
package: ty.Optional[str],
|
|
22
|
+
db_path: str,
|
|
23
|
+
) -> None: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_UNTIL_EXIT_SCOPE = scope.Scope("tabularasa.testing.mock_sqlite_loader")
|
|
27
|
+
# this scope is for creating temporary sqlite database files that persist until program exit, in case the caller of
|
|
28
|
+
# mock_sqlite_loader doesn't want to manage the database file themselves
|
|
29
|
+
|
|
30
|
+
L = ty.TypeVar("L", bound=_GeneratedSqliteLoader)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def mock_sqlite_loader(
|
|
34
|
+
loader_cls: ty.Type[L],
|
|
35
|
+
data: ty.Mapping[str, ty.Collection[attrs.AttrsInstance]],
|
|
36
|
+
tmp_db_path: ty.Optional[StrOrPath] = None,
|
|
37
|
+
*,
|
|
38
|
+
package: ty.Optional[str] = None,
|
|
39
|
+
schema_path: str = "schema.yaml",
|
|
40
|
+
validate: bool = False,
|
|
41
|
+
) -> L:
|
|
42
|
+
"""Construct an instance of your custom generated sqlite loader from mocked data. Note that this is guaranteed
|
|
43
|
+
typesafe because regardless of how you define your mock records, the resulting sqlite loader will be a true instance
|
|
44
|
+
of your generated loader class, and will have all the same lookup methods and will use all the same deserialization
|
|
45
|
+
logic for reading rows from the database and returning actual instances from your library's data model.
|
|
46
|
+
|
|
47
|
+
:param loader_cls: The generated sqlite loader class to instantiate.
|
|
48
|
+
:param data: A mapping from table names to collections of attrs records representing rows.
|
|
49
|
+
:param package: The root package name containing the schema and generated loader(s). If omitted, it will be inferred
|
|
50
|
+
from the loader class's `__module__` attribute by climbing up until a schema file is found.
|
|
51
|
+
:param schema_path: The path to the schema file within the package.
|
|
52
|
+
:param tmp_db_path: Optional path to a file to use for the sqlite database. If None, a temporary file is created.
|
|
53
|
+
Note that in this case the temporary file will not be cleaned up until program exit.
|
|
54
|
+
:param validate: Whether to validate data against the schema when inserting data into the database.
|
|
55
|
+
:return: An instance of the specified sqlite loader class populated with the provided mocked data, with empty
|
|
56
|
+
tables for any table names that were not included in the `data` mapping.
|
|
57
|
+
"""
|
|
58
|
+
if package is None:
|
|
59
|
+
if package_ := inspect.signature(loader_cls).parameters["package"].default:
|
|
60
|
+
package_candidates = [package_]
|
|
61
|
+
else:
|
|
62
|
+
loader_module_path = loader_cls.__module__.split(".")
|
|
63
|
+
package_candidates = [
|
|
64
|
+
".".join(loader_module_path[:i]) for i in range(len(loader_module_path), 0, -1)
|
|
65
|
+
]
|
|
66
|
+
else:
|
|
67
|
+
package_candidates = [package]
|
|
68
|
+
|
|
69
|
+
for package_ in package_candidates:
|
|
70
|
+
try:
|
|
71
|
+
schema = load_schema(package_, schema_path)
|
|
72
|
+
except (ModuleNotFoundError, FileNotFoundError):
|
|
73
|
+
continue
|
|
74
|
+
else:
|
|
75
|
+
break
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"Could not infer package containing schema from loader class {loader_cls.__qualname__}; "
|
|
79
|
+
"please specify the 'package' argument explicitly."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if tmp_db_path is None:
|
|
83
|
+
tmp_db_path = _UNTIL_EXIT_SCOPE.enter(tempfile.NamedTemporaryFile(suffix=".sqlite")).name
|
|
84
|
+
|
|
85
|
+
unknown_tables = set(data.keys()).difference(schema.tables.keys())
|
|
86
|
+
if unknown_tables:
|
|
87
|
+
raise ValueError(f"Data provided for unknown tables: {sorted(unknown_tables)}")
|
|
88
|
+
|
|
89
|
+
with (
|
|
90
|
+
tempfile.TemporaryDirectory() as tmpdir,
|
|
91
|
+
contextlib.closing(sqlite3.connect(str(tmp_db_path))) as con,
|
|
92
|
+
):
|
|
93
|
+
# this tmpdir is only for staging parquet files before loading into sqlite; it's fine that they get deleted
|
|
94
|
+
# immediately after the database is populated
|
|
95
|
+
for name, table in schema.tables.items():
|
|
96
|
+
rows = data.get(name, [])
|
|
97
|
+
pa_table = pa.Table.from_pylist(
|
|
98
|
+
[attrs.asdict(row, recurse=True) for row in rows], schema=table.parquet_schema
|
|
99
|
+
)
|
|
100
|
+
filename = name + ".parquet"
|
|
101
|
+
pyarrow.parquet.write_table(
|
|
102
|
+
pa_table, Path(tmpdir) / filename, version=util.PARQUET_FORMAT_VERSION
|
|
103
|
+
)
|
|
104
|
+
sqlite.insert_table(
|
|
105
|
+
con,
|
|
106
|
+
table,
|
|
107
|
+
package=None,
|
|
108
|
+
data_dir=tmpdir,
|
|
109
|
+
filename=filename,
|
|
110
|
+
validate=validate,
|
|
111
|
+
cast=False if validate else True,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return loader_cls(package=None, db_path=str(tmp_db_path))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: thds.tabularasa
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.1
|
|
4
4
|
Summary: Trilliant Health reference data build system.
|
|
5
5
|
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
6
|
Project-URL: Repository, https://github.com/TrilliantHealth/ds-monorepo
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
thds/tabularasa/__init__.py,sha256=jc6w1WD868MQ2t4wkRNYvRssojwXvPDcNyC8V5gwbl0,169
|
|
2
|
-
thds/tabularasa/__main__.py,sha256=
|
|
2
|
+
thds/tabularasa/__main__.py,sha256=Ryfd7YogTE_qFjp8IJA-KTeTXXD9INS5GJGmdPVvWBw,47791
|
|
3
3
|
thds/tabularasa/compat.py,sha256=j0313TPIXtkbfvRI0AH4if8GLrjQSrDJ9heayCIl9w8,1037
|
|
4
4
|
thds/tabularasa/git_util.py,sha256=fBFhaCPi_5W2BpG2B3WiPcAWJvuVI_pG47rt73wLO6E,1388
|
|
5
5
|
thds/tabularasa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -7,9 +7,9 @@ thds/tabularasa/sqlite3_compat.py,sha256=67hldmiLFTjG0qL2TPX0USV7XNfTjEN3j8MneqN
|
|
|
7
7
|
thds/tabularasa/sqlite_from_parquet.py,sha256=yJatUIAbgErHUOL5dhchWJwzKZCrDrx93SP0nGm7It8,1115
|
|
8
8
|
thds/tabularasa/to_sqlite.py,sha256=5lcEUh38MNebxAJdLp2XGWOP_WQDIADtL1fyhOvi9UU,1715
|
|
9
9
|
thds/tabularasa/data_dependencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
thds/tabularasa/data_dependencies/adls.py,sha256=
|
|
10
|
+
thds/tabularasa/data_dependencies/adls.py,sha256=vJAuc5Key-vO1N6DGo5dj9fIx_4hMALAVC17qhvkT7Y,3257
|
|
11
11
|
thds/tabularasa/data_dependencies/build.py,sha256=6iYgw93sOF2Nlnb6WSmA9NvPNwOf_Yyi2wXUQpRVkJM,23382
|
|
12
|
-
thds/tabularasa/data_dependencies/sqlite.py,sha256=
|
|
12
|
+
thds/tabularasa/data_dependencies/sqlite.py,sha256=sMP_NInBEDoH5SScIRYxtOvcPUi9WXfE3_jCoOBduGo,12825
|
|
13
13
|
thds/tabularasa/data_dependencies/tabular.py,sha256=oq9wFse235ikLEv8Zvol59ptRRojZbkbzXJyQeFfC9o,6529
|
|
14
14
|
thds/tabularasa/data_dependencies/util.py,sha256=FQ9G1nIpqKh00z2lXOt0Y2R1mLQsEb-BC6Tka1z2egc,8489
|
|
15
15
|
thds/tabularasa/diff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -19,7 +19,7 @@ thds/tabularasa/diff/summary.py,sha256=gENtDwhSrDYeN-8fWr6Ug2zgdp584b0pZF9UBYzKF
|
|
|
19
19
|
thds/tabularasa/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
thds/tabularasa/loaders/lazy_adls.py,sha256=jrWy5tTKDQfWEv6aHQ3UJhFzLrDPOlSGsArv9zcl1g8,1375
|
|
21
21
|
thds/tabularasa/loaders/parquet_util.py,sha256=u75j3PkMSakO2zfq4zksWzXLYnaO--WizAgXTcSpXRY,13354
|
|
22
|
-
thds/tabularasa/loaders/sqlite_util.py,sha256=
|
|
22
|
+
thds/tabularasa/loaders/sqlite_util.py,sha256=3Gi1Y4iTVCD9FXqylQw1eyFwVuplQUrjY1J0SC5FFWg,11099
|
|
23
23
|
thds/tabularasa/loaders/util.py,sha256=XmsGkDdL6O8R6B4667Iqi5HoRgq0YMs6LP3VvPIqPVU,21369
|
|
24
24
|
thds/tabularasa/schema/__init__.py,sha256=bowvNXrrDrWB3TAmwDxCeEAvVEe9z7iRfqRaNg1Qmo4,440
|
|
25
25
|
thds/tabularasa/schema/constraints.py,sha256=V2vh01BhYR8OVQvgdujqSi0l_fMJvFKYSlBvWExZFG0,9744
|
|
@@ -39,8 +39,10 @@ thds/tabularasa/schema/compilation/pyarrow.py,sha256=pcNQ3a6UPJT1PBj6xHOl99UvZft
|
|
|
39
39
|
thds/tabularasa/schema/compilation/sphinx.py,sha256=we5X-ZpCk6WH-8KCXAv6Nklg1JZmnkGPT3V2EHa2_rg,17491
|
|
40
40
|
thds/tabularasa/schema/compilation/sqlite.py,sha256=wSrSlVCYeuTpOf9AOHAnp6gJHkjHZhx8UkgkYgfoQVw,2368
|
|
41
41
|
thds/tabularasa/schema/compilation/util.py,sha256=YXFe1_yoBobED010hstKIoq-dwLHo6SBv1v1IAw6AYU,3886
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
thds_tabularasa-0.
|
|
45
|
-
thds_tabularasa-0.
|
|
46
|
-
thds_tabularasa-0.
|
|
42
|
+
thds/tabularasa/testing/__init__.py,sha256=XoLzB-DotxFw9KHt2vfH72k7pyAAFI2bW-qqq6nww1g,85
|
|
43
|
+
thds/tabularasa/testing/mock_sqlite.py,sha256=xoV4w_GaDgtZf17iUux2-LA6Va1XRJdC2FU34dysh0o,4769
|
|
44
|
+
thds_tabularasa-0.14.1.dist-info/METADATA,sha256=rqg7l_iBlrh7E8-iXCeZyLcnOxJIuFDBw_5QNe1A9V0,26786
|
|
45
|
+
thds_tabularasa-0.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
46
|
+
thds_tabularasa-0.14.1.dist-info/entry_points.txt,sha256=PX4ShRonjv6lMsVjrGu8RkFzpyyvgM9EnZlNfMomd9k,61
|
|
47
|
+
thds_tabularasa-0.14.1.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
48
|
+
thds_tabularasa-0.14.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|