thds.tabularasa 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -943,16 +943,18 @@ class ReferenceDataManager:
943
943
  return None
944
944
  raise IOError(table.name)
945
945
 
946
- failed: List[str] = []
946
+ failed: list[tuple[str, Exception]] = []
947
947
  synced: List[str] = []
948
948
  for table_name, res in parallel.yield_all([(t.name, partial(inner, t)) for t in tables_to_sync]):
949
949
  if isinstance(res, parallel.Error):
950
- failed.append(table_name)
950
+ failed.append((table_name, res.error))
951
951
  elif res is not None:
952
952
  synced.append(table_name)
953
953
 
954
954
  if failed:
955
- raise RuntimeError(f"Sync failed for tables {', '.join(failed)}")
955
+ first_exc = failed[0][1]
956
+ table_names = [name for name, _ in failed]
957
+ raise RuntimeError(f"Sync failed for tables {', '.join(table_names)}") from first_exc
956
958
 
957
959
  down_ = (
958
960
  f"to local build directory {pkg_resources.resource_filename(self.package, self.package_data_dir)}"
@@ -6,6 +6,7 @@ from typing import List, Optional, Protocol, Union
6
6
  import attr
7
7
 
8
8
  from thds.adls import ADLSFileSystem, fqn
9
+ from thds.core import fretry
9
10
  from thds.tabularasa.schema.files import ADLSDataSpec, RemoteBlobStoreSpec
10
11
 
11
12
  CACHE_DIR = ".cache/"
@@ -40,6 +41,7 @@ def adls_filesystem(account: str, filesystem: str, cache_dir: Optional[Union[Pat
40
41
  return ADLSFileSystem(account, filesystem, cache_dir=cache_dir)
41
42
 
42
43
 
44
+ @fretry.retry_regular(fretry.is_exc(Exception), fretry.n_times(3))
43
45
  def sync_adls_data(
44
46
  adls_spec: ADLSDataSpec, cache_dir: Optional[Union[Path, str]] = CACHE_DIR
45
47
  ) -> List[ADLSDownloadResult]:
@@ -222,7 +222,9 @@ def populate_sqlite_db(
222
222
  table_predicate: Callable[[Table], bool] = is_build_time_package_table,
223
223
  data_path_overrides: Optional[Mapping[str, Path]] = None,
224
224
  ):
225
- """Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema`
225
+ """Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema`.
226
+ Note that this can safely be called concurrently in multiple processes on the same database file; a file lock
227
+ is acquired on the database file and only released when the data insertion is complete.
226
228
 
227
229
  :param schema: the `reference_data.schema.Schema` object defining the data to be inserted
228
230
  :param db_package: name of the package where the database file is stored, if any. In case `None` is
@@ -259,6 +261,9 @@ def populate_sqlite_db(
259
261
  # gather all tables before executing any I/O
260
262
  insert_tables = [table for table in schema.filter_tables(table_predicate) if table.has_indexes]
261
263
 
264
+ if not insert_tables:
265
+ return
266
+
262
267
  with bulk_write_connection(db_path, db_package, close=True) as con:
263
268
  for table in insert_tables:
264
269
  table_filename: Optional[str]
@@ -22,7 +22,6 @@ from thds.tabularasa.sqlite3_compat import sqlite3
22
22
 
23
23
  DEFAULT_ATTR_SQLITE_CACHE_SIZE = 100_000
24
24
  DEFAULT_MMAP_BYTES = int(os.environ.get("TABULA_RASA_DEFAULT_MMAP_BYTES", 8_589_934_592)) # 8 GB
25
- DISABLE_WAL_MODE = bool(os.environ.get("REF_D_DISABLE_SQLITE_WAL_MODE", False))
26
25
 
27
26
  PARAMETERIZABLE_BUILTINS = sys.version_info >= (3, 9)
28
27
 
@@ -159,11 +158,6 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
159
158
  logger.debug("Setting pragmas for bulk write optimization")
160
159
  # https://www.sqlite.org/pragma.html#pragma_synchronous
161
160
  _log_exec_sql(logger, con, "PRAGMA synchronous = 0") # OFF
162
- # https://www.sqlite.org/pragma.html#pragma_journal_mode
163
- if not DISABLE_WAL_MODE:
164
- _log_exec_sql(logger, con, "PRAGMA journal_mode = WAL")
165
- # https://www.sqlite.org/pragma.html#pragma_locking_mode
166
- _log_exec_sql(logger, con, "PRAGMA locking_mode = EXCLUSIVE")
167
161
 
168
162
  return con
169
163
 
@@ -171,16 +165,7 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
171
165
  def unset_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
172
166
  logger = logging.getLogger(__name__)
173
167
  logger.debug("Setting pragmas for bulk write optimization")
174
- # https://www.sqlite.org/pragma.html#pragma_journal_mode
175
- # resetting this to the default. This is a property of the database, rather than the connection.
176
- # the other settings are connection-specific.
177
- # according to the docs, the WAL journal mode should be disabled before the locking mode is restored,
178
- # else any attempt to do so is a no-op.
179
- _log_exec_sql(logger, con, "PRAGMA journal_mode = DELETE")
180
- # https://www.sqlite.org/pragma.html#pragma_synchronous
181
168
  _log_exec_sql(logger, con, "PRAGMA synchronous = 2") # FULL (default)
182
- # https://www.sqlite.org/pragma.html#pragma_locking_mode
183
- _log_exec_sql(logger, con, "PRAGMA locking_mode = NORMAL")
184
169
 
185
170
  return con
186
171
 
@@ -191,9 +176,10 @@ def bulk_write_connection(
191
176
  ) -> ty.Generator[sqlite3.Connection, None, None]:
192
177
  """Context manager to set/unset bulk write mode on a sqlite connection. Sets pragmas for efficient bulk writes,
193
178
  such as loosening synchronous and locking modes. If `close` is True, the connection will be closed on exit.
194
- Since setting the pragmas may mutate the database file, and since by design this context manager exists to enable
195
- bulk writes which intentionally mutate the database, if a `db_path` (and optionally `db_package`, if specified as
196
- package data) is given, we also acquire a file lock on the database file on entry and release it on exit.
179
+ To avoid bulk insert routines being run by other processes concurrently, we also acquire a file lock on the
180
+ database file on entry and release it on exit. Other processes attempting to perform bulk writes to the same file
181
+ will block until the lock is released. In the case of tabularasa init-sqlite, the semantics then imply that those
182
+ workers will perform no writes at all, since metadata will indicate that the data in the file is up-to-date.
197
183
  """
198
184
  db_path_ = to_local_path(db_path, db_package).absolute()
199
185
  lock_path = db_path_.with_suffix(".lock")
@@ -0,0 +1,5 @@
1
+ __all__ = [
2
+ "mock_sqlite_loader",
3
+ ]
4
+
5
+ from .mock_sqlite import mock_sqlite_loader
@@ -0,0 +1,114 @@
1
+ import contextlib
2
+ import inspect
3
+ import sqlite3
4
+ import tempfile
5
+ import typing as ty
6
+ from pathlib import Path
7
+
8
+ import attrs
9
+ import pyarrow as pa
10
+ import pyarrow.parquet
11
+
12
+ from thds.core import scope
13
+ from thds.core.types import StrOrPath
14
+ from thds.tabularasa.data_dependencies import sqlite, util
15
+ from thds.tabularasa.schema import load_schema
16
+
17
+
18
+ class _GeneratedSqliteLoader(ty.Protocol):
19
+ def __init__(
20
+ self,
21
+ package: ty.Optional[str],
22
+ db_path: str,
23
+ ) -> None: ...
24
+
25
+
26
+ _UNTIL_EXIT_SCOPE = scope.Scope("tabularasa.testing.mock_sqlite_loader")
27
+ # this scope is for creating temporary sqlite database files that persist until program exit, in case the caller of
28
+ # mock_sqlite_loader doesn't want to manage the database file themselves
29
+
30
+ L = ty.TypeVar("L", bound=_GeneratedSqliteLoader)
31
+
32
+
33
+ def mock_sqlite_loader(
34
+ loader_cls: ty.Type[L],
35
+ data: ty.Mapping[str, ty.Collection[attrs.AttrsInstance]],
36
+ tmp_db_path: ty.Optional[StrOrPath] = None,
37
+ *,
38
+ package: ty.Optional[str] = None,
39
+ schema_path: str = "schema.yaml",
40
+ validate: bool = False,
41
+ ) -> L:
42
+ """Construct an instance of your custom generated sqlite loader from mocked data. Note that this is guaranteed
43
+ typesafe because regardless of how you define your mock records, the resulting sqlite loader will be a true instance
44
+ of your generated loader class, and will have all the same lookup methods and will use all the same deserialization
45
+ logic for reading rows from the database and returning actual instances from your library's data model.
46
+
47
+ :param loader_cls: The generated sqlite loader class to instantiate.
48
+ :param data: A mapping from table names to collections of attrs records representing rows.
49
+ :param package: The root package name containing the schema and generated loader(s). If omitted, it will be inferred
50
+ from the loader class's `__module__` attribute by climbing up until a schema file is found.
51
+ :param schema_path: The path to the schema file within the package.
52
+ :param tmp_db_path: Optional path to a file to use for the sqlite database. If None, a temporary file is created.
53
+ Note that in this case the temporary file will not be cleaned up until program exit.
54
+ :param validate: Whether to validate data against the schema when inserting data into the database.
55
+ :return: An instance of the specified sqlite loader class populated with the provided mocked data, with empty
56
+ tables for any table names that were not included in the `data` mapping.
57
+ """
58
+ if package is None:
59
+ if package_ := inspect.signature(loader_cls).parameters["package"].default:
60
+ package_candidates = [package_]
61
+ else:
62
+ loader_module_path = loader_cls.__module__.split(".")
63
+ package_candidates = [
64
+ ".".join(loader_module_path[:i]) for i in range(len(loader_module_path), 0, -1)
65
+ ]
66
+ else:
67
+ package_candidates = [package]
68
+
69
+ for package_ in package_candidates:
70
+ try:
71
+ schema = load_schema(package_, schema_path)
72
+ except (ModuleNotFoundError, FileNotFoundError):
73
+ continue
74
+ else:
75
+ break
76
+ else:
77
+ raise ValueError(
78
+ f"Could not infer package containing schema from loader class {loader_cls.__qualname__}; "
79
+ "please specify the 'package' argument explicitly."
80
+ )
81
+
82
+ if tmp_db_path is None:
83
+ tmp_db_path = _UNTIL_EXIT_SCOPE.enter(tempfile.NamedTemporaryFile(suffix=".sqlite")).name
84
+
85
+ unknown_tables = set(data.keys()).difference(schema.tables.keys())
86
+ if unknown_tables:
87
+ raise ValueError(f"Data provided for unknown tables: {sorted(unknown_tables)}")
88
+
89
+ with (
90
+ tempfile.TemporaryDirectory() as tmpdir,
91
+ contextlib.closing(sqlite3.connect(str(tmp_db_path))) as con,
92
+ ):
93
+ # this tmpdir is only for staging parquet files before loading into sqlite; it's fine that they get deleted
94
+ # immediately after the database is populated
95
+ for name, table in schema.tables.items():
96
+ rows = data.get(name, [])
97
+ pa_table = pa.Table.from_pylist(
98
+ [attrs.asdict(row, recurse=True) for row in rows], schema=table.parquet_schema
99
+ )
100
+ filename = name + ".parquet"
101
+ pyarrow.parquet.write_table(
102
+ pa_table, Path(tmpdir) / filename, version=util.PARQUET_FORMAT_VERSION
103
+ )
104
+ sqlite.insert_table(
105
+ con,
106
+ table,
107
+ package=None,
108
+ data_dir=tmpdir,
109
+ filename=filename,
110
+ validate=validate,
111
+ cast=False if validate else True,
112
+ )
113
+
114
+ return loader_cls(package=None, db_path=str(tmp_db_path))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.tabularasa
3
- Version: 0.13.1
3
+ Version: 0.14.1
4
4
  Summary: Trilliant Health reference data build system.
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  Project-URL: Repository, https://github.com/TrilliantHealth/ds-monorepo
@@ -1,5 +1,5 @@
1
1
  thds/tabularasa/__init__.py,sha256=jc6w1WD868MQ2t4wkRNYvRssojwXvPDcNyC8V5gwbl0,169
2
- thds/tabularasa/__main__.py,sha256=w10WQRwQmer4Hn3JmgHqjtVJ2WEjs9MtTiDvsAS9gog,47648
2
+ thds/tabularasa/__main__.py,sha256=Ryfd7YogTE_qFjp8IJA-KTeTXXD9INS5GJGmdPVvWBw,47791
3
3
  thds/tabularasa/compat.py,sha256=j0313TPIXtkbfvRI0AH4if8GLrjQSrDJ9heayCIl9w8,1037
4
4
  thds/tabularasa/git_util.py,sha256=fBFhaCPi_5W2BpG2B3WiPcAWJvuVI_pG47rt73wLO6E,1388
5
5
  thds/tabularasa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -7,9 +7,9 @@ thds/tabularasa/sqlite3_compat.py,sha256=67hldmiLFTjG0qL2TPX0USV7XNfTjEN3j8MneqN
7
7
  thds/tabularasa/sqlite_from_parquet.py,sha256=yJatUIAbgErHUOL5dhchWJwzKZCrDrx93SP0nGm7It8,1115
8
8
  thds/tabularasa/to_sqlite.py,sha256=5lcEUh38MNebxAJdLp2XGWOP_WQDIADtL1fyhOvi9UU,1715
9
9
  thds/tabularasa/data_dependencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- thds/tabularasa/data_dependencies/adls.py,sha256=smL-iRYr9aXFFpW4inQBQmB-ieBKXcBVe9AdwV2zisw,3161
10
+ thds/tabularasa/data_dependencies/adls.py,sha256=vJAuc5Key-vO1N6DGo5dj9fIx_4hMALAVC17qhvkT7Y,3257
11
11
  thds/tabularasa/data_dependencies/build.py,sha256=6iYgw93sOF2Nlnb6WSmA9NvPNwOf_Yyi2wXUQpRVkJM,23382
12
- thds/tabularasa/data_dependencies/sqlite.py,sha256=eweuLdoxyGlG-PvQUANarlMe_mmZgA5cxuMbOYxcpsQ,12576
12
+ thds/tabularasa/data_dependencies/sqlite.py,sha256=sMP_NInBEDoH5SScIRYxtOvcPUi9WXfE3_jCoOBduGo,12825
13
13
  thds/tabularasa/data_dependencies/tabular.py,sha256=oq9wFse235ikLEv8Zvol59ptRRojZbkbzXJyQeFfC9o,6529
14
14
  thds/tabularasa/data_dependencies/util.py,sha256=FQ9G1nIpqKh00z2lXOt0Y2R1mLQsEb-BC6Tka1z2egc,8489
15
15
  thds/tabularasa/diff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,7 +19,7 @@ thds/tabularasa/diff/summary.py,sha256=gENtDwhSrDYeN-8fWr6Ug2zgdp584b0pZF9UBYzKF
19
19
  thds/tabularasa/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  thds/tabularasa/loaders/lazy_adls.py,sha256=jrWy5tTKDQfWEv6aHQ3UJhFzLrDPOlSGsArv9zcl1g8,1375
21
21
  thds/tabularasa/loaders/parquet_util.py,sha256=u75j3PkMSakO2zfq4zksWzXLYnaO--WizAgXTcSpXRY,13354
22
- thds/tabularasa/loaders/sqlite_util.py,sha256=GLP271SF-GVQwxaaB0p5kCG0eGay6j4mWabL00a9pz0,11954
22
+ thds/tabularasa/loaders/sqlite_util.py,sha256=3Gi1Y4iTVCD9FXqylQw1eyFwVuplQUrjY1J0SC5FFWg,11099
23
23
  thds/tabularasa/loaders/util.py,sha256=XmsGkDdL6O8R6B4667Iqi5HoRgq0YMs6LP3VvPIqPVU,21369
24
24
  thds/tabularasa/schema/__init__.py,sha256=bowvNXrrDrWB3TAmwDxCeEAvVEe9z7iRfqRaNg1Qmo4,440
25
25
  thds/tabularasa/schema/constraints.py,sha256=V2vh01BhYR8OVQvgdujqSi0l_fMJvFKYSlBvWExZFG0,9744
@@ -39,8 +39,10 @@ thds/tabularasa/schema/compilation/pyarrow.py,sha256=pcNQ3a6UPJT1PBj6xHOl99UvZft
39
39
  thds/tabularasa/schema/compilation/sphinx.py,sha256=we5X-ZpCk6WH-8KCXAv6Nklg1JZmnkGPT3V2EHa2_rg,17491
40
40
  thds/tabularasa/schema/compilation/sqlite.py,sha256=wSrSlVCYeuTpOf9AOHAnp6gJHkjHZhx8UkgkYgfoQVw,2368
41
41
  thds/tabularasa/schema/compilation/util.py,sha256=YXFe1_yoBobED010hstKIoq-dwLHo6SBv1v1IAw6AYU,3886
42
- thds_tabularasa-0.13.1.dist-info/METADATA,sha256=flLUSZeccW-NUJgBBPwUrBeB94h1-TsjwdIUaNkdB8c,26786
43
- thds_tabularasa-0.13.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- thds_tabularasa-0.13.1.dist-info/entry_points.txt,sha256=PX4ShRonjv6lMsVjrGu8RkFzpyyvgM9EnZlNfMomd9k,61
45
- thds_tabularasa-0.13.1.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
46
- thds_tabularasa-0.13.1.dist-info/RECORD,,
42
+ thds/tabularasa/testing/__init__.py,sha256=XoLzB-DotxFw9KHt2vfH72k7pyAAFI2bW-qqq6nww1g,85
43
+ thds/tabularasa/testing/mock_sqlite.py,sha256=xoV4w_GaDgtZf17iUux2-LA6Va1XRJdC2FU34dysh0o,4769
44
+ thds_tabularasa-0.14.1.dist-info/METADATA,sha256=rqg7l_iBlrh7E8-iXCeZyLcnOxJIuFDBw_5QNe1A9V0,26786
45
+ thds_tabularasa-0.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ thds_tabularasa-0.14.1.dist-info/entry_points.txt,sha256=PX4ShRonjv6lMsVjrGu8RkFzpyyvgM9EnZlNfMomd9k,61
47
+ thds_tabularasa-0.14.1.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
48
+ thds_tabularasa-0.14.1.dist-info/RECORD,,