thds.tabularasa 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  import shutil
4
3
  import subprocess
5
4
  import sys
@@ -14,7 +13,7 @@ from typing import Dict, Iterable, Iterator, List, NamedTuple, Optional, Set, Tu
14
13
  import networkx as nx
15
14
  import pkg_resources
16
15
 
17
- from thds.core import parallel
16
+ from thds.core import link, parallel
18
17
  from thds.tabularasa.data_dependencies.adls import (
19
18
  ADLSFileIntegrityError,
20
19
  ADLSFileSystem,
@@ -730,7 +729,7 @@ class ReferenceDataManager:
730
729
  file_path = self.data_path_for(table)
731
730
  if file_path.exists():
732
731
  self.logger.warning(f"Removing built file for table {table.name} at {file_path}")
733
- os.remove(file_path)
732
+ file_path.unlink()
734
733
  else:
735
734
  self.logger.info(f"No file found for table {table.name}; nothing to remove")
736
735
  try:
@@ -763,7 +762,7 @@ class ReferenceDataManager:
763
762
  for table in tables_to_update:
764
763
  table_name = table.name
765
764
  table_path = self.data_path_for(table)
766
- if os.path.exists(table_path):
765
+ if table_path.exists():
767
766
  md5 = hash_file(table_path)
768
767
  old_md5 = table.md5
769
768
  if old_md5 is None:
@@ -847,10 +846,10 @@ class ReferenceDataManager:
847
846
  local_cache_path = paths[0].local_path
848
847
  if sync_data.local_file_exists:
849
848
  self.logger.warning(f"Removing existing file {sync_data.local_path}")
850
- os.remove(sync_data.local_path)
849
+ sync_data.local_path.unlink()
851
850
  self.logger.info(f"Linking downloaded file to local build file {sync_data.local_path}")
852
851
  sync_data.local_path.parent.mkdir(parents=True, exist_ok=True)
853
- os.link(local_cache_path, sync_data.local_path)
852
+ link.link(local_cache_path, sync_data.local_path)
854
853
  return True
855
854
 
856
855
  def sync_blob_store(
@@ -222,7 +222,9 @@ def populate_sqlite_db(
222
222
  table_predicate: Callable[[Table], bool] = is_build_time_package_table,
223
223
  data_path_overrides: Optional[Mapping[str, Path]] = None,
224
224
  ):
225
- """Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema`
225
+ """Populate a sqlite database with data for a set of tables from a `reference_data.schema.Schema`.
226
+ Note that this can safely be called concurrently in multiple processes on the same database file; a file lock
227
+ is acquired on the database file and only released when the data insertion is complete.
226
228
 
227
229
  :param schema: the `reference_data.schema.Schema` object defining the data to be inserted
228
230
  :param db_package: name of the package where the database file is stored, if any. In case `None` is
@@ -259,6 +261,9 @@ def populate_sqlite_db(
259
261
  # gather all tables before executing any I/O
260
262
  insert_tables = [table for table in schema.filter_tables(table_predicate) if table.has_indexes]
261
263
 
264
+ if not insert_tables:
265
+ return
266
+
262
267
  with bulk_write_connection(db_path, db_package, close=True) as con:
263
268
  for table in insert_tables:
264
269
  table_filename: Optional[str]
@@ -22,7 +22,6 @@ from thds.tabularasa.sqlite3_compat import sqlite3
22
22
 
23
23
  DEFAULT_ATTR_SQLITE_CACHE_SIZE = 100_000
24
24
  DEFAULT_MMAP_BYTES = int(os.environ.get("TABULA_RASA_DEFAULT_MMAP_BYTES", 8_589_934_592)) # 8 GB
25
- DISABLE_WAL_MODE = bool(os.environ.get("REF_D_DISABLE_SQLITE_WAL_MODE", False))
26
25
 
27
26
  PARAMETERIZABLE_BUILTINS = sys.version_info >= (3, 9)
28
27
 
@@ -159,11 +158,6 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
159
158
  logger.debug("Setting pragmas for bulk write optimization")
160
159
  # https://www.sqlite.org/pragma.html#pragma_synchronous
161
160
  _log_exec_sql(logger, con, "PRAGMA synchronous = 0") # OFF
162
- # https://www.sqlite.org/pragma.html#pragma_journal_mode
163
- if not DISABLE_WAL_MODE:
164
- _log_exec_sql(logger, con, "PRAGMA journal_mode = WAL")
165
- # https://www.sqlite.org/pragma.html#pragma_locking_mode
166
- _log_exec_sql(logger, con, "PRAGMA locking_mode = EXCLUSIVE")
167
161
 
168
162
  return con
169
163
 
@@ -171,16 +165,7 @@ def set_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
171
165
  def unset_bulk_write_mode(con: sqlite3.Connection) -> sqlite3.Connection:
172
166
  logger = logging.getLogger(__name__)
173
167
  logger.debug("Setting pragmas for bulk write optimization")
174
- # https://www.sqlite.org/pragma.html#pragma_journal_mode
175
- # resetting this to the default. This is a property of the database, rather than the connection.
176
- # the other settings are connection-specific.
177
- # according to the docs, the WAL journal mode should be disabled before the locking mode is restored,
178
- # else any attempt to do so is a no-op.
179
- _log_exec_sql(logger, con, "PRAGMA journal_mode = DELETE")
180
- # https://www.sqlite.org/pragma.html#pragma_synchronous
181
168
  _log_exec_sql(logger, con, "PRAGMA synchronous = 2") # FULL (default)
182
- # https://www.sqlite.org/pragma.html#pragma_locking_mode
183
- _log_exec_sql(logger, con, "PRAGMA locking_mode = NORMAL")
184
169
 
185
170
  return con
186
171
 
@@ -191,9 +176,10 @@ def bulk_write_connection(
191
176
  ) -> ty.Generator[sqlite3.Connection, None, None]:
192
177
  """Context manager to set/unset bulk write mode on a sqlite connection. Sets pragmas for efficient bulk writes,
193
178
  such as loosening synchronous and locking modes. If `close` is True, the connection will be closed on exit.
194
- Since setting the pragmas may mutate the database file, and since by design this context manager exists to enable
195
- bulk writes which intentionally mutate the database, if a `db_path` (and optionally `db_package`, if specified as
196
- package data) is given, we also acquire a file lock on the database file on entry and release it on exit.
179
+ To avoid bulk insert routines being run by other processes concurrently, we also acquire a file lock on the
180
+ database file on entry and release it on exit. Other processes attempting to perform bulk writes to the same file
181
+ will block until the lock is released. In the case of tabularasa init-sqlite, the semantics then imply that those
182
+ workers will perform no writes at all, since metadata will indicate that the data in the file is up-to-date.
197
183
  """
198
184
  db_path_ = to_local_path(db_path, db_package).absolute()
199
185
  lock_path = db_path_.with_suffix(".lock")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.tabularasa
3
- Version: 0.14.0
3
+ Version: 0.14.2
4
4
  Summary: Trilliant Health reference data build system.
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  Project-URL: Repository, https://github.com/TrilliantHealth/ds-monorepo
@@ -1,5 +1,5 @@
1
1
  thds/tabularasa/__init__.py,sha256=jc6w1WD868MQ2t4wkRNYvRssojwXvPDcNyC8V5gwbl0,169
2
- thds/tabularasa/__main__.py,sha256=Ryfd7YogTE_qFjp8IJA-KTeTXXD9INS5GJGmdPVvWBw,47791
2
+ thds/tabularasa/__main__.py,sha256=DlaUfXu03tbBVucRuMw7354LeBs8d5tRCuAprZs0XYs,47778
3
3
  thds/tabularasa/compat.py,sha256=j0313TPIXtkbfvRI0AH4if8GLrjQSrDJ9heayCIl9w8,1037
4
4
  thds/tabularasa/git_util.py,sha256=fBFhaCPi_5W2BpG2B3WiPcAWJvuVI_pG47rt73wLO6E,1388
5
5
  thds/tabularasa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ thds/tabularasa/to_sqlite.py,sha256=5lcEUh38MNebxAJdLp2XGWOP_WQDIADtL1fyhOvi9UU,
9
9
  thds/tabularasa/data_dependencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  thds/tabularasa/data_dependencies/adls.py,sha256=vJAuc5Key-vO1N6DGo5dj9fIx_4hMALAVC17qhvkT7Y,3257
11
11
  thds/tabularasa/data_dependencies/build.py,sha256=6iYgw93sOF2Nlnb6WSmA9NvPNwOf_Yyi2wXUQpRVkJM,23382
12
- thds/tabularasa/data_dependencies/sqlite.py,sha256=eweuLdoxyGlG-PvQUANarlMe_mmZgA5cxuMbOYxcpsQ,12576
12
+ thds/tabularasa/data_dependencies/sqlite.py,sha256=sMP_NInBEDoH5SScIRYxtOvcPUi9WXfE3_jCoOBduGo,12825
13
13
  thds/tabularasa/data_dependencies/tabular.py,sha256=oq9wFse235ikLEv8Zvol59ptRRojZbkbzXJyQeFfC9o,6529
14
14
  thds/tabularasa/data_dependencies/util.py,sha256=FQ9G1nIpqKh00z2lXOt0Y2R1mLQsEb-BC6Tka1z2egc,8489
15
15
  thds/tabularasa/diff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,7 +19,7 @@ thds/tabularasa/diff/summary.py,sha256=gENtDwhSrDYeN-8fWr6Ug2zgdp584b0pZF9UBYzKF
19
19
  thds/tabularasa/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  thds/tabularasa/loaders/lazy_adls.py,sha256=jrWy5tTKDQfWEv6aHQ3UJhFzLrDPOlSGsArv9zcl1g8,1375
21
21
  thds/tabularasa/loaders/parquet_util.py,sha256=u75j3PkMSakO2zfq4zksWzXLYnaO--WizAgXTcSpXRY,13354
22
- thds/tabularasa/loaders/sqlite_util.py,sha256=GLP271SF-GVQwxaaB0p5kCG0eGay6j4mWabL00a9pz0,11954
22
+ thds/tabularasa/loaders/sqlite_util.py,sha256=3Gi1Y4iTVCD9FXqylQw1eyFwVuplQUrjY1J0SC5FFWg,11099
23
23
  thds/tabularasa/loaders/util.py,sha256=XmsGkDdL6O8R6B4667Iqi5HoRgq0YMs6LP3VvPIqPVU,21369
24
24
  thds/tabularasa/schema/__init__.py,sha256=bowvNXrrDrWB3TAmwDxCeEAvVEe9z7iRfqRaNg1Qmo4,440
25
25
  thds/tabularasa/schema/constraints.py,sha256=V2vh01BhYR8OVQvgdujqSi0l_fMJvFKYSlBvWExZFG0,9744
@@ -41,8 +41,8 @@ thds/tabularasa/schema/compilation/sqlite.py,sha256=wSrSlVCYeuTpOf9AOHAnp6gJHkjH
41
41
  thds/tabularasa/schema/compilation/util.py,sha256=YXFe1_yoBobED010hstKIoq-dwLHo6SBv1v1IAw6AYU,3886
42
42
  thds/tabularasa/testing/__init__.py,sha256=XoLzB-DotxFw9KHt2vfH72k7pyAAFI2bW-qqq6nww1g,85
43
43
  thds/tabularasa/testing/mock_sqlite.py,sha256=xoV4w_GaDgtZf17iUux2-LA6Va1XRJdC2FU34dysh0o,4769
44
- thds_tabularasa-0.14.0.dist-info/METADATA,sha256=fzbOzf8zgv-IBEcUN_6stkhIkhccgbrmJBi_jbGdkS4,26786
45
- thds_tabularasa-0.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
- thds_tabularasa-0.14.0.dist-info/entry_points.txt,sha256=PX4ShRonjv6lMsVjrGu8RkFzpyyvgM9EnZlNfMomd9k,61
47
- thds_tabularasa-0.14.0.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
48
- thds_tabularasa-0.14.0.dist-info/RECORD,,
44
+ thds_tabularasa-0.14.2.dist-info/METADATA,sha256=H48Ilbbjrag4tb1s1IhIlh5tNbUnwjXdaQ3zSC9rejg,26786
45
+ thds_tabularasa-0.14.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ thds_tabularasa-0.14.2.dist-info/entry_points.txt,sha256=PX4ShRonjv6lMsVjrGu8RkFzpyyvgM9EnZlNfMomd9k,61
47
+ thds_tabularasa-0.14.2.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
48
+ thds_tabularasa-0.14.2.dist-info/RECORD,,