thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""This module exists solely to preferentially use pysqlite3 over the
|
|
2
|
+
built-in sqlite where it is already available.
|
|
3
|
+
|
|
4
|
+
Our primary purpose for doing this is to enable larger amounts of
|
|
5
|
+
mem-mapped shared memory between processes consuming reference-data.
|
|
6
|
+
|
|
7
|
+
However, while pre-compiled binaries for pysqlite3 are available via
|
|
8
|
+
PyPi, they are only available for manylinux, and most of our
|
|
9
|
+
development is done on Macs.
|
|
10
|
+
|
|
11
|
+
Since this customization is mostly useful for strongly parallel usage
|
|
12
|
+
of reference-data, and since that does not often happen on Mac
|
|
13
|
+
laptops, this shim will allow us to defer doing extra work to get
|
|
14
|
+
compiled pysqlite3 on Macs for the time being.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
from logging import getLogger
|
|
19
|
+
|
|
20
|
+
_DISABLE_PYSQLITE3 = bool(os.environ.get("REF_D_DISABLE_PYSQLITE3", False))
|
|
21
|
+
|
|
22
|
+
logger = getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# All of the following log statements include the string `pysqlite3` for searchability.
|
|
25
|
+
if not _DISABLE_PYSQLITE3:
|
|
26
|
+
try:
|
|
27
|
+
import pysqlite3 as sqlite3
|
|
28
|
+
|
|
29
|
+
logger.info(f"Using pysqlite3 with SQLite version {sqlite3.sqlite_version}")
|
|
30
|
+
except ImportError:
|
|
31
|
+
logger.debug("Using sqlite3 module because pysqlite3 was not available")
|
|
32
|
+
# this is DEBUG because it's the 'base case' for local dev and
|
|
33
|
+
# there's no need to make logs for development use cases more
|
|
34
|
+
# verbose. In production, one of the other two INFO logs is
|
|
35
|
+
# likely to fire, and if not, this case can be inferred from
|
|
36
|
+
# the lack of log.
|
|
37
|
+
import sqlite3 # type: ignore
|
|
38
|
+
else:
|
|
39
|
+
import sqlite3 # type: ignore
|
|
40
|
+
|
|
41
|
+
logger.info("Using sqlite3 module because pysqlite3 was disabled via environment variable")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from thds.core import log, source
|
|
4
|
+
from thds.tabularasa.data_dependencies.sqlite import insert_table
|
|
5
|
+
from thds.tabularasa.loaders.sqlite_util import sqlite_connection
|
|
6
|
+
from thds.tabularasa.schema import load_schema
|
|
7
|
+
from thds.tabularasa.schema.metaschema import Table
|
|
8
|
+
|
|
9
|
+
logger = log.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_table_from_schema(schema_path: Path, table_name: str) -> Table:
|
|
13
|
+
return load_schema(None, str(schema_path)).tables[table_name]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def sqlite_from_parquet(
|
|
17
|
+
schema_src: source.Source,
|
|
18
|
+
table_name: str,
|
|
19
|
+
parquet_src: source.Source,
|
|
20
|
+
output_db_dir: Path,
|
|
21
|
+
output_db_name: str,
|
|
22
|
+
) -> source.Source:
|
|
23
|
+
parquet_path = parquet_src.path()
|
|
24
|
+
sqlite_outfile = output_db_dir / output_db_name
|
|
25
|
+
with sqlite_connection(sqlite_outfile) as conn:
|
|
26
|
+
insert_table(
|
|
27
|
+
conn,
|
|
28
|
+
load_table_from_schema(schema_src.path(), table_name),
|
|
29
|
+
None,
|
|
30
|
+
data_dir=str(parquet_path.parent),
|
|
31
|
+
filename=str(parquet_path.name),
|
|
32
|
+
)
|
|
33
|
+
logger.info("Done inserting parquet into sqlite.")
|
|
34
|
+
return source.from_file(sqlite_outfile)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Helpers for taking an in-memory set of Tables and writing them to a
|
|
2
|
+
SQLite database file without needing to go through a build process.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import typing as ty
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .data_dependencies.sqlite import populate_sqlite_db
|
|
10
|
+
from .schema.metaschema import BuildOptions, Schema, Table
|
|
11
|
+
|
|
12
|
+
FAKE_BUILD_OPTIONS = BuildOptions(
|
|
13
|
+
derived_code_submodule="thds.nope",
|
|
14
|
+
attrs=False,
|
|
15
|
+
sqlite_data=True,
|
|
16
|
+
sqlite_interface=False,
|
|
17
|
+
pandas=False,
|
|
18
|
+
pyarrow=False,
|
|
19
|
+
require_typing_extensions=False,
|
|
20
|
+
type_constraint_comments=False,
|
|
21
|
+
validate_transient_tables=False,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _make_fake_schema(tables: ty.Collection[Table]) -> Schema:
|
|
26
|
+
return Schema(
|
|
27
|
+
tables={table.name: table for table in tables},
|
|
28
|
+
types=dict(),
|
|
29
|
+
# none of the build options are actually relevant to us.
|
|
30
|
+
build_options=FAKE_BUILD_OPTIONS,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def to_sqlite(sqlite_file_path: Path, *tables: Table, **data_paths_by_table_name: Path) -> None:
|
|
35
|
+
"""Create a SQLite database file from the given Tables alone.
|
|
36
|
+
|
|
37
|
+
The data_paths_by_table_name are only required if your Table
|
|
38
|
+
instances do not provide a file path to their data via the `doc`
|
|
39
|
+
attribute. Tables generated by define_table_from_parquet will
|
|
40
|
+
follow this convention.
|
|
41
|
+
"""
|
|
42
|
+
populate_sqlite_db(
|
|
43
|
+
_make_fake_schema(tables),
|
|
44
|
+
None,
|
|
45
|
+
str(sqlite_file_path),
|
|
46
|
+
None,
|
|
47
|
+
"",
|
|
48
|
+
"",
|
|
49
|
+
table_predicate=lambda table: True, # output all tables.
|
|
50
|
+
data_path_overrides={
|
|
51
|
+
table.name: (
|
|
52
|
+
Path(table.doc) if os.path.exists(table.doc) else data_paths_by_table_name[table.name]
|
|
53
|
+
)
|
|
54
|
+
for table in tables
|
|
55
|
+
},
|
|
56
|
+
)
|
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thds.tabularasa
|
|
3
|
+
Version: 0.13.0
|
|
4
|
+
Summary: Trilliant Health reference data build system.
|
|
5
|
+
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
|
+
Project-URL: Repository, https://github.com/TrilliantHealth/ds-monorepo
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: attrs>=22.2
|
|
10
|
+
Requires-Dist: cattrs>=22.2
|
|
11
|
+
Requires-Dist: filelock
|
|
12
|
+
Requires-Dist: networkx>=3.0
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: packaging
|
|
15
|
+
Requires-Dist: pandas>=1.5
|
|
16
|
+
Requires-Dist: pandera<0.24,>=0.20
|
|
17
|
+
Requires-Dist: pydantic<2.0,>=1.10.9
|
|
18
|
+
Requires-Dist: pyarrow>=10.0
|
|
19
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
20
|
+
Requires-Dist: setuptools>=66.1.1
|
|
21
|
+
Requires-Dist: thds-adls
|
|
22
|
+
Requires-Dist: thds-core
|
|
23
|
+
Requires-Dist: typing-extensions
|
|
24
|
+
Provides-Extra: autoformat
|
|
25
|
+
Requires-Dist: black; extra == "autoformat"
|
|
26
|
+
Requires-Dist: isort; extra == "autoformat"
|
|
27
|
+
Provides-Extra: cli
|
|
28
|
+
Requires-Dist: bourbaki.application>=0.10.12; extra == "cli"
|
|
29
|
+
Requires-Dist: ruamel.yaml; extra == "cli"
|
|
30
|
+
Requires-Dist: tabulate; extra == "cli"
|
|
31
|
+
Requires-Dist: typing-inspect; extra == "cli"
|
|
32
|
+
Provides-Extra: viz
|
|
33
|
+
Requires-Dist: pygraphviz; extra == "viz"
|
|
34
|
+
|
|
35
|
+
## Tabula Rasa
|
|
36
|
+
|
|
37
|
+
The `thds.tabularasa` package serves to enable version control, validation, and runtime access to tabular
|
|
38
|
+
datasets that are required in analytic and production workflows. As such, it encompasses a build system
|
|
39
|
+
for generating data, documentation for its derivation process, and code for accessing it.
|
|
40
|
+
|
|
41
|
+
### The Schema File
|
|
42
|
+
|
|
43
|
+
To use `tabularasa` in your project, you will first create a single yaml file defining a tabular schema
|
|
44
|
+
and build process. This file should exist within your package, not somewhere else in your repo - in other
|
|
45
|
+
words it should be package data. It is therefore always read and specified as any package data would be -
|
|
46
|
+
with a package name and a path inside said package.
|
|
47
|
+
|
|
48
|
+
The schema file includes documentation, tabular schema definitions, type information, value-level
|
|
49
|
+
constraints (e.g. ranges, string patterns, and nullability), column-level constraints (e.g. uniqueness),
|
|
50
|
+
file resource definitions, and build options controlling the output of the build system. Tables are built
|
|
51
|
+
from raw data files which may take any form and may be stored either in the repository under version
|
|
52
|
+
control or remotely in a blob store such as ADLS (versioned with md5 hashes to ensure build availability
|
|
53
|
+
and consistency), but are packaged with the distribution as strictly-typed parquet files and optionally
|
|
54
|
+
as a sqlite database archive file. Large package files may be omitted from the base distribution to be
|
|
55
|
+
synced with a blob store at run time.
|
|
56
|
+
|
|
57
|
+
The sections of the schema file are as follows:
|
|
58
|
+
|
|
59
|
+
- `build_options`: a set of various flags controlling your build process, including code and data
|
|
60
|
+
generation
|
|
61
|
+
- `tables`: the schema definitions of your tabular data, plus specifications of the inputs and functions
|
|
62
|
+
used to derive them
|
|
63
|
+
- `types`: any custom constrained column-level types you may wish to define and reference in your tables.
|
|
64
|
+
These become both validation constraints expressed as `pandera` schemas, and `typing.Literal` types in
|
|
65
|
+
the case of enums, or sometimes `typing.NewType`s depending on your build options.
|
|
66
|
+
- `local_data`: specifications of local files in your repo that will be used to build your tables. Files
|
|
67
|
+
referenced here are expected to be version-controlled along with your code and so don't require hashes
|
|
68
|
+
for integrity checks. Note that tabularasa assumes the file on disk is the official committed version.
|
|
69
|
+
It cannot protect against builds with uncommitted local changes to these files.
|
|
70
|
+
- `remote_data`: specifications of remote files that will be used to build your tables. Currently only
|
|
71
|
+
blob store backends like ADLS are supported. Files referenced here must be versioned with hashes to
|
|
72
|
+
ensure build integrity (MD5 is used currently).
|
|
73
|
+
- `remote_blob_store`: optional location to store large artifacts in post-build, in case you want to set
|
|
74
|
+
a size limit above which your data files will not be packaged with your distribution. They can then be
|
|
75
|
+
fetched at run time as needed.
|
|
76
|
+
- `external_schemas`: optional specification of `tabularasa` schemas inside other packages, in case you
|
|
77
|
+
are integrating with them, e.g. by sharing some types.
|
|
78
|
+
|
|
79
|
+
To get more detail on the structure of any of these sections, you may refer to the
|
|
80
|
+
`thds.tabularasa.schema.metaschema._RawSchema` class, which is an exact field-by-field reflection of the
|
|
81
|
+
schema yaml file (with a few enriched fields). Instances of this class are validated and enriched to
|
|
82
|
+
become instances of `thds.tabularasa.schema.metaschema.Schema`, which are then used in various build
|
|
83
|
+
operations.
|
|
84
|
+
|
|
85
|
+
### Core Concepts: How Tabularasa Controls Your Data
|
|
86
|
+
|
|
87
|
+
Before diving into the details, it's important to understand how tabularasa controls and transforms your
|
|
88
|
+
data:
|
|
89
|
+
|
|
90
|
+
#### Column Ordering
|
|
91
|
+
|
|
92
|
+
**Important**: The column order in your output parquet files is **entirely controlled by the order
|
|
93
|
+
defined in schema.yaml**, not by the order in your preprocessor code or source data. Even if your
|
|
94
|
+
preprocessor returns columns in a different order, tabularasa will reorder them to match the schema
|
|
95
|
+
definition during the build process. This ensures consistency across all data artifacts.
|
|
96
|
+
|
|
97
|
+
#### Primary Keys and Pandas Index
|
|
98
|
+
|
|
99
|
+
When working with pandas DataFrames, be aware that **primary key columns become the DataFrame index** and
|
|
100
|
+
effectively "disappear" from the regular columns. If you define `primary_key: [id, date]` in your schema,
|
|
101
|
+
those columns will be accessible via `df.index` rather than `df['id']` or `df['date']`. This behavior is
|
|
102
|
+
automatic and ensures efficient indexing for data access.
|
|
103
|
+
|
|
104
|
+
#### Transient Tables
|
|
105
|
+
|
|
106
|
+
Tables marked with `transient: true` are intermediate tables used during the build process but are not
|
|
107
|
+
included in the final package distribution. Use transient tables for:
|
|
108
|
+
|
|
109
|
+
- Raw input data that gets processed into final tables
|
|
110
|
+
- Intermediate transformation steps
|
|
111
|
+
- Large source data that shouldn't be shipped with the package
|
|
112
|
+
|
|
113
|
+
#### External Data Philosophy
|
|
114
|
+
|
|
115
|
+
Tabularasa follows a fundamental principle: **builds should never depend on external services**. All data
|
|
116
|
+
is snapshotted internally to ensure reproducible builds. This means:
|
|
117
|
+
|
|
118
|
+
- Data from external sources (APIs, remote CSVs, etc.) should be fetched and stored in version control or
|
|
119
|
+
a blob store that you control (specified in the `remote_data` section)
|
|
120
|
+
- This ensures builds are deterministic and not affected by external service availability or consistency
|
|
121
|
+
|
|
122
|
+
### The Data Interfaces
|
|
123
|
+
|
|
124
|
+
The code generation portion of the build system can generate interfaces for loading the package parquet
|
|
125
|
+
data as `attrs` records or `pandas` dataframes (validated by `pandera` schemas), and for loading `attrs`
|
|
126
|
+
records from a `sqlite` archive via indexed queries on specific sets of fields.
|
|
127
|
+
|
|
128
|
+
The code for all modules is generated and written at [build time](#building).
|
|
129
|
+
|
|
130
|
+
### Building
|
|
131
|
+
|
|
132
|
+
To build your project with `tabularasa`, just run
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
tabularasa codegen
|
|
136
|
+
tabularasa datagen
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
from the project root, followed by the invocation of your standard build tool (`poetry`, `setuptools`,
|
|
140
|
+
etc).
|
|
141
|
+
|
|
142
|
+
This will generate all source code interfaces and package data according to various options specified in
|
|
143
|
+
the `build_options` section of the [schema file](#the-schema-file). Note that no code is written unless
|
|
144
|
+
the [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree) of the generated python code differs from
|
|
145
|
+
what is found in the local source files. This allows the code generation step to avoid conflict with code
|
|
146
|
+
formatters such as `black`, since these change only the formatting and not the AST of the code.
|
|
147
|
+
|
|
148
|
+
### Adding new package data
|
|
149
|
+
|
|
150
|
+
To add a new table to the schema, place a new named entry under the `tables` section in your
|
|
151
|
+
[schema file](#the-schema-file). Source data for the table is specified in the table's `dependencies`
|
|
152
|
+
section. There are multiple ways to specify the source data, including version-controlled
|
|
153
|
+
repository-local files and remote files. Source data can be a standard tabular text format (CSV, TSV,
|
|
154
|
+
etc) which can be translated automatically into the table's typed schema, or some other data format that
|
|
155
|
+
requires processing using a user-defined function specified under a `preprocessor` key.
|
|
156
|
+
|
|
157
|
+
The simplest way to add new reference data to version control is to simply place a CSV in your repo, and
|
|
158
|
+
define the schema of that data in the `tables` section of your [schema file](#the-schema-file), pointing
|
|
159
|
+
the `dependencies.filename` of the table to the new CSV file.
|
|
160
|
+
|
|
161
|
+
Note that this direct file reference approach works only with files that can unambiguously be interpreted
|
|
162
|
+
into the table's schema. Currently this is implemented for character-delimited text files such as CSV/TSV
|
|
163
|
+
(with many exposed options for parsing), but could be extended to other tabular formats in the future.
|
|
164
|
+
|
|
165
|
+
#### Choosing Between Local and Remote Data
|
|
166
|
+
|
|
167
|
+
When deciding how to store your source data, consider these trade-offs:
|
|
168
|
+
|
|
169
|
+
**Local Data Storage Patterns**
|
|
170
|
+
|
|
171
|
+
Tabularasa supports two distinct patterns for managing local data files, each serving different
|
|
172
|
+
organizational needs. The **direct file reference pattern** allows tables to specify their data source
|
|
173
|
+
directly through `dependencies.filename`, providing a straightforward path to a file in the repository.
|
|
174
|
+
When you need to update the data, you simply overwrite the file and run
|
|
175
|
+
`tabularasa datagen <your-table-name>` without making any schema changes. The framework reads the file
|
|
176
|
+
directly using the provided path along with any parsing parameters specified in the dependencies block.
|
|
177
|
+
This approach works best for data files that are specific to a single table and can be parsed
|
|
178
|
+
unambiguously, requiring no custom code to interpret.
|
|
179
|
+
|
|
180
|
+
The **shared data pattern** using the `local_data` section provides a more structured approach for
|
|
181
|
+
managing data sources that multiple tables depend on. With this pattern, you define a named entry in the
|
|
182
|
+
`local_data` section of your schema that contains not just the filename but comprehensive metadata
|
|
183
|
+
including the data authority, source URL, update frequency, and documentation. Tables then reference
|
|
184
|
+
these entries using `dependencies.local: [entry_name]`. When the preprocessor function executes, it
|
|
185
|
+
receives a `LocalDataSpec` object that provides access to both the file (via the `full_path` property)
|
|
186
|
+
and all associated metadata. This pattern is best when multiple tables need to derive data from the same
|
|
187
|
+
source file, such as when several tables extract different subsets from a comprehensive dataset. This
|
|
188
|
+
centralized definition allows consistency across all dependent tables and makes it easier to track data
|
|
189
|
+
provenance and update schedules. The same metadata fields are available on all file reference types
|
|
190
|
+
(direct references, `local_data`, and `remote_data`) since they all inherit from the same base schema.
|
|
191
|
+
|
|
192
|
+
Both patterns store files in version control, making them ideal for smaller datasets that require
|
|
193
|
+
frequent updates. There is no difference in documentation level or reusability between the two
|
|
194
|
+
patterns—both require the same metadata and can be referenced throughout the derivation DAG (in the case
|
|
195
|
+
of the direct reference pattern you would reference the derived _table_ rather than the raw file). The
|
|
196
|
+
key difference is organizational: direct references provide a quick way to define a table from a single
|
|
197
|
+
file inline, while `local_data` provides centralized definitions when multiple tables derive from the
|
|
198
|
+
same source file. Larger files should use remote storage instead.
|
|
199
|
+
|
|
200
|
+
**Remote Data Storage in Blob Store**
|
|
201
|
+
|
|
202
|
+
Remote data storage through a blob store (e.g., ADLS) addresses the scalability limitations of local file
|
|
203
|
+
storage. When source datasets too large for version control, the `remote_data` section of the schema file
|
|
204
|
+
allows you to reference files stored in a blob store. Each remote data entry specifies paths to files in
|
|
205
|
+
the blob store along with their MD5 hashes to ensure the correct version is downloaded during builds.
|
|
206
|
+
While this approach keeps the repository lean, it requires a more structured workflow: you must upload
|
|
207
|
+
source files to the blob store, calculate their MD5 hashes, and specify them in the schema. This
|
|
208
|
+
additional complexity makes remote storage most suitable for stable, infrequently changing source
|
|
209
|
+
datasets where the overhead of managing source file hashes is justified by the benefits of centralized
|
|
210
|
+
storage and repository size optimization.
|
|
211
|
+
|
|
212
|
+
Note that MD5 hash management differs by context: source files in `remote_data` require manual MD5 hash
|
|
213
|
+
specification, while the derived parquet files underlying the tables in the schema have their MD5 hashes
|
|
214
|
+
automatically calculated and updated by `tabularasa datagen`. Local source files referenced through
|
|
215
|
+
`local_data` or `dependencies.filename` do not require MD5 hashes since they are assumed to be versioned
|
|
216
|
+
by your version control system.
|
|
217
|
+
|
|
218
|
+
Example workflow for monthly updates with local data:
|
|
219
|
+
|
|
220
|
+
```yaml
|
|
221
|
+
# schema.yaml - Direct file reference pattern
|
|
222
|
+
tables:
|
|
223
|
+
my_monthly_data:
|
|
224
|
+
dependencies:
|
|
225
|
+
filename: build_data/monthly_data.csv
|
|
226
|
+
last_updated: 2024-01-15
|
|
227
|
+
update_frequency: Monthly
|
|
228
|
+
doc: "Monthly update: Download new CSV → overwrite file → datagen"
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Example of shared local_data pattern:
|
|
232
|
+
|
|
233
|
+
```yaml
|
|
234
|
+
# schema.yaml - Shared data pattern
|
|
235
|
+
local_data:
|
|
236
|
+
census_data: # Define once
|
|
237
|
+
filename: build_data/census_2023.xlsx
|
|
238
|
+
url: https://census.gov/data/...
|
|
239
|
+
authority: US Census Bureau
|
|
240
|
+
last_updated: 2023-07-01
|
|
241
|
+
update_frequency: Yearly
|
|
242
|
+
|
|
243
|
+
tables:
|
|
244
|
+
state_demographics:
|
|
245
|
+
dependencies:
|
|
246
|
+
local: [census_data] # Reference from multiple tables
|
|
247
|
+
county_statistics:
|
|
248
|
+
dependencies:
|
|
249
|
+
local: [census_data] # Same source, consistent metadata
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Example workflow for remote data:
|
|
253
|
+
|
|
254
|
+
```yaml
|
|
255
|
+
# schema.yaml
|
|
256
|
+
remote_data:
|
|
257
|
+
my_large_data:
|
|
258
|
+
paths:
|
|
259
|
+
- name: data/large_file_2024_01.parquet
|
|
260
|
+
md5: abc123... # Must update this hash for each new version
|
|
261
|
+
|
|
262
|
+
tables:
|
|
263
|
+
large_table:
|
|
264
|
+
dependencies:
|
|
265
|
+
remote: [my_large_data] # Reference remote data
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
When changes are made to a table in `schema.yaml`, either the schema or the source data, be sure to
|
|
269
|
+
update the associated derived package data file by running `tabularasa datagen <table-name>`. The table's
|
|
270
|
+
MD5 hash, and those of any dependent derived tables downstream of it, will then be automatically updated
|
|
271
|
+
to reflect the new generated parquet file either during this step or during pre-commit hook execution.
|
|
272
|
+
See the [package data generation section](#generating-package-data) for more information on this.
|
|
273
|
+
|
|
274
|
+
To understand all the ways of defining a table or file dependency, take a look at the schema file data
|
|
275
|
+
model defined in the `thds.tabularasa.schema.metaschema._RawSchema` class. This represents an exact
|
|
276
|
+
field-by-field reflection of the contents of the schema yaml file.
|
|
277
|
+
|
|
278
|
+
### The CLI
|
|
279
|
+
|
|
280
|
+
When installed, the `thds.tabularasa` package comes with a CLI, invoked as `tabularasa` or
|
|
281
|
+
`python -m thds.tabularasa`. In the examples that follow, we use the `tabularasa` invocation. This CLI
|
|
282
|
+
supplies various utils for development tasks like building and fetching data, generating code and docs,
|
|
283
|
+
and checking package data integrity.
|
|
284
|
+
|
|
285
|
+
Each of these functionalities can be invoked via
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
tabularasa <subcommand-name>
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
for the subcommand that accomplishes the intended task.
|
|
292
|
+
|
|
293
|
+
The CLI can be made more verbose by repeating the `-v` flag as many times as necessary just after
|
|
294
|
+
`tabularasa` and before the name of the subcommand being invoked. If you should want them, the CLI can
|
|
295
|
+
self-install its own set of bash-compatible completions by running
|
|
296
|
+
`tabularasa --install-bash-completions`.
|
|
297
|
+
|
|
298
|
+
Documentation for the main CLI or any subcommand can be accessed in the standard way with `--help`:
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
tabularasa --help # main CLI args and subcommand list
|
|
302
|
+
tabularasa <command-name> --help # help for command identified by <command-name> - its purpose and args
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
The CLI is by default configured by a config file (JSON or YAML) in the working directory called
|
|
306
|
+
`tabularasa.yaml`. This just supplies a few required pieces of information, namely the name of the
|
|
307
|
+
`package` that you're interacting with and the `schema_path` relative to the package root, so that you
|
|
308
|
+
don't have to pass them as options on the command line. Most other important information relevant to the
|
|
309
|
+
CLI operations is contained in the [schema file](#the-schema-file) itself, especially the `build_options`
|
|
310
|
+
section.
|
|
311
|
+
|
|
312
|
+
To use the CLI in another project as a build tool, you will need to specify `thds.tabularasa[cli]` as
|
|
313
|
+
your dependency. The `cli` extra comes with some dependencies that are only needed in the context of the
|
|
314
|
+
CLI which are somewhat heavy and so best left out of your environment if you don't explicitly need them.
|
|
315
|
+
|
|
316
|
+
Of course if you need the CLI as a development dependency but you only need the _library_ at run time,
|
|
317
|
+
you may specify just `thds.tabularasa` as your main dependency and `thds.tabularasa[cli]` as your dev
|
|
318
|
+
dependency.
|
|
319
|
+
|
|
320
|
+
Some useful subcommands of the CLI are documented below.
|
|
321
|
+
|
|
322
|
+
#### Generating package data
|
|
323
|
+
|
|
324
|
+
If you're adding new tables or updating the data in a set of tables, especially when using a custom
|
|
325
|
+
preprocessor, you will likely want to repeatedly regenerate the package data parquet files for those
|
|
326
|
+
tables in order to confirm that the build is working as intended.
|
|
327
|
+
|
|
328
|
+
To do so, run
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
tabularasa datagen <table-name-1> <table-name-2> ...
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
All of the tables you specify _and_ all of their dependents downstream in the computational DAG will thus
|
|
335
|
+
be re-computed. This saves you from the work of keeping track of the downstream dependents, a tedious and
|
|
336
|
+
error-prone task. It ensures that all your package data and associated hashes are up to date, which
|
|
337
|
+
finally ensures that your peers will have up-to-date data when they get a cache miss after pulling your
|
|
338
|
+
code changes.
|
|
339
|
+
|
|
340
|
+
Any derived table upstream of those you request to build with `datagen` will be auto-synced from the blob
|
|
341
|
+
store prior to the build running, if available, saving you the wait time of re-building them needlessly
|
|
342
|
+
in case they're not already in your working tree.
|
|
343
|
+
|
|
344
|
+
If you'd like to better understand what you changed after any `tabularasa datagen` invocation before you
|
|
345
|
+
commit the result, you can run `tabularasa data-diff`. By default, this diffs the data as versioned in
|
|
346
|
+
the working tree against the data as versioned in the HEAD commit. If you've already committed, you can
|
|
347
|
+
pass a ref to the previous commit, e.g. `tabularasa data-diff HEAD~`. This will show summary stats
|
|
348
|
+
describing the changes, such as the number of rows added, removed, and modified for each updated table.
|
|
349
|
+
With the `--verbose` flag added, you can see more detail, for instance the row counts for each row-level
|
|
350
|
+
pattern of updates (e.g. in 10 rows, columns 'A' and 'B' were updated, in 5 rows, column 'C' was nulled,
|
|
351
|
+
in 3 rows, column 'A' was filled, etc.).
|
|
352
|
+
|
|
353
|
+
If you wish to regenerate _all_ package data tables from scratch, you can run
|
|
354
|
+
|
|
355
|
+
```bash
|
|
356
|
+
tabularasa datagen
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
This will remove _all_ pre-existing package data files and re-generate them. This is an extreme measure
|
|
360
|
+
and should be used sparingly; in most cases, you will want to only those specific tables whose source
|
|
361
|
+
data or derivation logic you know has changed.
|
|
362
|
+
|
|
363
|
+
Note that if you have just cloned the repo or pulled a branch and wish to get your local package data
|
|
364
|
+
up-to-date with the state on that branch, you don't need to re-derive all the data! Just
|
|
365
|
+
[sync with the blob store](#syncing-with-the-blob-store) instead.
|
|
366
|
+
|
|
367
|
+
#### Inspecting auto-generated code
|
|
368
|
+
|
|
369
|
+
If you'd like to review the code changes that would result from any change to the schema or compilation
|
|
370
|
+
modules without over-writing the existing generated source (as a [build](#building) could do), there is a
|
|
371
|
+
simple CLI command for inspecting it.
|
|
372
|
+
|
|
373
|
+
To inspect e.g. the auto-generated pandas code for the current repo state, run
|
|
374
|
+
|
|
375
|
+
```bash
|
|
376
|
+
tabularasa compile pandas
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
The code will print to stdout. Simply replace `pandas` with `attrs`, `sqlite`, `attrs-sqlite`, or
|
|
380
|
+
`pyarrow` to see the code generated for those use cases.
|
|
381
|
+
|
|
382
|
+
#### Checking integrity of local built reference data
|
|
383
|
+
|
|
384
|
+
The build pipeline uses md5 hashes to prevent expensive re-builds in local runs. When the
|
|
385
|
+
[build](#building) finishes, you will have several parquet files and possibly a sqlite database archive
|
|
386
|
+
present in your file tree. Each of the parquet files should have an associated md5 checksum in
|
|
387
|
+
`schema.yaml`, indicating the version of the data that should result from the build.
|
|
388
|
+
|
|
389
|
+
To check the status of your local built data files with respect to the `schema.yaml` hashes, you can run
|
|
390
|
+
|
|
391
|
+
```bash
|
|
392
|
+
tabularasa check-hashes
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
**Important**: The following shouldn't be required in normal usage: use with care and only if you know
|
|
396
|
+
what you're doing!
|
|
397
|
+
|
|
398
|
+
To sync the hashes in `schema.yaml` with those of your generated data you can run
|
|
399
|
+
|
|
400
|
+
```bash
|
|
401
|
+
tabularasa update-hashes
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
By default this will also update your generated data accessor source code, which has the hashes embedded
|
|
405
|
+
in order to enable run-time integrity checks on fetch from the blob store, if you're using one. In
|
|
406
|
+
general, you _should not need to to this manually_ however, since `tabularasa datagen` will update the
|
|
407
|
+
hashes for you as part of its normal operation.
|
|
408
|
+
|
|
409
|
+
#### Syncing with the Blob Store
|
|
410
|
+
|
|
411
|
+
**Important**: The `push`, `pull`, and `sync-blob-store` commands work **only with final parquet
|
|
412
|
+
tables**, not with input source data. Input data (specified in `local_data` or `remote_data`) is only
|
|
413
|
+
accessed during `datagen` execution.
|
|
414
|
+
|
|
415
|
+
Under the section `remote_blob_store` in [the schema file](#the-schema-file), you may optionally specify
|
|
416
|
+
a remote cloud storage location where built package data artifacts are stored. In case
|
|
417
|
+
`build_options.package_data_file_size_limit` is set, the package in question will not come with any
|
|
418
|
+
package data files exceeding that limit in size. These _will_ be available in the remote blob store, and
|
|
419
|
+
in case they are not present when one of the [data loaders](#the-data-interfaces) is invoked, will be
|
|
420
|
+
downloaded into the package.
|
|
421
|
+
|
|
422
|
+
Should your use case require the data to be locally available at run time, e.g. if you lack connectivity,
|
|
423
|
+
then you may fetch all the package data tables that were omitted in the [build](#building) by running
|
|
424
|
+
|
|
425
|
+
```bash
|
|
426
|
+
tabularasa sync-blob-store --down
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
or just
|
|
430
|
+
|
|
431
|
+
```bash
|
|
432
|
+
tabularasa pull
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
If you're using a remote blob store for large files, you will want to include the invocation
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
tabularasa sync-blob-store --up
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
or just
|
|
442
|
+
|
|
443
|
+
```bash
|
|
444
|
+
tabularasa push
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
somewhere in your CI build scripts after the [build](#building) completes and before you publish your
|
|
448
|
+
package, to ensure that those files are available at run time to end users when needed.
|
|
449
|
+
|
|
450
|
+
#### Initializing the SQLite Database
|
|
451
|
+
|
|
452
|
+
To initialize the SQLite database (see [interfaces](#the-data-interfaces)), should one be needed but not
|
|
453
|
+
shipped as package data (as specified in the `build_options` section of
|
|
454
|
+
[the schema file](#the-schema-file)), you may run
|
|
455
|
+
|
|
456
|
+
```bash
|
|
457
|
+
tabularasa init-sqlite
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
This will create the SQLite database archive in your installed package directory. For an added level of
|
|
461
|
+
safety you may pass `--validate` (to validate the inserted data against the constraints defined in
|
|
462
|
+
[the schema file](#the-schema-file) as expressed as [pandera schemas](#the-data-interfaces)), but these
|
|
463
|
+
will usually be statically verified once at build time and guaranteed correct before shipping.
|
|
464
|
+
|
|
465
|
+
#### Visualizing the Data Dependency DAG
|
|
466
|
+
|
|
467
|
+
The `dag` command creates a graph visualization of your project's dependency DAG and subsets thereof. The
|
|
468
|
+
visualization is opened in a browser (it's SVG by default) but if you pass `--format png` for example it
|
|
469
|
+
will open in an image viewer.
|
|
470
|
+
|
|
471
|
+
To visualize your data dependency DAG, from your project root run
|
|
472
|
+
|
|
473
|
+
```bash
|
|
474
|
+
tabularasa dag # generate full DAG
|
|
475
|
+
tabularasa dag [table-name(s)] # generate DAG for specific tables
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
> [!NOTE]
|
|
479
|
+
> This requires the `graphviz` source and binaries to be available on your system (`graphviz` is a C
|
|
480
|
+
> library that doesn't come packaged with the python wrapper `pygraphviz`). The easiest way to ensure
|
|
481
|
+
> this if you have a global anaconda env is to run `conda install graphviz`. However you proceed, you can
|
|
482
|
+
> verify that `graphviz` is available by running `which dot` and verifying that a path to an executable
|
|
483
|
+
> for the `dot` CLI is found (`dot` is one layout algorithm that comes with graphviz, and the one used in
|
|
484
|
+
> this feature). Once you have that, you may `pip install pygraphviz` into your working dev environment.
|
|
485
|
+
> Refer to the [pygraphviz docs](https://pygraphviz.github.io/documentation/stable/install.html) if you
|
|
486
|
+
> get stuck.
|
|
487
|
+
|
|
488
|
+
## Generating documentation
|
|
489
|
+
|
|
490
|
+
To generate the documentation for your project, run:
|
|
491
|
+
|
|
492
|
+
```bash
|
|
493
|
+
tabularasa docgen
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
from your project root.
|
|
497
|
+
|
|
498
|
+
This generates docs in ReStructuredText (rst) format in a directory structure specified in the
|
|
499
|
+
`table_docs_path`, `type_docs_path`, and `source_docs_path` fields of the
|
|
500
|
+
[schema file](#the-schema-file)'s `build_options` section. As such, these docs are valid as input to the
|
|
501
|
+
`sphinx` documentation build tool.
|
|
502
|
+
|
|
503
|
+
## Memory usage
|
|
504
|
+
|
|
505
|
+
Your reference data may be fairly large, and in multiprocessing contexts it can be useful to share the
|
|
506
|
+
read-only data in memory between processes for the sake of performance.
|
|
507
|
+
|
|
508
|
+
`tabularasa` builds this in via mem-mapped SQLite for the most part, but the default Python installation
|
|
509
|
+
of SQLite [limits](https://www.sqlite.org/mmap.html) the amount of memory-mapped data to 2GB per database
|
|
510
|
+
file.
|
|
511
|
+
|
|
512
|
+
A project called `pysqlite3` packages the same shim code alongside the ability to provide a different
|
|
513
|
+
shared library for SQLite, and their built binary package
|
|
514
|
+
[increases](https://github.com/coleifer/pysqlite3/blob/master/setup.py?ts=4#L107) the memory cap to 1TB.
|
|
515
|
+
Currently, the precompiled package is only available for Linux.
|
|
516
|
+
|
|
517
|
+
The good news: if you want more reference data to be shared between processes, all you need to do is
|
|
518
|
+
successfully install a version of `pysqlite3` into your Python environment. If you're on Linux, likely
|
|
519
|
+
you can accomplish this with a simple `pip install pysqlite3-binary`. On a Mac, you'll need to follow
|
|
520
|
+
their [instructions](https://github.com/coleifer/pysqlite3#building-with-system-sqlite) for linking
|
|
521
|
+
against a system-installed SQLite, or build against a statically-linked library and then install from
|
|
522
|
+
source.
|
|
523
|
+
|
|
524
|
+
If `pysqlite3` is installed in your Python environment, it will be used within `tabularasa` by default.
|
|
525
|
+
To disable this behavior, set the `REF_D_DISABLE_PYSQLITE3` environment variable to a non-empty string
|
|
526
|
+
value.
|
|
527
|
+
|
|
528
|
+
By default, with `pysqlite3` installed, 8 GB of RAM will be memory-mapped per database file. With the
|
|
529
|
+
standard `sqlite3` module, the limit will be hard-capped at 2 GB. If you want to change this default, you
|
|
530
|
+
can set the `REF_D_DEFAULT_MMAP_BYTES` environment variable to an integer number of bytes.
|