thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,41 @@
1
+ """This module exists solely to preferentially use pysqlite3 over the
2
+ built-in sqlite where it is already available.
3
+
4
+ Our primary purpose for doing this is to enable larger amounts of
5
+ mem-mapped shared memory between processes consuming reference-data.
6
+
7
+ However, while pre-compiled binaries for pysqlite3 are available via
8
+ PyPi, they are only available for manylinux, and most of our
9
+ development is done on Macs.
10
+
11
+ Since this customization is mostly useful for strongly parallel usage
12
+ of reference-data, and since that does not often happen on Mac
13
+ laptops, this shim will allow us to defer doing extra work to get
14
+ compiled pysqlite3 on Macs for the time being.
15
+ """
16
+
17
+ import os
18
+ from logging import getLogger
19
+
20
+ _DISABLE_PYSQLITE3 = bool(os.environ.get("REF_D_DISABLE_PYSQLITE3", False))
21
+
22
+ logger = getLogger(__name__)
23
+
24
+ # All of the following log statements include the string `pysqlite3` for searchability.
25
+ if not _DISABLE_PYSQLITE3:
26
+ try:
27
+ import pysqlite3 as sqlite3
28
+
29
+ logger.info(f"Using pysqlite3 with SQLite version {sqlite3.sqlite_version}")
30
+ except ImportError:
31
+ logger.debug("Using sqlite3 module because pysqlite3 was not available")
32
+ # this is DEBUG because it's the 'base case' for local dev and
33
+ # there's no need to make logs for development use cases more
34
+ # verbose. In production, one of the other two INFO logs is
35
+ # likely to fire, and if not, this case can be inferred from
36
+ # the lack of log.
37
+ import sqlite3 # type: ignore
38
+ else:
39
+ import sqlite3 # type: ignore
40
+
41
+ logger.info("Using sqlite3 module because pysqlite3 was disabled via environment variable")
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ from thds.core import log, source
4
+ from thds.tabularasa.data_dependencies.sqlite import insert_table
5
+ from thds.tabularasa.loaders.sqlite_util import sqlite_connection
6
+ from thds.tabularasa.schema import load_schema
7
+ from thds.tabularasa.schema.metaschema import Table
8
+
9
+ logger = log.getLogger(__name__)
10
+
11
+
12
+ def load_table_from_schema(schema_path: Path, table_name: str) -> Table:
13
+ return load_schema(None, str(schema_path)).tables[table_name]
14
+
15
+
16
+ def sqlite_from_parquet(
17
+ schema_src: source.Source,
18
+ table_name: str,
19
+ parquet_src: source.Source,
20
+ output_db_dir: Path,
21
+ output_db_name: str,
22
+ ) -> source.Source:
23
+ parquet_path = parquet_src.path()
24
+ sqlite_outfile = output_db_dir / output_db_name
25
+ with sqlite_connection(sqlite_outfile) as conn:
26
+ insert_table(
27
+ conn,
28
+ load_table_from_schema(schema_src.path(), table_name),
29
+ None,
30
+ data_dir=str(parquet_path.parent),
31
+ filename=str(parquet_path.name),
32
+ )
33
+ logger.info("Done inserting parquet into sqlite.")
34
+ return source.from_file(sqlite_outfile)
@@ -0,0 +1,56 @@
1
+ """Helpers for taking an in-memory set of Tables and writing them to a
2
+ SQLite database file without needing to go through a build process.
3
+ """
4
+
5
+ import os
6
+ import typing as ty
7
+ from pathlib import Path
8
+
9
+ from .data_dependencies.sqlite import populate_sqlite_db
10
+ from .schema.metaschema import BuildOptions, Schema, Table
11
+
12
+ FAKE_BUILD_OPTIONS = BuildOptions(
13
+ derived_code_submodule="thds.nope",
14
+ attrs=False,
15
+ sqlite_data=True,
16
+ sqlite_interface=False,
17
+ pandas=False,
18
+ pyarrow=False,
19
+ require_typing_extensions=False,
20
+ type_constraint_comments=False,
21
+ validate_transient_tables=False,
22
+ )
23
+
24
+
25
+ def _make_fake_schema(tables: ty.Collection[Table]) -> Schema:
26
+ return Schema(
27
+ tables={table.name: table for table in tables},
28
+ types=dict(),
29
+ # none of the build options are actually relevant to us.
30
+ build_options=FAKE_BUILD_OPTIONS,
31
+ )
32
+
33
+
34
+ def to_sqlite(sqlite_file_path: Path, *tables: Table, **data_paths_by_table_name: Path) -> None:
35
+ """Create a SQLite database file from the given Tables alone.
36
+
37
+ The data_paths_by_table_name are only required if your Table
38
+ instances do not provide a file path to their data via the `doc`
39
+ attribute. Tables generated by define_table_from_parquet will
40
+ follow this convention.
41
+ """
42
+ populate_sqlite_db(
43
+ _make_fake_schema(tables),
44
+ None,
45
+ str(sqlite_file_path),
46
+ None,
47
+ "",
48
+ "",
49
+ table_predicate=lambda table: True, # output all tables.
50
+ data_path_overrides={
51
+ table.name: (
52
+ Path(table.doc) if os.path.exists(table.doc) else data_paths_by_table_name[table.name]
53
+ )
54
+ for table in tables
55
+ },
56
+ )
@@ -0,0 +1,530 @@
1
+ Metadata-Version: 2.4
2
+ Name: thds.tabularasa
3
+ Version: 0.13.0
4
+ Summary: Trilliant Health reference data build system.
5
+ Author-email: Trilliant Health <info@trillianthealth.com>
6
+ Project-URL: Repository, https://github.com/TrilliantHealth/ds-monorepo
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: attrs>=22.2
10
+ Requires-Dist: cattrs>=22.2
11
+ Requires-Dist: filelock
12
+ Requires-Dist: networkx>=3.0
13
+ Requires-Dist: numpy
14
+ Requires-Dist: packaging
15
+ Requires-Dist: pandas>=1.5
16
+ Requires-Dist: pandera<0.24,>=0.20
17
+ Requires-Dist: pydantic<2.0,>=1.10.9
18
+ Requires-Dist: pyarrow>=10.0
19
+ Requires-Dist: pyyaml>=6.0.1
20
+ Requires-Dist: setuptools>=66.1.1
21
+ Requires-Dist: thds-adls
22
+ Requires-Dist: thds-core
23
+ Requires-Dist: typing-extensions
24
+ Provides-Extra: autoformat
25
+ Requires-Dist: black; extra == "autoformat"
26
+ Requires-Dist: isort; extra == "autoformat"
27
+ Provides-Extra: cli
28
+ Requires-Dist: bourbaki.application>=0.10.12; extra == "cli"
29
+ Requires-Dist: ruamel.yaml; extra == "cli"
30
+ Requires-Dist: tabulate; extra == "cli"
31
+ Requires-Dist: typing-inspect; extra == "cli"
32
+ Provides-Extra: viz
33
+ Requires-Dist: pygraphviz; extra == "viz"
34
+
35
+ ## Tabula Rasa
36
+
37
+ The `thds.tabularasa` package serves to enable version control, validation, and runtime access to tabular
38
+ datasets that are required in analytic and production workflows. As such, it encompasses a build system
39
+ for generating data, documentation for its derivation process, and code for accessing it.
40
+
41
+ ### The Schema File
42
+
43
+ To use `tabularasa` in your project, you will first create a single yaml file defining a tabular schema
44
+ and build process. This file should exist within your package, not somewhere else in your repo - in other
45
+ words it should be package data. It is therefore always read and specified as any package data would be -
46
+ with a package name and a path inside said package.
47
+
48
+ The schema file includes documentation, tabular schema definitions, type information, value-level
49
+ constraints (e.g. ranges, string patterns, and nullability), column-level constraints (e.g. uniqueness),
50
+ file resource definitions, and build options controlling the output of the build system. Tables are built
51
+ from raw data files which may take any form and may be stored either in the repository under version
52
+ control or remotely in a blob store such as ADLS (versioned with md5 hashes to ensure build availability
53
+ and consistency), but are packaged with the distribution as strictly-typed parquet files and optionally
54
+ as a sqlite database archive file. Large package files may be omitted from the base distribution to be
55
+ synced with a blob store at run time.
56
+
57
+ The sections of the schema file are as follows:
58
+
59
+ - `build_options`: a set of various flags controlling your build process, including code and data
60
+ generation
61
+ - `tables`: the schema definitions of your tabular data, plus specifications of the inputs and functions
62
+ used to derive them
63
+ - `types`: any custom constrained column-level types you may wish to define and reference in your tables.
64
+ These become both validation constraints expressed as `pandera` schemas, and `typing.Literal` types in
65
+ the case of enums, or sometimes `typing.NewType`s depending on your build options.
66
+ - `local_data`: specifications of local files in your repo that will be used to build your tables. Files
67
+ referenced here are expected to be version-controlled along with your code and so don't require hashes
68
+ for integrity checks. Note that tabularasa assumes the file on disk is the official committed version.
69
+ It cannot protect against builds with uncommitted local changes to these files.
70
+ - `remote_data`: specifications of remote files that will be used to build your tables. Currently only
71
+ blob store backends like ADLS are supported. Files referenced here must be versioned with hashes to
72
+ ensure build integrity (MD5 is used currently).
73
+ - `remote_blob_store`: optional location to store large artifacts in post-build, in case you want to set
74
+ a size limit above which your data files will not be packaged with your distribution. They can then be
75
+ fetched at run time as needed.
76
+ - `external_schemas`: optional specification of `tabularasa` schemas inside other packages, in case you
77
+ are integrating with them, e.g. by sharing some types.
78
+
79
+ To get more detail on the structure of any of these sections, you may refer to the
80
+ `thds.tabularasa.schema.metaschema._RawSchema` class, which is an exact field-by-field reflection of the
81
+ schema yaml file (with a few enriched fields). Instances of this class are validated and enriched to
82
+ become instances of `thds.tabularasa.schema.metaschema.Schema`, which are then used in various build
83
+ operations.
84
+
85
+ ### Core Concepts: How Tabularasa Controls Your Data
86
+
87
+ Before diving into the details, it's important to understand how tabularasa controls and transforms your
88
+ data:
89
+
90
+ #### Column Ordering
91
+
92
+ **Important**: The column order in your output parquet files is **entirely controlled by the order
93
+ defined in schema.yaml**, not by the order in your preprocessor code or source data. Even if your
94
+ preprocessor returns columns in a different order, tabularasa will reorder them to match the schema
95
+ definition during the build process. This ensures consistency across all data artifacts.
96
+
97
+ #### Primary Keys and Pandas Index
98
+
99
+ When working with pandas DataFrames, be aware that **primary key columns become the DataFrame index** and
100
+ effectively "disappear" from the regular columns. If you define `primary_key: [id, date]` in your schema,
101
+ those columns will be accessible via `df.index` rather than `df['id']` or `df['date']`. This behavior is
102
+ automatic and ensures efficient indexing for data access.
103
+
104
+ #### Transient Tables
105
+
106
+ Tables marked with `transient: true` are intermediate tables used during the build process but are not
107
+ included in the final package distribution. Use transient tables for:
108
+
109
+ - Raw input data that gets processed into final tables
110
+ - Intermediate transformation steps
111
+ - Large source data that shouldn't be shipped with the package
112
+
113
+ #### External Data Philosophy
114
+
115
+ Tabularasa follows a fundamental principle: **builds should never depend on external services**. All data
116
+ is snapshotted internally to ensure reproducible builds. This means:
117
+
118
+ - Data from external sources (APIs, remote CSVs, etc.) should be fetched and stored in version control or
119
+ a blob store that you control (specified in the `remote_data` section)
120
+ - This ensures builds are deterministic and not affected by external service availability or consistency
121
+
122
+ ### The Data Interfaces
123
+
124
+ The code generation portion of the build system can generate interfaces for loading the package parquet
125
+ data as `attrs` records or `pandas` dataframes (validated by `pandera` schemas), and for loading `attrs`
126
+ records from a `sqlite` archive via indexed queries on specific sets of fields.
127
+
128
+ The code for all modules is generated and written at [build time](#building).
129
+
130
+ ### Building
131
+
132
+ To build your project with `tabularasa`, just run
133
+
134
+ ```bash
135
+ tabularasa codegen
136
+ tabularasa datagen
137
+ ```
138
+
139
+ from the project root, followed by the invocation of your standard build tool (`poetry`, `setuptools`,
140
+ etc).
141
+
142
+ This will generate all source code interfaces and package data according to various options specified in
143
+ the `build_options` section of the [schema file](#the-schema-file). Note that no code is written unless
144
+ the [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree) of the generated python code differs from
145
+ what is found in the local source files. This allows the code generation step to avoid conflict with code
146
+ formatters such as `black`, since these change only the formatting and not the AST of the code.
147
+
148
+ ### Adding new package data
149
+
150
+ To add a new table to the schema, place a new named entry under the `tables` section in your
151
+ [schema file](#the-schema-file). Source data for the table is specified in the table's `dependencies`
152
+ section. There are multiple ways to specify the source data, including version-controlled
153
+ repository-local files and remote files. Source data can be a standard tabular text format (CSV, TSV,
154
+ etc) which can be translated automatically into the table's typed schema, or some other data format that
155
+ requires processing using a user-defined function specified under a `preprocessor` key.
156
+
157
+ The simplest way to add new reference data to version control is to simply place a CSV in your repo, and
158
+ define the schema of that data in the `tables` section of your [schema file](#the-schema-file), pointing
159
+ the `dependencies.filename` of the table to the new CSV file.
160
+
161
+ Note that this direct file reference approach works only with files that can unambiguously be interpreted
162
+ into the table's schema. Currently this is implemented for character-delimited text files such as CSV/TSV
163
+ (with many exposed options for parsing), but could be extended to other tabular formats in the future.
164
+
165
+ #### Choosing Between Local and Remote Data
166
+
167
+ When deciding how to store your source data, consider these trade-offs:
168
+
169
+ **Local Data Storage Patterns**
170
+
171
+ Tabularasa supports two distinct patterns for managing local data files, each serving different
172
+ organizational needs. The **direct file reference pattern** allows tables to specify their data source
173
+ directly through `dependencies.filename`, providing a straightforward path to a file in the repository.
174
+ When you need to update the data, you simply overwrite the file and run
175
+ `tabularasa datagen <your-table-name>` without making any schema changes. The framework reads the file
176
+ directly using the provided path along with any parsing parameters specified in the dependencies block.
177
+ This approach works best for data files that are specific to a single table and can be parsed
178
+ unambiguously, requiring no custom code to interpret.
179
+
180
+ The **shared data pattern** using the `local_data` section provides a more structured approach for
181
+ managing data sources that multiple tables depend on. With this pattern, you define a named entry in the
182
+ `local_data` section of your schema that contains not just the filename but comprehensive metadata
183
+ including the data authority, source URL, update frequency, and documentation. Tables then reference
184
+ these entries using `dependencies.local: [entry_name]`. When the preprocessor function executes, it
185
+ receives a `LocalDataSpec` object that provides access to both the file (via the `full_path` property)
186
+ and all associated metadata. This pattern is best when multiple tables need to derive data from the same
187
+ source file, such as when several tables extract different subsets from a comprehensive dataset. This
188
+ centralized definition allows consistency across all dependent tables and makes it easier to track data
189
+ provenance and update schedules. The same metadata fields are available on all file reference types
190
+ (direct references, `local_data`, and `remote_data`) since they all inherit from the same base schema.
191
+
192
+ Both patterns store files in version control, making them ideal for smaller datasets that require
193
+ frequent updates. There is no difference in documentation level or reusability between the two
194
+ patterns—both require the same metadata and can be referenced throughout the derivation DAG (in the case
195
+ of the direct reference pattern you would reference the derived _table_ rather than the raw file). The
196
+ key difference is organizational: direct references provide a quick way to define a table from a single
197
+ file inline, while `local_data` provides centralized definitions when multiple tables derive from the
198
+ same source file. Larger files should use remote storage instead.
199
+
200
+ **Remote Data Storage in Blob Store**
201
+
202
+ Remote data storage through a blob store (e.g., ADLS) addresses the scalability limitations of local file
203
+ storage. When source datasets too large for version control, the `remote_data` section of the schema file
204
+ allows you to reference files stored in a blob store. Each remote data entry specifies paths to files in
205
+ the blob store along with their MD5 hashes to ensure the correct version is downloaded during builds.
206
+ While this approach keeps the repository lean, it requires a more structured workflow: you must upload
207
+ source files to the blob store, calculate their MD5 hashes, and specify them in the schema. This
208
+ additional complexity makes remote storage most suitable for stable, infrequently changing source
209
+ datasets where the overhead of managing source file hashes is justified by the benefits of centralized
210
+ storage and repository size optimization.
211
+
212
+ Note that MD5 hash management differs by context: source files in `remote_data` require manual MD5 hash
213
+ specification, while the derived parquet files underlying the tables in the schema have their MD5 hashes
214
+ automatically calculated and updated by `tabularasa datagen`. Local source files referenced through
215
+ `local_data` or `dependencies.filename` do not require MD5 hashes since they are assumed to be versioned
216
+ by your version control system.
217
+
218
+ Example workflow for monthly updates with local data:
219
+
220
+ ```yaml
221
+ # schema.yaml - Direct file reference pattern
222
+ tables:
223
+ my_monthly_data:
224
+ dependencies:
225
+ filename: build_data/monthly_data.csv
226
+ last_updated: 2024-01-15
227
+ update_frequency: Monthly
228
+ doc: "Monthly update: Download new CSV → overwrite file → datagen"
229
+ ```
230
+
231
+ Example of shared local_data pattern:
232
+
233
+ ```yaml
234
+ # schema.yaml - Shared data pattern
235
+ local_data:
236
+ census_data: # Define once
237
+ filename: build_data/census_2023.xlsx
238
+ url: https://census.gov/data/...
239
+ authority: US Census Bureau
240
+ last_updated: 2023-07-01
241
+ update_frequency: Yearly
242
+
243
+ tables:
244
+ state_demographics:
245
+ dependencies:
246
+ local: [census_data] # Reference from multiple tables
247
+ county_statistics:
248
+ dependencies:
249
+ local: [census_data] # Same source, consistent metadata
250
+ ```
251
+
252
+ Example workflow for remote data:
253
+
254
+ ```yaml
255
+ # schema.yaml
256
+ remote_data:
257
+ my_large_data:
258
+ paths:
259
+ - name: data/large_file_2024_01.parquet
260
+ md5: abc123... # Must update this hash for each new version
261
+
262
+ tables:
263
+ large_table:
264
+ dependencies:
265
+ remote: [my_large_data] # Reference remote data
266
+ ```
267
+
268
+ When changes are made to a table in `schema.yaml`, either the schema or the source data, be sure to
269
+ update the associated derived package data file by running `tabularasa datagen <table-name>`. The table's
270
+ MD5 hash, and those of any dependent derived tables downstream of it, will then be automatically updated
271
+ to reflect the new generated parquet file either during this step or during pre-commit hook execution.
272
+ See the [package data generation section](#generating-package-data) for more information on this.
273
+
274
+ To understand all the ways of defining a table or file dependency, take a look at the schema file data
275
+ model defined in the `thds.tabularasa.schema.metaschema._RawSchema` class. This represents an exact
276
+ field-by-field reflection of the contents of the schema yaml file.
277
+
278
+ ### The CLI
279
+
280
+ When installed, the `thds.tabularasa` package comes with a CLI, invoked as `tabularasa` or
281
+ `python -m thds.tabularasa`. In the examples that follow, we use the `tabularasa` invocation. This CLI
282
+ supplies various utils for development tasks like building and fetching data, generating code and docs,
283
+ and checking package data integrity.
284
+
285
+ Each of these functionalities can be invoked via
286
+
287
+ ```
288
+ tabularasa <subcommand-name>
289
+ ```
290
+
291
+ for the subcommand that accomplishes the intended task.
292
+
293
+ The CLI can be made more verbose by repeating the `-v` flag as many times as necessary just after
294
+ `tabularasa` and before the name of the subcommand being invoked. If you should want them, the CLI can
295
+ self-install its own set of bash-compatible completions by running
296
+ `tabularasa --install-bash-completions`.
297
+
298
+ Documentation for the main CLI or any subcommand can be accessed in the standard way with `--help`:
299
+
300
+ ```bash
301
+ tabularasa --help # main CLI args and subcommand list
302
+ tabularasa <command-name> --help # help for command identified by <command-name> - its purpose and args
303
+ ```
304
+
305
+ The CLI is by default configured by a config file (JSON or YAML) in the working directory called
306
+ `tabularasa.yaml`. This just supplies a few required pieces of information, namely the name of the
307
+ `package` that you're interacting with and the `schema_path` relative to the package root, so that you
308
+ don't have to pass them as options on the command line. Most other important information relevant to the
309
+ CLI operations is contained in the [schema file](#the-schema-file) itself, especially the `build_options`
310
+ section.
311
+
312
+ To use the CLI in another project as a build tool, you will need to specify `thds.tabularasa[cli]` as
313
+ your dependency. The `cli` extra comes with some dependencies that are only needed in the context of the
314
+ CLI which are somewhat heavy and so best left out of your environment if you don't explicitly need them.
315
+
316
+ Of course if you need the CLI as a development dependency but you only need the _library_ at run time,
317
+ you may specify just `thds.tabularasa` as your main dependency and `thds.tabularasa[cli]` as your dev
318
+ dependency.
319
+
320
+ Some useful subcommands of the CLI are documented below.
321
+
322
+ #### Generating package data
323
+
324
+ If you're adding new tables or updating the data in a set of tables, especially when using a custom
325
+ preprocessor, you will likely want to repeatedly regenerate the package data parquet files for those
326
+ tables in order to confirm that the build is working as intended.
327
+
328
+ To do so, run
329
+
330
+ ```bash
331
+ tabularasa datagen <table-name-1> <table-name-2> ...
332
+ ```
333
+
334
+ All of the tables you specify _and_ all of their dependents downstream in the computational DAG will thus
335
+ be re-computed. This saves you from the work of keeping track of the downstream dependents, a tedious and
336
+ error-prone task. It ensures that all your package data and associated hashes are up to date, which
337
+ finally ensures that your peers will have up-to-date data when they get a cache miss after pulling your
338
+ code changes.
339
+
340
+ Any derived table upstream of those you request to build with `datagen` will be auto-synced from the blob
341
+ store prior to the build running, if available, saving you the wait time of re-building them needlessly
342
+ in case they're not already in your working tree.
343
+
344
+ If you'd like to better understand what you changed after any `tabularasa datagen` invocation before you
345
+ commit the result, you can run `tabularasa data-diff`. By default, this diffs the data as versioned in
346
+ the working tree against the data as versioned in the HEAD commit. If you've already committed, you can
347
+ pass a ref to the previous commit, e.g. `tabularasa data-diff HEAD~`. This will show summary stats
348
+ describing the changes, such as the number of rows added, removed, and modified for each updated table.
349
+ With the `--verbose` flag added, you can see more detail, for instance the row counts for each row-level
350
+ pattern of updates (e.g. in 10 rows, columns 'A' and 'B' were updated, in 5 rows, column 'C' was nulled,
351
+ in 3 rows, column 'A' was filled, etc.).
352
+
353
+ If you wish to regenerate _all_ package data tables from scratch, you can run
354
+
355
+ ```bash
356
+ tabularasa datagen
357
+ ```
358
+
359
+ This will remove _all_ pre-existing package data files and re-generate them. This is an extreme measure
360
+ and should be used sparingly; in most cases, you will want to only those specific tables whose source
361
+ data or derivation logic you know has changed.
362
+
363
+ Note that if you have just cloned the repo or pulled a branch and wish to get your local package data
364
+ up-to-date with the state on that branch, you don't need to re-derive all the data! Just
365
+ [sync with the blob store](#syncing-with-the-blob-store) instead.
366
+
367
+ #### Inspecting auto-generated code
368
+
369
+ If you'd like to review the code changes that would result from any change to the schema or compilation
370
+ modules without over-writing the existing generated source (as a [build](#building) could do), there is a
371
+ simple CLI command for inspecting it.
372
+
373
+ To inspect e.g. the auto-generated pandas code for the current repo state, run
374
+
375
+ ```bash
376
+ tabularasa compile pandas
377
+ ```
378
+
379
+ The code will print to stdout. Simply replace `pandas` with `attrs`, `sqlite`, `attrs-sqlite`, or
380
+ `pyarrow` to see the code generated for those use cases.
381
+
382
+ #### Checking integrity of local built reference data
383
+
384
+ The build pipeline uses md5 hashes to prevent expensive re-builds in local runs. When the
385
+ [build](#building) finishes, you will have several parquet files and possibly a sqlite database archive
386
+ present in your file tree. Each of the parquet files should have an associated md5 checksum in
387
+ `schema.yaml`, indicating the version of the data that should result from the build.
388
+
389
+ To check the status of your local built data files with respect to the `schema.yaml` hashes, you can run
390
+
391
+ ```bash
392
+ tabularasa check-hashes
393
+ ```
394
+
395
+ **Important**: The following shouldn't be required in normal usage: use with care and only if you know
396
+ what you're doing!
397
+
398
+ To sync the hashes in `schema.yaml` with those of your generated data you can run
399
+
400
+ ```bash
401
+ tabularasa update-hashes
402
+ ```
403
+
404
+ By default this will also update your generated data accessor source code, which has the hashes embedded
405
+ in order to enable run-time integrity checks on fetch from the blob store, if you're using one. In
406
+ general, you _should not need to to this manually_ however, since `tabularasa datagen` will update the
407
+ hashes for you as part of its normal operation.
408
+
409
+ #### Syncing with the Blob Store
410
+
411
+ **Important**: The `push`, `pull`, and `sync-blob-store` commands work **only with final parquet
412
+ tables**, not with input source data. Input data (specified in `local_data` or `remote_data`) is only
413
+ accessed during `datagen` execution.
414
+
415
+ Under the section `remote_blob_store` in [the schema file](#the-schema-file), you may optionally specify
416
+ a remote cloud storage location where built package data artifacts are stored. In case
417
+ `build_options.package_data_file_size_limit` is set, the package in question will not come with any
418
+ package data files exceeding that limit in size. These _will_ be available in the remote blob store, and
419
+ in case they are not present when one of the [data loaders](#the-data-interfaces) is invoked, will be
420
+ downloaded into the package.
421
+
422
+ Should your use case require the data to be locally available at run time, e.g. if you lack connectivity,
423
+ then you may fetch all the package data tables that were omitted in the [build](#building) by running
424
+
425
+ ```bash
426
+ tabularasa sync-blob-store --down
427
+ ```
428
+
429
+ or just
430
+
431
+ ```bash
432
+ tabularasa pull
433
+ ```
434
+
435
+ If you're using a remote blob store for large files, you will want to include the invocation
436
+
437
+ ```bash
438
+ tabularasa sync-blob-store --up
439
+ ```
440
+
441
+ or just
442
+
443
+ ```bash
444
+ tabularasa push
445
+ ```
446
+
447
+ somewhere in your CI build scripts after the [build](#building) completes and before you publish your
448
+ package, to ensure that those files are available at run time to end users when needed.
449
+
450
+ #### Initializing the SQLite Database
451
+
452
+ To initialize the SQLite database (see [interfaces](#the-data-interfaces)), should one be needed but not
453
+ shipped as package data (as specified in the `build_options` section of
454
+ [the schema file](#the-schema-file)), you may run
455
+
456
+ ```bash
457
+ tabularasa init-sqlite
458
+ ```
459
+
460
+ This will create the SQLite database archive in your installed package directory. For an added level of
461
+ safety you may pass `--validate` (to validate the inserted data against the constraints defined in
462
+ [the schema file](#the-schema-file) as expressed as [pandera schemas](#the-data-interfaces)), but these
463
+ will usually be statically verified once at build time and guaranteed correct before shipping.
464
+
465
+ #### Visualizing the Data Dependency DAG
466
+
467
+ The `dag` command creates a graph visualization of your project's dependency DAG and subsets thereof. The
468
+ visualization is opened in a browser (it's SVG by default) but if you pass `--format png` for example it
469
+ will open in an image viewer.
470
+
471
+ To visualize your data dependency DAG, from your project root run
472
+
473
+ ```bash
474
+ tabularasa dag # generate full DAG
475
+ tabularasa dag [table-name(s)] # generate DAG for specific tables
476
+ ```
477
+
478
+ > [!NOTE]
479
+ > This requires the `graphviz` source and binaries to be available on your system (`graphviz` is a C
480
+ > library that doesn't come packaged with the python wrapper `pygraphviz`). The easiest way to ensure
481
+ > this if you have a global anaconda env is to run `conda install graphviz`. However you proceed, you can
482
+ > verify that `graphviz` is available by running `which dot` and verifying that a path to an executable
483
+ > for the `dot` CLI is found (`dot` is one layout algorithm that comes with graphviz, and the one used in
484
+ > this feature). Once you have that, you may `pip install pygraphviz` into your working dev environment.
485
+ > Refer to the [pygraphviz docs](https://pygraphviz.github.io/documentation/stable/install.html) if you
486
+ > get stuck.
487
+
488
+ ## Generating documentation
489
+
490
+ To generate the documentation for your project, run:
491
+
492
+ ```bash
493
+ tabularasa docgen
494
+ ```
495
+
496
+ from your project root.
497
+
498
+ This generates docs in ReStructuredText (rst) format in a directory structure specified in the
499
+ `table_docs_path`, `type_docs_path`, and `source_docs_path` fields of the
500
+ [schema file](#the-schema-file)'s `build_options` section. As such, these docs are valid as input to the
501
+ `sphinx` documentation build tool.
502
+
503
+ ## Memory usage
504
+
505
+ Your reference data may be fairly large, and in multiprocessing contexts it can be useful to share the
506
+ read-only data in memory between processes for the sake of performance.
507
+
508
+ `tabularasa` builds this in via mem-mapped SQLite for the most part, but the default Python installation
509
+ of SQLite [limits](https://www.sqlite.org/mmap.html) the amount of memory-mapped data to 2GB per database
510
+ file.
511
+
512
+ A project called `pysqlite3` packages the same shim code alongside the ability to provide a different
513
+ shared library for SQLite, and their built binary package
514
+ [increases](https://github.com/coleifer/pysqlite3/blob/master/setup.py?ts=4#L107) the memory cap to 1TB.
515
+ Currently, the precompiled package is only available for Linux.
516
+
517
+ The good news: if you want more reference data to be shared between processes, all you need to do is
518
+ successfully install a version of `pysqlite3` into your Python environment. If you're on Linux, likely
519
+ you can accomplish this with a simple `pip install pysqlite3-binary`. On a Mac, you'll need to follow
520
+ their [instructions](https://github.com/coleifer/pysqlite3#building-with-system-sqlite) for linking
521
+ against a system-installed SQLite, or build against a statically-linked library and then install from
522
+ source.
523
+
524
+ If `pysqlite3` is installed in your Python environment, it will be used within `tabularasa` by default.
525
+ To disable this behavior, set the `REF_D_DISABLE_PYSQLITE3` environment variable to a non-empty string
526
+ value.
527
+
528
+ By default, with `pysqlite3` installed, 8 GB of RAM will be memory-mapped per database file. With the
529
+ standard `sqlite3` module, the limit will be hard-capped at 2 GB. If you want to change this default, you
530
+ can set the `REF_D_DEFAULT_MMAP_BYTES` environment variable to an integer number of bytes.