thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1122 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
from copy import copy
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from functools import partial
|
|
10
|
+
from itertools import repeat
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, Iterable, Iterator, List, NamedTuple, Optional, Set, Tuple, Type, Union, cast
|
|
13
|
+
|
|
14
|
+
import networkx as nx
|
|
15
|
+
import pkg_resources
|
|
16
|
+
|
|
17
|
+
from thds.core import parallel
|
|
18
|
+
from thds.tabularasa.data_dependencies.adls import (
|
|
19
|
+
ADLSFileIntegrityError,
|
|
20
|
+
ADLSFileSystem,
|
|
21
|
+
adls_filesystem,
|
|
22
|
+
sync_adls_data,
|
|
23
|
+
)
|
|
24
|
+
from thds.tabularasa.data_dependencies.build import ReferenceDataBuildCommand, populate_sqlite_db
|
|
25
|
+
from thds.tabularasa.diff import data as data_diff
|
|
26
|
+
from thds.tabularasa.diff import schema as schema_diff
|
|
27
|
+
from thds.tabularasa.diff import summary as diff_summary
|
|
28
|
+
from thds.tabularasa.loaders import parquet_util
|
|
29
|
+
from thds.tabularasa.loaders.util import (
|
|
30
|
+
PandasParquetLoader,
|
|
31
|
+
default_parquet_package_data_path,
|
|
32
|
+
hash_file,
|
|
33
|
+
)
|
|
34
|
+
from thds.tabularasa.schema import load_schema, metaschema
|
|
35
|
+
from thds.tabularasa.schema.compilation import (
|
|
36
|
+
render_attrs_module,
|
|
37
|
+
render_attrs_sqlite_schema,
|
|
38
|
+
render_pandera_module,
|
|
39
|
+
render_pyarrow_schema,
|
|
40
|
+
render_sphinx_docs,
|
|
41
|
+
render_sql_schema,
|
|
42
|
+
)
|
|
43
|
+
from thds.tabularasa.schema.util import all_predecessors, all_successors
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
from bourbaki.application.cli import CommandLineInterface, cli_spec
|
|
47
|
+
except ImportError:
|
|
48
|
+
|
|
49
|
+
# stand-in decorators
|
|
50
|
+
def noop_decorator(obj):
|
|
51
|
+
return obj
|
|
52
|
+
|
|
53
|
+
def noop_decorator_factory(obj):
|
|
54
|
+
return noop_decorator
|
|
55
|
+
|
|
56
|
+
config_top_level = define_cli = noop_decorator
|
|
57
|
+
output_handler = noop_decorator_factory
|
|
58
|
+
noncommand = noop_decorator
|
|
59
|
+
cli = None
|
|
60
|
+
else:
|
|
61
|
+
# increase default log verbosity
|
|
62
|
+
# this ensures all log messages at INFO level or greater are rendered,
|
|
63
|
+
# and that tracebacks are always shown
|
|
64
|
+
import bourbaki.application.cli.main as _bourbaki
|
|
65
|
+
|
|
66
|
+
_bourbaki.MIN_VERBOSITY = _bourbaki.TRACEBACK_VERBOSITY = _bourbaki.LOG_LEVEL_NAMES.index("INFO")
|
|
67
|
+
|
|
68
|
+
cli = CommandLineInterface(
|
|
69
|
+
prog="tabularasa",
|
|
70
|
+
require_options=False,
|
|
71
|
+
require_subcommand=True,
|
|
72
|
+
implicit_flags=True,
|
|
73
|
+
use_verbose_flag=True,
|
|
74
|
+
require_config=False,
|
|
75
|
+
add_init_config_command=True,
|
|
76
|
+
use_config_file="tabularasa.yaml",
|
|
77
|
+
package="thds.tabularasa",
|
|
78
|
+
)
|
|
79
|
+
# decorators
|
|
80
|
+
define_cli = cli.definition
|
|
81
|
+
output_handler = cli_spec.output_handler
|
|
82
|
+
config_top_level = cli_spec.config_top_level
|
|
83
|
+
noncommand = cli_spec.noncommand
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
from ruamel.yaml import YAML
|
|
87
|
+
except ImportError:
|
|
88
|
+
|
|
89
|
+
import yaml
|
|
90
|
+
|
|
91
|
+
load_yaml = yaml.safe_load
|
|
92
|
+
dump_yaml = yaml.safe_dump
|
|
93
|
+
else:
|
|
94
|
+
|
|
95
|
+
def _yaml():
|
|
96
|
+
yaml = YAML()
|
|
97
|
+
yaml.preserve_quotes = True # type: ignore[assignment]
|
|
98
|
+
yaml.width = 100 # type: ignore[assignment]
|
|
99
|
+
return yaml
|
|
100
|
+
|
|
101
|
+
def load_yaml(stream):
|
|
102
|
+
return _yaml().load(stream)
|
|
103
|
+
|
|
104
|
+
def dump_yaml(data, stream): # type: ignore
|
|
105
|
+
_yaml().dump(data, stream)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
DEFAULT_GRAPHVIZ_FORMAT = "svg"
|
|
109
|
+
RED, GREEN, YELLOW, BLUE = "#FFAB99", "#99FFDE", "#EDFF99", "#b3f0ff"
|
|
110
|
+
DAG_NODE_COLORS: Dict[Type, str] = {
|
|
111
|
+
metaschema.ADLSRef: RED,
|
|
112
|
+
metaschema.LocalRef: YELLOW,
|
|
113
|
+
metaschema.TabularTextFileRef: YELLOW,
|
|
114
|
+
metaschema.TransientReferenceDataRef: BLUE,
|
|
115
|
+
metaschema.ReferenceDataRef: GREEN,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class CompilationTarget(Enum):
|
|
120
|
+
pandas = "pandas"
|
|
121
|
+
sqlite = "sqlite"
|
|
122
|
+
pyarrow = "pyarrow"
|
|
123
|
+
attrs = "attrs"
|
|
124
|
+
attrs_sqlite = "attrs_sqlite"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class DataFileHashes(NamedTuple):
|
|
128
|
+
actual: Optional[str]
|
|
129
|
+
expected: Optional[str]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class TableSyncData(NamedTuple):
|
|
133
|
+
local_path: Path
|
|
134
|
+
blob_store: metaschema.RemoteBlobStoreSpec
|
|
135
|
+
md5: str
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def remote_path(self) -> str:
|
|
139
|
+
return self.remote_data_spec.paths[0].name
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def remote_data_spec(self) -> metaschema.ADLSDataSpec:
|
|
143
|
+
data_spec = self.blob_store.data_spec(self.md5)
|
|
144
|
+
return data_spec
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def local_file_exists(self) -> bool:
|
|
148
|
+
return self.local_path.exists()
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def remote_file_system(self) -> ADLSFileSystem:
|
|
152
|
+
return adls_filesystem(self.blob_store.adls_account, self.blob_store.adls_filesystem)
|
|
153
|
+
|
|
154
|
+
def local_file_md5(self) -> Optional[str]:
|
|
155
|
+
return hash_file(self.local_path) if self.local_file_exists else None
|
|
156
|
+
|
|
157
|
+
def remote_file_exists(self) -> bool:
|
|
158
|
+
return self.remote_file_system.file_exists(self.remote_path)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def print_source(source, *, output: Optional[Path] = None):
|
|
162
|
+
if output is None:
|
|
163
|
+
outfile = sys.stdout
|
|
164
|
+
else:
|
|
165
|
+
outfile = open(output, "w")
|
|
166
|
+
|
|
167
|
+
print(source, file=outfile)
|
|
168
|
+
|
|
169
|
+
if output is not None:
|
|
170
|
+
outfile.close()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def print_file_hashes_status(hashes: Dict[str, DataFileHashes]):
|
|
174
|
+
ready_for_packaging = True
|
|
175
|
+
for name, hs in sorted(hashes.items(), key=lambda kv: kv[0]):
|
|
176
|
+
if hs.actual != hs.expected:
|
|
177
|
+
if hs.actual:
|
|
178
|
+
if hs.expected:
|
|
179
|
+
print(f"{name}: actual md5 {hs.actual} != expected md5 {hs.expected}")
|
|
180
|
+
ready_for_packaging = False
|
|
181
|
+
else:
|
|
182
|
+
print(f"{name}: actual md5 {hs.actual}; NO md5 IN SCHEMA")
|
|
183
|
+
else:
|
|
184
|
+
print(f"{name}: NO FILE")
|
|
185
|
+
ready_for_packaging = False
|
|
186
|
+
else:
|
|
187
|
+
print(f"{name}: ✔")
|
|
188
|
+
|
|
189
|
+
if not ready_for_packaging:
|
|
190
|
+
raise Exception("package data files or schema are not ready for packaging")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def print_list(it: Iterable):
|
|
194
|
+
for i in it:
|
|
195
|
+
print(i)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def print_schema_diff_summary(
|
|
199
|
+
diff: schema_diff.SchemaDiff,
|
|
200
|
+
*,
|
|
201
|
+
exit_code: bool = False,
|
|
202
|
+
heading_level: int = 0,
|
|
203
|
+
tablefmt: str = diff_summary.DEFAULT_TABLEFMT,
|
|
204
|
+
):
|
|
205
|
+
"""Print the schema diff summary to stdout and raise an exception if there are positive diffs
|
|
206
|
+
|
|
207
|
+
:param diff: the schema diff to summarize
|
|
208
|
+
:param exit_code: if passed, exit with code 1 if there is a positive diff (similar to `git diff --exit-code`)
|
|
209
|
+
:param tables: if passed, only show diffs for these tables. Note that a table may not be shown if it is
|
|
210
|
+
transient and the `transient` flag is not passed; a warning is raised in this case
|
|
211
|
+
:param transient: if passed, show diffs for transient tables
|
|
212
|
+
:param heading_level: increase this to render smaller headings on the markdown sections
|
|
213
|
+
:param tablefmt: the table format to use for the markdown tables, as understood by `tabulate`
|
|
214
|
+
"""
|
|
215
|
+
positive_diff = False
|
|
216
|
+
for section in diff_summary.markdown_schema_diff_summary(
|
|
217
|
+
diff,
|
|
218
|
+
heading_level=heading_level,
|
|
219
|
+
tablefmt=tablefmt,
|
|
220
|
+
):
|
|
221
|
+
print(section, end="\n\n")
|
|
222
|
+
positive_diff = True
|
|
223
|
+
if positive_diff and exit_code:
|
|
224
|
+
exit(1)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def print_data_diff_summaries(
|
|
228
|
+
data_diffs: Iterator[Tuple[metaschema.Identifier, data_diff.DataFrameDiff]],
|
|
229
|
+
*,
|
|
230
|
+
exit_code: bool = False,
|
|
231
|
+
verbose: bool = False,
|
|
232
|
+
value_detail: bool = False,
|
|
233
|
+
value_detail_min_count: int = 0,
|
|
234
|
+
heading_level: int = 0,
|
|
235
|
+
tablefmt: str = diff_summary.DEFAULT_TABLEFMT,
|
|
236
|
+
floatfmt: str = diff_summary.DEFAULT_FLOATFMT,
|
|
237
|
+
):
|
|
238
|
+
"""Print summaries of data diffs for a sequence of updated tables
|
|
239
|
+
|
|
240
|
+
:param data_diffs: an iterator of tuples of table names and their corresponding data diffs
|
|
241
|
+
:param exit_code: if True, exit with code 1 if there is a positive diff (similar to `git diff --exit-code`)
|
|
242
|
+
:param verbose: if True, show detailed row change status counts; otherwise show only single-column
|
|
243
|
+
change counts
|
|
244
|
+
:param value_detail: if True, show detailed value change counts; otherwise show only statistics of the
|
|
245
|
+
types of changes as determined by the `verbose` flag
|
|
246
|
+
:param value_detail_min_count: minimum number of instances of a specific value update to show value-level
|
|
247
|
+
detail for. No effect when `value_detail` is False
|
|
248
|
+
:param heading_level: increase this to render smaller headings on the markdown sections
|
|
249
|
+
:param tablefmt: the table format to use for the markdown tables, as understood by `tabulate`
|
|
250
|
+
:param floatfmt: the float format to use for the markdown tables, as understood by `tabulate`
|
|
251
|
+
"""
|
|
252
|
+
positive_diff = False
|
|
253
|
+
for table_name, d_diff in data_diffs:
|
|
254
|
+
for section in diff_summary.markdown_dataframe_diff_summary(
|
|
255
|
+
d_diff,
|
|
256
|
+
table_name,
|
|
257
|
+
verbose,
|
|
258
|
+
value_detail=value_detail,
|
|
259
|
+
value_detail_min_count=value_detail_min_count,
|
|
260
|
+
heading_level=heading_level,
|
|
261
|
+
tablefmt=tablefmt,
|
|
262
|
+
floatfmt=floatfmt,
|
|
263
|
+
):
|
|
264
|
+
positive_diff = True
|
|
265
|
+
print(section, end="\n\n")
|
|
266
|
+
|
|
267
|
+
if positive_diff and exit_code:
|
|
268
|
+
exit(1)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def to_graphviz(
|
|
272
|
+
dag: nx.DiGraph,
|
|
273
|
+
vertical: bool = False,
|
|
274
|
+
ranksep: float = 1.0,
|
|
275
|
+
nodesep: float = 1.0,
|
|
276
|
+
fontsize: int = 12,
|
|
277
|
+
fontname: str = "Courier",
|
|
278
|
+
compact: bool = False,
|
|
279
|
+
):
|
|
280
|
+
try:
|
|
281
|
+
from pygraphviz import AGraph
|
|
282
|
+
except ImportError:
|
|
283
|
+
raise RuntimeError("dag visulization requires `pygraphviz`")
|
|
284
|
+
|
|
285
|
+
title = "Reference Data dependency DAG"
|
|
286
|
+
g = AGraph(
|
|
287
|
+
directed=True,
|
|
288
|
+
name=title,
|
|
289
|
+
rankdir="TB" if vertical else "LR",
|
|
290
|
+
fontsize=fontsize * 3,
|
|
291
|
+
fontname=fontname,
|
|
292
|
+
label=title,
|
|
293
|
+
labelloc="t",
|
|
294
|
+
ranksep=str(ranksep) + " equally",
|
|
295
|
+
nodesep=str(nodesep),
|
|
296
|
+
)
|
|
297
|
+
g.node_attr["shape"] = "box"
|
|
298
|
+
g.node_attr["fontname"] = fontname
|
|
299
|
+
g.node_attr["fontsize"] = fontsize
|
|
300
|
+
|
|
301
|
+
for node, attrs in dag.nodes(data=True):
|
|
302
|
+
color = DAG_NODE_COLORS.get(type(node))
|
|
303
|
+
name = repr(node)
|
|
304
|
+
g.add_node(
|
|
305
|
+
name,
|
|
306
|
+
label=name,
|
|
307
|
+
fillcolor=color,
|
|
308
|
+
style="bold" if attrs.get("initial") else "filled",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
g.add_edges_from((repr(head), repr(tail)) for head, tail in dag.edges)
|
|
312
|
+
|
|
313
|
+
if compact:
|
|
314
|
+
# add invisible edges between components to put them on separate levels
|
|
315
|
+
def terminal_nodes(nodes: Iterable[metaschema.CustomStr], initial: bool) -> Iterable[str]:
|
|
316
|
+
lookup = dag.pred if initial else dag.succ
|
|
317
|
+
return (repr(node) for node in nodes if not len(lookup[node]))
|
|
318
|
+
|
|
319
|
+
def balanced_layers(dag: nx.DiGraph) -> List[Set[metaschema.CustomStr]]:
|
|
320
|
+
components = cast(
|
|
321
|
+
List[Set[metaschema.CustomStr]],
|
|
322
|
+
sorted(nx.connected_components(dag.to_undirected()), key=len),
|
|
323
|
+
)
|
|
324
|
+
target_size = len(components[-1])
|
|
325
|
+
layers = [components[0]]
|
|
326
|
+
for nodes in components[1:]:
|
|
327
|
+
component_to_merge = layers[-1]
|
|
328
|
+
if len(component_to_merge) + len(nodes) <= target_size:
|
|
329
|
+
component_to_merge.update(nodes)
|
|
330
|
+
else:
|
|
331
|
+
layers.append(nodes)
|
|
332
|
+
return layers
|
|
333
|
+
|
|
334
|
+
layers = balanced_layers(dag)
|
|
335
|
+
for i, (layer1, layer2) in enumerate(zip(layers, layers[1:]), 1):
|
|
336
|
+
sep_node = f"Layer({i})"
|
|
337
|
+
g.add_node(sep_node, style="invis")
|
|
338
|
+
g.add_edges_from(zip(terminal_nodes(layer1, False), repeat(sep_node)), style="invis")
|
|
339
|
+
g.add_edges_from(zip(repeat(sep_node), terminal_nodes(layer2, True)), style="invis")
|
|
340
|
+
|
|
341
|
+
g.layout(prog="dot")
|
|
342
|
+
return g
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def write_dependency_dag(
|
|
346
|
+
dag: nx.DiGraph,
|
|
347
|
+
*,
|
|
348
|
+
output: Optional[Path] = None,
|
|
349
|
+
format: Optional[str] = None,
|
|
350
|
+
vertical: bool = False,
|
|
351
|
+
fontsize: int = 12,
|
|
352
|
+
compact: bool = False,
|
|
353
|
+
):
|
|
354
|
+
"""Save a visualization of the dependency DAG using pygraphviz
|
|
355
|
+
|
|
356
|
+
:param dag: networkx graph representing the DAG
|
|
357
|
+
:param output: the file to write the visualization to. If not passed, a temp file will be created.
|
|
358
|
+
:param format: the format to save the image as (e.g. svg, png); if not passed, it will be inferred
|
|
359
|
+
from the output path name. When that is not passed, svg will be used.
|
|
360
|
+
:param vertical: orient the DAG visualization from top to bottom? (default is left to right)
|
|
361
|
+
:param fontsize: font size of text (e.g. table and resource names) in the visualization
|
|
362
|
+
:param compact: if True, put separate connected components of the DAG on separate levels (vertical
|
|
363
|
+
or horizontal depending on the orientation). For wide DAGs with many components this can result in
|
|
364
|
+
a much more compact representation.
|
|
365
|
+
"""
|
|
366
|
+
graphviz = to_graphviz(dag, vertical=vertical, fontsize=fontsize, compact=compact)
|
|
367
|
+
if output is None:
|
|
368
|
+
format = format.lower() if format else DEFAULT_GRAPHVIZ_FORMAT
|
|
369
|
+
output = Path(tempfile.mkstemp(suffix=f".{format}")[1])
|
|
370
|
+
print(f"Created teporary file at {output}; will save in PNG format")
|
|
371
|
+
|
|
372
|
+
print(f"Saving DAG visualization to {output}", file=sys.stderr)
|
|
373
|
+
graphviz.draw(str(output), format=format, prog="dot")
|
|
374
|
+
try:
|
|
375
|
+
subprocess.run(["open", str(output)])
|
|
376
|
+
except Exception as e:
|
|
377
|
+
print(
|
|
378
|
+
f"Couldn't run `open {output}` ({e}); open the file manually",
|
|
379
|
+
file=sys.stderr,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
@define_cli
|
|
384
|
+
@config_top_level
|
|
385
|
+
class ReferenceDataManager:
|
|
386
|
+
def __init__(
|
|
387
|
+
self,
|
|
388
|
+
*,
|
|
389
|
+
package: str,
|
|
390
|
+
schema_path: str,
|
|
391
|
+
repo_root: Optional[Path] = None,
|
|
392
|
+
require_editable_install: bool = False,
|
|
393
|
+
):
|
|
394
|
+
"""Utilities for managing, installing, validating, and inspecting reference data
|
|
395
|
+
|
|
396
|
+
:param package: name of the package where the data is to be defined and stored and where the
|
|
397
|
+
schema should be read from
|
|
398
|
+
:param schema_path: path to the schema relative to the package root; should be a YAML file
|
|
399
|
+
compatible with `reference_data.schema.metaschema.Schema`
|
|
400
|
+
:param repo_root: path to the root of the local repository. If not supplied, it will be set to
|
|
401
|
+
the current working directory
|
|
402
|
+
:param require_editable_install: Fail if the package is not installed in editable mode? This is
|
|
403
|
+
generally what you want when developing/publishing. However there are use cases where one may
|
|
404
|
+
wish to edit a hard install of the package in place, e.g. when syncing data files to the
|
|
405
|
+
installed package data directory, in which case this may remain as the default `False` value.
|
|
406
|
+
:return: a `ReferenceDataBuildCommand` subclass with the provided fields populated
|
|
407
|
+
"""
|
|
408
|
+
self.build_command = ReferenceDataBuildCommand.with_options(
|
|
409
|
+
package_name=package,
|
|
410
|
+
schema_path=schema_path,
|
|
411
|
+
for_setup_py_build=False,
|
|
412
|
+
)()
|
|
413
|
+
self.logger = logging.getLogger(__name__)
|
|
414
|
+
if repo_root is None:
|
|
415
|
+
self.repo_root = Path.cwd().resolve()
|
|
416
|
+
else:
|
|
417
|
+
self.repo_root = repo_root.resolve()
|
|
418
|
+
|
|
419
|
+
if require_editable_install:
|
|
420
|
+
self.check_editable_install()
|
|
421
|
+
|
|
422
|
+
def check_editable_install(self):
|
|
423
|
+
"""Ensure that the package being built is installed in an editable mode; otherwise the operations
|
|
424
|
+
defined in this interface may not have the intended effects."""
|
|
425
|
+
local_data_dir = Path(pkg_resources.resource_filename(self.package, ""))
|
|
426
|
+
if not str(local_data_dir).startswith(str(self.repo_root)):
|
|
427
|
+
msg = (
|
|
428
|
+
f"Package {self.package} appears not to be installed in editable mode; this could result"
|
|
429
|
+
"for example in incorrect file hashes or a corrupted package installation"
|
|
430
|
+
)
|
|
431
|
+
self.logger.exception(msg)
|
|
432
|
+
raise RuntimeError(msg)
|
|
433
|
+
else:
|
|
434
|
+
self.logger.info(f"Check passed - package {self.package} is installed in editable mode")
|
|
435
|
+
|
|
436
|
+
def load_raw_schema(self):
|
|
437
|
+
"""Round-trippable load of the schema YAML file, for development operations where the file needs
|
|
438
|
+
to be edited while preserving style and comments"""
|
|
439
|
+
self.logger.info("Loading round-trippable raw schema")
|
|
440
|
+
with pkg_resources.resource_stream(self.package, self.schema_path) as f:
|
|
441
|
+
return load_yaml(f)
|
|
442
|
+
|
|
443
|
+
@property
|
|
444
|
+
def schema(self) -> metaschema.Schema:
|
|
445
|
+
return self.build_command.schema
|
|
446
|
+
|
|
447
|
+
@property
|
|
448
|
+
def build_options(self) -> metaschema.BuildOptions:
|
|
449
|
+
options = self.schema.build_options
|
|
450
|
+
assert options is not None
|
|
451
|
+
return options
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def package(self) -> str:
|
|
455
|
+
return self.build_command.package_name
|
|
456
|
+
|
|
457
|
+
@property
|
|
458
|
+
def schema_path(self) -> str:
|
|
459
|
+
return self.build_command.schema_path
|
|
460
|
+
|
|
461
|
+
@property
|
|
462
|
+
def package_data_dir(self) -> Optional[str]:
|
|
463
|
+
return self.build_options.package_data_dir
|
|
464
|
+
|
|
465
|
+
@property
|
|
466
|
+
def transient_data_dir(self) -> Optional[str]:
|
|
467
|
+
return self.build_options.transient_data_dir
|
|
468
|
+
|
|
469
|
+
@property
|
|
470
|
+
def sqlite_db_path(self) -> Optional[str]:
|
|
471
|
+
return self.build_options.sqlite_db_path
|
|
472
|
+
|
|
473
|
+
@property
|
|
474
|
+
def repo_url(self):
|
|
475
|
+
return self.build_options.repo_url
|
|
476
|
+
|
|
477
|
+
@property
|
|
478
|
+
def table_docs_dir(self):
|
|
479
|
+
return self.build_options.table_docs_dir
|
|
480
|
+
|
|
481
|
+
@property
|
|
482
|
+
def type_docs_path(self):
|
|
483
|
+
return self.build_options.type_docs_path
|
|
484
|
+
|
|
485
|
+
@property
|
|
486
|
+
def source_docs_path(self):
|
|
487
|
+
return self.build_options.source_docs_path
|
|
488
|
+
|
|
489
|
+
@property
|
|
490
|
+
def curation_badge_path(self):
|
|
491
|
+
return self.build_options.curation_badge_path
|
|
492
|
+
|
|
493
|
+
def data_path_for(self, table: Union[str, metaschema.Table]) -> Path:
|
|
494
|
+
table_ = self.schema.tables[table] if isinstance(table, str) else table
|
|
495
|
+
data_dir = self.transient_data_dir if table_.transient else self.package_data_dir
|
|
496
|
+
assert data_dir is not None
|
|
497
|
+
return Path(
|
|
498
|
+
pkg_resources.resource_filename(
|
|
499
|
+
self.package,
|
|
500
|
+
default_parquet_package_data_path(table_.name, data_dir),
|
|
501
|
+
)
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
@output_handler(print_list)
|
|
505
|
+
def dependent_tables(self, tables: Optional[Set[str]] = None) -> Set[str]:
|
|
506
|
+
"""Compute the set of tables downstream from a set of tables in the computational DAG,
|
|
507
|
+
including the original tables"""
|
|
508
|
+
tables = tables or set()
|
|
509
|
+
unknown_tables = {t for t in tables if t not in self.schema.tables}
|
|
510
|
+
if unknown_tables:
|
|
511
|
+
raise KeyError(f"Unknown tables: {','.join(unknown_tables)}")
|
|
512
|
+
dag = self.schema.dependency_dag()
|
|
513
|
+
downstream = all_successors(dag, [self.schema.tables[t].graph_ref for t in tables])
|
|
514
|
+
return {str(t) for t in downstream if isinstance(t, metaschema.ReferenceDataRef)}
|
|
515
|
+
|
|
516
|
+
@output_handler(write_dependency_dag)
|
|
517
|
+
def dag(
|
|
518
|
+
self,
|
|
519
|
+
tables: Optional[Set[str]] = None,
|
|
520
|
+
*,
|
|
521
|
+
upstream: bool = True,
|
|
522
|
+
downstream: bool = True,
|
|
523
|
+
build: bool = False,
|
|
524
|
+
):
|
|
525
|
+
"""Compute the dependency DAG for a set of tables and their dependencies and/or dependents.
|
|
526
|
+
(or the whole DAG if tables are not passed)
|
|
527
|
+
|
|
528
|
+
:param tables: tables to treat as root nodes in the DAG; if passed, only these tables and their
|
|
529
|
+
dependencies/dependents will be in the DAG, otherwise the entire DAG will be returned
|
|
530
|
+
:param upstream: Should the DAG include upstream dependencies? (by default it does)
|
|
531
|
+
:param downstream: Should the DAG include downstream dependencies? (by default it does)
|
|
532
|
+
:param build: Should the DAG include all dependencies that would be included in a build of the
|
|
533
|
+
specified tables? Overrides upstream and downstream specification. False by default.
|
|
534
|
+
:return: networkx.DiGraph representing the computational DAG of data derivations
|
|
535
|
+
"""
|
|
536
|
+
if tables:
|
|
537
|
+
if not upstream and not downstream and not build:
|
|
538
|
+
raise ValueError("one of `upstream`, `downstream`, `connected` must be True")
|
|
539
|
+
|
|
540
|
+
full_dag = self.schema.dependency_dag()
|
|
541
|
+
table_refs = {self.schema.tables[t].graph_ref for t in tables}
|
|
542
|
+
if build:
|
|
543
|
+
tables_ = self.dependent_tables(tables)
|
|
544
|
+
dag = self.schema.dependency_dag(lambda table: table.name in tables_)
|
|
545
|
+
else:
|
|
546
|
+
downstream_refs = all_successors(full_dag, table_refs) if downstream else set()
|
|
547
|
+
upstream_refs = all_predecessors(full_dag, table_refs) if upstream else set()
|
|
548
|
+
refs = downstream_refs.union(upstream_refs)
|
|
549
|
+
dag = nx.DiGraph(nx.induced_subgraph(full_dag, refs))
|
|
550
|
+
|
|
551
|
+
for table in table_refs:
|
|
552
|
+
dag.add_node(table, initial=True)
|
|
553
|
+
return dag
|
|
554
|
+
else:
|
|
555
|
+
return self.schema.dependency_dag()
|
|
556
|
+
|
|
557
|
+
@output_handler(print_source)
|
|
558
|
+
def compile(self, target: CompilationTarget) -> str:
|
|
559
|
+
"""Compile a schema YAML file to a specific target language/library
|
|
560
|
+
|
|
561
|
+
:param target: The target language/library to compile the YAML schema to
|
|
562
|
+
"""
|
|
563
|
+
if target == CompilationTarget.sqlite:
|
|
564
|
+
|
|
565
|
+
def sql_renderer(schema):
|
|
566
|
+
return "\n".join(render_sql_schema(schema))
|
|
567
|
+
|
|
568
|
+
renderer = sql_renderer
|
|
569
|
+
elif target == CompilationTarget.pandas:
|
|
570
|
+
renderer = partial(
|
|
571
|
+
render_pandera_module,
|
|
572
|
+
package=self.package,
|
|
573
|
+
)
|
|
574
|
+
elif target == CompilationTarget.pyarrow:
|
|
575
|
+
renderer = render_pyarrow_schema
|
|
576
|
+
elif target == CompilationTarget.attrs:
|
|
577
|
+
renderer = partial(
|
|
578
|
+
render_attrs_module,
|
|
579
|
+
package=self.package,
|
|
580
|
+
)
|
|
581
|
+
elif target == CompilationTarget.attrs_sqlite:
|
|
582
|
+
assert (
|
|
583
|
+
self.sqlite_db_path is not None
|
|
584
|
+
), "Must specify sqlite db path in build options to generate sqlite interface"
|
|
585
|
+
renderer = partial(
|
|
586
|
+
render_attrs_sqlite_schema,
|
|
587
|
+
package=self.package,
|
|
588
|
+
db_path=self.sqlite_db_path,
|
|
589
|
+
)
|
|
590
|
+
else:
|
|
591
|
+
raise NotImplementedError(f"Compilation hasn't been implemented for target {target.value}")
|
|
592
|
+
|
|
593
|
+
source = renderer(self.schema)
|
|
594
|
+
return source
|
|
595
|
+
|
|
596
|
+
@output_handler(print_file_hashes_status)
|
|
597
|
+
def check_hashes(self) -> Dict[str, DataFileHashes]:
|
|
598
|
+
"""Check actual hashes of on-disk built data files against those documented in the schema"""
|
|
599
|
+
assert (
|
|
600
|
+
self.package_data_dir is not None and self.transient_data_dir is not None
|
|
601
|
+
), "Can't check hashes without package data dirs"
|
|
602
|
+
hashes = {}
|
|
603
|
+
for table in self.schema.build_time_package_tables:
|
|
604
|
+
name = table.name
|
|
605
|
+
loader = PandasParquetLoader.from_schema_table(
|
|
606
|
+
table,
|
|
607
|
+
package=self.package,
|
|
608
|
+
data_dir=self.transient_data_dir if table.transient else self.package_data_dir,
|
|
609
|
+
)
|
|
610
|
+
if Path(pkg_resources.resource_filename(self.package, loader.data_path)).exists():
|
|
611
|
+
hashes[name] = DataFileHashes(actual=loader.file_hash(), expected=table.md5)
|
|
612
|
+
else:
|
|
613
|
+
hashes[name] = DataFileHashes(actual=None, expected=table.md5)
|
|
614
|
+
|
|
615
|
+
return hashes
|
|
616
|
+
|
|
617
|
+
def init_sqlite(self, *, validate: bool = False, check_hash: bool = True):
|
|
618
|
+
"""Populate a sqlite database with the package's tabular data
|
|
619
|
+
|
|
620
|
+
:param db_path: Optional path inside the package to use for the database. If not supplied, the
|
|
621
|
+
default path will be used
|
|
622
|
+
:param validate: Validate data using pandera schemas before inserting?
|
|
623
|
+
:param check_hash: Check hashes in db metadata table and skip inserting tables that are
|
|
624
|
+
up-to-date with current package data files?
|
|
625
|
+
"""
|
|
626
|
+
assert (
|
|
627
|
+
self.package_data_dir is not None
|
|
628
|
+
and self.transient_data_dir is not None
|
|
629
|
+
and self.sqlite_db_path is not None
|
|
630
|
+
), "Can't init sqlite db without package data dirs and sqlite db path"
|
|
631
|
+
populate_sqlite_db(
|
|
632
|
+
self.schema,
|
|
633
|
+
db_package=self.package,
|
|
634
|
+
db_path=self.sqlite_db_path,
|
|
635
|
+
data_package=self.package,
|
|
636
|
+
data_dir=self.package_data_dir,
|
|
637
|
+
transient_data_dir=self.transient_data_dir,
|
|
638
|
+
validate=validate,
|
|
639
|
+
check_hash=check_hash,
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def codegen(self):
|
|
643
|
+
"""Generate all derived accessor code and save to specified files"""
|
|
644
|
+
self.build_command.write_derived_source_code()
|
|
645
|
+
|
|
646
|
+
def docgen(self):
|
|
647
|
+
if self.table_docs_dir is None:
|
|
648
|
+
raise ValueError("Can't write table docs without table_docs_dir")
|
|
649
|
+
elif self.type_docs_path is None:
|
|
650
|
+
raise ValueError("Can't write type doc without type_docs_path")
|
|
651
|
+
elif self.source_docs_path is None:
|
|
652
|
+
self.logger.warning("Can't write source doc without source_docs_path")
|
|
653
|
+
|
|
654
|
+
table_output_dir = Path(self.table_docs_dir)
|
|
655
|
+
|
|
656
|
+
if table_output_dir.is_dir():
|
|
657
|
+
self.logger.info(f"Clearing existing table docs directory at {table_output_dir}")
|
|
658
|
+
shutil.rmtree(table_output_dir)
|
|
659
|
+
|
|
660
|
+
self.logger.info(f"Creating table docs directory at {table_output_dir}")
|
|
661
|
+
table_output_dir.mkdir(parents=True)
|
|
662
|
+
|
|
663
|
+
self.logger.info("Rendering markdown for package tables")
|
|
664
|
+
types_doc, source_doc, table_docs = render_sphinx_docs(
|
|
665
|
+
self.schema, self.repo_root, self.repo_url
|
|
666
|
+
)
|
|
667
|
+
for table_name, markdown in table_docs.items():
|
|
668
|
+
path = table_output_dir / f"{table_name}.rst"
|
|
669
|
+
self.logger.info(f"Writing markdown docs for table {table_name} to {path}")
|
|
670
|
+
with open(path, "w") as f:
|
|
671
|
+
f.write(markdown)
|
|
672
|
+
|
|
673
|
+
type_docs_path = Path(self.type_docs_path)
|
|
674
|
+
self.logger.info(f"Writing markdown for package types to {type_docs_path}")
|
|
675
|
+
with open(type_docs_path, "w") as f:
|
|
676
|
+
f.write(types_doc)
|
|
677
|
+
|
|
678
|
+
if self.source_docs_path:
|
|
679
|
+
source_docs_path = Path(self.source_docs_path)
|
|
680
|
+
self.logger.info(f"Writing markdown for package source data to {source_docs_path}")
|
|
681
|
+
with open(source_docs_path, "w") as f:
|
|
682
|
+
f.write(source_doc)
|
|
683
|
+
|
|
684
|
+
def datagen(
|
|
685
|
+
self, tables: Optional[Set[str]] = None, *, update_hashes: bool = True, no_sync: bool = False
|
|
686
|
+
):
|
|
687
|
+
"""Re-generate package data, optionally skipping files with hashes matching those in the schema
|
|
688
|
+
:param tables: names of the specific tables to build. If not passed, all tables will be built
|
|
689
|
+
:param update_hashes: Should hashes be updated for all tables regenerated at the end of the
|
|
690
|
+
build? This is done by default but can be disabled if you are just experimenting.
|
|
691
|
+
:param no_sync: when passed, don't pull the latest data from the remote blob store before building.
|
|
692
|
+
Useful only if you really know what you're doing and are in an intermediate state with
|
|
693
|
+
"uncommitted" data files whose md5s don't match what's in the schema - e.g. as a result of
|
|
694
|
+
running `datagen` with `update_hashes=False`.
|
|
695
|
+
"""
|
|
696
|
+
data_dir = self.package_data_dir
|
|
697
|
+
transient_data_dir = self.transient_data_dir
|
|
698
|
+
if data_dir is None or transient_data_dir is None:
|
|
699
|
+
raise ValueError("Can't build data files without specification of data dirs in the schema")
|
|
700
|
+
|
|
701
|
+
if tables:
|
|
702
|
+
self.logger.info(
|
|
703
|
+
f"Computing all tables downstream of {tables} in the dependency DAG and removing built "
|
|
704
|
+
f"files to force re-computation"
|
|
705
|
+
)
|
|
706
|
+
# force re-computation of the specified tables *and* all their downstream dependents
|
|
707
|
+
tables_to_recompute = self.dependent_tables(tables)
|
|
708
|
+
else:
|
|
709
|
+
# build all tables
|
|
710
|
+
tables_to_recompute = set(t.name for t in self.schema.computable_tables)
|
|
711
|
+
|
|
712
|
+
# update hashes for all upstream tables in the DAG as well, since any of them may be recomputed
|
|
713
|
+
# in this build on a hash mismatch
|
|
714
|
+
tables_to_update_hashes = {
|
|
715
|
+
str(t)
|
|
716
|
+
for t in self.schema.dependency_dag(lambda table: table.name in tables_to_recompute)
|
|
717
|
+
if isinstance(t, metaschema.ReferenceDataRef)
|
|
718
|
+
and not ((table := self.schema.tables[str(t)]).transient and table.md5 is None)
|
|
719
|
+
# don't update hashes for transient tables with explicitly no hash
|
|
720
|
+
}
|
|
721
|
+
run_hash_update = bool(tables_to_update_hashes) and update_hashes
|
|
722
|
+
|
|
723
|
+
if not no_sync:
|
|
724
|
+
# ensure local blobs are up-to-date before building, but don't fail if a remote blob is absent;
|
|
725
|
+
# we'll just regenerate it if it's needed for computing the current DAG
|
|
726
|
+
self.sync_blob_store(down=True, no_fail_if_absent=True)
|
|
727
|
+
|
|
728
|
+
for table_name in tables_to_recompute:
|
|
729
|
+
table = self.schema.tables[table_name]
|
|
730
|
+
file_path = self.data_path_for(table)
|
|
731
|
+
if file_path.exists():
|
|
732
|
+
self.logger.warning(f"Removing built file for table {table.name} at {file_path}")
|
|
733
|
+
os.remove(file_path)
|
|
734
|
+
else:
|
|
735
|
+
self.logger.info(f"No file found for table {table.name}; nothing to remove")
|
|
736
|
+
try:
|
|
737
|
+
self.build_command.build_package_data(tables=tables_to_recompute or None)
|
|
738
|
+
except Exception as e:
|
|
739
|
+
raise e
|
|
740
|
+
finally:
|
|
741
|
+
if run_hash_update:
|
|
742
|
+
self.update_hashes(tables_to_update_hashes, codegen=True)
|
|
743
|
+
|
|
744
|
+
def update_hashes(self, tables: Optional[Set[str]] = None, *, codegen: bool = True):
|
|
745
|
+
"""Update package data hashes in schema YAML to match the actual hashes of package data files as
|
|
746
|
+
currently present in the file tree (or as recomputed when specified)
|
|
747
|
+
|
|
748
|
+
:param tables: if passed, only update hashes for these tables' package data; otherwise update for
|
|
749
|
+
all tables
|
|
750
|
+
:param codegen: indicates whether to run the `codegen` command after updating the hashes to
|
|
751
|
+
ensure hashes embedded in source code are up-to-date. By default, this runs when any hashes are
|
|
752
|
+
updated in the config file.
|
|
753
|
+
"""
|
|
754
|
+
assert (
|
|
755
|
+
self.package_data_dir is not None and self.transient_data_dir is not None
|
|
756
|
+
), "Can't update hashes without package data dirs"
|
|
757
|
+
hashes_updated = []
|
|
758
|
+
tables_to_update = (
|
|
759
|
+
[self.schema.tables[t] for t in tables] if tables else self.schema.build_time_package_tables
|
|
760
|
+
)
|
|
761
|
+
raw_schema = self.load_raw_schema()
|
|
762
|
+
self.logger.info("Updating data hashes")
|
|
763
|
+
for table in tables_to_update:
|
|
764
|
+
table_name = table.name
|
|
765
|
+
table_path = self.data_path_for(table)
|
|
766
|
+
if os.path.exists(table_path):
|
|
767
|
+
md5 = hash_file(table_path)
|
|
768
|
+
old_md5 = table.md5
|
|
769
|
+
if old_md5 is None:
|
|
770
|
+
self.logger.warning(
|
|
771
|
+
f"no md5 hash previously defined for table {table_name}; updating to {md5!r}"
|
|
772
|
+
)
|
|
773
|
+
elif md5 != old_md5:
|
|
774
|
+
self.logger.warning(
|
|
775
|
+
f"md5 hashes did not match for table {table_name}; updating to {md5!r}"
|
|
776
|
+
)
|
|
777
|
+
else:
|
|
778
|
+
continue
|
|
779
|
+
|
|
780
|
+
table.md5 = md5
|
|
781
|
+
raw_schema["tables"][table_name]["md5"] = md5
|
|
782
|
+
hashes_updated.append(table_name)
|
|
783
|
+
else:
|
|
784
|
+
self.logger.warning(
|
|
785
|
+
f"package data file doesn't exist for table {table_name!r}; can't update md5 hash"
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
schema_path = pkg_resources.resource_filename(self.package, self.schema_path)
|
|
789
|
+
if hashes_updated:
|
|
790
|
+
self.logger.warning(
|
|
791
|
+
f"updated hashes for tables {hashes_updated!r}; writing new schema to {schema_path}"
|
|
792
|
+
)
|
|
793
|
+
with open(schema_path, "w") as f:
|
|
794
|
+
dump_yaml(raw_schema, f)
|
|
795
|
+
|
|
796
|
+
if codegen:
|
|
797
|
+
self.logger.info("regenerating source code to update embedded hashes")
|
|
798
|
+
self.codegen()
|
|
799
|
+
|
|
800
|
+
@noncommand
|
|
801
|
+
def table_sync_data(self, table: metaschema.Table) -> TableSyncData:
|
|
802
|
+
blob_store = self.schema.remote_blob_store
|
|
803
|
+
assert blob_store is not None, "No blob store defined in schema"
|
|
804
|
+
assert table.md5 is not None, f"No md5 defined for table {table.name}"
|
|
805
|
+
assert self.package_data_dir is not None, "No package data dir to sync"
|
|
806
|
+
local_build_path = Path(
|
|
807
|
+
pkg_resources.resource_filename(
|
|
808
|
+
self.package,
|
|
809
|
+
default_parquet_package_data_path(table.name, self.package_data_dir),
|
|
810
|
+
)
|
|
811
|
+
)
|
|
812
|
+
return TableSyncData(local_build_path, blob_store, md5=table.md5)
|
|
813
|
+
|
|
814
|
+
@noncommand
|
|
815
|
+
def sync_up(self, sync_data: TableSyncData) -> bool:
|
|
816
|
+
remote_path = sync_data.remote_path
|
|
817
|
+
local_build_path = sync_data.local_path
|
|
818
|
+
if sync_data.remote_file_exists():
|
|
819
|
+
self.logger.info(f"Found existing file in remote blob store at {remote_path}; not syncing")
|
|
820
|
+
return True
|
|
821
|
+
else:
|
|
822
|
+
self.logger.info(f"Syncing to path {remote_path} in remote blob store")
|
|
823
|
+
try:
|
|
824
|
+
sync_data.remote_file_system.put_file(local_build_path, remote_path)
|
|
825
|
+
except Exception as e:
|
|
826
|
+
self.logger.exception(
|
|
827
|
+
f"Failed to put file {local_build_path} at {remote_path} in blob store: {e}"
|
|
828
|
+
)
|
|
829
|
+
return False
|
|
830
|
+
else:
|
|
831
|
+
return True
|
|
832
|
+
|
|
833
|
+
@noncommand
|
|
834
|
+
def sync_down(self, sync_data: TableSyncData, link_build: bool) -> bool:
|
|
835
|
+
self.logger.info(f"Fetching file from remote blob store at {sync_data.remote_path}")
|
|
836
|
+
try:
|
|
837
|
+
paths = sync_adls_data(sync_data.remote_data_spec)
|
|
838
|
+
except ADLSFileIntegrityError as e:
|
|
839
|
+
self.logger.exception(str(e))
|
|
840
|
+
return False
|
|
841
|
+
except Exception as e:
|
|
842
|
+
self.logger.exception(f"Failed to fetch file {sync_data.remote_path} from blob store: {e}")
|
|
843
|
+
return False
|
|
844
|
+
else:
|
|
845
|
+
assert len(paths) == 1
|
|
846
|
+
if link_build:
|
|
847
|
+
local_cache_path = paths[0].local_path
|
|
848
|
+
if sync_data.local_file_exists:
|
|
849
|
+
self.logger.warning(f"Removing existing file {sync_data.local_path}")
|
|
850
|
+
os.remove(sync_data.local_path)
|
|
851
|
+
self.logger.info(f"Linking downloaded file to local build file {sync_data.local_path}")
|
|
852
|
+
sync_data.local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
853
|
+
os.link(local_cache_path, sync_data.local_path)
|
|
854
|
+
return True
|
|
855
|
+
|
|
856
|
+
def sync_blob_store(
|
|
857
|
+
self,
|
|
858
|
+
*,
|
|
859
|
+
up: bool = False,
|
|
860
|
+
down: bool = False,
|
|
861
|
+
no_fail_if_absent: bool = False,
|
|
862
|
+
tables: Optional[Set[str]] = None,
|
|
863
|
+
) -> List[str]:
|
|
864
|
+
"""Sync the local built files to the remote blob store, if one is defined.
|
|
865
|
+
It is assumed that the hashes in the schema file are the source of truth rather than the hashes
|
|
866
|
+
of the on-disk built files; if these should be taken as authoritative instead, run the
|
|
867
|
+
`update_hashes` command first. At the end of this operation, all files in the local build folder
|
|
868
|
+
and the remote blob store are guaranteed to match the hashes in the schema file, unless a file
|
|
869
|
+
with the correct hash was unavailable.
|
|
870
|
+
|
|
871
|
+
:param up: Upload local files to the blob store if they're available?
|
|
872
|
+
:param down: Download remote blobs to the local build directory if they're available?
|
|
873
|
+
:param no_fail_if_absent: when passed, don't fail an upload for lack of a local file being
|
|
874
|
+
present with the expected hash for a version-controlled table. This is useful in development
|
|
875
|
+
workflows where you just want to regenerate/sync a particular table that you've updated.
|
|
876
|
+
:param tables: optional collection of table names to sync; all will be synced if not passed.
|
|
877
|
+
:return: list of table names that were synced successfully
|
|
878
|
+
:raises FileNotExistsError: if a local or remote file was not available for sync
|
|
879
|
+
"""
|
|
880
|
+
assert self.package_data_dir is not None, "Can't sync blob store without package data dir"
|
|
881
|
+
blob_store = self.schema.remote_blob_store
|
|
882
|
+
if blob_store is None:
|
|
883
|
+
self.logger.warning("No remote blob store defined; not syncing files")
|
|
884
|
+
return []
|
|
885
|
+
|
|
886
|
+
if not (down or up):
|
|
887
|
+
raise ValueError("Must indicate syncing either down, up, or both from blob store")
|
|
888
|
+
|
|
889
|
+
tables_to_sync = []
|
|
890
|
+
for table in self.schema.build_time_package_tables:
|
|
891
|
+
if table.md5 is None:
|
|
892
|
+
self.logger.warning(
|
|
893
|
+
f"No md5 hash defined for package table {table.name}; no remote blob to sync to or from"
|
|
894
|
+
)
|
|
895
|
+
else:
|
|
896
|
+
tables_to_sync.append(table)
|
|
897
|
+
|
|
898
|
+
if tables is not None:
|
|
899
|
+
known_tables = {t.name for t in tables_to_sync}
|
|
900
|
+
if unknown_tables := tables.difference(known_tables):
|
|
901
|
+
msg = f"Can't sync unknown or non-version-controlled tables: {', '.join(unknown_tables)}"
|
|
902
|
+
self.logger.error(msg)
|
|
903
|
+
raise KeyError(msg)
|
|
904
|
+
tables_to_sync = [t for t in tables_to_sync if t.name in tables]
|
|
905
|
+
|
|
906
|
+
self.logger.info(
|
|
907
|
+
f"Syncing with remote blob store {blob_store.adls_account}/{blob_store.adls_filesystem}"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
def inner(table: metaschema.Table) -> Optional[str]:
|
|
911
|
+
sync_data = self.table_sync_data(table)
|
|
912
|
+
local_file_md5 = sync_data.local_file_md5()
|
|
913
|
+
if local_file_md5 == table.md5:
|
|
914
|
+
# good local file; we can sync up
|
|
915
|
+
self.logger.info(
|
|
916
|
+
f"Found local file for table {table.name} matching expected hash {table.md5}"
|
|
917
|
+
)
|
|
918
|
+
if up:
|
|
919
|
+
if self.sync_up(sync_data):
|
|
920
|
+
return table.name
|
|
921
|
+
else:
|
|
922
|
+
raise IOError(table.name)
|
|
923
|
+
else:
|
|
924
|
+
# file is present locally with expected hash; no need to sync down
|
|
925
|
+
return table.name
|
|
926
|
+
else:
|
|
927
|
+
# check remote; download to get hash and link if good
|
|
928
|
+
addendum = "" if local_file_md5 is None else f" matching expected hash {table.md5}"
|
|
929
|
+
self.logger.info(f"No local file found for table {table.name}{addendum}")
|
|
930
|
+
if up and no_fail_if_absent:
|
|
931
|
+
self.logger.info(
|
|
932
|
+
f"Skipping sync to remote blob store of local file for table {table.name}"
|
|
933
|
+
)
|
|
934
|
+
return None
|
|
935
|
+
|
|
936
|
+
# only link the downloaded file into the build dir if we're syncing down; else just download
|
|
937
|
+
# the file to check that it has the correct hash
|
|
938
|
+
success = self.sync_down(sync_data, link_build=down)
|
|
939
|
+
if success:
|
|
940
|
+
return table.name
|
|
941
|
+
else:
|
|
942
|
+
if down and no_fail_if_absent:
|
|
943
|
+
return None
|
|
944
|
+
raise IOError(table.name)
|
|
945
|
+
|
|
946
|
+
failed: List[str] = []
|
|
947
|
+
synced: List[str] = []
|
|
948
|
+
for table_name, res in parallel.yield_all([(t.name, partial(inner, t)) for t in tables_to_sync]):
|
|
949
|
+
if isinstance(res, parallel.Error):
|
|
950
|
+
failed.append(table_name)
|
|
951
|
+
elif res is not None:
|
|
952
|
+
synced.append(table_name)
|
|
953
|
+
|
|
954
|
+
if failed:
|
|
955
|
+
raise RuntimeError(f"Sync failed for tables {', '.join(failed)}")
|
|
956
|
+
|
|
957
|
+
down_ = (
|
|
958
|
+
f"to local build directory {pkg_resources.resource_filename(self.package, self.package_data_dir)}"
|
|
959
|
+
if down
|
|
960
|
+
else ""
|
|
961
|
+
)
|
|
962
|
+
up_ = (
|
|
963
|
+
f"to remote blob store {blob_store.adls_account}/{blob_store.adls_filesystem}/{blob_store.path}"
|
|
964
|
+
if up
|
|
965
|
+
else ""
|
|
966
|
+
)
|
|
967
|
+
addendum = f"{down_} and {up_}" if down and up else down_ or up_
|
|
968
|
+
tables_ = f" {', '.join(tables)}" if tables else ""
|
|
969
|
+
self.logger.info(f"Success - build-time package data tables{tables_} synced {addendum}")
|
|
970
|
+
return synced
|
|
971
|
+
|
|
972
|
+
def pull(self, tables: Optional[Set[str]] = None, *, no_fail_if_absent: bool = False):
|
|
973
|
+
"""Download all remote blobs to the local data directory, with integrity checks.
|
|
974
|
+
|
|
975
|
+
:param tables: optional collection of table names to sync; all will be synced if not passed.
|
|
976
|
+
:param no_fail_if_absent: when passed, don't fail a download for lack of a remote blob being
|
|
977
|
+
present in the bob store with the expected hash for a version-controlled table. This is useful
|
|
978
|
+
in development workflows where you just want to regenerate/sync a particular table that you
|
|
979
|
+
generated once and then removed, but didn't push yet (leaving a dangling hash reference).
|
|
980
|
+
"""
|
|
981
|
+
self.sync_blob_store(down=True, no_fail_if_absent=no_fail_if_absent, tables=tables)
|
|
982
|
+
|
|
983
|
+
def push(self, tables: Optional[Set[str]] = None, *, no_fail_if_absent: bool = False):
|
|
984
|
+
"""Upload all local data files to the remote blob store, with integrity checks.
|
|
985
|
+
|
|
986
|
+
:param tables: optional collection of table names to sync; all will be synced if not passed.
|
|
987
|
+
:param no_fail_if_absent: when passed, don't fail an upload for lack of a local file being
|
|
988
|
+
present with the expected hash for a version-controlled table. This is useful in development
|
|
989
|
+
workflows where you just want to regenerate/sync a particular table that you've updated.
|
|
990
|
+
"""
|
|
991
|
+
self.sync_blob_store(up=True, no_fail_if_absent=no_fail_if_absent, tables=tables)
|
|
992
|
+
|
|
993
|
+
@output_handler(print_schema_diff_summary)
|
|
994
|
+
def schema_diff(
|
|
995
|
+
self,
|
|
996
|
+
base_ref: str = "HEAD",
|
|
997
|
+
tables: Optional[Set[str]] = None,
|
|
998
|
+
*,
|
|
999
|
+
include_transient: bool = False,
|
|
1000
|
+
base_schema_path: Optional[str] = None,
|
|
1001
|
+
):
|
|
1002
|
+
"""Compute a diff between the current schema and a historical version of the schema.
|
|
1003
|
+
|
|
1004
|
+
:param base_ref: the base git ref to compare against
|
|
1005
|
+
:param tables: a set of specific tables to inspect; if not passed the full schemas will be diffed
|
|
1006
|
+
:param include_transient: if passed, include transient tables in the analysis. These are usually
|
|
1007
|
+
implementation details of a derivation process and so are excluded by default (unless
|
|
1008
|
+
a transient table is specifically included in the `tables` argument)
|
|
1009
|
+
:param base_schema_path: path to the schema file to compare against; if not passed, the schema
|
|
1010
|
+
will be assumed to be present at the same location in the filesystem as the current schema.
|
|
1011
|
+
This enables loading of a historical schema even if the schema file or containing package have
|
|
1012
|
+
been moved or renamed.
|
|
1013
|
+
:return: a `SchemaDiff` object representing the differences between the two schemas
|
|
1014
|
+
"""
|
|
1015
|
+
|
|
1016
|
+
if base_schema_path is None:
|
|
1017
|
+
base_schema = load_schema(self.package, self.schema_path, git_ref=base_ref)
|
|
1018
|
+
else:
|
|
1019
|
+
base_schema = load_schema(None, base_schema_path, git_ref=base_ref)
|
|
1020
|
+
|
|
1021
|
+
if tables is None and include_transient:
|
|
1022
|
+
this_schema = self.schema
|
|
1023
|
+
else:
|
|
1024
|
+
|
|
1025
|
+
def table_pred(t: metaschema.Table) -> bool:
|
|
1026
|
+
return (include_transient or not t.transient) if tables is None else (t.name in tables)
|
|
1027
|
+
|
|
1028
|
+
base_schema.tables = {name: t for name, t in base_schema.tables.items() if table_pred(t)}
|
|
1029
|
+
this_schema = copy(self.schema)
|
|
1030
|
+
this_schema.tables = {name: t for name, t in this_schema.tables.items() if table_pred(t)}
|
|
1031
|
+
|
|
1032
|
+
return schema_diff.SchemaDiff(base_schema, this_schema)
|
|
1033
|
+
|
|
1034
|
+
@output_handler(print_data_diff_summaries)
|
|
1035
|
+
def data_diff(
|
|
1036
|
+
self,
|
|
1037
|
+
base_ref: str = "HEAD",
|
|
1038
|
+
tables: Optional[Set[str]] = None,
|
|
1039
|
+
*,
|
|
1040
|
+
base_schema_path: Optional[str] = None,
|
|
1041
|
+
debug: bool = False,
|
|
1042
|
+
) -> Iterator[Tuple[metaschema.Identifier, data_diff.DataFrameDiff]]:
|
|
1043
|
+
"""Compute a diff between the current version-controlled data and the version-controlled data
|
|
1044
|
+
present at a historical point in time.
|
|
1045
|
+
|
|
1046
|
+
:param base_ref: the base git ref to compare against
|
|
1047
|
+
:param tables: a set of specific tables to inspect; if not passed the full set of tables will be
|
|
1048
|
+
diffed
|
|
1049
|
+
:param base_schema_path: path to the schema file to compare against; if not passed, the schema
|
|
1050
|
+
will be assumed to be present at the same location in the filesystem as the current schema.
|
|
1051
|
+
This enables loading of a historical schema even if the schema file or containing package have
|
|
1052
|
+
been moved or renamed.
|
|
1053
|
+
:param debug: if True, pause execution at the first positive diff and drop into a debugger.
|
|
1054
|
+
The local `d_diff` object will be available in the debugger context.
|
|
1055
|
+
:return: an iterator of tuples of table names and their corresponding `DataFrameDiff`s. These
|
|
1056
|
+
may be consumed lazily, allowing for memory-efficient processing of large data diffs.
|
|
1057
|
+
"""
|
|
1058
|
+
if tables:
|
|
1059
|
+
unknown = set(tables).difference(self.schema.tables.keys())
|
|
1060
|
+
if unknown:
|
|
1061
|
+
raise KeyError(f"Unknown tables: {', '.join(unknown)}")
|
|
1062
|
+
|
|
1063
|
+
s_diff = self.schema_diff(base_ref, base_schema_path=base_schema_path)
|
|
1064
|
+
before_blob_store = s_diff.before.remote_blob_store
|
|
1065
|
+
after_blob_store = s_diff.after.remote_blob_store
|
|
1066
|
+
if before_blob_store is None or after_blob_store is None:
|
|
1067
|
+
raise ValueError("Can't diff data without remote blob stores defined in both schemas")
|
|
1068
|
+
for table_name, table_diff in sorted(s_diff.table_diffs.items(), key=lambda t: t[0]):
|
|
1069
|
+
if tables and table_name not in tables:
|
|
1070
|
+
continue
|
|
1071
|
+
if (not table_diff.before.md5) or (not table_diff.after.md5):
|
|
1072
|
+
if table_diff.after.build_time_installed and not table_diff.after.transient:
|
|
1073
|
+
self.logger.warning(f"{table_name}: Can't diff without versioned data (md5 hashes)")
|
|
1074
|
+
continue
|
|
1075
|
+
if table_diff.before.md5 == table_diff.after.md5:
|
|
1076
|
+
self.logger.info(f"{table_name}: Matching md5 hashes; no data diff detected")
|
|
1077
|
+
continue
|
|
1078
|
+
|
|
1079
|
+
if not (pkb := table_diff.before.primary_key) or not (pka := table_diff.after.primary_key):
|
|
1080
|
+
self.logger.warning(f"{table_name}: Can't diff without primary keys")
|
|
1081
|
+
continue
|
|
1082
|
+
if len(pka) != len(pkb):
|
|
1083
|
+
self.logger.warning(
|
|
1084
|
+
f"{table_name}: Can't diff with different primary key lengths ({len(pkb)} vs {len(pka)})"
|
|
1085
|
+
)
|
|
1086
|
+
continue
|
|
1087
|
+
|
|
1088
|
+
before_pk_cols = [next(c for c in table_diff.before.columns if c.name == k) for k in pkb]
|
|
1089
|
+
after_pk_cols = [next(c for c in table_diff.after.columns if c.name == k) for k in pka]
|
|
1090
|
+
incomparable = [
|
|
1091
|
+
(c1.name, c2.name)
|
|
1092
|
+
for c1, c2 in zip(before_pk_cols, after_pk_cols)
|
|
1093
|
+
if not parquet_util.pyarrow_type_compatible(
|
|
1094
|
+
c1.type.parquet, c2.type.parquet, parquet_util.TypeCheckLevel.compatible
|
|
1095
|
+
)
|
|
1096
|
+
]
|
|
1097
|
+
if incomparable:
|
|
1098
|
+
_incomparable = ", ".join(f"{a} <-> {b}" for a, b in incomparable)
|
|
1099
|
+
self.logger.warning(
|
|
1100
|
+
f"{table_name}: Can't diff with incompatibly typed primary key columns {_incomparable}"
|
|
1101
|
+
)
|
|
1102
|
+
continue
|
|
1103
|
+
|
|
1104
|
+
d_diff = data_diff.DataFrameDiff.from_tables(
|
|
1105
|
+
table_diff.before, table_diff.after, before_blob_store, after_blob_store
|
|
1106
|
+
)
|
|
1107
|
+
if debug and d_diff:
|
|
1108
|
+
breakpoint()
|
|
1109
|
+
yield table_name, d_diff
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
def main():
|
|
1113
|
+
if cli is None:
|
|
1114
|
+
raise RuntimeError(
|
|
1115
|
+
"CLI requirements not installed; include the 'cli' extra to use the tabularasa CLI"
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
cli.run()
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
if __name__ == "__main__":
|
|
1122
|
+
main()
|