thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1122 @@
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import sys
6
+ import tempfile
7
+ from copy import copy
8
+ from enum import Enum
9
+ from functools import partial
10
+ from itertools import repeat
11
+ from pathlib import Path
12
+ from typing import Dict, Iterable, Iterator, List, NamedTuple, Optional, Set, Tuple, Type, Union, cast
13
+
14
+ import networkx as nx
15
+ import pkg_resources
16
+
17
+ from thds.core import parallel
18
+ from thds.tabularasa.data_dependencies.adls import (
19
+ ADLSFileIntegrityError,
20
+ ADLSFileSystem,
21
+ adls_filesystem,
22
+ sync_adls_data,
23
+ )
24
+ from thds.tabularasa.data_dependencies.build import ReferenceDataBuildCommand, populate_sqlite_db
25
+ from thds.tabularasa.diff import data as data_diff
26
+ from thds.tabularasa.diff import schema as schema_diff
27
+ from thds.tabularasa.diff import summary as diff_summary
28
+ from thds.tabularasa.loaders import parquet_util
29
+ from thds.tabularasa.loaders.util import (
30
+ PandasParquetLoader,
31
+ default_parquet_package_data_path,
32
+ hash_file,
33
+ )
34
+ from thds.tabularasa.schema import load_schema, metaschema
35
+ from thds.tabularasa.schema.compilation import (
36
+ render_attrs_module,
37
+ render_attrs_sqlite_schema,
38
+ render_pandera_module,
39
+ render_pyarrow_schema,
40
+ render_sphinx_docs,
41
+ render_sql_schema,
42
+ )
43
+ from thds.tabularasa.schema.util import all_predecessors, all_successors
44
+
45
+ try:
46
+ from bourbaki.application.cli import CommandLineInterface, cli_spec
47
+ except ImportError:
48
+
49
+ # stand-in decorators
50
+ def noop_decorator(obj):
51
+ return obj
52
+
53
+ def noop_decorator_factory(obj):
54
+ return noop_decorator
55
+
56
+ config_top_level = define_cli = noop_decorator
57
+ output_handler = noop_decorator_factory
58
+ noncommand = noop_decorator
59
+ cli = None
60
+ else:
61
+ # increase default log verbosity
62
+ # this ensures all log messages at INFO level or greater are rendered,
63
+ # and that tracebacks are always shown
64
+ import bourbaki.application.cli.main as _bourbaki
65
+
66
+ _bourbaki.MIN_VERBOSITY = _bourbaki.TRACEBACK_VERBOSITY = _bourbaki.LOG_LEVEL_NAMES.index("INFO")
67
+
68
+ cli = CommandLineInterface(
69
+ prog="tabularasa",
70
+ require_options=False,
71
+ require_subcommand=True,
72
+ implicit_flags=True,
73
+ use_verbose_flag=True,
74
+ require_config=False,
75
+ add_init_config_command=True,
76
+ use_config_file="tabularasa.yaml",
77
+ package="thds.tabularasa",
78
+ )
79
+ # decorators
80
+ define_cli = cli.definition
81
+ output_handler = cli_spec.output_handler
82
+ config_top_level = cli_spec.config_top_level
83
+ noncommand = cli_spec.noncommand
84
+
85
+ try:
86
+ from ruamel.yaml import YAML
87
+ except ImportError:
88
+
89
+ import yaml
90
+
91
+ load_yaml = yaml.safe_load
92
+ dump_yaml = yaml.safe_dump
93
+ else:
94
+
95
+ def _yaml():
96
+ yaml = YAML()
97
+ yaml.preserve_quotes = True # type: ignore[assignment]
98
+ yaml.width = 100 # type: ignore[assignment]
99
+ return yaml
100
+
101
+ def load_yaml(stream):
102
+ return _yaml().load(stream)
103
+
104
+ def dump_yaml(data, stream): # type: ignore
105
+ _yaml().dump(data, stream)
106
+
107
+
108
+ DEFAULT_GRAPHVIZ_FORMAT = "svg"
109
+ RED, GREEN, YELLOW, BLUE = "#FFAB99", "#99FFDE", "#EDFF99", "#b3f0ff"
110
+ DAG_NODE_COLORS: Dict[Type, str] = {
111
+ metaschema.ADLSRef: RED,
112
+ metaschema.LocalRef: YELLOW,
113
+ metaschema.TabularTextFileRef: YELLOW,
114
+ metaschema.TransientReferenceDataRef: BLUE,
115
+ metaschema.ReferenceDataRef: GREEN,
116
+ }
117
+
118
+
119
+ class CompilationTarget(Enum):
120
+ pandas = "pandas"
121
+ sqlite = "sqlite"
122
+ pyarrow = "pyarrow"
123
+ attrs = "attrs"
124
+ attrs_sqlite = "attrs_sqlite"
125
+
126
+
127
+ class DataFileHashes(NamedTuple):
128
+ actual: Optional[str]
129
+ expected: Optional[str]
130
+
131
+
132
+ class TableSyncData(NamedTuple):
133
+ local_path: Path
134
+ blob_store: metaschema.RemoteBlobStoreSpec
135
+ md5: str
136
+
137
+ @property
138
+ def remote_path(self) -> str:
139
+ return self.remote_data_spec.paths[0].name
140
+
141
+ @property
142
+ def remote_data_spec(self) -> metaschema.ADLSDataSpec:
143
+ data_spec = self.blob_store.data_spec(self.md5)
144
+ return data_spec
145
+
146
+ @property
147
+ def local_file_exists(self) -> bool:
148
+ return self.local_path.exists()
149
+
150
+ @property
151
+ def remote_file_system(self) -> ADLSFileSystem:
152
+ return adls_filesystem(self.blob_store.adls_account, self.blob_store.adls_filesystem)
153
+
154
+ def local_file_md5(self) -> Optional[str]:
155
+ return hash_file(self.local_path) if self.local_file_exists else None
156
+
157
+ def remote_file_exists(self) -> bool:
158
+ return self.remote_file_system.file_exists(self.remote_path)
159
+
160
+
161
+ def print_source(source, *, output: Optional[Path] = None):
162
+ if output is None:
163
+ outfile = sys.stdout
164
+ else:
165
+ outfile = open(output, "w")
166
+
167
+ print(source, file=outfile)
168
+
169
+ if output is not None:
170
+ outfile.close()
171
+
172
+
173
+ def print_file_hashes_status(hashes: Dict[str, DataFileHashes]):
174
+ ready_for_packaging = True
175
+ for name, hs in sorted(hashes.items(), key=lambda kv: kv[0]):
176
+ if hs.actual != hs.expected:
177
+ if hs.actual:
178
+ if hs.expected:
179
+ print(f"{name}: actual md5 {hs.actual} != expected md5 {hs.expected}")
180
+ ready_for_packaging = False
181
+ else:
182
+ print(f"{name}: actual md5 {hs.actual}; NO md5 IN SCHEMA")
183
+ else:
184
+ print(f"{name}: NO FILE")
185
+ ready_for_packaging = False
186
+ else:
187
+ print(f"{name}: ✔")
188
+
189
+ if not ready_for_packaging:
190
+ raise Exception("package data files or schema are not ready for packaging")
191
+
192
+
193
+ def print_list(it: Iterable):
194
+ for i in it:
195
+ print(i)
196
+
197
+
198
+ def print_schema_diff_summary(
199
+ diff: schema_diff.SchemaDiff,
200
+ *,
201
+ exit_code: bool = False,
202
+ heading_level: int = 0,
203
+ tablefmt: str = diff_summary.DEFAULT_TABLEFMT,
204
+ ):
205
+ """Print the schema diff summary to stdout and raise an exception if there are positive diffs
206
+
207
+ :param diff: the schema diff to summarize
208
+ :param exit_code: if passed, exit with code 1 if there is a positive diff (similar to `git diff --exit-code`)
209
+ :param tables: if passed, only show diffs for these tables. Note that a table may not be shown if it is
210
+ transient and the `transient` flag is not passed; a warning is raised in this case
211
+ :param transient: if passed, show diffs for transient tables
212
+ :param heading_level: increase this to render smaller headings on the markdown sections
213
+ :param tablefmt: the table format to use for the markdown tables, as understood by `tabulate`
214
+ """
215
+ positive_diff = False
216
+ for section in diff_summary.markdown_schema_diff_summary(
217
+ diff,
218
+ heading_level=heading_level,
219
+ tablefmt=tablefmt,
220
+ ):
221
+ print(section, end="\n\n")
222
+ positive_diff = True
223
+ if positive_diff and exit_code:
224
+ exit(1)
225
+
226
+
227
+ def print_data_diff_summaries(
228
+ data_diffs: Iterator[Tuple[metaschema.Identifier, data_diff.DataFrameDiff]],
229
+ *,
230
+ exit_code: bool = False,
231
+ verbose: bool = False,
232
+ value_detail: bool = False,
233
+ value_detail_min_count: int = 0,
234
+ heading_level: int = 0,
235
+ tablefmt: str = diff_summary.DEFAULT_TABLEFMT,
236
+ floatfmt: str = diff_summary.DEFAULT_FLOATFMT,
237
+ ):
238
+ """Print summaries of data diffs for a sequence of updated tables
239
+
240
+ :param data_diffs: an iterator of tuples of table names and their corresponding data diffs
241
+ :param exit_code: if True, exit with code 1 if there is a positive diff (similar to `git diff --exit-code`)
242
+ :param verbose: if True, show detailed row change status counts; otherwise show only single-column
243
+ change counts
244
+ :param value_detail: if True, show detailed value change counts; otherwise show only statistics of the
245
+ types of changes as determined by the `verbose` flag
246
+ :param value_detail_min_count: minimum number of instances of a specific value update to show value-level
247
+ detail for. No effect when `value_detail` is False
248
+ :param heading_level: increase this to render smaller headings on the markdown sections
249
+ :param tablefmt: the table format to use for the markdown tables, as understood by `tabulate`
250
+ :param floatfmt: the float format to use for the markdown tables, as understood by `tabulate`
251
+ """
252
+ positive_diff = False
253
+ for table_name, d_diff in data_diffs:
254
+ for section in diff_summary.markdown_dataframe_diff_summary(
255
+ d_diff,
256
+ table_name,
257
+ verbose,
258
+ value_detail=value_detail,
259
+ value_detail_min_count=value_detail_min_count,
260
+ heading_level=heading_level,
261
+ tablefmt=tablefmt,
262
+ floatfmt=floatfmt,
263
+ ):
264
+ positive_diff = True
265
+ print(section, end="\n\n")
266
+
267
+ if positive_diff and exit_code:
268
+ exit(1)
269
+
270
+
271
+ def to_graphviz(
272
+ dag: nx.DiGraph,
273
+ vertical: bool = False,
274
+ ranksep: float = 1.0,
275
+ nodesep: float = 1.0,
276
+ fontsize: int = 12,
277
+ fontname: str = "Courier",
278
+ compact: bool = False,
279
+ ):
280
+ try:
281
+ from pygraphviz import AGraph
282
+ except ImportError:
283
+ raise RuntimeError("dag visulization requires `pygraphviz`")
284
+
285
+ title = "Reference Data dependency DAG"
286
+ g = AGraph(
287
+ directed=True,
288
+ name=title,
289
+ rankdir="TB" if vertical else "LR",
290
+ fontsize=fontsize * 3,
291
+ fontname=fontname,
292
+ label=title,
293
+ labelloc="t",
294
+ ranksep=str(ranksep) + " equally",
295
+ nodesep=str(nodesep),
296
+ )
297
+ g.node_attr["shape"] = "box"
298
+ g.node_attr["fontname"] = fontname
299
+ g.node_attr["fontsize"] = fontsize
300
+
301
+ for node, attrs in dag.nodes(data=True):
302
+ color = DAG_NODE_COLORS.get(type(node))
303
+ name = repr(node)
304
+ g.add_node(
305
+ name,
306
+ label=name,
307
+ fillcolor=color,
308
+ style="bold" if attrs.get("initial") else "filled",
309
+ )
310
+
311
+ g.add_edges_from((repr(head), repr(tail)) for head, tail in dag.edges)
312
+
313
+ if compact:
314
+ # add invisible edges between components to put them on separate levels
315
+ def terminal_nodes(nodes: Iterable[metaschema.CustomStr], initial: bool) -> Iterable[str]:
316
+ lookup = dag.pred if initial else dag.succ
317
+ return (repr(node) for node in nodes if not len(lookup[node]))
318
+
319
+ def balanced_layers(dag: nx.DiGraph) -> List[Set[metaschema.CustomStr]]:
320
+ components = cast(
321
+ List[Set[metaschema.CustomStr]],
322
+ sorted(nx.connected_components(dag.to_undirected()), key=len),
323
+ )
324
+ target_size = len(components[-1])
325
+ layers = [components[0]]
326
+ for nodes in components[1:]:
327
+ component_to_merge = layers[-1]
328
+ if len(component_to_merge) + len(nodes) <= target_size:
329
+ component_to_merge.update(nodes)
330
+ else:
331
+ layers.append(nodes)
332
+ return layers
333
+
334
+ layers = balanced_layers(dag)
335
+ for i, (layer1, layer2) in enumerate(zip(layers, layers[1:]), 1):
336
+ sep_node = f"Layer({i})"
337
+ g.add_node(sep_node, style="invis")
338
+ g.add_edges_from(zip(terminal_nodes(layer1, False), repeat(sep_node)), style="invis")
339
+ g.add_edges_from(zip(repeat(sep_node), terminal_nodes(layer2, True)), style="invis")
340
+
341
+ g.layout(prog="dot")
342
+ return g
343
+
344
+
345
+ def write_dependency_dag(
346
+ dag: nx.DiGraph,
347
+ *,
348
+ output: Optional[Path] = None,
349
+ format: Optional[str] = None,
350
+ vertical: bool = False,
351
+ fontsize: int = 12,
352
+ compact: bool = False,
353
+ ):
354
+ """Save a visualization of the dependency DAG using pygraphviz
355
+
356
+ :param dag: networkx graph representing the DAG
357
+ :param output: the file to write the visualization to. If not passed, a temp file will be created.
358
+ :param format: the format to save the image as (e.g. svg, png); if not passed, it will be inferred
359
+ from the output path name. When that is not passed, svg will be used.
360
+ :param vertical: orient the DAG visualization from top to bottom? (default is left to right)
361
+ :param fontsize: font size of text (e.g. table and resource names) in the visualization
362
+ :param compact: if True, put separate connected components of the DAG on separate levels (vertical
363
+ or horizontal depending on the orientation). For wide DAGs with many components this can result in
364
+ a much more compact representation.
365
+ """
366
+ graphviz = to_graphviz(dag, vertical=vertical, fontsize=fontsize, compact=compact)
367
+ if output is None:
368
+ format = format.lower() if format else DEFAULT_GRAPHVIZ_FORMAT
369
+ output = Path(tempfile.mkstemp(suffix=f".{format}")[1])
370
+ print(f"Created teporary file at {output}; will save in PNG format")
371
+
372
+ print(f"Saving DAG visualization to {output}", file=sys.stderr)
373
+ graphviz.draw(str(output), format=format, prog="dot")
374
+ try:
375
+ subprocess.run(["open", str(output)])
376
+ except Exception as e:
377
+ print(
378
+ f"Couldn't run `open {output}` ({e}); open the file manually",
379
+ file=sys.stderr,
380
+ )
381
+
382
+
383
+ @define_cli
384
+ @config_top_level
385
+ class ReferenceDataManager:
386
+ def __init__(
387
+ self,
388
+ *,
389
+ package: str,
390
+ schema_path: str,
391
+ repo_root: Optional[Path] = None,
392
+ require_editable_install: bool = False,
393
+ ):
394
+ """Utilities for managing, installing, validating, and inspecting reference data
395
+
396
+ :param package: name of the package where the data is to be defined and stored and where the
397
+ schema should be read from
398
+ :param schema_path: path to the schema relative to the package root; should be a YAML file
399
+ compatible with `reference_data.schema.metaschema.Schema`
400
+ :param repo_root: path to the root of the local repository. If not supplied, it will be set to
401
+ the current working directory
402
+ :param require_editable_install: Fail if the package is not installed in editable mode? This is
403
+ generally what you want when developing/publishing. However there are use cases where one may
404
+ wish to edit a hard install of the package in place, e.g. when syncing data files to the
405
+ installed package data directory, in which case this may remain as the default `False` value.
406
+ :return: a `ReferenceDataBuildCommand` subclass with the provided fields populated
407
+ """
408
+ self.build_command = ReferenceDataBuildCommand.with_options(
409
+ package_name=package,
410
+ schema_path=schema_path,
411
+ for_setup_py_build=False,
412
+ )()
413
+ self.logger = logging.getLogger(__name__)
414
+ if repo_root is None:
415
+ self.repo_root = Path.cwd().resolve()
416
+ else:
417
+ self.repo_root = repo_root.resolve()
418
+
419
+ if require_editable_install:
420
+ self.check_editable_install()
421
+
422
+ def check_editable_install(self):
423
+ """Ensure that the package being built is installed in an editable mode; otherwise the operations
424
+ defined in this interface may not have the intended effects."""
425
+ local_data_dir = Path(pkg_resources.resource_filename(self.package, ""))
426
+ if not str(local_data_dir).startswith(str(self.repo_root)):
427
+ msg = (
428
+ f"Package {self.package} appears not to be installed in editable mode; this could result"
429
+ "for example in incorrect file hashes or a corrupted package installation"
430
+ )
431
+ self.logger.exception(msg)
432
+ raise RuntimeError(msg)
433
+ else:
434
+ self.logger.info(f"Check passed - package {self.package} is installed in editable mode")
435
+
436
+ def load_raw_schema(self):
437
+ """Round-trippable load of the schema YAML file, for development operations where the file needs
438
+ to be edited while preserving style and comments"""
439
+ self.logger.info("Loading round-trippable raw schema")
440
+ with pkg_resources.resource_stream(self.package, self.schema_path) as f:
441
+ return load_yaml(f)
442
+
443
+ @property
444
+ def schema(self) -> metaschema.Schema:
445
+ return self.build_command.schema
446
+
447
+ @property
448
+ def build_options(self) -> metaschema.BuildOptions:
449
+ options = self.schema.build_options
450
+ assert options is not None
451
+ return options
452
+
453
+ @property
454
+ def package(self) -> str:
455
+ return self.build_command.package_name
456
+
457
+ @property
458
+ def schema_path(self) -> str:
459
+ return self.build_command.schema_path
460
+
461
+ @property
462
+ def package_data_dir(self) -> Optional[str]:
463
+ return self.build_options.package_data_dir
464
+
465
+ @property
466
+ def transient_data_dir(self) -> Optional[str]:
467
+ return self.build_options.transient_data_dir
468
+
469
+ @property
470
+ def sqlite_db_path(self) -> Optional[str]:
471
+ return self.build_options.sqlite_db_path
472
+
473
+ @property
474
+ def repo_url(self):
475
+ return self.build_options.repo_url
476
+
477
+ @property
478
+ def table_docs_dir(self):
479
+ return self.build_options.table_docs_dir
480
+
481
+ @property
482
+ def type_docs_path(self):
483
+ return self.build_options.type_docs_path
484
+
485
+ @property
486
+ def source_docs_path(self):
487
+ return self.build_options.source_docs_path
488
+
489
+ @property
490
+ def curation_badge_path(self):
491
+ return self.build_options.curation_badge_path
492
+
493
+ def data_path_for(self, table: Union[str, metaschema.Table]) -> Path:
494
+ table_ = self.schema.tables[table] if isinstance(table, str) else table
495
+ data_dir = self.transient_data_dir if table_.transient else self.package_data_dir
496
+ assert data_dir is not None
497
+ return Path(
498
+ pkg_resources.resource_filename(
499
+ self.package,
500
+ default_parquet_package_data_path(table_.name, data_dir),
501
+ )
502
+ )
503
+
504
+ @output_handler(print_list)
505
+ def dependent_tables(self, tables: Optional[Set[str]] = None) -> Set[str]:
506
+ """Compute the set of tables downstream from a set of tables in the computational DAG,
507
+ including the original tables"""
508
+ tables = tables or set()
509
+ unknown_tables = {t for t in tables if t not in self.schema.tables}
510
+ if unknown_tables:
511
+ raise KeyError(f"Unknown tables: {','.join(unknown_tables)}")
512
+ dag = self.schema.dependency_dag()
513
+ downstream = all_successors(dag, [self.schema.tables[t].graph_ref for t in tables])
514
+ return {str(t) for t in downstream if isinstance(t, metaschema.ReferenceDataRef)}
515
+
516
+ @output_handler(write_dependency_dag)
517
+ def dag(
518
+ self,
519
+ tables: Optional[Set[str]] = None,
520
+ *,
521
+ upstream: bool = True,
522
+ downstream: bool = True,
523
+ build: bool = False,
524
+ ):
525
+ """Compute the dependency DAG for a set of tables and their dependencies and/or dependents.
526
+ (or the whole DAG if tables are not passed)
527
+
528
+ :param tables: tables to treat as root nodes in the DAG; if passed, only these tables and their
529
+ dependencies/dependents will be in the DAG, otherwise the entire DAG will be returned
530
+ :param upstream: Should the DAG include upstream dependencies? (by default it does)
531
+ :param downstream: Should the DAG include downstream dependencies? (by default it does)
532
+ :param build: Should the DAG include all dependencies that would be included in a build of the
533
+ specified tables? Overrides upstream and downstream specification. False by default.
534
+ :return: networkx.DiGraph representing the computational DAG of data derivations
535
+ """
536
+ if tables:
537
+ if not upstream and not downstream and not build:
538
+ raise ValueError("one of `upstream`, `downstream`, `connected` must be True")
539
+
540
+ full_dag = self.schema.dependency_dag()
541
+ table_refs = {self.schema.tables[t].graph_ref for t in tables}
542
+ if build:
543
+ tables_ = self.dependent_tables(tables)
544
+ dag = self.schema.dependency_dag(lambda table: table.name in tables_)
545
+ else:
546
+ downstream_refs = all_successors(full_dag, table_refs) if downstream else set()
547
+ upstream_refs = all_predecessors(full_dag, table_refs) if upstream else set()
548
+ refs = downstream_refs.union(upstream_refs)
549
+ dag = nx.DiGraph(nx.induced_subgraph(full_dag, refs))
550
+
551
+ for table in table_refs:
552
+ dag.add_node(table, initial=True)
553
+ return dag
554
+ else:
555
+ return self.schema.dependency_dag()
556
+
557
+ @output_handler(print_source)
558
+ def compile(self, target: CompilationTarget) -> str:
559
+ """Compile a schema YAML file to a specific target language/library
560
+
561
+ :param target: The target language/library to compile the YAML schema to
562
+ """
563
+ if target == CompilationTarget.sqlite:
564
+
565
+ def sql_renderer(schema):
566
+ return "\n".join(render_sql_schema(schema))
567
+
568
+ renderer = sql_renderer
569
+ elif target == CompilationTarget.pandas:
570
+ renderer = partial(
571
+ render_pandera_module,
572
+ package=self.package,
573
+ )
574
+ elif target == CompilationTarget.pyarrow:
575
+ renderer = render_pyarrow_schema
576
+ elif target == CompilationTarget.attrs:
577
+ renderer = partial(
578
+ render_attrs_module,
579
+ package=self.package,
580
+ )
581
+ elif target == CompilationTarget.attrs_sqlite:
582
+ assert (
583
+ self.sqlite_db_path is not None
584
+ ), "Must specify sqlite db path in build options to generate sqlite interface"
585
+ renderer = partial(
586
+ render_attrs_sqlite_schema,
587
+ package=self.package,
588
+ db_path=self.sqlite_db_path,
589
+ )
590
+ else:
591
+ raise NotImplementedError(f"Compilation hasn't been implemented for target {target.value}")
592
+
593
+ source = renderer(self.schema)
594
+ return source
595
+
596
+ @output_handler(print_file_hashes_status)
597
+ def check_hashes(self) -> Dict[str, DataFileHashes]:
598
+ """Check actual hashes of on-disk built data files against those documented in the schema"""
599
+ assert (
600
+ self.package_data_dir is not None and self.transient_data_dir is not None
601
+ ), "Can't check hashes without package data dirs"
602
+ hashes = {}
603
+ for table in self.schema.build_time_package_tables:
604
+ name = table.name
605
+ loader = PandasParquetLoader.from_schema_table(
606
+ table,
607
+ package=self.package,
608
+ data_dir=self.transient_data_dir if table.transient else self.package_data_dir,
609
+ )
610
+ if Path(pkg_resources.resource_filename(self.package, loader.data_path)).exists():
611
+ hashes[name] = DataFileHashes(actual=loader.file_hash(), expected=table.md5)
612
+ else:
613
+ hashes[name] = DataFileHashes(actual=None, expected=table.md5)
614
+
615
+ return hashes
616
+
617
+ def init_sqlite(self, *, validate: bool = False, check_hash: bool = True):
618
+ """Populate a sqlite database with the package's tabular data
619
+
620
+ :param db_path: Optional path inside the package to use for the database. If not supplied, the
621
+ default path will be used
622
+ :param validate: Validate data using pandera schemas before inserting?
623
+ :param check_hash: Check hashes in db metadata table and skip inserting tables that are
624
+ up-to-date with current package data files?
625
+ """
626
+ assert (
627
+ self.package_data_dir is not None
628
+ and self.transient_data_dir is not None
629
+ and self.sqlite_db_path is not None
630
+ ), "Can't init sqlite db without package data dirs and sqlite db path"
631
+ populate_sqlite_db(
632
+ self.schema,
633
+ db_package=self.package,
634
+ db_path=self.sqlite_db_path,
635
+ data_package=self.package,
636
+ data_dir=self.package_data_dir,
637
+ transient_data_dir=self.transient_data_dir,
638
+ validate=validate,
639
+ check_hash=check_hash,
640
+ )
641
+
642
+ def codegen(self):
643
+ """Generate all derived accessor code and save to specified files"""
644
+ self.build_command.write_derived_source_code()
645
+
646
+ def docgen(self):
647
+ if self.table_docs_dir is None:
648
+ raise ValueError("Can't write table docs without table_docs_dir")
649
+ elif self.type_docs_path is None:
650
+ raise ValueError("Can't write type doc without type_docs_path")
651
+ elif self.source_docs_path is None:
652
+ self.logger.warning("Can't write source doc without source_docs_path")
653
+
654
+ table_output_dir = Path(self.table_docs_dir)
655
+
656
+ if table_output_dir.is_dir():
657
+ self.logger.info(f"Clearing existing table docs directory at {table_output_dir}")
658
+ shutil.rmtree(table_output_dir)
659
+
660
+ self.logger.info(f"Creating table docs directory at {table_output_dir}")
661
+ table_output_dir.mkdir(parents=True)
662
+
663
+ self.logger.info("Rendering markdown for package tables")
664
+ types_doc, source_doc, table_docs = render_sphinx_docs(
665
+ self.schema, self.repo_root, self.repo_url
666
+ )
667
+ for table_name, markdown in table_docs.items():
668
+ path = table_output_dir / f"{table_name}.rst"
669
+ self.logger.info(f"Writing markdown docs for table {table_name} to {path}")
670
+ with open(path, "w") as f:
671
+ f.write(markdown)
672
+
673
+ type_docs_path = Path(self.type_docs_path)
674
+ self.logger.info(f"Writing markdown for package types to {type_docs_path}")
675
+ with open(type_docs_path, "w") as f:
676
+ f.write(types_doc)
677
+
678
+ if self.source_docs_path:
679
+ source_docs_path = Path(self.source_docs_path)
680
+ self.logger.info(f"Writing markdown for package source data to {source_docs_path}")
681
+ with open(source_docs_path, "w") as f:
682
+ f.write(source_doc)
683
+
684
+ def datagen(
685
+ self, tables: Optional[Set[str]] = None, *, update_hashes: bool = True, no_sync: bool = False
686
+ ):
687
+ """Re-generate package data, optionally skipping files with hashes matching those in the schema
688
+ :param tables: names of the specific tables to build. If not passed, all tables will be built
689
+ :param update_hashes: Should hashes be updated for all tables regenerated at the end of the
690
+ build? This is done by default but can be disabled if you are just experimenting.
691
+ :param no_sync: when passed, don't pull the latest data from the remote blob store before building.
692
+ Useful only if you really know what you're doing and are in an intermediate state with
693
+ "uncommitted" data files whose md5s don't match what's in the schema - e.g. as a result of
694
+ running `datagen` with `update_hashes=False`.
695
+ """
696
+ data_dir = self.package_data_dir
697
+ transient_data_dir = self.transient_data_dir
698
+ if data_dir is None or transient_data_dir is None:
699
+ raise ValueError("Can't build data files without specification of data dirs in the schema")
700
+
701
+ if tables:
702
+ self.logger.info(
703
+ f"Computing all tables downstream of {tables} in the dependency DAG and removing built "
704
+ f"files to force re-computation"
705
+ )
706
+ # force re-computation of the specified tables *and* all their downstream dependents
707
+ tables_to_recompute = self.dependent_tables(tables)
708
+ else:
709
+ # build all tables
710
+ tables_to_recompute = set(t.name for t in self.schema.computable_tables)
711
+
712
+ # update hashes for all upstream tables in the DAG as well, since any of them may be recomputed
713
+ # in this build on a hash mismatch
714
+ tables_to_update_hashes = {
715
+ str(t)
716
+ for t in self.schema.dependency_dag(lambda table: table.name in tables_to_recompute)
717
+ if isinstance(t, metaschema.ReferenceDataRef)
718
+ and not ((table := self.schema.tables[str(t)]).transient and table.md5 is None)
719
+ # don't update hashes for transient tables with explicitly no hash
720
+ }
721
+ run_hash_update = bool(tables_to_update_hashes) and update_hashes
722
+
723
+ if not no_sync:
724
+ # ensure local blobs are up-to-date before building, but don't fail if a remote blob is absent;
725
+ # we'll just regenerate it if it's needed for computing the current DAG
726
+ self.sync_blob_store(down=True, no_fail_if_absent=True)
727
+
728
+ for table_name in tables_to_recompute:
729
+ table = self.schema.tables[table_name]
730
+ file_path = self.data_path_for(table)
731
+ if file_path.exists():
732
+ self.logger.warning(f"Removing built file for table {table.name} at {file_path}")
733
+ os.remove(file_path)
734
+ else:
735
+ self.logger.info(f"No file found for table {table.name}; nothing to remove")
736
+ try:
737
+ self.build_command.build_package_data(tables=tables_to_recompute or None)
738
+ except Exception as e:
739
+ raise e
740
+ finally:
741
+ if run_hash_update:
742
+ self.update_hashes(tables_to_update_hashes, codegen=True)
743
+
744
+ def update_hashes(self, tables: Optional[Set[str]] = None, *, codegen: bool = True):
745
+ """Update package data hashes in schema YAML to match the actual hashes of package data files as
746
+ currently present in the file tree (or as recomputed when specified)
747
+
748
+ :param tables: if passed, only update hashes for these tables' package data; otherwise update for
749
+ all tables
750
+ :param codegen: indicates whether to run the `codegen` command after updating the hashes to
751
+ ensure hashes embedded in source code are up-to-date. By default, this runs when any hashes are
752
+ updated in the config file.
753
+ """
754
+ assert (
755
+ self.package_data_dir is not None and self.transient_data_dir is not None
756
+ ), "Can't update hashes without package data dirs"
757
+ hashes_updated = []
758
+ tables_to_update = (
759
+ [self.schema.tables[t] for t in tables] if tables else self.schema.build_time_package_tables
760
+ )
761
+ raw_schema = self.load_raw_schema()
762
+ self.logger.info("Updating data hashes")
763
+ for table in tables_to_update:
764
+ table_name = table.name
765
+ table_path = self.data_path_for(table)
766
+ if os.path.exists(table_path):
767
+ md5 = hash_file(table_path)
768
+ old_md5 = table.md5
769
+ if old_md5 is None:
770
+ self.logger.warning(
771
+ f"no md5 hash previously defined for table {table_name}; updating to {md5!r}"
772
+ )
773
+ elif md5 != old_md5:
774
+ self.logger.warning(
775
+ f"md5 hashes did not match for table {table_name}; updating to {md5!r}"
776
+ )
777
+ else:
778
+ continue
779
+
780
+ table.md5 = md5
781
+ raw_schema["tables"][table_name]["md5"] = md5
782
+ hashes_updated.append(table_name)
783
+ else:
784
+ self.logger.warning(
785
+ f"package data file doesn't exist for table {table_name!r}; can't update md5 hash"
786
+ )
787
+
788
+ schema_path = pkg_resources.resource_filename(self.package, self.schema_path)
789
+ if hashes_updated:
790
+ self.logger.warning(
791
+ f"updated hashes for tables {hashes_updated!r}; writing new schema to {schema_path}"
792
+ )
793
+ with open(schema_path, "w") as f:
794
+ dump_yaml(raw_schema, f)
795
+
796
+ if codegen:
797
+ self.logger.info("regenerating source code to update embedded hashes")
798
+ self.codegen()
799
+
800
+ @noncommand
801
+ def table_sync_data(self, table: metaschema.Table) -> TableSyncData:
802
+ blob_store = self.schema.remote_blob_store
803
+ assert blob_store is not None, "No blob store defined in schema"
804
+ assert table.md5 is not None, f"No md5 defined for table {table.name}"
805
+ assert self.package_data_dir is not None, "No package data dir to sync"
806
+ local_build_path = Path(
807
+ pkg_resources.resource_filename(
808
+ self.package,
809
+ default_parquet_package_data_path(table.name, self.package_data_dir),
810
+ )
811
+ )
812
+ return TableSyncData(local_build_path, blob_store, md5=table.md5)
813
+
814
+ @noncommand
815
+ def sync_up(self, sync_data: TableSyncData) -> bool:
816
+ remote_path = sync_data.remote_path
817
+ local_build_path = sync_data.local_path
818
+ if sync_data.remote_file_exists():
819
+ self.logger.info(f"Found existing file in remote blob store at {remote_path}; not syncing")
820
+ return True
821
+ else:
822
+ self.logger.info(f"Syncing to path {remote_path} in remote blob store")
823
+ try:
824
+ sync_data.remote_file_system.put_file(local_build_path, remote_path)
825
+ except Exception as e:
826
+ self.logger.exception(
827
+ f"Failed to put file {local_build_path} at {remote_path} in blob store: {e}"
828
+ )
829
+ return False
830
+ else:
831
+ return True
832
+
833
+ @noncommand
834
+ def sync_down(self, sync_data: TableSyncData, link_build: bool) -> bool:
835
+ self.logger.info(f"Fetching file from remote blob store at {sync_data.remote_path}")
836
+ try:
837
+ paths = sync_adls_data(sync_data.remote_data_spec)
838
+ except ADLSFileIntegrityError as e:
839
+ self.logger.exception(str(e))
840
+ return False
841
+ except Exception as e:
842
+ self.logger.exception(f"Failed to fetch file {sync_data.remote_path} from blob store: {e}")
843
+ return False
844
+ else:
845
+ assert len(paths) == 1
846
+ if link_build:
847
+ local_cache_path = paths[0].local_path
848
+ if sync_data.local_file_exists:
849
+ self.logger.warning(f"Removing existing file {sync_data.local_path}")
850
+ os.remove(sync_data.local_path)
851
+ self.logger.info(f"Linking downloaded file to local build file {sync_data.local_path}")
852
+ sync_data.local_path.parent.mkdir(parents=True, exist_ok=True)
853
+ os.link(local_cache_path, sync_data.local_path)
854
+ return True
855
+
856
+ def sync_blob_store(
857
+ self,
858
+ *,
859
+ up: bool = False,
860
+ down: bool = False,
861
+ no_fail_if_absent: bool = False,
862
+ tables: Optional[Set[str]] = None,
863
+ ) -> List[str]:
864
+ """Sync the local built files to the remote blob store, if one is defined.
865
+ It is assumed that the hashes in the schema file are the source of truth rather than the hashes
866
+ of the on-disk built files; if these should be taken as authoritative instead, run the
867
+ `update_hashes` command first. At the end of this operation, all files in the local build folder
868
+ and the remote blob store are guaranteed to match the hashes in the schema file, unless a file
869
+ with the correct hash was unavailable.
870
+
871
+ :param up: Upload local files to the blob store if they're available?
872
+ :param down: Download remote blobs to the local build directory if they're available?
873
+ :param no_fail_if_absent: when passed, don't fail an upload for lack of a local file being
874
+ present with the expected hash for a version-controlled table. This is useful in development
875
+ workflows where you just want to regenerate/sync a particular table that you've updated.
876
+ :param tables: optional collection of table names to sync; all will be synced if not passed.
877
+ :return: list of table names that were synced successfully
878
+ :raises FileNotExistsError: if a local or remote file was not available for sync
879
+ """
880
+ assert self.package_data_dir is not None, "Can't sync blob store without package data dir"
881
+ blob_store = self.schema.remote_blob_store
882
+ if blob_store is None:
883
+ self.logger.warning("No remote blob store defined; not syncing files")
884
+ return []
885
+
886
+ if not (down or up):
887
+ raise ValueError("Must indicate syncing either down, up, or both from blob store")
888
+
889
+ tables_to_sync = []
890
+ for table in self.schema.build_time_package_tables:
891
+ if table.md5 is None:
892
+ self.logger.warning(
893
+ f"No md5 hash defined for package table {table.name}; no remote blob to sync to or from"
894
+ )
895
+ else:
896
+ tables_to_sync.append(table)
897
+
898
+ if tables is not None:
899
+ known_tables = {t.name for t in tables_to_sync}
900
+ if unknown_tables := tables.difference(known_tables):
901
+ msg = f"Can't sync unknown or non-version-controlled tables: {', '.join(unknown_tables)}"
902
+ self.logger.error(msg)
903
+ raise KeyError(msg)
904
+ tables_to_sync = [t for t in tables_to_sync if t.name in tables]
905
+
906
+ self.logger.info(
907
+ f"Syncing with remote blob store {blob_store.adls_account}/{blob_store.adls_filesystem}"
908
+ )
909
+
910
+ def inner(table: metaschema.Table) -> Optional[str]:
911
+ sync_data = self.table_sync_data(table)
912
+ local_file_md5 = sync_data.local_file_md5()
913
+ if local_file_md5 == table.md5:
914
+ # good local file; we can sync up
915
+ self.logger.info(
916
+ f"Found local file for table {table.name} matching expected hash {table.md5}"
917
+ )
918
+ if up:
919
+ if self.sync_up(sync_data):
920
+ return table.name
921
+ else:
922
+ raise IOError(table.name)
923
+ else:
924
+ # file is present locally with expected hash; no need to sync down
925
+ return table.name
926
+ else:
927
+ # check remote; download to get hash and link if good
928
+ addendum = "" if local_file_md5 is None else f" matching expected hash {table.md5}"
929
+ self.logger.info(f"No local file found for table {table.name}{addendum}")
930
+ if up and no_fail_if_absent:
931
+ self.logger.info(
932
+ f"Skipping sync to remote blob store of local file for table {table.name}"
933
+ )
934
+ return None
935
+
936
+ # only link the downloaded file into the build dir if we're syncing down; else just download
937
+ # the file to check that it has the correct hash
938
+ success = self.sync_down(sync_data, link_build=down)
939
+ if success:
940
+ return table.name
941
+ else:
942
+ if down and no_fail_if_absent:
943
+ return None
944
+ raise IOError(table.name)
945
+
946
+ failed: List[str] = []
947
+ synced: List[str] = []
948
+ for table_name, res in parallel.yield_all([(t.name, partial(inner, t)) for t in tables_to_sync]):
949
+ if isinstance(res, parallel.Error):
950
+ failed.append(table_name)
951
+ elif res is not None:
952
+ synced.append(table_name)
953
+
954
+ if failed:
955
+ raise RuntimeError(f"Sync failed for tables {', '.join(failed)}")
956
+
957
+ down_ = (
958
+ f"to local build directory {pkg_resources.resource_filename(self.package, self.package_data_dir)}"
959
+ if down
960
+ else ""
961
+ )
962
+ up_ = (
963
+ f"to remote blob store {blob_store.adls_account}/{blob_store.adls_filesystem}/{blob_store.path}"
964
+ if up
965
+ else ""
966
+ )
967
+ addendum = f"{down_} and {up_}" if down and up else down_ or up_
968
+ tables_ = f" {', '.join(tables)}" if tables else ""
969
+ self.logger.info(f"Success - build-time package data tables{tables_} synced {addendum}")
970
+ return synced
971
+
972
+ def pull(self, tables: Optional[Set[str]] = None, *, no_fail_if_absent: bool = False):
973
+ """Download all remote blobs to the local data directory, with integrity checks.
974
+
975
+ :param tables: optional collection of table names to sync; all will be synced if not passed.
976
+ :param no_fail_if_absent: when passed, don't fail a download for lack of a remote blob being
977
+ present in the bob store with the expected hash for a version-controlled table. This is useful
978
+ in development workflows where you just want to regenerate/sync a particular table that you
979
+ generated once and then removed, but didn't push yet (leaving a dangling hash reference).
980
+ """
981
+ self.sync_blob_store(down=True, no_fail_if_absent=no_fail_if_absent, tables=tables)
982
+
983
+ def push(self, tables: Optional[Set[str]] = None, *, no_fail_if_absent: bool = False):
984
+ """Upload all local data files to the remote blob store, with integrity checks.
985
+
986
+ :param tables: optional collection of table names to sync; all will be synced if not passed.
987
+ :param no_fail_if_absent: when passed, don't fail an upload for lack of a local file being
988
+ present with the expected hash for a version-controlled table. This is useful in development
989
+ workflows where you just want to regenerate/sync a particular table that you've updated.
990
+ """
991
+ self.sync_blob_store(up=True, no_fail_if_absent=no_fail_if_absent, tables=tables)
992
+
993
+ @output_handler(print_schema_diff_summary)
994
+ def schema_diff(
995
+ self,
996
+ base_ref: str = "HEAD",
997
+ tables: Optional[Set[str]] = None,
998
+ *,
999
+ include_transient: bool = False,
1000
+ base_schema_path: Optional[str] = None,
1001
+ ):
1002
+ """Compute a diff between the current schema and a historical version of the schema.
1003
+
1004
+ :param base_ref: the base git ref to compare against
1005
+ :param tables: a set of specific tables to inspect; if not passed the full schemas will be diffed
1006
+ :param include_transient: if passed, include transient tables in the analysis. These are usually
1007
+ implementation details of a derivation process and so are excluded by default (unless
1008
+ a transient table is specifically included in the `tables` argument)
1009
+ :param base_schema_path: path to the schema file to compare against; if not passed, the schema
1010
+ will be assumed to be present at the same location in the filesystem as the current schema.
1011
+ This enables loading of a historical schema even if the schema file or containing package have
1012
+ been moved or renamed.
1013
+ :return: a `SchemaDiff` object representing the differences between the two schemas
1014
+ """
1015
+
1016
+ if base_schema_path is None:
1017
+ base_schema = load_schema(self.package, self.schema_path, git_ref=base_ref)
1018
+ else:
1019
+ base_schema = load_schema(None, base_schema_path, git_ref=base_ref)
1020
+
1021
+ if tables is None and include_transient:
1022
+ this_schema = self.schema
1023
+ else:
1024
+
1025
+ def table_pred(t: metaschema.Table) -> bool:
1026
+ return (include_transient or not t.transient) if tables is None else (t.name in tables)
1027
+
1028
+ base_schema.tables = {name: t for name, t in base_schema.tables.items() if table_pred(t)}
1029
+ this_schema = copy(self.schema)
1030
+ this_schema.tables = {name: t for name, t in this_schema.tables.items() if table_pred(t)}
1031
+
1032
+ return schema_diff.SchemaDiff(base_schema, this_schema)
1033
+
1034
+ @output_handler(print_data_diff_summaries)
1035
+ def data_diff(
1036
+ self,
1037
+ base_ref: str = "HEAD",
1038
+ tables: Optional[Set[str]] = None,
1039
+ *,
1040
+ base_schema_path: Optional[str] = None,
1041
+ debug: bool = False,
1042
+ ) -> Iterator[Tuple[metaschema.Identifier, data_diff.DataFrameDiff]]:
1043
+ """Compute a diff between the current version-controlled data and the version-controlled data
1044
+ present at a historical point in time.
1045
+
1046
+ :param base_ref: the base git ref to compare against
1047
+ :param tables: a set of specific tables to inspect; if not passed the full set of tables will be
1048
+ diffed
1049
+ :param base_schema_path: path to the schema file to compare against; if not passed, the schema
1050
+ will be assumed to be present at the same location in the filesystem as the current schema.
1051
+ This enables loading of a historical schema even if the schema file or containing package have
1052
+ been moved or renamed.
1053
+ :param debug: if True, pause execution at the first positive diff and drop into a debugger.
1054
+ The local `d_diff` object will be available in the debugger context.
1055
+ :return: an iterator of tuples of table names and their corresponding `DataFrameDiff`s. These
1056
+ may be consumed lazily, allowing for memory-efficient processing of large data diffs.
1057
+ """
1058
+ if tables:
1059
+ unknown = set(tables).difference(self.schema.tables.keys())
1060
+ if unknown:
1061
+ raise KeyError(f"Unknown tables: {', '.join(unknown)}")
1062
+
1063
+ s_diff = self.schema_diff(base_ref, base_schema_path=base_schema_path)
1064
+ before_blob_store = s_diff.before.remote_blob_store
1065
+ after_blob_store = s_diff.after.remote_blob_store
1066
+ if before_blob_store is None or after_blob_store is None:
1067
+ raise ValueError("Can't diff data without remote blob stores defined in both schemas")
1068
+ for table_name, table_diff in sorted(s_diff.table_diffs.items(), key=lambda t: t[0]):
1069
+ if tables and table_name not in tables:
1070
+ continue
1071
+ if (not table_diff.before.md5) or (not table_diff.after.md5):
1072
+ if table_diff.after.build_time_installed and not table_diff.after.transient:
1073
+ self.logger.warning(f"{table_name}: Can't diff without versioned data (md5 hashes)")
1074
+ continue
1075
+ if table_diff.before.md5 == table_diff.after.md5:
1076
+ self.logger.info(f"{table_name}: Matching md5 hashes; no data diff detected")
1077
+ continue
1078
+
1079
+ if not (pkb := table_diff.before.primary_key) or not (pka := table_diff.after.primary_key):
1080
+ self.logger.warning(f"{table_name}: Can't diff without primary keys")
1081
+ continue
1082
+ if len(pka) != len(pkb):
1083
+ self.logger.warning(
1084
+ f"{table_name}: Can't diff with different primary key lengths ({len(pkb)} vs {len(pka)})"
1085
+ )
1086
+ continue
1087
+
1088
+ before_pk_cols = [next(c for c in table_diff.before.columns if c.name == k) for k in pkb]
1089
+ after_pk_cols = [next(c for c in table_diff.after.columns if c.name == k) for k in pka]
1090
+ incomparable = [
1091
+ (c1.name, c2.name)
1092
+ for c1, c2 in zip(before_pk_cols, after_pk_cols)
1093
+ if not parquet_util.pyarrow_type_compatible(
1094
+ c1.type.parquet, c2.type.parquet, parquet_util.TypeCheckLevel.compatible
1095
+ )
1096
+ ]
1097
+ if incomparable:
1098
+ _incomparable = ", ".join(f"{a} <-> {b}" for a, b in incomparable)
1099
+ self.logger.warning(
1100
+ f"{table_name}: Can't diff with incompatibly typed primary key columns {_incomparable}"
1101
+ )
1102
+ continue
1103
+
1104
+ d_diff = data_diff.DataFrameDiff.from_tables(
1105
+ table_diff.before, table_diff.after, before_blob_store, after_blob_store
1106
+ )
1107
+ if debug and d_diff:
1108
+ breakpoint()
1109
+ yield table_name, d_diff
1110
+
1111
+
1112
+ def main():
1113
+ if cli is None:
1114
+ raise RuntimeError(
1115
+ "CLI requirements not installed; include the 'cli' extra to use the tabularasa CLI"
1116
+ )
1117
+
1118
+ cli.run()
1119
+
1120
+
1121
+ if __name__ == "__main__":
1122
+ main()