thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import typing as ty
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from ..schema import metaschema
|
|
7
|
+
from ..schema.compilation.sphinx import render_table
|
|
8
|
+
from ..schema.constraints import ColumnConstraint, EnumConstraint
|
|
9
|
+
from . import data as data_diff
|
|
10
|
+
from . import schema as schema_diff
|
|
11
|
+
|
|
12
|
+
DEFAULT_TABLEFMT = "pipe"
|
|
13
|
+
DEFAULT_FLOATFMT = ".6g"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def markdown_list(items: ty.Iterable) -> str:
|
|
17
|
+
return "\n".join(map("- {}".format, items))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def markdown_heading(level: int, text: str) -> str:
|
|
21
|
+
return f"{'#' * level} {text}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def code_literal(text: str) -> str:
|
|
25
|
+
return f"`{text}`"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _dropped_and_added(
|
|
29
|
+
kind: str, dropped: ty.Iterable, added: ty.Iterable, heading_level: int = 0
|
|
30
|
+
) -> ty.Iterator[str]:
|
|
31
|
+
for title, items in (f"{kind} Dropped:", dropped), (f"{kind} Added:", added):
|
|
32
|
+
if items:
|
|
33
|
+
yield markdown_heading(heading_level + 1, title)
|
|
34
|
+
yield markdown_list(items)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _py_type(dtype: metaschema.ResolvedDType):
|
|
38
|
+
if isinstance(dtype, schema_diff._CUSTOM_DTYPES):
|
|
39
|
+
return dtype.type.python
|
|
40
|
+
return dtype.python
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _constraint_expr(constraint: ColumnConstraint) -> ty.Optional[str]:
|
|
44
|
+
expr = constraint.comment_expr()
|
|
45
|
+
if expr is None and isinstance(constraint, EnumConstraint):
|
|
46
|
+
ordered = "ordered " if constraint.ordered else ""
|
|
47
|
+
return f"{ordered}enum of cardinality {len(constraint.enum)}"
|
|
48
|
+
return expr
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _prepend_sparse_col(
|
|
52
|
+
value: ty.Any, rows: ty.Iterable[ty.Tuple[ty.Any, ...]]
|
|
53
|
+
) -> ty.List[ty.Tuple[ty.Any, ...]]:
|
|
54
|
+
return [(v, *row) for v, row in zip(itertools.chain([value], itertools.repeat("")), rows)]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def markdown_column_diff_table(column_diff: schema_diff.ColumnDiff) -> ty.List[ty.Tuple[ty.Any, ...]]:
|
|
58
|
+
rows = []
|
|
59
|
+
compat_note = "" if column_diff.compatible else " (INCOMPATIBLE)"
|
|
60
|
+
if column_diff.before.name != column_diff.after.name:
|
|
61
|
+
rows.append(("Name", column_diff.before.name, column_diff.after.name))
|
|
62
|
+
if nullability := column_diff.nullability_diff:
|
|
63
|
+
rows.append(("Nullability", (~nullability).name, nullability.name))
|
|
64
|
+
if dtype_diff := column_diff.dtype_diff:
|
|
65
|
+
if (bt := dtype_diff.before.parquet) != (at := dtype_diff.after.parquet):
|
|
66
|
+
rows.append((f"Arrow Type{compat_note}", str(bt), str(at)))
|
|
67
|
+
if (bt_ := _py_type(dtype_diff.before)) != (at_ := _py_type(dtype_diff.after)):
|
|
68
|
+
rows.append((f"Python Type{compat_note}", str(bt_), str(at_)))
|
|
69
|
+
if dtype_diff.constraints_dropped:
|
|
70
|
+
rows.extend(
|
|
71
|
+
_prepend_sparse_col(
|
|
72
|
+
"Constraint Dropped", ((_constraint_expr(c), "") for c in dtype_diff.constraints_dropped)
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
if dtype_diff.constraints_added:
|
|
76
|
+
rows.extend(
|
|
77
|
+
_prepend_sparse_col(
|
|
78
|
+
"Constraint Added", (("", _constraint_expr(c)) for c in dtype_diff.constraints_added)
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
if enum_diff := dtype_diff.enum_diff:
|
|
82
|
+
if enum_diff.values_dropped:
|
|
83
|
+
rows.extend(
|
|
84
|
+
_prepend_sparse_col("Enum Value Dropped", ((v, "") for v in enum_diff.values_dropped))
|
|
85
|
+
)
|
|
86
|
+
if enum_diff.values_added:
|
|
87
|
+
rows.extend(
|
|
88
|
+
_prepend_sparse_col("Enum Value Added", (("", v) for v in enum_diff.values_added))
|
|
89
|
+
)
|
|
90
|
+
if enum_diff.ordered_diff:
|
|
91
|
+
rows.append(
|
|
92
|
+
(
|
|
93
|
+
"Enum Orderability",
|
|
94
|
+
(~enum_diff.ordered_diff).name,
|
|
95
|
+
enum_diff.ordered_diff.name,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
if enum_diff.order_changed:
|
|
99
|
+
rows.append(("Enum Value Order", "", ""))
|
|
100
|
+
return rows
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def markdown_table_table_diff_table(table_diff: schema_diff.TableDiff) -> ty.List[ty.Tuple[ty.Any, ...]]:
|
|
104
|
+
rows = []
|
|
105
|
+
rows.extend(_prepend_sparse_col("Column Dropped", ((c, "") for c in table_diff.columns_dropped)))
|
|
106
|
+
rows.extend(_prepend_sparse_col("Column Added", (("", c) for c in table_diff.columns_added)))
|
|
107
|
+
if (pkb := table_diff.before.primary_key) != (pka := table_diff.after.primary_key):
|
|
108
|
+
rows.append(("Primary Key", ", ".join(pkb or ()), ", ".join(pka or ())))
|
|
109
|
+
rows.extend(
|
|
110
|
+
_prepend_sparse_col("Index Dropped", ((", ".join(i), "") for i in table_diff.indexes_dropped))
|
|
111
|
+
)
|
|
112
|
+
rows.extend(
|
|
113
|
+
_prepend_sparse_col("Index Added", (("", ", ".join(i)) for i in table_diff.indexes_added))
|
|
114
|
+
)
|
|
115
|
+
return rows
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def markdown_table_diff_summary(
|
|
119
|
+
table_diff: schema_diff.TableDiff,
|
|
120
|
+
heading_level: int = 0,
|
|
121
|
+
tablefmt: str = DEFAULT_TABLEFMT,
|
|
122
|
+
) -> ty.Iterator[str]:
|
|
123
|
+
if rows := markdown_table_table_diff_table(table_diff):
|
|
124
|
+
yield markdown_heading(heading_level + 1, "Table Modifications:")
|
|
125
|
+
yield render_table(("Change", "Before", "After"), rows, tablefmt=tablefmt)
|
|
126
|
+
|
|
127
|
+
rows = []
|
|
128
|
+
for column_name, column_diff in table_diff.column_diffs.items():
|
|
129
|
+
rows.extend(_prepend_sparse_col(column_name, markdown_column_diff_table(column_diff)))
|
|
130
|
+
if rows:
|
|
131
|
+
yield markdown_heading(heading_level + 1, "Columns Modified:")
|
|
132
|
+
yield render_table(("Column Name", "Change", "Before", "After"), rows, tablefmt=tablefmt)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def markdown_schema_diff_summary(
|
|
136
|
+
schema_diff: schema_diff.SchemaDiff,
|
|
137
|
+
table_predicate: ty.Optional[ty.Callable[[metaschema.Table], bool]] = None,
|
|
138
|
+
heading_level: int = 0,
|
|
139
|
+
tablefmt: str = DEFAULT_TABLEFMT,
|
|
140
|
+
) -> ty.Iterator[str]:
|
|
141
|
+
yield from _dropped_and_added(
|
|
142
|
+
"Tables",
|
|
143
|
+
(
|
|
144
|
+
schema_diff.tables_dropped
|
|
145
|
+
if table_predicate is None
|
|
146
|
+
else {n: t for n, t in schema_diff.tables_dropped.items() if table_predicate(t)}
|
|
147
|
+
),
|
|
148
|
+
(
|
|
149
|
+
schema_diff.tables_added
|
|
150
|
+
if table_predicate is None
|
|
151
|
+
else {n: t for n, t in schema_diff.tables_dropped.items() if table_predicate(t)}
|
|
152
|
+
),
|
|
153
|
+
heading_level,
|
|
154
|
+
)
|
|
155
|
+
heading = False
|
|
156
|
+
for table_name, table_diff in sorted(schema_diff.table_diffs.items(), key=lambda x: x[0]):
|
|
157
|
+
if (table_predicate is None or table_predicate(table_diff.after)) and table_diff:
|
|
158
|
+
if not heading:
|
|
159
|
+
yield markdown_heading(heading_level + 1, "Tables Modified:")
|
|
160
|
+
heading = True
|
|
161
|
+
yield markdown_heading(heading_level + 2, table_name)
|
|
162
|
+
yield from markdown_table_diff_summary(table_diff, heading_level + 2, tablefmt=tablefmt)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _floatfmt_from_df(df: pd.DataFrame, floatfmt: str) -> ty.List[ty.Optional[str]]:
|
|
166
|
+
return [floatfmt if dt.kind == "f" else None for dt in df.dtypes.values]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def markdown_dataframe_diff_summary(
|
|
170
|
+
dataframe_diff: data_diff.DataFrameDiff,
|
|
171
|
+
table_name: ty.Optional[str] = None,
|
|
172
|
+
verbose: bool = False,
|
|
173
|
+
value_detail: bool = False,
|
|
174
|
+
value_detail_min_count: int = 0,
|
|
175
|
+
heading_level: int = 0,
|
|
176
|
+
tablefmt: str = DEFAULT_TABLEFMT,
|
|
177
|
+
floatfmt: str = DEFAULT_FLOATFMT,
|
|
178
|
+
) -> ty.Iterator[str]:
|
|
179
|
+
heading = False
|
|
180
|
+
table_changes = dataframe_diff.summary()
|
|
181
|
+
if table_changes:
|
|
182
|
+
if table_name:
|
|
183
|
+
yield markdown_heading(heading_level + 1, code_literal(table_name))
|
|
184
|
+
heading = True
|
|
185
|
+
yield markdown_heading(heading_level + 2, "Key Changes:")
|
|
186
|
+
table = table_changes.table().reset_index()
|
|
187
|
+
yield table[table["count"] > 0].to_markdown(
|
|
188
|
+
index=False, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(table, floatfmt)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
meta_changes = dataframe_diff.meta_diff
|
|
192
|
+
if meta_changes is not None and len(meta_changes):
|
|
193
|
+
if table_name and not heading:
|
|
194
|
+
yield markdown_heading(heading_level + 1, code_literal(table_name))
|
|
195
|
+
heading = True
|
|
196
|
+
yield markdown_heading(heading_level + 2, "Metadata Changes:")
|
|
197
|
+
yield meta_changes.to_markdown(
|
|
198
|
+
index=True, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(meta_changes, floatfmt)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def _drop_zero_cols(df: ty.Optional[pd.DataFrame]) -> ty.Optional[pd.DataFrame]:
|
|
202
|
+
if df is None:
|
|
203
|
+
return None
|
|
204
|
+
nonzero_cols = df.any()
|
|
205
|
+
return df[nonzero_cols.index[nonzero_cols]]
|
|
206
|
+
|
|
207
|
+
value_changes = (
|
|
208
|
+
dataframe_diff.row_diff_patterns()
|
|
209
|
+
if verbose
|
|
210
|
+
else _drop_zero_cols(dataframe_diff.column_diff_summary())
|
|
211
|
+
)
|
|
212
|
+
if value_changes is not None and len(value_changes):
|
|
213
|
+
if table_name and not heading:
|
|
214
|
+
yield markdown_heading(heading_level + 1, code_literal(table_name))
|
|
215
|
+
yield markdown_heading(heading_level + 2, "Value Changes:")
|
|
216
|
+
value_changes = value_changes.reset_index()
|
|
217
|
+
yield ty.cast(
|
|
218
|
+
str,
|
|
219
|
+
value_changes.to_markdown(
|
|
220
|
+
index=False, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(value_changes, floatfmt)
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
if value_detail:
|
|
224
|
+
pos_col_diffs = (
|
|
225
|
+
(col_name, col_diff)
|
|
226
|
+
for col_name, col_diff in dataframe_diff.column_diffs.items()
|
|
227
|
+
if col_diff
|
|
228
|
+
)
|
|
229
|
+
for col_name, col_diff in pos_col_diffs:
|
|
230
|
+
col_heading = False
|
|
231
|
+
for kind, prop in (
|
|
232
|
+
("Nulled", data_diff.ColumnDiff.nulled_counts),
|
|
233
|
+
("Filled", data_diff.ColumnDiff.filled_counts),
|
|
234
|
+
("Updated", data_diff.ColumnDiff.updated_counts),
|
|
235
|
+
):
|
|
236
|
+
counts = prop.__get__(col_diff)
|
|
237
|
+
# evaluate these lazily to allow for rendering as they're computed
|
|
238
|
+
if value_detail_min_count:
|
|
239
|
+
counts = counts[counts >= value_detail_min_count]
|
|
240
|
+
if len(counts):
|
|
241
|
+
if not col_heading:
|
|
242
|
+
yield markdown_heading(
|
|
243
|
+
heading_level + 2, f"Column {code_literal(col_name)} Changes Detail:"
|
|
244
|
+
)
|
|
245
|
+
col_heading = True
|
|
246
|
+
yield markdown_heading(heading_level + 3, f"{kind}:")
|
|
247
|
+
yield counts.to_frame("count").reset_index().to_markdown(
|
|
248
|
+
index=False, tablefmt=tablefmt
|
|
249
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import typing as ty
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from thds.core import git
|
|
8
|
+
|
|
9
|
+
StrOrPath = ty.Union[str, os.PathLike]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def relative_to_root(path: StrOrPath) -> Path:
|
|
13
|
+
"""Path relative to the repo root. Can be given either as an absolute path or relative path, in
|
|
14
|
+
which case it's assumed to be relative to the current working directory. Note: paths which have
|
|
15
|
+
already been relativized to the repo root should *not* be passed here - that will only be correct
|
|
16
|
+
if the working directory *is* the repo root"""
|
|
17
|
+
return Path(path).resolve().relative_to(git.get_repo_root())
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@contextlib.contextmanager
|
|
21
|
+
def _subcap(cmd: list, **kwargs) -> ty.Iterator:
|
|
22
|
+
try:
|
|
23
|
+
yield
|
|
24
|
+
except subprocess.CalledProcessError as cpe:
|
|
25
|
+
print("stdout:", cpe.stdout)
|
|
26
|
+
print("stderr:", cpe.stderr)
|
|
27
|
+
print("Failed; retrying: " + " ".join(cmd))
|
|
28
|
+
subprocess.run(cmd, check=True)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def blob_contents(path: StrOrPath, ref: str) -> bytes:
|
|
32
|
+
"""Read the text contents of a specific file (relative to the repo root) at a specific git ref.
|
|
33
|
+
Note that git *requires* the paths given here to be relative to the repo root"""
|
|
34
|
+
cmd = ["git", "show", f"{ref}:{relative_to_root(path)}"]
|
|
35
|
+
with _subcap(cmd):
|
|
36
|
+
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, check=True)
|
|
37
|
+
return proc.stdout
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Make SQLite Loaders that lazily load their source from pre-built ADLS paths.
|
|
2
|
+
|
|
3
|
+
The download will only occur once, and each thread will get its own
|
|
4
|
+
SQLite connection, as is proper.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing as ty
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from thds.core import source
|
|
11
|
+
from thds.core.lazy import Lazy, ThreadLocalLazy
|
|
12
|
+
|
|
13
|
+
from .sqlite_util import AttrsSQLiteDatabase
|
|
14
|
+
|
|
15
|
+
L = ty.TypeVar("L")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _make_lazy_attrs_sqlite_loader(
|
|
19
|
+
mk_loader: ty.Callable[[AttrsSQLiteDatabase], L],
|
|
20
|
+
db_installer: ty.Callable[[], Path],
|
|
21
|
+
mmap_size: int = 2**24,
|
|
22
|
+
) -> ThreadLocalLazy[L]:
|
|
23
|
+
one_time_db_install = Lazy(db_installer)
|
|
24
|
+
|
|
25
|
+
def make_loader():
|
|
26
|
+
# the DB installer is made lazy so that multiple threads
|
|
27
|
+
# competing to install it will only install it once.
|
|
28
|
+
return mk_loader(AttrsSQLiteDatabase(None, one_time_db_install(), mmap_size=mmap_size))
|
|
29
|
+
|
|
30
|
+
return ThreadLocalLazy(make_loader)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def lazy_attrs_sqlite_loader_maker(
|
|
34
|
+
mk_loader: ty.Callable[[AttrsSQLiteDatabase], L],
|
|
35
|
+
default_mmap_size: int = 2**24,
|
|
36
|
+
) -> ty.Callable[[source.Source], ThreadLocalLazy[L]]:
|
|
37
|
+
def make_loader(source: source.Source, mmap_size: int = -1) -> ThreadLocalLazy[L]:
|
|
38
|
+
return _make_lazy_attrs_sqlite_loader(
|
|
39
|
+
mk_loader,
|
|
40
|
+
source.path,
|
|
41
|
+
mmap_size if mmap_size > -1 else default_mmap_size,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return make_loader # type: ignore
|