thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,249 @@
1
+ import itertools
2
+ import typing as ty
3
+
4
+ import pandas as pd
5
+
6
+ from ..schema import metaschema
7
+ from ..schema.compilation.sphinx import render_table
8
+ from ..schema.constraints import ColumnConstraint, EnumConstraint
9
+ from . import data as data_diff
10
+ from . import schema as schema_diff
11
+
12
+ DEFAULT_TABLEFMT = "pipe"
13
+ DEFAULT_FLOATFMT = ".6g"
14
+
15
+
16
+ def markdown_list(items: ty.Iterable) -> str:
17
+ return "\n".join(map("- {}".format, items))
18
+
19
+
20
+ def markdown_heading(level: int, text: str) -> str:
21
+ return f"{'#' * level} {text}"
22
+
23
+
24
+ def code_literal(text: str) -> str:
25
+ return f"`{text}`"
26
+
27
+
28
+ def _dropped_and_added(
29
+ kind: str, dropped: ty.Iterable, added: ty.Iterable, heading_level: int = 0
30
+ ) -> ty.Iterator[str]:
31
+ for title, items in (f"{kind} Dropped:", dropped), (f"{kind} Added:", added):
32
+ if items:
33
+ yield markdown_heading(heading_level + 1, title)
34
+ yield markdown_list(items)
35
+
36
+
37
+ def _py_type(dtype: metaschema.ResolvedDType):
38
+ if isinstance(dtype, schema_diff._CUSTOM_DTYPES):
39
+ return dtype.type.python
40
+ return dtype.python
41
+
42
+
43
+ def _constraint_expr(constraint: ColumnConstraint) -> ty.Optional[str]:
44
+ expr = constraint.comment_expr()
45
+ if expr is None and isinstance(constraint, EnumConstraint):
46
+ ordered = "ordered " if constraint.ordered else ""
47
+ return f"{ordered}enum of cardinality {len(constraint.enum)}"
48
+ return expr
49
+
50
+
51
+ def _prepend_sparse_col(
52
+ value: ty.Any, rows: ty.Iterable[ty.Tuple[ty.Any, ...]]
53
+ ) -> ty.List[ty.Tuple[ty.Any, ...]]:
54
+ return [(v, *row) for v, row in zip(itertools.chain([value], itertools.repeat("")), rows)]
55
+
56
+
57
+ def markdown_column_diff_table(column_diff: schema_diff.ColumnDiff) -> ty.List[ty.Tuple[ty.Any, ...]]:
58
+ rows = []
59
+ compat_note = "" if column_diff.compatible else " (INCOMPATIBLE)"
60
+ if column_diff.before.name != column_diff.after.name:
61
+ rows.append(("Name", column_diff.before.name, column_diff.after.name))
62
+ if nullability := column_diff.nullability_diff:
63
+ rows.append(("Nullability", (~nullability).name, nullability.name))
64
+ if dtype_diff := column_diff.dtype_diff:
65
+ if (bt := dtype_diff.before.parquet) != (at := dtype_diff.after.parquet):
66
+ rows.append((f"Arrow Type{compat_note}", str(bt), str(at)))
67
+ if (bt_ := _py_type(dtype_diff.before)) != (at_ := _py_type(dtype_diff.after)):
68
+ rows.append((f"Python Type{compat_note}", str(bt_), str(at_)))
69
+ if dtype_diff.constraints_dropped:
70
+ rows.extend(
71
+ _prepend_sparse_col(
72
+ "Constraint Dropped", ((_constraint_expr(c), "") for c in dtype_diff.constraints_dropped)
73
+ )
74
+ )
75
+ if dtype_diff.constraints_added:
76
+ rows.extend(
77
+ _prepend_sparse_col(
78
+ "Constraint Added", (("", _constraint_expr(c)) for c in dtype_diff.constraints_added)
79
+ )
80
+ )
81
+ if enum_diff := dtype_diff.enum_diff:
82
+ if enum_diff.values_dropped:
83
+ rows.extend(
84
+ _prepend_sparse_col("Enum Value Dropped", ((v, "") for v in enum_diff.values_dropped))
85
+ )
86
+ if enum_diff.values_added:
87
+ rows.extend(
88
+ _prepend_sparse_col("Enum Value Added", (("", v) for v in enum_diff.values_added))
89
+ )
90
+ if enum_diff.ordered_diff:
91
+ rows.append(
92
+ (
93
+ "Enum Orderability",
94
+ (~enum_diff.ordered_diff).name,
95
+ enum_diff.ordered_diff.name,
96
+ )
97
+ )
98
+ if enum_diff.order_changed:
99
+ rows.append(("Enum Value Order", "", ""))
100
+ return rows
101
+
102
+
103
+ def markdown_table_table_diff_table(table_diff: schema_diff.TableDiff) -> ty.List[ty.Tuple[ty.Any, ...]]:
104
+ rows = []
105
+ rows.extend(_prepend_sparse_col("Column Dropped", ((c, "") for c in table_diff.columns_dropped)))
106
+ rows.extend(_prepend_sparse_col("Column Added", (("", c) for c in table_diff.columns_added)))
107
+ if (pkb := table_diff.before.primary_key) != (pka := table_diff.after.primary_key):
108
+ rows.append(("Primary Key", ", ".join(pkb or ()), ", ".join(pka or ())))
109
+ rows.extend(
110
+ _prepend_sparse_col("Index Dropped", ((", ".join(i), "") for i in table_diff.indexes_dropped))
111
+ )
112
+ rows.extend(
113
+ _prepend_sparse_col("Index Added", (("", ", ".join(i)) for i in table_diff.indexes_added))
114
+ )
115
+ return rows
116
+
117
+
118
+ def markdown_table_diff_summary(
119
+ table_diff: schema_diff.TableDiff,
120
+ heading_level: int = 0,
121
+ tablefmt: str = DEFAULT_TABLEFMT,
122
+ ) -> ty.Iterator[str]:
123
+ if rows := markdown_table_table_diff_table(table_diff):
124
+ yield markdown_heading(heading_level + 1, "Table Modifications:")
125
+ yield render_table(("Change", "Before", "After"), rows, tablefmt=tablefmt)
126
+
127
+ rows = []
128
+ for column_name, column_diff in table_diff.column_diffs.items():
129
+ rows.extend(_prepend_sparse_col(column_name, markdown_column_diff_table(column_diff)))
130
+ if rows:
131
+ yield markdown_heading(heading_level + 1, "Columns Modified:")
132
+ yield render_table(("Column Name", "Change", "Before", "After"), rows, tablefmt=tablefmt)
133
+
134
+
135
+ def markdown_schema_diff_summary(
136
+ schema_diff: schema_diff.SchemaDiff,
137
+ table_predicate: ty.Optional[ty.Callable[[metaschema.Table], bool]] = None,
138
+ heading_level: int = 0,
139
+ tablefmt: str = DEFAULT_TABLEFMT,
140
+ ) -> ty.Iterator[str]:
141
+ yield from _dropped_and_added(
142
+ "Tables",
143
+ (
144
+ schema_diff.tables_dropped
145
+ if table_predicate is None
146
+ else {n: t for n, t in schema_diff.tables_dropped.items() if table_predicate(t)}
147
+ ),
148
+ (
149
+ schema_diff.tables_added
150
+ if table_predicate is None
151
+ else {n: t for n, t in schema_diff.tables_dropped.items() if table_predicate(t)}
152
+ ),
153
+ heading_level,
154
+ )
155
+ heading = False
156
+ for table_name, table_diff in sorted(schema_diff.table_diffs.items(), key=lambda x: x[0]):
157
+ if (table_predicate is None or table_predicate(table_diff.after)) and table_diff:
158
+ if not heading:
159
+ yield markdown_heading(heading_level + 1, "Tables Modified:")
160
+ heading = True
161
+ yield markdown_heading(heading_level + 2, table_name)
162
+ yield from markdown_table_diff_summary(table_diff, heading_level + 2, tablefmt=tablefmt)
163
+
164
+
165
+ def _floatfmt_from_df(df: pd.DataFrame, floatfmt: str) -> ty.List[ty.Optional[str]]:
166
+ return [floatfmt if dt.kind == "f" else None for dt in df.dtypes.values]
167
+
168
+
169
+ def markdown_dataframe_diff_summary(
170
+ dataframe_diff: data_diff.DataFrameDiff,
171
+ table_name: ty.Optional[str] = None,
172
+ verbose: bool = False,
173
+ value_detail: bool = False,
174
+ value_detail_min_count: int = 0,
175
+ heading_level: int = 0,
176
+ tablefmt: str = DEFAULT_TABLEFMT,
177
+ floatfmt: str = DEFAULT_FLOATFMT,
178
+ ) -> ty.Iterator[str]:
179
+ heading = False
180
+ table_changes = dataframe_diff.summary()
181
+ if table_changes:
182
+ if table_name:
183
+ yield markdown_heading(heading_level + 1, code_literal(table_name))
184
+ heading = True
185
+ yield markdown_heading(heading_level + 2, "Key Changes:")
186
+ table = table_changes.table().reset_index()
187
+ yield table[table["count"] > 0].to_markdown(
188
+ index=False, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(table, floatfmt)
189
+ )
190
+
191
+ meta_changes = dataframe_diff.meta_diff
192
+ if meta_changes is not None and len(meta_changes):
193
+ if table_name and not heading:
194
+ yield markdown_heading(heading_level + 1, code_literal(table_name))
195
+ heading = True
196
+ yield markdown_heading(heading_level + 2, "Metadata Changes:")
197
+ yield meta_changes.to_markdown(
198
+ index=True, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(meta_changes, floatfmt)
199
+ )
200
+
201
+ def _drop_zero_cols(df: ty.Optional[pd.DataFrame]) -> ty.Optional[pd.DataFrame]:
202
+ if df is None:
203
+ return None
204
+ nonzero_cols = df.any()
205
+ return df[nonzero_cols.index[nonzero_cols]]
206
+
207
+ value_changes = (
208
+ dataframe_diff.row_diff_patterns()
209
+ if verbose
210
+ else _drop_zero_cols(dataframe_diff.column_diff_summary())
211
+ )
212
+ if value_changes is not None and len(value_changes):
213
+ if table_name and not heading:
214
+ yield markdown_heading(heading_level + 1, code_literal(table_name))
215
+ yield markdown_heading(heading_level + 2, "Value Changes:")
216
+ value_changes = value_changes.reset_index()
217
+ yield ty.cast(
218
+ str,
219
+ value_changes.to_markdown(
220
+ index=False, tablefmt=tablefmt, floatfmt=_floatfmt_from_df(value_changes, floatfmt)
221
+ ),
222
+ )
223
+ if value_detail:
224
+ pos_col_diffs = (
225
+ (col_name, col_diff)
226
+ for col_name, col_diff in dataframe_diff.column_diffs.items()
227
+ if col_diff
228
+ )
229
+ for col_name, col_diff in pos_col_diffs:
230
+ col_heading = False
231
+ for kind, prop in (
232
+ ("Nulled", data_diff.ColumnDiff.nulled_counts),
233
+ ("Filled", data_diff.ColumnDiff.filled_counts),
234
+ ("Updated", data_diff.ColumnDiff.updated_counts),
235
+ ):
236
+ counts = prop.__get__(col_diff)
237
+ # evaluate these lazily to allow for rendering as they're computed
238
+ if value_detail_min_count:
239
+ counts = counts[counts >= value_detail_min_count]
240
+ if len(counts):
241
+ if not col_heading:
242
+ yield markdown_heading(
243
+ heading_level + 2, f"Column {code_literal(col_name)} Changes Detail:"
244
+ )
245
+ col_heading = True
246
+ yield markdown_heading(heading_level + 3, f"{kind}:")
247
+ yield counts.to_frame("count").reset_index().to_markdown(
248
+ index=False, tablefmt=tablefmt
249
+ )
@@ -0,0 +1,37 @@
1
+ import contextlib
2
+ import os
3
+ import subprocess
4
+ import typing as ty
5
+ from pathlib import Path
6
+
7
+ from thds.core import git
8
+
9
+ StrOrPath = ty.Union[str, os.PathLike]
10
+
11
+
12
+ def relative_to_root(path: StrOrPath) -> Path:
13
+ """Path relative to the repo root. Can be given either as an absolute path or relative path, in
14
+ which case it's assumed to be relative to the current working directory. Note: paths which have
15
+ already been relativized to the repo root should *not* be passed here - that will only be correct
16
+ if the working directory *is* the repo root"""
17
+ return Path(path).resolve().relative_to(git.get_repo_root())
18
+
19
+
20
+ @contextlib.contextmanager
21
+ def _subcap(cmd: list, **kwargs) -> ty.Iterator:
22
+ try:
23
+ yield
24
+ except subprocess.CalledProcessError as cpe:
25
+ print("stdout:", cpe.stdout)
26
+ print("stderr:", cpe.stderr)
27
+ print("Failed; retrying: " + " ".join(cmd))
28
+ subprocess.run(cmd, check=True)
29
+
30
+
31
+ def blob_contents(path: StrOrPath, ref: str) -> bytes:
32
+ """Read the text contents of a specific file (relative to the repo root) at a specific git ref.
33
+ Note that git *requires* the paths given here to be relative to the repo root"""
34
+ cmd = ["git", "show", f"{ref}:{relative_to_root(path)}"]
35
+ with _subcap(cmd):
36
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, check=True)
37
+ return proc.stdout
File without changes
@@ -0,0 +1,44 @@
1
+ """Make SQLite Loaders that lazily load their source from pre-built ADLS paths.
2
+
3
+ The download will only occur once, and each thread will get its own
4
+ SQLite connection, as is proper.
5
+ """
6
+
7
+ import typing as ty
8
+ from pathlib import Path
9
+
10
+ from thds.core import source
11
+ from thds.core.lazy import Lazy, ThreadLocalLazy
12
+
13
+ from .sqlite_util import AttrsSQLiteDatabase
14
+
15
+ L = ty.TypeVar("L")
16
+
17
+
18
+ def _make_lazy_attrs_sqlite_loader(
19
+ mk_loader: ty.Callable[[AttrsSQLiteDatabase], L],
20
+ db_installer: ty.Callable[[], Path],
21
+ mmap_size: int = 2**24,
22
+ ) -> ThreadLocalLazy[L]:
23
+ one_time_db_install = Lazy(db_installer)
24
+
25
+ def make_loader():
26
+ # the DB installer is made lazy so that multiple threads
27
+ # competing to install it will only install it once.
28
+ return mk_loader(AttrsSQLiteDatabase(None, one_time_db_install(), mmap_size=mmap_size))
29
+
30
+ return ThreadLocalLazy(make_loader)
31
+
32
+
33
+ def lazy_attrs_sqlite_loader_maker(
34
+ mk_loader: ty.Callable[[AttrsSQLiteDatabase], L],
35
+ default_mmap_size: int = 2**24,
36
+ ) -> ty.Callable[[source.Source], ThreadLocalLazy[L]]:
37
+ def make_loader(source: source.Source, mmap_size: int = -1) -> ThreadLocalLazy[L]:
38
+ return _make_lazy_attrs_sqlite_loader(
39
+ mk_loader,
40
+ source.path,
41
+ mmap_size if mmap_size > -1 else default_mmap_size,
42
+ )
43
+
44
+ return make_loader # type: ignore