thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,346 @@
1
+ import dataclasses
2
+ import typing as ty
3
+ from functools import cached_property
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pyarrow.parquet as pq
8
+
9
+ from ..data_dependencies.adls import sync_adls_data
10
+ from ..loaders.util import PandasParquetLoader
11
+ from ..schema.files import RemoteBlobStoreSpec
12
+ from ..schema.metaschema import Table
13
+
14
+
15
+ def load_historical_data(table: Table, blob_store: RemoteBlobStoreSpec):
16
+ assert table.md5
17
+ loader = PandasParquetLoader.from_schema_table(
18
+ table, package=None, data_dir="", filename=None, derive_schema=False
19
+ )
20
+ remote_data_spec = blob_store.data_spec(table.md5)
21
+ results = sync_adls_data(remote_data_spec)
22
+ assert len(results) == 1
23
+ local_path = results[0].local_path
24
+ meta = pq.read_metadata(local_path)
25
+ return loader(local_path), meta
26
+
27
+
28
+ T_Tabular = ty.TypeVar("T_Tabular", pd.Series, pd.DataFrame)
29
+
30
+
31
+ def _uncategorify_index(data: T_Tabular) -> T_Tabular:
32
+ index = data.index
33
+ if isinstance(index.dtype, pd.CategoricalDtype):
34
+ return data.set_axis(index.astype(index.dtype.categories.dtype), axis=0, copy=False)
35
+ return data
36
+
37
+
38
+ def _uncategorify_series(series: pd.Series) -> pd.Series:
39
+ series = _uncategorify_index(series)
40
+ if isinstance(series.dtype, pd.CategoricalDtype):
41
+ return series.astype(series.dtype.categories.dtype)
42
+ return series
43
+
44
+
45
+ def _uncategorify_dataframe(df: pd.DataFrame) -> pd.DataFrame:
46
+ df = _uncategorify_index(df)
47
+ update_dtypes = {
48
+ c: dt.categories.dtype for c, dt in df.dtypes.items() if isinstance(dt, pd.CategoricalDtype)
49
+ }
50
+ if not update_dtypes:
51
+ return df
52
+ return df.astype(update_dtypes, copy=False)
53
+
54
+
55
+ def _percent(numerator: int, denominator: int) -> float:
56
+ return numerator * 100 / denominator if denominator else 0.0
57
+
58
+
59
+ class ColumnDiffSummary(ty.NamedTuple):
60
+ nulled: int
61
+ filled: int
62
+ updated: int
63
+
64
+ def __bool__(self):
65
+ return any(self)
66
+
67
+
68
+ class DataFrameDiffSummary(ty.NamedTuple):
69
+ rows_before: int
70
+ rows_after: int
71
+ columns_before: int
72
+ columns_after: int
73
+ dropped_rows: int
74
+ added_rows: int
75
+ dropped_columns: int
76
+ added_columns: int
77
+
78
+ def __bool__(self):
79
+ return bool(self.dropped_rows or self.dropped_columns or self.added_rows or self.added_columns)
80
+
81
+ def table(self):
82
+ rows = [
83
+ ("dropped_rows", self.dropped_rows, _percent(self.dropped_rows, self.rows_before)),
84
+ ("added_rows", self.added_rows, _percent(self.added_rows, self.rows_after)),
85
+ (
86
+ "dropped_columns",
87
+ self.dropped_columns,
88
+ _percent(self.dropped_columns, self.columns_before),
89
+ ),
90
+ ("added_columns", self.added_columns, _percent(self.added_columns, self.columns_after)),
91
+ ]
92
+ return pd.DataFrame.from_records(rows, columns=["", "count", "percent"]).set_index("")
93
+
94
+
95
+ @dataclasses.dataclass
96
+ class ColumnDiff:
97
+ before: pd.Series
98
+ after: pd.Series
99
+
100
+ def __post_init__(self):
101
+ # to facilitate hassle-free comparison
102
+ self.before = _uncategorify_series(self.before)
103
+ self.after = _uncategorify_series(self.after)
104
+
105
+ @cached_property
106
+ def was_null(self) -> pd.Series:
107
+ return self.before.isna()
108
+
109
+ @cached_property
110
+ def is_null(self) -> pd.Series:
111
+ return self.after.isna()
112
+
113
+ @cached_property
114
+ def nulled(self) -> pd.Series:
115
+ return ~self.was_null & self.is_null
116
+
117
+ @cached_property
118
+ def filled(self) -> pd.Series:
119
+ return self.was_null & ~self.is_null
120
+
121
+ @cached_property
122
+ def updated(self) -> pd.Series:
123
+ return (self.before != self.after).fillna(False) & ~self.is_null & ~self.was_null
124
+
125
+ @cached_property
126
+ def n_nulled(self) -> int:
127
+ return self.nulled.sum()
128
+
129
+ @cached_property
130
+ def n_filled(self) -> int:
131
+ return self.filled.sum()
132
+
133
+ @cached_property
134
+ def n_updated(self) -> int:
135
+ return self.updated.sum()
136
+
137
+ def __bool__(self):
138
+ return bool(self.nulled.any() or self.filled.any() or self.updated.any())
139
+
140
+ @cached_property
141
+ def updated_counts(self) -> pd.Series:
142
+ updated = self.updated
143
+ return (
144
+ pd.DataFrame(dict(before=self.before[updated], after=self.after[updated]))
145
+ .value_counts()
146
+ .rename("count", copy=False)
147
+ )
148
+
149
+ @cached_property
150
+ def nulled_counts(self) -> pd.Series:
151
+ return (
152
+ self.before[self.nulled]
153
+ .value_counts()
154
+ .rename("count", copy=False)
155
+ .rename_axis(index="before")
156
+ )
157
+
158
+ @cached_property
159
+ def filled_counts(self) -> pd.Series:
160
+ return (
161
+ self.after[self.filled].value_counts().rename("count", copy=False).rename_axis(index="after")
162
+ )
163
+
164
+ def summary(self):
165
+ return ColumnDiffSummary(nulled=self.n_nulled, filled=self.n_filled, updated=self.n_updated)
166
+
167
+
168
+ @dataclasses.dataclass
169
+ class DataFrameDiff:
170
+ before: pd.DataFrame
171
+ after: pd.DataFrame
172
+ before_meta: ty.Optional[pq.FileMetaData] = None
173
+ after_meta: ty.Optional[pq.FileMetaData] = None
174
+
175
+ def __post_init__(self):
176
+ self._column_diffs: ty.Dict[str, ColumnDiff] = dict()
177
+
178
+ @cached_property
179
+ def dropped_columns(self) -> ty.List[str]:
180
+ return self.before.columns.difference(self.after.columns).tolist()
181
+
182
+ @cached_property
183
+ def added_columns(self) -> ty.List[str]:
184
+ return self.after.columns.difference(self.before.columns).tolist()
185
+
186
+ @cached_property
187
+ def common_columns(self) -> ty.List[str]:
188
+ return self.after.columns.intersection(self.before.columns).tolist()
189
+
190
+ @cached_property
191
+ def dropped_keys(self) -> pd.Index:
192
+ return self.before.index.difference(self.after.index)
193
+
194
+ @cached_property
195
+ def added_keys(self) -> pd.Index:
196
+ return self.after.index.difference(self.before.index)
197
+
198
+ @cached_property
199
+ def common_keys(self) -> list:
200
+ """Don't use `.index.intersection` here because it does not work with different types of nulls.
201
+
202
+ For a single Index, the returned keys are based on the following True statements:
203
+ * None is None
204
+ * pandas.NA is pandas.NA
205
+ * float("nan") is not float("nan")
206
+
207
+ For MultiIndex, `float("nan")` behave differently, please check the test for all null equality checks
208
+ E.g.,
209
+ ```
210
+ In [47]: pd.MultiIndex.from_tuples([float("nan")]) == pd.MultiIndex.from_tuples([float("nan")])
211
+ Out[47]: array([ True])
212
+
213
+ In [48]: pd.Index([float("nan")]) == pd.Index([float("nan")])
214
+ Out[48]: array([False])
215
+ ```
216
+ """
217
+ return list(set(self.after.index).intersection(self.before.index))
218
+
219
+ @cached_property
220
+ def dropped_rows(self) -> pd.DataFrame:
221
+ return self.before.loc[self.dropped_keys]
222
+
223
+ @cached_property
224
+ def added_rows(self) -> pd.DataFrame:
225
+ return self.after.loc[self.added_keys]
226
+
227
+ @cached_property
228
+ def common_rows_before(self) -> pd.DataFrame:
229
+ return self.before.loc[self.common_keys]
230
+
231
+ @cached_property
232
+ def common_rows_after(self) -> pd.DataFrame:
233
+ return self.after.loc[self.common_keys]
234
+
235
+ def column_diff(self, column: str) -> ColumnDiff:
236
+ if (maybe_diff := self._column_diffs.get(column)) is None:
237
+ diff = self._column_diffs[column] = ColumnDiff(
238
+ self.common_rows_before[column], self.common_rows_after[column]
239
+ )
240
+ return diff
241
+ return maybe_diff
242
+
243
+ @property
244
+ def column_diffs(self) -> ty.Dict[str, ColumnDiff]:
245
+ return {c: self.column_diff(c) for c in self.common_columns}
246
+
247
+ def column_diff_summary(self) -> ty.Optional[pd.DataFrame]:
248
+ df = pd.DataFrame.from_dict(
249
+ {name: diff.summary() for name, diff in self.column_diffs.items() if diff},
250
+ orient="index",
251
+ columns=ColumnDiffSummary._fields,
252
+ )
253
+ df.index.name = "column"
254
+ percent_df = df.rename(columns="{}_percent".format, copy=False).applymap( # type: ignore[operator]
255
+ lambda v: _percent(v, len(self.common_keys))
256
+ )
257
+ df = pd.concat([df, percent_df], axis=1)
258
+ return None if not len(df) else df
259
+
260
+ def row_diff_patterns(self, detailed: bool = True) -> ty.Optional[pd.DataFrame]:
261
+ before = _uncategorify_dataframe(self.common_rows_before[self.common_columns])
262
+ after = _uncategorify_dataframe(self.common_rows_after[self.common_columns])
263
+ was_null = before.isna()
264
+ is_null = after.isna()
265
+ filled = was_null & ~is_null
266
+ nulled = ~was_null & is_null
267
+ updated = (before != after).fillna(False) & ~is_null & ~was_null # type: ignore[attr-defined]
268
+ changed_cols_ = updated.any(axis=0) | nulled.any(axis=0) | filled.any(axis=0)
269
+ changed_cols = changed_cols_.index[changed_cols_].tolist()
270
+ if not changed_cols:
271
+ return None
272
+ if detailed:
273
+ changes = pd.DataFrame(
274
+ np.where(
275
+ updated[changed_cols].values,
276
+ "updated",
277
+ np.where(
278
+ nulled[changed_cols].values,
279
+ "nulled",
280
+ np.where(filled[changed_cols].values, "filled", ""),
281
+ ),
282
+ ),
283
+ index=updated.index,
284
+ columns=changed_cols,
285
+ ).astype("category")
286
+ else:
287
+ changes = updated[changed_cols] | nulled[changed_cols] | filled[changed_cols]
288
+ changes_df = changes.value_counts(dropna=False).to_frame("count")
289
+ changes_df["percent"] = changes_df["count"].apply(lambda c: _percent(c, len(self.common_keys)))
290
+ return changes_df
291
+
292
+ def summary(self) -> DataFrameDiffSummary:
293
+ return DataFrameDiffSummary(
294
+ rows_before=len(self.before),
295
+ rows_after=len(self.after),
296
+ columns_before=len(self.before.columns),
297
+ columns_after=len(self.after.columns),
298
+ dropped_rows=len(self.dropped_keys),
299
+ added_rows=len(self.added_keys),
300
+ dropped_columns=len(self.dropped_columns),
301
+ added_columns=len(self.added_columns),
302
+ )
303
+
304
+ @cached_property
305
+ def meta_diff(self):
306
+ if self.before_meta is None or self.after_meta is None:
307
+ return pd.DataFrame(columns=["before", "after"], dtype=object)
308
+
309
+ before = self.before_meta.to_dict()
310
+ after = self.after_meta.to_dict()
311
+ return pd.DataFrame.from_dict(
312
+ {
313
+ name: [before[name], after[name]]
314
+ for name in before
315
+ if (name != "row_groups") and (before[name] != after[name])
316
+ },
317
+ orient="index",
318
+ columns=["before", "after"],
319
+ dtype=object,
320
+ )
321
+
322
+ def __bool__(self) -> bool:
323
+ return bool(
324
+ len(self.meta_diff)
325
+ or len(self.dropped_keys)
326
+ or len(self.added_keys)
327
+ or len(self.dropped_columns)
328
+ or len(self.added_columns)
329
+ or any(map(bool, map(self.column_diff, self.common_columns)))
330
+ )
331
+
332
+ @staticmethod
333
+ def from_tables(
334
+ before: Table,
335
+ after: Table,
336
+ before_blob_store: RemoteBlobStoreSpec,
337
+ after_blob_store: RemoteBlobStoreSpec,
338
+ ) -> "DataFrameDiff":
339
+ before_df, before_meta = load_historical_data(before, before_blob_store)
340
+ after_df, after_meta = load_historical_data(after, after_blob_store)
341
+ return DataFrameDiff(
342
+ before=before_df,
343
+ after=after_df,
344
+ before_meta=before_meta,
345
+ after_meta=after_meta,
346
+ )
@@ -0,0 +1,254 @@
1
+ """Diffs for schema objects"""
2
+
3
+ import dataclasses
4
+ import enum
5
+ import typing as ty
6
+ from functools import cached_property, singledispatch
7
+
8
+ from ..loaders import parquet_util
9
+ from ..schema import metaschema
10
+ from ..schema.constraints import AnyColumnConstraint
11
+ from ..schema.metaschema import Column, Identifier, Schema, Table
12
+
13
+ _CUSTOM_DTYPES = (metaschema.AnonCustomType, metaschema.CustomType)
14
+
15
+
16
+ class NullabilityDiff(enum.IntEnum):
17
+ """Works as expected with `bool`: bool(NullabilityDiff.NO_CHANGE) == False"""
18
+
19
+ NULL = -1
20
+ NO_CHANGE = 0
21
+ NOT_NULL = 1
22
+
23
+ def __invert__(self):
24
+ return NullabilityDiff(-self.value)
25
+
26
+ @staticmethod
27
+ def from_nullability(nullable_before: bool, nullable_after: bool):
28
+ return (
29
+ NullabilityDiff.NO_CHANGE
30
+ if nullable_before == nullable_after
31
+ else NullabilityDiff.NOT_NULL if nullable_before else NullabilityDiff.NULL
32
+ )
33
+
34
+
35
+ class OrderedDiff(enum.IntEnum):
36
+ """Works as expected with `bool`: bool(OrderedDiff.NO_CHANGE) == False"""
37
+
38
+ UNORDERED = -1
39
+ NO_CHANGE = 0
40
+ ORDERED = 1
41
+
42
+ def __invert__(self):
43
+ return OrderedDiff(-self.value)
44
+
45
+ @staticmethod
46
+ def from_ordered(ordered_before: bool, ordered_after: bool):
47
+ return (
48
+ OrderedDiff.NO_CHANGE
49
+ if ordered_before == ordered_after
50
+ else OrderedDiff.UNORDERED if ordered_before else OrderedDiff.ORDERED
51
+ )
52
+
53
+
54
+ @dataclasses.dataclass
55
+ class EnumDiff:
56
+ before: metaschema.EnumConstraint
57
+ after: metaschema.EnumConstraint
58
+
59
+ @cached_property
60
+ def ordered_diff(self) -> OrderedDiff:
61
+ return OrderedDiff.from_ordered(self.before.ordered, self.after.ordered)
62
+
63
+ @cached_property
64
+ def order_changed(self) -> bool:
65
+ if self.before.ordered and self.after.ordered:
66
+ common_values_before = [v for v in self.before.enum if v in self.after.enum]
67
+ common_values_after = [v for v in self.after.enum if v in self.before.enum]
68
+ return common_values_before != common_values_after
69
+ return False
70
+
71
+ @cached_property
72
+ def values_dropped(self) -> metaschema.EnumList:
73
+ # Note that this uses python comparison semantics; changing dtype from int to float e.g.
74
+ # with enum values [1, 2] -> [1.0, 2.0] will not be considered a change. This change would be
75
+ # picked up as a compatibility change in DtypeDiff.
76
+ return ty.cast(metaschema.EnumList, [v for v in self.before.enum if v not in self.after.enum])
77
+
78
+ @cached_property
79
+ def values_added(self) -> metaschema.EnumList:
80
+ return ty.cast(metaschema.EnumList, [v for v in self.after.enum if v not in self.before.enum])
81
+
82
+ def __bool__(self):
83
+ return (
84
+ bool(self.ordered_diff)
85
+ or bool(self.order_changed)
86
+ or bool(self.values_dropped or self.values_added)
87
+ )
88
+
89
+
90
+ @singledispatch
91
+ def _constraints(dtype: metaschema.ResolvedDType) -> ty.List[AnyColumnConstraint]:
92
+ return []
93
+
94
+
95
+ @_constraints.register(metaschema.AnonCustomType)
96
+ @_constraints.register(metaschema.CustomType)
97
+ def _constraints_custom(
98
+ dtype: ty.Union[metaschema.AnonCustomType, metaschema.CustomType],
99
+ ) -> ty.List[AnyColumnConstraint]:
100
+ return dtype.constraints
101
+
102
+
103
+ @dataclasses.dataclass
104
+ class DtypeDiff:
105
+ before: metaschema.ResolvedDType
106
+ after: metaschema.ResolvedDType
107
+
108
+ def _type_compatible(self, level: parquet_util.TypeCheckLevel) -> bool:
109
+ # The compatibility check is asymmetric; we use the `after` type as the `actual` type
110
+ # (since that's what you'll get when you load the data) and the `before` type as the `expected`
111
+ # type. Hence we're checking whether any pre-existing code expecting the `before` type should be
112
+ # expected to still work after the change.
113
+ return parquet_util.pyarrow_type_compatible(
114
+ self.after.parquet,
115
+ expected=self.before.parquet,
116
+ level=level,
117
+ )
118
+
119
+ @cached_property
120
+ def compatible(self) -> bool:
121
+ return (
122
+ self._type_compatible(parquet_util.TypeCheckLevel.compatible)
123
+ and (self.enum_diff is None or not self.enum_diff.values_added)
124
+ # new values are a potential compatibility change for any code that is only expecting the old values
125
+ )
126
+
127
+ @cached_property
128
+ def same_kind(self) -> bool:
129
+ return self._type_compatible(parquet_util.TypeCheckLevel.same_kind)
130
+
131
+ @cached_property
132
+ def constraints_dropped(self) -> ty.List[AnyColumnConstraint]:
133
+ before_constraints = _constraints(self.before)
134
+ after_constraints = _constraints(self.after)
135
+ return [c for c in before_constraints if c not in after_constraints]
136
+
137
+ @cached_property
138
+ def constraints_added(self) -> ty.List[AnyColumnConstraint]:
139
+ before_constraints = _constraints(self.before)
140
+ after_constraints = _constraints(self.after)
141
+ return [c for c in after_constraints if c not in before_constraints]
142
+
143
+ @cached_property
144
+ def enum_diff(self) -> ty.Optional[EnumDiff]:
145
+ before = self.before.enum
146
+ after = self.after.enum
147
+ if (before is not None) and (after is not None):
148
+ return EnumDiff(before, after)
149
+ return None
150
+
151
+ def __bool__(self):
152
+ # we don't consider type changes that don't change the kind of the type to be a meaningful change;
153
+ # usually this is just a storage optimization, e.g. going from int64 to int32
154
+ return (self.before.parquet != self.after.parquet) or bool(
155
+ self.constraints_dropped or self.constraints_added
156
+ )
157
+
158
+
159
+ @dataclasses.dataclass
160
+ class ColumnDiff:
161
+ before: Column
162
+ after: Column
163
+
164
+ @cached_property
165
+ def nullability_diff(self) -> NullabilityDiff:
166
+ return NullabilityDiff.from_nullability(self.before.nullable, self.after.nullable)
167
+
168
+ @cached_property
169
+ def dtype_diff(self) -> DtypeDiff:
170
+ return DtypeDiff(self.before.type, self.after.type)
171
+
172
+ @cached_property
173
+ def compatible(self) -> bool:
174
+ return (self.nullability_diff != NullabilityDiff.NULL) and self.dtype_diff.compatible
175
+
176
+ def __bool__(self):
177
+ return bool(self.nullability_diff) or bool(self.dtype_diff)
178
+
179
+
180
+ @dataclasses.dataclass
181
+ class TableDiff:
182
+ before: Table
183
+ after: Table
184
+
185
+ @cached_property
186
+ def before_columns(self) -> ty.Dict[Identifier, Column]:
187
+ return {c.name: c for c in self.before.columns}
188
+
189
+ @cached_property
190
+ def after_columns(self) -> ty.Dict[Identifier, Column]:
191
+ return {c.name: c for c in self.after.columns}
192
+
193
+ @cached_property
194
+ def columns_dropped(self) -> ty.Dict[Identifier, Column]:
195
+ after_names = self.after_columns
196
+ return {col.name: col for col in self.before.columns if col.name not in after_names}
197
+
198
+ @cached_property
199
+ def columns_added(self) -> ty.Dict[Identifier, Column]:
200
+ before_names = self.before_columns
201
+ return {col.name: col for col in self.after.columns if col.name not in before_names}
202
+
203
+ @cached_property
204
+ def column_diffs(self) -> ty.Dict[Identifier, ColumnDiff]:
205
+ before_names = self.before_columns
206
+ after_names = self.after_columns
207
+ return {
208
+ name: ColumnDiff(before_names[name], after_names[name])
209
+ for name in set(before_names).intersection(after_names)
210
+ }
211
+
212
+ @cached_property
213
+ def indexes_dropped(self) -> ty.List[metaschema.IdTuple]:
214
+ return [ix for ix in self.before.indexes if ix not in self.after.indexes]
215
+
216
+ @cached_property
217
+ def indexes_added(self) -> ty.List[metaschema.IdTuple]:
218
+ return [ix for ix in self.after.indexes if ix not in self.before.indexes]
219
+
220
+ def __bool__(self):
221
+ return bool(
222
+ self.columns_dropped
223
+ or self.columns_added
224
+ or self.indexes_dropped
225
+ or self.indexes_added
226
+ or self.before.primary_key != self.after.primary_key
227
+ or any(self.column_diffs.values())
228
+ )
229
+
230
+
231
+ @dataclasses.dataclass
232
+ class SchemaDiff:
233
+ before: Schema
234
+ after: Schema
235
+
236
+ @cached_property
237
+ def tables_dropped(self) -> ty.Dict[Identifier, Table]:
238
+ return {name: t for name, t in self.before.tables.items() if name not in self.after.tables}
239
+
240
+ @cached_property
241
+ def tables_added(self) -> ty.Dict[Identifier, Table]:
242
+ return {name: t for name, t in self.after.tables.items() if name not in self.before.tables}
243
+
244
+ @cached_property
245
+ def table_diffs(self) -> ty.Dict[Identifier, TableDiff]:
246
+ before_tables = self.before.tables
247
+ after_tables = self.after.tables
248
+ return {
249
+ name: TableDiff(before_tables[name], after_tables[name])
250
+ for name in set(before_tables).intersection(after_tables)
251
+ }
252
+
253
+ def __bool__(self):
254
+ return bool(self.tables_dropped or self.tables_added or any(self.table_diffs.values()))