PyPI - thds.tabularasa - Versions diffs - 0.13.0__py3-none-any.whl - Mend

thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

thds/tabularasa/__init__.py +6 -0
thds/tabularasa/__main__.py +1122 -0
thds/tabularasa/compat.py +33 -0
thds/tabularasa/data_dependencies/__init__.py +0 -0
thds/tabularasa/data_dependencies/adls.py +97 -0
thds/tabularasa/data_dependencies/build.py +573 -0
thds/tabularasa/data_dependencies/sqlite.py +286 -0
thds/tabularasa/data_dependencies/tabular.py +167 -0
thds/tabularasa/data_dependencies/util.py +209 -0
thds/tabularasa/diff/__init__.py +0 -0
thds/tabularasa/diff/data.py +346 -0
thds/tabularasa/diff/schema.py +254 -0
thds/tabularasa/diff/summary.py +249 -0
thds/tabularasa/git_util.py +37 -0
thds/tabularasa/loaders/__init__.py +0 -0
thds/tabularasa/loaders/lazy_adls.py +44 -0
thds/tabularasa/loaders/parquet_util.py +385 -0
thds/tabularasa/loaders/sqlite_util.py +346 -0
thds/tabularasa/loaders/util.py +532 -0
thds/tabularasa/py.typed +0 -0
thds/tabularasa/schema/__init__.py +7 -0
thds/tabularasa/schema/compilation/__init__.py +20 -0
thds/tabularasa/schema/compilation/_format.py +50 -0
thds/tabularasa/schema/compilation/attrs.py +257 -0
thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
thds/tabularasa/schema/compilation/io.py +96 -0
thds/tabularasa/schema/compilation/pandas.py +252 -0
thds/tabularasa/schema/compilation/pyarrow.py +93 -0
thds/tabularasa/schema/compilation/sphinx.py +550 -0
thds/tabularasa/schema/compilation/sqlite.py +69 -0
thds/tabularasa/schema/compilation/util.py +117 -0
thds/tabularasa/schema/constraints.py +327 -0
thds/tabularasa/schema/dtypes.py +153 -0
thds/tabularasa/schema/extract_from_parquet.py +132 -0
thds/tabularasa/schema/files.py +215 -0
thds/tabularasa/schema/metaschema.py +1007 -0
thds/tabularasa/schema/util.py +123 -0
thds/tabularasa/schema/validation.py +878 -0
thds/tabularasa/sqlite3_compat.py +41 -0
thds/tabularasa/sqlite_from_parquet.py +34 -0
thds/tabularasa/to_sqlite.py +56 -0
thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0

thds/tabularasa/diff/data.py ADDED Viewed

@@ -0,0 +1,346 @@
+import dataclasses
+import typing as ty
+from functools import cached_property
+import numpy as np
+import pandas as pd
+import pyarrow.parquet as pq
+from ..data_dependencies.adls import sync_adls_data
+from ..loaders.util import PandasParquetLoader
+from ..schema.files import RemoteBlobStoreSpec
+from ..schema.metaschema import Table
+def load_historical_data(table: Table, blob_store: RemoteBlobStoreSpec):
+    assert table.md5
+    loader = PandasParquetLoader.from_schema_table(
+        table, package=None, data_dir="", filename=None, derive_schema=False
+    )
+    remote_data_spec = blob_store.data_spec(table.md5)
+    results = sync_adls_data(remote_data_spec)
+    assert len(results) == 1
+    local_path = results[0].local_path
+    meta = pq.read_metadata(local_path)
+    return loader(local_path), meta
+T_Tabular = ty.TypeVar("T_Tabular", pd.Series, pd.DataFrame)
+def _uncategorify_index(data: T_Tabular) -> T_Tabular:
+    index = data.index
+    if isinstance(index.dtype, pd.CategoricalDtype):
+        return data.set_axis(index.astype(index.dtype.categories.dtype), axis=0, copy=False)
+    return data
+def _uncategorify_series(series: pd.Series) -> pd.Series:
+    series = _uncategorify_index(series)
+    if isinstance(series.dtype, pd.CategoricalDtype):
+        return series.astype(series.dtype.categories.dtype)
+    return series
+def _uncategorify_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    df = _uncategorify_index(df)
+    update_dtypes = {
+        c: dt.categories.dtype for c, dt in df.dtypes.items() if isinstance(dt, pd.CategoricalDtype)
+    }
+    if not update_dtypes:
+        return df
+    return df.astype(update_dtypes, copy=False)
+def _percent(numerator: int, denominator: int) -> float:
+    return numerator * 100 / denominator if denominator else 0.0
+class ColumnDiffSummary(ty.NamedTuple):
+    nulled: int
+    filled: int
+    updated: int
+    def __bool__(self):
+        return any(self)
+class DataFrameDiffSummary(ty.NamedTuple):
+    rows_before: int
+    rows_after: int
+    columns_before: int
+    columns_after: int
+    dropped_rows: int
+    added_rows: int
+    dropped_columns: int
+    added_columns: int
+    def __bool__(self):
+        return bool(self.dropped_rows or self.dropped_columns or self.added_rows or self.added_columns)
+    def table(self):
+        rows = [
+            ("dropped_rows", self.dropped_rows, _percent(self.dropped_rows, self.rows_before)),
+            ("added_rows", self.added_rows, _percent(self.added_rows, self.rows_after)),
+            (
+                "dropped_columns",
+                self.dropped_columns,
+                _percent(self.dropped_columns, self.columns_before),
+            ),
+            ("added_columns", self.added_columns, _percent(self.added_columns, self.columns_after)),
+        ]
+        return pd.DataFrame.from_records(rows, columns=["", "count", "percent"]).set_index("")
+@dataclasses.dataclass
+class ColumnDiff:
+    before: pd.Series
+    after: pd.Series
+    def __post_init__(self):
+        # to facilitate hassle-free comparison
+        self.before = _uncategorify_series(self.before)
+        self.after = _uncategorify_series(self.after)
+    @cached_property
+    def was_null(self) -> pd.Series:
+        return self.before.isna()
+    @cached_property
+    def is_null(self) -> pd.Series:
+        return self.after.isna()
+    @cached_property
+    def nulled(self) -> pd.Series:
+        return ~self.was_null & self.is_null
+    @cached_property
+    def filled(self) -> pd.Series:
+        return self.was_null & ~self.is_null
+    @cached_property
+    def updated(self) -> pd.Series:
+        return (self.before != self.after).fillna(False) & ~self.is_null & ~self.was_null
+    @cached_property
+    def n_nulled(self) -> int:
+        return self.nulled.sum()
+    @cached_property
+    def n_filled(self) -> int:
+        return self.filled.sum()
+    @cached_property
+    def n_updated(self) -> int:
+        return self.updated.sum()
+    def __bool__(self):
+        return bool(self.nulled.any() or self.filled.any() or self.updated.any())
+    @cached_property
+    def updated_counts(self) -> pd.Series:
+        updated = self.updated
+        return (
+            pd.DataFrame(dict(before=self.before[updated], after=self.after[updated]))
+            .value_counts()
+            .rename("count", copy=False)
+        )
+    @cached_property
+    def nulled_counts(self) -> pd.Series:
+        return (
+            self.before[self.nulled]
+            .value_counts()
+            .rename("count", copy=False)
+            .rename_axis(index="before")
+        )
+    @cached_property
+    def filled_counts(self) -> pd.Series:
+        return (
+            self.after[self.filled].value_counts().rename("count", copy=False).rename_axis(index="after")
+        )
+    def summary(self):
+        return ColumnDiffSummary(nulled=self.n_nulled, filled=self.n_filled, updated=self.n_updated)
+@dataclasses.dataclass
+class DataFrameDiff:
+    before: pd.DataFrame
+    after: pd.DataFrame
+    before_meta: ty.Optional[pq.FileMetaData] = None
+    after_meta: ty.Optional[pq.FileMetaData] = None
+    def __post_init__(self):
+        self._column_diffs: ty.Dict[str, ColumnDiff] = dict()
+    @cached_property
+    def dropped_columns(self) -> ty.List[str]:
+        return self.before.columns.difference(self.after.columns).tolist()
+    @cached_property
+    def added_columns(self) -> ty.List[str]:
+        return self.after.columns.difference(self.before.columns).tolist()
+    @cached_property
+    def common_columns(self) -> ty.List[str]:
+        return self.after.columns.intersection(self.before.columns).tolist()
+    @cached_property
+    def dropped_keys(self) -> pd.Index:
+        return self.before.index.difference(self.after.index)
+    @cached_property
+    def added_keys(self) -> pd.Index:
+        return self.after.index.difference(self.before.index)
+    @cached_property
+    def common_keys(self) -> list:
+        """Don't use `.index.intersection` here because it does not work with different types of nulls.
+        For a single Index, the returned keys are based on the following True statements:
+            * None is None
+            * pandas.NA is pandas.NA
+            * float("nan") is not float("nan")
+        For MultiIndex, `float("nan")` behave differently, please check the test for all null equality checks
+        E.g.,
+        ```
+        In [47]: pd.MultiIndex.from_tuples([float("nan")]) == pd.MultiIndex.from_tuples([float("nan")])
+        Out[47]: array([ True])
+        In [48]: pd.Index([float("nan")]) == pd.Index([float("nan")])
+        Out[48]: array([False])
+        ```
+        """
+        return list(set(self.after.index).intersection(self.before.index))
+    @cached_property
+    def dropped_rows(self) -> pd.DataFrame:
+        return self.before.loc[self.dropped_keys]
+    @cached_property
+    def added_rows(self) -> pd.DataFrame:
+        return self.after.loc[self.added_keys]
+    @cached_property
+    def common_rows_before(self) -> pd.DataFrame:
+        return self.before.loc[self.common_keys]
+    @cached_property
+    def common_rows_after(self) -> pd.DataFrame:
+        return self.after.loc[self.common_keys]
+    def column_diff(self, column: str) -> ColumnDiff:
+        if (maybe_diff := self._column_diffs.get(column)) is None:
+            diff = self._column_diffs[column] = ColumnDiff(
+                self.common_rows_before[column], self.common_rows_after[column]
+            )
+            return diff
+        return maybe_diff
+    @property
+    def column_diffs(self) -> ty.Dict[str, ColumnDiff]:
+        return {c: self.column_diff(c) for c in self.common_columns}
+    def column_diff_summary(self) -> ty.Optional[pd.DataFrame]:
+        df = pd.DataFrame.from_dict(
+            {name: diff.summary() for name, diff in self.column_diffs.items() if diff},
+            orient="index",
+            columns=ColumnDiffSummary._fields,
+        )
+        df.index.name = "column"
+        percent_df = df.rename(columns="{}_percent".format, copy=False).applymap(  # type: ignore[operator]
+            lambda v: _percent(v, len(self.common_keys))
+        )
+        df = pd.concat([df, percent_df], axis=1)
+        return None if not len(df) else df
+    def row_diff_patterns(self, detailed: bool = True) -> ty.Optional[pd.DataFrame]:
+        before = _uncategorify_dataframe(self.common_rows_before[self.common_columns])
+        after = _uncategorify_dataframe(self.common_rows_after[self.common_columns])
+        was_null = before.isna()
+        is_null = after.isna()
+        filled = was_null & ~is_null
+        nulled = ~was_null & is_null
+        updated = (before != after).fillna(False) & ~is_null & ~was_null  # type: ignore[attr-defined]
+        changed_cols_ = updated.any(axis=0) | nulled.any(axis=0) | filled.any(axis=0)
+        changed_cols = changed_cols_.index[changed_cols_].tolist()
+        if not changed_cols:
+            return None
+        if detailed:
+            changes = pd.DataFrame(
+                np.where(
+                    updated[changed_cols].values,
+                    "updated",
+                    np.where(
+                        nulled[changed_cols].values,
+                        "nulled",
+                        np.where(filled[changed_cols].values, "filled", ""),
+                    ),
+                ),
+                index=updated.index,
+                columns=changed_cols,
+            ).astype("category")
+        else:
+            changes = updated[changed_cols] | nulled[changed_cols] | filled[changed_cols]
+        changes_df = changes.value_counts(dropna=False).to_frame("count")
+        changes_df["percent"] = changes_df["count"].apply(lambda c: _percent(c, len(self.common_keys)))
+        return changes_df
+    def summary(self) -> DataFrameDiffSummary:
+        return DataFrameDiffSummary(
+            rows_before=len(self.before),
+            rows_after=len(self.after),
+            columns_before=len(self.before.columns),
+            columns_after=len(self.after.columns),
+            dropped_rows=len(self.dropped_keys),
+            added_rows=len(self.added_keys),
+            dropped_columns=len(self.dropped_columns),
+            added_columns=len(self.added_columns),
+        )
+    @cached_property
+    def meta_diff(self):
+        if self.before_meta is None or self.after_meta is None:
+            return pd.DataFrame(columns=["before", "after"], dtype=object)
+        before = self.before_meta.to_dict()
+        after = self.after_meta.to_dict()
+        return pd.DataFrame.from_dict(
+            {
+                name: [before[name], after[name]]
+                for name in before
+                if (name != "row_groups") and (before[name] != after[name])
+            },
+            orient="index",
+            columns=["before", "after"],
+            dtype=object,
+        )
+    def __bool__(self) -> bool:
+        return bool(
+            len(self.meta_diff)
+            or len(self.dropped_keys)
+            or len(self.added_keys)
+            or len(self.dropped_columns)
+            or len(self.added_columns)
+            or any(map(bool, map(self.column_diff, self.common_columns)))
+        )
+    @staticmethod
+    def from_tables(
+        before: Table,
+        after: Table,
+        before_blob_store: RemoteBlobStoreSpec,
+        after_blob_store: RemoteBlobStoreSpec,
+    ) -> "DataFrameDiff":
+        before_df, before_meta = load_historical_data(before, before_blob_store)
+        after_df, after_meta = load_historical_data(after, after_blob_store)
+        return DataFrameDiff(
+            before=before_df,
+            after=after_df,
+            before_meta=before_meta,
+            after_meta=after_meta,
+        )

thds/tabularasa/diff/schema.py ADDED Viewed

@@ -0,0 +1,254 @@
+"""Diffs for schema objects"""
+import dataclasses
+import enum
+import typing as ty
+from functools import cached_property, singledispatch
+from ..loaders import parquet_util
+from ..schema import metaschema
+from ..schema.constraints import AnyColumnConstraint
+from ..schema.metaschema import Column, Identifier, Schema, Table
+_CUSTOM_DTYPES = (metaschema.AnonCustomType, metaschema.CustomType)
+class NullabilityDiff(enum.IntEnum):
+    """Works as expected with `bool`: bool(NullabilityDiff.NO_CHANGE) == False"""
+    NULL = -1
+    NO_CHANGE = 0
+    NOT_NULL = 1
+    def __invert__(self):
+        return NullabilityDiff(-self.value)
+    @staticmethod
+    def from_nullability(nullable_before: bool, nullable_after: bool):
+        return (
+            NullabilityDiff.NO_CHANGE
+            if nullable_before == nullable_after
+            else NullabilityDiff.NOT_NULL if nullable_before else NullabilityDiff.NULL
+        )
+class OrderedDiff(enum.IntEnum):
+    """Works as expected with `bool`: bool(OrderedDiff.NO_CHANGE) == False"""
+    UNORDERED = -1
+    NO_CHANGE = 0
+    ORDERED = 1
+    def __invert__(self):
+        return OrderedDiff(-self.value)
+    @staticmethod
+    def from_ordered(ordered_before: bool, ordered_after: bool):
+        return (
+            OrderedDiff.NO_CHANGE
+            if ordered_before == ordered_after
+            else OrderedDiff.UNORDERED if ordered_before else OrderedDiff.ORDERED
+        )
+@dataclasses.dataclass
+class EnumDiff:
+    before: metaschema.EnumConstraint
+    after: metaschema.EnumConstraint
+    @cached_property
+    def ordered_diff(self) -> OrderedDiff:
+        return OrderedDiff.from_ordered(self.before.ordered, self.after.ordered)
+    @cached_property
+    def order_changed(self) -> bool:
+        if self.before.ordered and self.after.ordered:
+            common_values_before = [v for v in self.before.enum if v in self.after.enum]
+            common_values_after = [v for v in self.after.enum if v in self.before.enum]
+            return common_values_before != common_values_after
+        return False
+    @cached_property
+    def values_dropped(self) -> metaschema.EnumList:
+        # Note that this uses python comparison semantics; changing dtype from int to float e.g.
+        # with enum values [1, 2] -> [1.0, 2.0] will not be considered a change. This change would be
+        # picked up as a compatibility change in DtypeDiff.
+        return ty.cast(metaschema.EnumList, [v for v in self.before.enum if v not in self.after.enum])
+    @cached_property
+    def values_added(self) -> metaschema.EnumList:
+        return ty.cast(metaschema.EnumList, [v for v in self.after.enum if v not in self.before.enum])
+    def __bool__(self):
+        return (
+            bool(self.ordered_diff)
+            or bool(self.order_changed)
+            or bool(self.values_dropped or self.values_added)
+        )
+@singledispatch
+def _constraints(dtype: metaschema.ResolvedDType) -> ty.List[AnyColumnConstraint]:
+    return []
+@_constraints.register(metaschema.AnonCustomType)
+@_constraints.register(metaschema.CustomType)
+def _constraints_custom(
+    dtype: ty.Union[metaschema.AnonCustomType, metaschema.CustomType],
+) -> ty.List[AnyColumnConstraint]:
+    return dtype.constraints
+@dataclasses.dataclass
+class DtypeDiff:
+    before: metaschema.ResolvedDType
+    after: metaschema.ResolvedDType
+    def _type_compatible(self, level: parquet_util.TypeCheckLevel) -> bool:
+        # The compatibility check is asymmetric; we use the `after` type as the `actual` type
+        # (since that's what you'll get when you load the data) and the `before` type as the `expected`
+        # type. Hence we're checking whether any pre-existing code expecting the `before` type should be
+        # expected to still work after the change.
+        return parquet_util.pyarrow_type_compatible(
+            self.after.parquet,
+            expected=self.before.parquet,
+            level=level,
+        )
+    @cached_property
+    def compatible(self) -> bool:
+        return (
+            self._type_compatible(parquet_util.TypeCheckLevel.compatible)
+            and (self.enum_diff is None or not self.enum_diff.values_added)
+            # new values are a potential compatibility change for any code that is only expecting the old values
+        )
+    @cached_property
+    def same_kind(self) -> bool:
+        return self._type_compatible(parquet_util.TypeCheckLevel.same_kind)
+    @cached_property
+    def constraints_dropped(self) -> ty.List[AnyColumnConstraint]:
+        before_constraints = _constraints(self.before)
+        after_constraints = _constraints(self.after)
+        return [c for c in before_constraints if c not in after_constraints]
+    @cached_property
+    def constraints_added(self) -> ty.List[AnyColumnConstraint]:
+        before_constraints = _constraints(self.before)
+        after_constraints = _constraints(self.after)
+        return [c for c in after_constraints if c not in before_constraints]
+    @cached_property
+    def enum_diff(self) -> ty.Optional[EnumDiff]:
+        before = self.before.enum
+        after = self.after.enum
+        if (before is not None) and (after is not None):
+            return EnumDiff(before, after)
+        return None
+    def __bool__(self):
+        # we don't consider type changes that don't change the kind of the type to be a meaningful change;
+        # usually this is just a storage optimization, e.g. going from int64 to int32
+        return (self.before.parquet != self.after.parquet) or bool(
+            self.constraints_dropped or self.constraints_added
+        )
+@dataclasses.dataclass
+class ColumnDiff:
+    before: Column
+    after: Column
+    @cached_property
+    def nullability_diff(self) -> NullabilityDiff:
+        return NullabilityDiff.from_nullability(self.before.nullable, self.after.nullable)
+    @cached_property
+    def dtype_diff(self) -> DtypeDiff:
+        return DtypeDiff(self.before.type, self.after.type)
+    @cached_property
+    def compatible(self) -> bool:
+        return (self.nullability_diff != NullabilityDiff.NULL) and self.dtype_diff.compatible
+    def __bool__(self):
+        return bool(self.nullability_diff) or bool(self.dtype_diff)
+@dataclasses.dataclass
+class TableDiff:
+    before: Table
+    after: Table
+    @cached_property
+    def before_columns(self) -> ty.Dict[Identifier, Column]:
+        return {c.name: c for c in self.before.columns}
+    @cached_property
+    def after_columns(self) -> ty.Dict[Identifier, Column]:
+        return {c.name: c for c in self.after.columns}
+    @cached_property
+    def columns_dropped(self) -> ty.Dict[Identifier, Column]:
+        after_names = self.after_columns
+        return {col.name: col for col in self.before.columns if col.name not in after_names}
+    @cached_property
+    def columns_added(self) -> ty.Dict[Identifier, Column]:
+        before_names = self.before_columns
+        return {col.name: col for col in self.after.columns if col.name not in before_names}
+    @cached_property
+    def column_diffs(self) -> ty.Dict[Identifier, ColumnDiff]:
+        before_names = self.before_columns
+        after_names = self.after_columns
+        return {
+            name: ColumnDiff(before_names[name], after_names[name])
+            for name in set(before_names).intersection(after_names)
+        }
+    @cached_property
+    def indexes_dropped(self) -> ty.List[metaschema.IdTuple]:
+        return [ix for ix in self.before.indexes if ix not in self.after.indexes]
+    @cached_property
+    def indexes_added(self) -> ty.List[metaschema.IdTuple]:
+        return [ix for ix in self.after.indexes if ix not in self.before.indexes]
+    def __bool__(self):
+        return bool(
+            self.columns_dropped
+            or self.columns_added
+            or self.indexes_dropped
+            or self.indexes_added
+            or self.before.primary_key != self.after.primary_key
+            or any(self.column_diffs.values())
+        )
+@dataclasses.dataclass
+class SchemaDiff:
+    before: Schema
+    after: Schema
+    @cached_property
+    def tables_dropped(self) -> ty.Dict[Identifier, Table]:
+        return {name: t for name, t in self.before.tables.items() if name not in self.after.tables}
+    @cached_property
+    def tables_added(self) -> ty.Dict[Identifier, Table]:
+        return {name: t for name, t in self.after.tables.items() if name not in self.before.tables}
+    @cached_property
+    def table_diffs(self) -> ty.Dict[Identifier, TableDiff]:
+        before_tables = self.before.tables
+        after_tables = self.after.tables
+        return {
+            name: TableDiff(before_tables[name], after_tables[name])
+            for name in set(before_tables).intersection(after_tables)
+        }
+    def __bool__(self):
+        return bool(self.tables_dropped or self.tables_added or any(self.table_diffs.values()))