PyPI - thds.tabularasa - Versions diffs - 0.13.0__py3-none-any.whl - Mend

thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

thds/tabularasa/__init__.py +6 -0
thds/tabularasa/__main__.py +1122 -0
thds/tabularasa/compat.py +33 -0
thds/tabularasa/data_dependencies/__init__.py +0 -0
thds/tabularasa/data_dependencies/adls.py +97 -0
thds/tabularasa/data_dependencies/build.py +573 -0
thds/tabularasa/data_dependencies/sqlite.py +286 -0
thds/tabularasa/data_dependencies/tabular.py +167 -0
thds/tabularasa/data_dependencies/util.py +209 -0
thds/tabularasa/diff/__init__.py +0 -0
thds/tabularasa/diff/data.py +346 -0
thds/tabularasa/diff/schema.py +254 -0
thds/tabularasa/diff/summary.py +249 -0
thds/tabularasa/git_util.py +37 -0
thds/tabularasa/loaders/__init__.py +0 -0
thds/tabularasa/loaders/lazy_adls.py +44 -0
thds/tabularasa/loaders/parquet_util.py +385 -0
thds/tabularasa/loaders/sqlite_util.py +346 -0
thds/tabularasa/loaders/util.py +532 -0
thds/tabularasa/py.typed +0 -0
thds/tabularasa/schema/__init__.py +7 -0
thds/tabularasa/schema/compilation/__init__.py +20 -0
thds/tabularasa/schema/compilation/_format.py +50 -0
thds/tabularasa/schema/compilation/attrs.py +257 -0
thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
thds/tabularasa/schema/compilation/io.py +96 -0
thds/tabularasa/schema/compilation/pandas.py +252 -0
thds/tabularasa/schema/compilation/pyarrow.py +93 -0
thds/tabularasa/schema/compilation/sphinx.py +550 -0
thds/tabularasa/schema/compilation/sqlite.py +69 -0
thds/tabularasa/schema/compilation/util.py +117 -0
thds/tabularasa/schema/constraints.py +327 -0
thds/tabularasa/schema/dtypes.py +153 -0
thds/tabularasa/schema/extract_from_parquet.py +132 -0
thds/tabularasa/schema/files.py +215 -0
thds/tabularasa/schema/metaschema.py +1007 -0
thds/tabularasa/schema/util.py +123 -0
thds/tabularasa/schema/validation.py +878 -0
thds/tabularasa/sqlite3_compat.py +41 -0
thds/tabularasa/sqlite_from_parquet.py +34 -0
thds/tabularasa/to_sqlite.py +56 -0
thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0

thds/tabularasa/schema/compilation/util.py ADDED Viewed

@@ -0,0 +1,117 @@
+import itertools
+import re
+import textwrap
+from inspect import Signature, signature
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
+import thds.tabularasa.schema.metaschema as metaschema
+AUTOGEN_DISCLAIMER = "This code is auto-generated; do not edit!"
+def sorted_class_names_for_import(names: Iterable[str]) -> List[str]:
+    all_names = set(names)
+    names_upper = [name for name in all_names if name.isupper()]
+    class_names = all_names.difference(names_upper)
+    return sorted(names_upper) + sorted(class_names, key=str.lower)
+def _list_literal(exprs: Iterable[str], linebreak: bool = True) -> str:
+    sep = ",\n    " if linebreak else ", "
+    start = "\n    " if linebreak else ""
+    end = ",\n" if linebreak else ""
+    exprs = iter(exprs)
+    try:
+        peek = next(exprs)
+    except StopIteration:
+        return "[]"
+    else:
+        return f"[{start}{sep.join(itertools.chain((peek,), exprs))}{end}]"
+def _dict_literal(named_exprs: Iterable[Tuple[str, str]], linebreak: bool = True):
+    sep = ",\n    " if linebreak else ", "
+    start = "\n    " if linebreak else ""
+    end = ",\n" if linebreak else ""
+    keyval = "%s=%s".__mod__
+    named_exprs = iter(named_exprs)
+    try:
+        peek = next(named_exprs)
+    except StopIteration:
+        return "{}"
+    else:
+        return f"dict({start}{sep.join(map(keyval, itertools.chain((peek,), named_exprs)))}{end})" ""
+def _indent(expr: str, level: int = 1, first_line: bool = False) -> str:
+    ws = "    " * level
+    indented = textwrap.indent(expr, ws)
+    return indented if first_line else indented.lstrip()
+def _wrap_lines_with_prefix(
+    text: str,
+    line_width: int,
+    first_line_prefix_len: int,
+    trailing_line_indent: int = 0,
+) -> str:
+    text_ = re.sub(r"\s+", " ", text).strip()
+    first_line = textwrap.shorten(text_, line_width - first_line_prefix_len, placeholder="")
+    tail = text_[len(first_line) :].lstrip()
+    if tail:
+        tail_lines = textwrap.wrap(tail, line_width)
+        if trailing_line_indent:
+            prefix = " " * trailing_line_indent
+            tail_lines = [prefix + line for line in tail_lines]
+        return "\n".join([first_line, *tail_lines])
+    else:
+        return first_line
+def constructor_template(
+    type_: Union[Type, Callable],
+    module_name: Optional[str] = None,
+    sig: Optional[Signature] = None,
+    exclude: Optional[List[str]] = None,
+    type_params: Optional[List[str]] = None,
+) -> str:
+    module = module_name or type_.__module__
+    name = type_.__name__
+    if sig is None:
+        if isinstance(type_, type):
+            sig = signature(type_.__init__)  # type: ignore
+            is_method = True
+        else:
+            sig = signature(type_)
+            is_method = False
+        params = list(sig.parameters)[1:] if is_method else list(sig.parameters)
+    else:
+        params = list(sig.parameters)
+    exclude_ = exclude or []
+    args = ",\n    ".join(f"{name}={{{name}}}" for name in params if name not in exclude_)
+    type_params_ = f"[{', '.join(type_params)}]" if type_params else ""
+    template = f"{module}.{name}{type_params_}(\n    {args},\n)"
+    return template
+def render_constructor(template: str, kwargs: Dict[str, Any], var_name: Optional[str] = None) -> str:
+    kwarg_strs = {name: repr(value) for name, value in kwargs.items()}
+    rendered = template.format(**kwarg_strs)
+    return rendered if var_name is None else f"{var_name} = {rendered}"
+class VarName(str):
+    def __repr__(self):
+        return self
+BLOB_STORE_SPEC_TEMPLATE = constructor_template(
+    metaschema.RemoteBlobStoreSpec, sig=signature(metaschema.RemoteBlobStoreSpec)
+)
+def render_blob_store_def(blob_store: metaschema.RemoteBlobStoreSpec, var_name: str) -> str:
+    return render_constructor(BLOB_STORE_SPEC_TEMPLATE, kwargs=blob_store.dict(), var_name=var_name)

thds/tabularasa/schema/constraints.py ADDED Viewed

@@ -0,0 +1,327 @@
+import re
+from enum import Enum
+from typing import Dict, List, Optional, Pattern, Tuple, Union
+import pandas as pd
+import pandera as pa
+from pydantic import BaseModel, Extra, StrictFloat, StrictInt
+from .dtypes import DType
+from .util import EnumList
+Numeric = Union[StrictInt, StrictFloat]
+class ColumnConstraint(BaseModel, extra=Extra.forbid):
+    __dtypes__: Tuple[DType, ...] = ()
+    def applies_to(self, dtype: DType) -> bool:
+        return dtype in self.__dtypes__
+    def pandera_check_expr(self) -> str:
+        raise NotImplementedError(
+            f"{type(self).__name__} must implement pandera_check producing an expression "
+            "equivalent to the expression produced by the `.pandera_check` method "
+            "(with pandera aliased to 'pa')"
+        )
+    def pandera_check(self) -> pa.Check:
+        raise NotImplementedError(
+            f"{type(self).__name__} must implement pandera_check producing a pandera.Check object "
+            "equivalent to the expression produced by the `.pandera_check_expr` method"
+        )
+    def sqlite_check_expr(self, colname: str) -> str:
+        raise NotImplementedError(f"sqlite check constraint not implemented for constraint {self}")
+    def required_modules(self) -> List[str]:
+        """list of stdlib modules required for constraint checks"""
+        return []
+    def comment_expr(self) -> Optional[str]:
+        return None
+class StrConstraint(ColumnConstraint):
+    __dtypes__ = (DType.STR,)
+class LenConstraint(StrConstraint):
+    __operator__: str
+    __value_attr__: str
+    def pandera_check_expr(self) -> str:
+        kwargs_ = ", ".join(f"{k}={v!r}" for k, v in self._pandera_check_kwargs().items())
+        return f"pa.{pa.Check.__name__}.{pa.Check.str_length.__name__}({kwargs_})"
+    def pandera_check(self) -> pa.Check:
+        return pa.Check.str_length(**self._pandera_check_kwargs())
+    def _pandera_check_kwargs(self) -> Dict[str, int]:
+        kw = "max_value" if "<" in self.__operator__ else "min_value"
+        value = getattr(self, self.__value_attr__)
+        if "=" not in self.__operator__:
+            # non-inclusive bound
+            if "<" in self.__operator__:
+                value -= 1
+            else:
+                value += 1
+        return {kw: value}
+    def sqlite_check_expr(self, colname: str) -> str:
+        return f"length({colname}) {self.__operator__} {getattr(self, self.__value_attr__)!r}"
+    def comment_expr(self) -> Optional[str]:
+        return f"length {self.__operator__} {getattr(self, self.__value_attr__)!r}"
+class NumConstraint(ColumnConstraint):
+    __dtypes__ = tuple(t for t in DType if t.is_float_type or t.is_int_type)
+class OrderConstraint(NumConstraint):
+    __operator__: str
+    __value_attr__: str
+    def applies_to(self, dtype: DType) -> bool:
+        value = getattr(self, self.__value_attr__)
+        if dtype.is_int_type:
+            return isinstance(value, int)
+        elif dtype.is_float_type:
+            return isinstance(value, float)
+        return False
+    def pandera_check_expr(self) -> str:
+        return f"pa.{pa.Check.__name__}.{self.__value_attr__}({getattr(self, self.__value_attr__)!r})"
+    def pandera_check(self) -> pa.Check:
+        return getattr(pa.Check, self.__value_attr__)(getattr(self, self.__value_attr__))
+    def sqlite_check_expr(self, colname) -> str:
+        return f"{colname} {self.__operator__} {getattr(self, self.__value_attr__)!r}"
+    def comment_expr(self) -> Optional[str]:
+        return f"{self.__operator__} {getattr(self, self.__value_attr__)!r}"
+class LessThanOrEqual(OrderConstraint):
+    le: Numeric
+    __operator__ = "<="
+    __value_attr__ = "le"
+class GreaterThanOrEqual(OrderConstraint):
+    ge: Numeric
+    __operator__ = ">="
+    __value_attr__ = "ge"
+class LessThan(OrderConstraint):
+    lt: Numeric
+    __operator__ = "<"
+    __value_attr__ = "lt"
+class GreaterThan(OrderConstraint):
+    gt: Numeric
+    __operator__ = ">"
+    __value_attr__ = "gt"
+class EqualModulo(NumConstraint):
+    eq: Numeric
+    mod: Numeric
+    def sqlite_check_expr(self, colname: str) -> str:
+        return f"{colname} % {self.mod} = {self.eq}"
+    def pandera_check_expr(self) -> str:
+        return f"pa.{pa.Check.__name__}(lambda s: (s % {self.mod} == {self.eq}), name={repr(self)!r})"
+    def pandera_check(self) -> pa.Check:
+        return pa.Check(lambda s: (s % self.mod == self.eq), name=repr(self))
+    def comment_expr(self) -> str:
+        return f"equals {self.eq!r} modulo {self.mod!r}"
+class LenLessThanOrEqual(LenConstraint):
+    len_le: StrictInt
+    __operator__ = "<="
+    __value_attr__ = "len_le"
+class LenGreaterThanOrEqual(LenConstraint):
+    len_ge: StrictInt
+    __operator__ = ">="
+    __value_attr__ = "len_ge"
+class LenLessThan(LenConstraint):
+    len_lt: StrictInt
+    __operator__ = "<"
+    __value_attr__ = "len_lt"
+class LenGreaterThan(LenConstraint):
+    len_gt: StrictInt
+    __operator__ = ">"
+    __value_attr__ = "len_gt"
+class StrCase(Enum):
+    lower = "lower"
+    upper = "upper"
+class StrChars(Enum):
+    alpha = "alpha"
+    alphanumeric = "alnum"
+    digit = "digit"
+    decimal = "decimal"
+    title = "title"
+class CaseConstraint(StrConstraint):
+    case: StrCase
+    def pandera_check_expr(self) -> str:
+        method_name = self._pandas_str_method_name()
+        check_name = self._check_name()
+        return (
+            f"pa.{pa.Check.__name__}(lambda s: s.str.{method_name}().fillna(True), name={check_name!r})"
+        )
+    def pandera_check(self) -> pa.Check:
+        method_name = self._pandas_str_method_name()
+        check_name = self._check_name()
+        return pa.Check(lambda s: getattr(s.str, method_name)().fillna(True), name=check_name)
+    def _check_name(self) -> str:
+        return f"case={self.case.value}"
+    def _pandas_str_method_name(self) -> str:
+        return f"is{self.case.value}"
+    def sqlite_check_expr(self, colname: str) -> str:
+        return f"{colname} = {self.case.value}({colname})"
+    def comment_expr(self) -> Optional[str]:
+        return self.case.value + "case"
+class CharsConstraint(StrConstraint):
+    chars: StrChars
+    def pandera_check_expr(self) -> str:
+        method_name = self._pandas_str_method_name()
+        check_name = self._check_name()
+        return (
+            f"pa.{pa.Check.__name__}(lambda s: s.str.{method_name}().fillna(True), name={check_name!r})"
+        )
+    def pandera_check(self) -> pa.Check:
+        method_name = self._pandas_str_method_name()
+        check_name = self._check_name()
+        return pa.Check(lambda s: getattr(s.str, method_name)().fillna(True), name=check_name)
+    def _check_name(self) -> str:
+        return f"chars={self.chars.value}"
+    def _pandas_str_method_name(self) -> str:
+        return f"is{self.chars.value}"
+    def comment_expr(self) -> Optional[str]:
+        return self.chars.name + " pattern"
+class MatchesRegex(StrConstraint):
+    matches: Pattern
+    fullmatch: bool = True
+    case_sensitive: bool = True
+    def pandera_check_expr(self) -> str:
+        method_name = "fullmatch" if self.fullmatch else "match"
+        check_name = self._check_name()
+        return (
+            f"pa.{pa.Check.__name__}(lambda s: s.str.{method_name}"
+            f"(re.compile({self.matches.pattern!r}), "
+            f"case={self.case_sensitive}, na=True), name={check_name!r})"
+        )
+    def pandera_check(self) -> pa.Check:
+        check_name = self._check_name()
+        if self.fullmatch:
+            def check_fn(s: pd.Series):
+                return s.str.fullmatch(
+                    re.compile(self.matches.pattern), case=self.case_sensitive, na=True  # type: ignore[arg-type]
+                )
+        else:
+            def check_fn(s: pd.Series):
+                return s.str.match(
+                    re.compile(self.matches.pattern),  # type: ignore[arg-type]
+                    case=self.case_sensitive,
+                    na=True,
+                )
+        # TODO - check above type ignores
+        return pa.Check(check_fn, name=check_name)
+    def _check_name(self) -> str:
+        return f"{type(self).__name__}(fullmatch={self.fullmatch}, case_sensitive={self.case_sensitive})"
+    def required_modules(self) -> List[str]:
+        return ["re"]
+    def comment_expr(self) -> Optional[str]:
+        extras = [
+            "full match" if self.fullmatch else "prefix match",
+            "case sensitive" if self.case_sensitive else "case insensitive",
+        ]
+        extra = f" ({', '.join(extras)})"
+        return f"matches ``{self.matches.pattern}``{extra}"
+class EnumConstraint(ColumnConstraint):
+    __dtypes__ = (DType.STR, *(t for t in DType if t.is_int_type or t.is_float_type))
+    enum: EnumList
+    ordered: bool = False
+    def pandera_check_expr(self) -> str:
+        # pandera doesn't support checking specific categoricals natively, only
+        # that a column has a categorical dtype, so we check the values here
+        return f"pa.{pa.Check.__name__}.isin({self.enum!r})"
+    def pandera_check(self) -> pa.Check:
+        return pa.Check.isin(self.enum)
+    def sqlite_check_expr(self, colname: str) -> str:
+        return f'{colname} IN ({", ".join(map(repr, self.enum))})'
+    def applies_to(self, dtype: DType) -> bool:
+        if not self.enum:
+            return True
+        value_type = type(self.enum[0])
+        return issubclass(value_type, dtype.python)
+AnyColumnConstraint = Union[
+    LessThanOrEqual,
+    GreaterThanOrEqual,
+    LessThan,
+    GreaterThan,
+    EqualModulo,
+    LenLessThanOrEqual,
+    LenGreaterThanOrEqual,
+    LenLessThan,
+    LenGreaterThan,
+    EnumConstraint,
+    CaseConstraint,
+    CharsConstraint,
+    MatchesRegex,
+]

thds/tabularasa/schema/dtypes.py ADDED Viewed

@@ -0,0 +1,153 @@
+import builtins
+import datetime
+from enum import Enum
+from typing import Any, Callable, Iterator, Optional, Set, Type, Union
+import numpy as np
+import pandas as pd
+import pyarrow
+from pandas.core.dtypes import base as pd_dtypes
+from thds.tabularasa.compat import PANDAS_VERSION_LT_2_0
+from .util import EnumList, Identifier
+AnyDtype = Union[pd_dtypes.ExtensionDtype, np.dtype]
+PyType = Union[int, float, bool, str, datetime.date, datetime.datetime]
+_dtype_name_to_pd_dtype = dict(
+    int8=pd.Int8Dtype(),
+    int16=pd.Int16Dtype(),
+    int32=pd.Int32Dtype(),
+    int64=pd.Int64Dtype(),
+    uint8=pd.UInt8Dtype(),
+    uint16=pd.UInt16Dtype(),
+    uint32=pd.UInt32Dtype(),
+    uint64=pd.UInt64Dtype(),
+    bool=pd.BooleanDtype(),
+    str=pd.StringDtype(),
+)
+class DType(Enum):
+    INT8 = "int8"
+    INT16 = "int16"
+    INT32 = "int32"
+    INT64 = "int64"
+    UINT8 = "uint8"
+    UINT16 = "uint16"
+    UINT32 = "uint32"
+    UINT64 = "uint64"
+    FLOAT32 = "float32"
+    FLOAT64 = "float64"
+    STR = "str"
+    DATE = "date"
+    DATETIME = "datetime"
+    BOOL = "bool"
+    @property
+    def enum(self) -> None:
+        return None
+    @property
+    def is_int_type(self):
+        return self.value.startswith("int") or self.value.startswith("uint")
+    @property
+    def is_float_type(self):
+        return self.value.startswith("float")
+    def pandas(
+        self,
+        nullable: bool = False,
+        index: bool = False,
+        enum: Optional[EnumList] = None,
+        ordered: bool = False,
+    ) -> Union[np.dtype, pd_dtypes.ExtensionDtype]:
+        if enum:
+            return pd.CategoricalDtype(enum, ordered=ordered)
+        elif self == DType.DATE or self == DType.DATETIME:
+            # pandas <2.0 *only* supports nanosecond datetimes so we're safe to use this
+            return np.dtype("datetime64[ns]")
+        elif self == DType.BOOL:
+            return pd.BooleanDtype() if nullable else np.dtype("bool")
+        elif self == DType.STR:
+            # StringDtype is better than dtype('O') for explicitness and null handling,
+            # but note that pandas<1.4 coerces to dtype('O') for indexes - we used to handle that case
+            # but it is no longer necessary in pandas>=1.4
+            return pd.StringDtype()
+        else:
+            # int and float types
+            if nullable and self.is_int_type:
+                # nullable int extension types
+                return _dtype_name_to_pd_dtype[self.value]
+            else:
+                # non-nullable ints or floats
+                if index and PANDAS_VERSION_LT_2_0:
+                    # no low-resolution types on indexes with pandas<2.0
+                    if self.is_float_type:
+                        return np.dtype("float")
+                    else:
+                        return np.dtype("int") if self.value.startswith("int") else np.dtype("uint")
+                else:
+                    return np.dtype(self.value)
+    @property
+    def sqlite(self) -> str:
+        if self == DType.BOOL:
+            return "BOOLEAN"
+        elif self == DType.STR:
+            return "TEXT"
+        else:
+            # only one true float and int type in sqlite, but all aliases accepted; we keep them as-is
+            # for explicitness
+            return self.name
+    @property
+    def python(self) -> Type[PyType]:
+        if self.is_int_type:
+            return int
+        elif self.is_float_type:
+            return float
+        elif self == DType.STR:
+            return str
+        elif self == DType.DATE:
+            return datetime.date
+        elif self == DType.DATETIME:
+            return datetime.datetime
+        elif self == DType.BOOL:
+            return bool
+        else:
+            raise TypeError(f"No python type registered for {self}")
+    def python_type_literal(self, build_options: Any = None, builtin: bool = False) -> str:
+        cls = self.python
+        if cls.__module__ == builtins.__name__:
+            # int, str, bool, etc
+            return cls.__name__
+        else:
+            return f"{cls.__module__}.{cls.__name__}"
+    @property
+    def custom_type_refs(self) -> Iterator[Identifier]:
+        yield from ()
+    @property
+    def parquet(self) -> pyarrow.DataType:
+        if self == DType.STR:
+            return pyarrow.string()
+        elif self == DType.BOOL:
+            return pyarrow.bool_()
+        elif self is DType.DATETIME:
+            return pyarrow.timestamp("ns")
+        elif self is DType.DATE:
+            return pyarrow.date32()
+        else:
+            dtype: Callable[[], pyarrow.DataType] = getattr(pyarrow, self.name.lower())
+            return dtype()
+    def attrs_required_imports(self, build_options: Any = None) -> Set[str]:
+        if self in (DType.DATE, DType.DATETIME):
+            return {"datetime"}
+        return set()