thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import importlib
|
|
3
|
+
import itertools
|
|
4
|
+
import typing
|
|
5
|
+
from typing import Callable, Iterable, List, Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pandas.core.dtypes import base as pd_dtypes
|
|
11
|
+
from pydantic import BaseModel, Extra, StrictFloat, StrictInt, StrictStr, constr
|
|
12
|
+
|
|
13
|
+
EnumList = Union[List[StrictInt], List[StrictFloat], List[StrictStr]]
|
|
14
|
+
|
|
15
|
+
_identifier_pattern = r"[a-zA-Z]\w*"
|
|
16
|
+
_dunder_identifier_pattern = r"[a-zA-Z_]\w*"
|
|
17
|
+
_dashed_identifier_pattern = rf"{_identifier_pattern}(-{_identifier_pattern})*"
|
|
18
|
+
_dotted_identifier_pattern = rf"{_identifier_pattern}(\.{_dunder_identifier_pattern})*"
|
|
19
|
+
_rel_path_pattern = r"[^/].*" # paths are validated by the filesystem, not us.
|
|
20
|
+
_md5_hex_pattern = r"[0-9a-f]{32}"
|
|
21
|
+
|
|
22
|
+
if not typing.TYPE_CHECKING:
|
|
23
|
+
# pydantic (hilariously) uses match instead of fullmatch, so we
|
|
24
|
+
# have to anchor the regexes, but only at the end, since re.match
|
|
25
|
+
# requires the match to be found at the beginning of the string.
|
|
26
|
+
Identifier = constr(regex=_identifier_pattern + "$")
|
|
27
|
+
DottedIdentifier = constr(regex=_dotted_identifier_pattern + "$")
|
|
28
|
+
DashedIdentifier = constr(regex=_dashed_identifier_pattern + "$")
|
|
29
|
+
PathStr = constr(regex=_rel_path_pattern + "$")
|
|
30
|
+
HexStr = constr(regex=_md5_hex_pattern + "$")
|
|
31
|
+
NonEmptyStr = constr(min_length=1)
|
|
32
|
+
else:
|
|
33
|
+
Identifier = str
|
|
34
|
+
DottedIdentifier = str
|
|
35
|
+
DashedIdentifier = str
|
|
36
|
+
PathStr = str
|
|
37
|
+
HexStr = str
|
|
38
|
+
NonEmptyStr = str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def snake_to_title(schema_name: str, separator: str = ""):
|
|
42
|
+
"""Turn a snake-case name from the schema into a title-case name with separated by `separator`."""
|
|
43
|
+
parts = str(schema_name).split("_")
|
|
44
|
+
return separator.join(part if part.isupper() else part.title() for part in parts)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def snake_case(schema_name: str) -> str:
|
|
48
|
+
"""Alias for `str.lower` but defined here as a single source of truth in case we change that.
|
|
49
|
+
Names for tables, columns, and types in the schema should be underscore-separated, but tokens may be
|
|
50
|
+
capitalized to indicated that in class names they should remain as such (e.g. acronyms - see
|
|
51
|
+
`snake_to_title`)"""
|
|
52
|
+
return schema_name.lower()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@functools.singledispatch
|
|
56
|
+
def render_dtype(dt: Union[np.dtype, pd_dtypes.ExtensionDtype]) -> str:
|
|
57
|
+
raise NotImplementedError(f"Can't interpret {dt} as a pandas dtype")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@render_dtype.register(pd_dtypes.ExtensionDtype)
|
|
61
|
+
def render_pandas_dtype(dt: pd_dtypes.ExtensionDtype) -> str:
|
|
62
|
+
return f"pd.{type(dt).__name__}()"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@render_dtype.register(pd.CategoricalDtype)
|
|
66
|
+
def render_pandas_categorical_dtype(dt: pd.CategoricalDtype) -> str:
|
|
67
|
+
return f"pd.{pd.CategoricalDtype.__name__}({list(dt.categories)!r}, ordered={dt.ordered})"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@render_dtype.register(np.dtype)
|
|
71
|
+
def render_numpy_dtype(dt: np.dtype) -> str:
|
|
72
|
+
return f'np.dtype("{dt.name}")'
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def all_predecessors(g: nx.DiGraph, nodes: Iterable) -> Set:
|
|
76
|
+
frontier = set(nodes)
|
|
77
|
+
predecessors = set()
|
|
78
|
+
while frontier:
|
|
79
|
+
predecessors.update(frontier)
|
|
80
|
+
frontier = set(itertools.chain.from_iterable(map(g.predecessors, frontier)))
|
|
81
|
+
return predecessors
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def all_successors(g: nx.DiGraph, nodes: Iterable) -> Set:
|
|
85
|
+
return all_predecessors(nx.reverse(g), nodes)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def predecessor_graph(g: nx.DiGraph, nodes: Iterable) -> nx.DiGraph:
|
|
89
|
+
predecessors = all_predecessors(g, nodes)
|
|
90
|
+
return nx.induced_subgraph(g, predecessors)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def successor_graph(g: nx.DiGraph, nodes: Iterable) -> nx.DiGraph:
|
|
94
|
+
successors = all_successors(g, nodes)
|
|
95
|
+
return nx.induced_subgraph(g, successors)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def import_func(path: str) -> Callable:
|
|
99
|
+
parts = path.split(".")
|
|
100
|
+
module = ".".join(parts[:-1])
|
|
101
|
+
name = parts[-1]
|
|
102
|
+
mod = importlib.import_module(module)
|
|
103
|
+
func = getattr(mod, name)
|
|
104
|
+
if not callable(func):
|
|
105
|
+
raise TypeError(f"value {func} of type {type(func)} is not callable")
|
|
106
|
+
|
|
107
|
+
return func
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class DocumentedMixin(BaseModel, extra=Extra.forbid):
|
|
111
|
+
doc: Optional[str] = None
|
|
112
|
+
markup: Optional[PathStr] = None
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def docstring(self) -> Optional[str]:
|
|
116
|
+
if self.doc is None:
|
|
117
|
+
if self.markup is None:
|
|
118
|
+
return None
|
|
119
|
+
else:
|
|
120
|
+
with open(self.markup, "r") as f:
|
|
121
|
+
return f.read()
|
|
122
|
+
else:
|
|
123
|
+
return self.doc
|