thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,123 @@
1
+ import functools
2
+ import importlib
3
+ import itertools
4
+ import typing
5
+ from typing import Callable, Iterable, List, Optional, Set, Union
6
+
7
+ import networkx as nx
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas.core.dtypes import base as pd_dtypes
11
+ from pydantic import BaseModel, Extra, StrictFloat, StrictInt, StrictStr, constr
12
+
13
+ EnumList = Union[List[StrictInt], List[StrictFloat], List[StrictStr]]
14
+
15
+ _identifier_pattern = r"[a-zA-Z]\w*"
16
+ _dunder_identifier_pattern = r"[a-zA-Z_]\w*"
17
+ _dashed_identifier_pattern = rf"{_identifier_pattern}(-{_identifier_pattern})*"
18
+ _dotted_identifier_pattern = rf"{_identifier_pattern}(\.{_dunder_identifier_pattern})*"
19
+ _rel_path_pattern = r"[^/].*" # paths are validated by the filesystem, not us.
20
+ _md5_hex_pattern = r"[0-9a-f]{32}"
21
+
22
+ if not typing.TYPE_CHECKING:
23
+ # pydantic (hilariously) uses match instead of fullmatch, so we
24
+ # have to anchor the regexes, but only at the end, since re.match
25
+ # requires the match to be found at the beginning of the string.
26
+ Identifier = constr(regex=_identifier_pattern + "$")
27
+ DottedIdentifier = constr(regex=_dotted_identifier_pattern + "$")
28
+ DashedIdentifier = constr(regex=_dashed_identifier_pattern + "$")
29
+ PathStr = constr(regex=_rel_path_pattern + "$")
30
+ HexStr = constr(regex=_md5_hex_pattern + "$")
31
+ NonEmptyStr = constr(min_length=1)
32
+ else:
33
+ Identifier = str
34
+ DottedIdentifier = str
35
+ DashedIdentifier = str
36
+ PathStr = str
37
+ HexStr = str
38
+ NonEmptyStr = str
39
+
40
+
41
+ def snake_to_title(schema_name: str, separator: str = ""):
42
+ """Turn a snake-case name from the schema into a title-case name with separated by `separator`."""
43
+ parts = str(schema_name).split("_")
44
+ return separator.join(part if part.isupper() else part.title() for part in parts)
45
+
46
+
47
+ def snake_case(schema_name: str) -> str:
48
+ """Alias for `str.lower` but defined here as a single source of truth in case we change that.
49
+ Names for tables, columns, and types in the schema should be underscore-separated, but tokens may be
50
+ capitalized to indicated that in class names they should remain as such (e.g. acronyms - see
51
+ `snake_to_title`)"""
52
+ return schema_name.lower()
53
+
54
+
55
+ @functools.singledispatch
56
+ def render_dtype(dt: Union[np.dtype, pd_dtypes.ExtensionDtype]) -> str:
57
+ raise NotImplementedError(f"Can't interpret {dt} as a pandas dtype")
58
+
59
+
60
+ @render_dtype.register(pd_dtypes.ExtensionDtype)
61
+ def render_pandas_dtype(dt: pd_dtypes.ExtensionDtype) -> str:
62
+ return f"pd.{type(dt).__name__}()"
63
+
64
+
65
+ @render_dtype.register(pd.CategoricalDtype)
66
+ def render_pandas_categorical_dtype(dt: pd.CategoricalDtype) -> str:
67
+ return f"pd.{pd.CategoricalDtype.__name__}({list(dt.categories)!r}, ordered={dt.ordered})"
68
+
69
+
70
+ @render_dtype.register(np.dtype)
71
+ def render_numpy_dtype(dt: np.dtype) -> str:
72
+ return f'np.dtype("{dt.name}")'
73
+
74
+
75
+ def all_predecessors(g: nx.DiGraph, nodes: Iterable) -> Set:
76
+ frontier = set(nodes)
77
+ predecessors = set()
78
+ while frontier:
79
+ predecessors.update(frontier)
80
+ frontier = set(itertools.chain.from_iterable(map(g.predecessors, frontier)))
81
+ return predecessors
82
+
83
+
84
+ def all_successors(g: nx.DiGraph, nodes: Iterable) -> Set:
85
+ return all_predecessors(nx.reverse(g), nodes)
86
+
87
+
88
+ def predecessor_graph(g: nx.DiGraph, nodes: Iterable) -> nx.DiGraph:
89
+ predecessors = all_predecessors(g, nodes)
90
+ return nx.induced_subgraph(g, predecessors)
91
+
92
+
93
+ def successor_graph(g: nx.DiGraph, nodes: Iterable) -> nx.DiGraph:
94
+ successors = all_successors(g, nodes)
95
+ return nx.induced_subgraph(g, successors)
96
+
97
+
98
+ def import_func(path: str) -> Callable:
99
+ parts = path.split(".")
100
+ module = ".".join(parts[:-1])
101
+ name = parts[-1]
102
+ mod = importlib.import_module(module)
103
+ func = getattr(mod, name)
104
+ if not callable(func):
105
+ raise TypeError(f"value {func} of type {type(func)} is not callable")
106
+
107
+ return func
108
+
109
+
110
+ class DocumentedMixin(BaseModel, extra=Extra.forbid):
111
+ doc: Optional[str] = None
112
+ markup: Optional[PathStr] = None
113
+
114
+ @property
115
+ def docstring(self) -> Optional[str]:
116
+ if self.doc is None:
117
+ if self.markup is None:
118
+ return None
119
+ else:
120
+ with open(self.markup, "r") as f:
121
+ return f.read()
122
+ else:
123
+ return self.doc