thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import List, NamedTuple, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas.core.dtypes.dtypes as pd_dtypes
|
|
6
|
+
import pandera as pa
|
|
7
|
+
|
|
8
|
+
import thds.tabularasa.loaders.util
|
|
9
|
+
from thds.tabularasa.schema import metaschema
|
|
10
|
+
|
|
11
|
+
from ._format import autoformat
|
|
12
|
+
from .util import (
|
|
13
|
+
AUTOGEN_DISCLAIMER,
|
|
14
|
+
VarName,
|
|
15
|
+
_dict_literal,
|
|
16
|
+
_indent,
|
|
17
|
+
_list_literal,
|
|
18
|
+
constructor_template,
|
|
19
|
+
render_blob_store_def,
|
|
20
|
+
render_constructor,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
REMOTE_BLOB_STORE_VAR_NAME = "REMOTE_BLOB_STORE"
|
|
24
|
+
|
|
25
|
+
PANDERA_DATAFRAME_SCHEMA_TEMPLATE = (
|
|
26
|
+
"""pa.%s(
|
|
27
|
+
columns={columns},
|
|
28
|
+
index={index},
|
|
29
|
+
checks={checks},
|
|
30
|
+
coerce={coerce!r},
|
|
31
|
+
strict={strict!r},
|
|
32
|
+
ordered={ordered!r},
|
|
33
|
+
)"""
|
|
34
|
+
% pa.DataFrameSchema.__name__
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
PANDERA_COLUMN_SCHEMA_TEMPLATE = (
|
|
38
|
+
"""pa.%s(
|
|
39
|
+
{dtype},
|
|
40
|
+
checks={checks},
|
|
41
|
+
nullable={nullable!r},
|
|
42
|
+
unique={unique!r},
|
|
43
|
+
)"""
|
|
44
|
+
% pa.Column.__name__
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
PANDERA_INDEX_SCHEMA_TEMPLATE = (
|
|
48
|
+
"""pa.%s(
|
|
49
|
+
{dtype},
|
|
50
|
+
checks={checks},
|
|
51
|
+
nullable={nullable!r},
|
|
52
|
+
unique={unique!r},
|
|
53
|
+
name={name!r},
|
|
54
|
+
)"""
|
|
55
|
+
% pa.Index.__name__
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
PANDERA_MULTIINDEX_SCHEMA_TEMPLATE = (
|
|
59
|
+
"""pa.%s(
|
|
60
|
+
[
|
|
61
|
+
{indexes},
|
|
62
|
+
],
|
|
63
|
+
strict={strict!r},
|
|
64
|
+
ordered=True,
|
|
65
|
+
)"""
|
|
66
|
+
% pa.MultiIndex.__name__
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
PANDAS_LOADER_TEMPLATE = constructor_template(
|
|
70
|
+
thds.tabularasa.loaders.util.PandasParquetLoader.from_pandera_schema,
|
|
71
|
+
module_name=(
|
|
72
|
+
f"{thds.tabularasa.loaders.util.__name__}.{thds.tabularasa.loaders.util.PandasParquetLoader.__name__}"
|
|
73
|
+
),
|
|
74
|
+
exclude=["filename"],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def render_pandera_table_schema(table: metaschema.Table, coerce_run_time_tables: bool) -> str:
|
|
79
|
+
proxy_schema: metaschema._DataFrameSchemaProxy
|
|
80
|
+
proxy_schema = metaschema.render_pandera_schema(table, as_str=True) # type: ignore
|
|
81
|
+
|
|
82
|
+
def render_check_exprs(check_exprs: Optional[List[str]]) -> str:
|
|
83
|
+
if check_exprs:
|
|
84
|
+
return _indent(_list_literal(check_exprs, linebreak=False), 1)
|
|
85
|
+
return repr(None)
|
|
86
|
+
|
|
87
|
+
def render_column_schema(schema: metaschema._ColumnSchemaProxy) -> str:
|
|
88
|
+
return PANDERA_COLUMN_SCHEMA_TEMPLATE.format(
|
|
89
|
+
dtype=schema.dtype,
|
|
90
|
+
checks=render_check_exprs(schema.checks),
|
|
91
|
+
nullable=schema.nullable,
|
|
92
|
+
unique=schema.unique,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def render_index_schema(schema: metaschema._IndexSchemaProxy) -> str:
|
|
96
|
+
return PANDERA_INDEX_SCHEMA_TEMPLATE.format(
|
|
97
|
+
dtype=schema.dtype,
|
|
98
|
+
checks=render_check_exprs(schema.checks),
|
|
99
|
+
nullable=schema.nullable,
|
|
100
|
+
unique=schema.unique,
|
|
101
|
+
name=schema.name,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def render_multiindex_schema(schema: metaschema._MultiIndexSchemaProxy) -> str:
|
|
105
|
+
indexes = ",\n".join(render_index_schema(index) for index in schema.indexes)
|
|
106
|
+
return PANDERA_MULTIINDEX_SCHEMA_TEMPLATE.format(
|
|
107
|
+
indexes=_indent(indexes, 2),
|
|
108
|
+
strict=schema.strict,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
column_defs = [
|
|
112
|
+
(name, _indent(render_column_schema(column))) for name, column in proxy_schema.columns.items()
|
|
113
|
+
]
|
|
114
|
+
column_def = _indent(_dict_literal(column_defs), 1) if column_defs else "{}"
|
|
115
|
+
if isinstance(proxy_schema.index, metaschema._IndexSchemaProxy):
|
|
116
|
+
index_def = _indent(render_index_schema(proxy_schema.index), 1)
|
|
117
|
+
elif isinstance(proxy_schema.index, metaschema._MultiIndexSchemaProxy):
|
|
118
|
+
index_def = _indent(render_multiindex_schema(proxy_schema.index), 1)
|
|
119
|
+
else:
|
|
120
|
+
index_def = repr(None)
|
|
121
|
+
|
|
122
|
+
if proxy_schema.checks:
|
|
123
|
+
checks = _indent(_list_literal(proxy_schema.checks, linebreak=False))
|
|
124
|
+
else:
|
|
125
|
+
checks = repr(None)
|
|
126
|
+
|
|
127
|
+
table_schema = PANDERA_DATAFRAME_SCHEMA_TEMPLATE.format(
|
|
128
|
+
columns=column_def,
|
|
129
|
+
index=index_def,
|
|
130
|
+
checks=checks,
|
|
131
|
+
strict=True,
|
|
132
|
+
# allow the pandera schema to coerce inputs if the table is dropped in at run time, in case e.g.
|
|
133
|
+
# we expected int32 but got int64, which is a non-fatal error
|
|
134
|
+
coerce=table.run_time_installed if coerce_run_time_tables else False,
|
|
135
|
+
ordered=False,
|
|
136
|
+
)
|
|
137
|
+
return f"{table.snake_case_name}_schema = {table_schema}"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ImportsAndCode(NamedTuple):
|
|
141
|
+
imports: List[str]
|
|
142
|
+
code: List[str]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def render_pandera_loaders(
|
|
146
|
+
schema: metaschema.Schema,
|
|
147
|
+
package: str,
|
|
148
|
+
) -> ImportsAndCode:
|
|
149
|
+
data_dir = schema.build_options.package_data_dir
|
|
150
|
+
render_pyarrow_schemas = schema.build_options.pyarrow
|
|
151
|
+
qualified_pyarrow_module_name = "pyarrow_schemas"
|
|
152
|
+
import_lines = list()
|
|
153
|
+
if render_pyarrow_schemas:
|
|
154
|
+
import_lines.append("\n")
|
|
155
|
+
import_lines.append(f"from . import pyarrow as {qualified_pyarrow_module_name}")
|
|
156
|
+
return ImportsAndCode(
|
|
157
|
+
import_lines,
|
|
158
|
+
[
|
|
159
|
+
render_constructor(
|
|
160
|
+
PANDAS_LOADER_TEMPLATE,
|
|
161
|
+
kwargs=dict(
|
|
162
|
+
table_name=table.snake_case_name,
|
|
163
|
+
schema=VarName(f"{table.snake_case_name}_schema"),
|
|
164
|
+
package=package,
|
|
165
|
+
data_dir=data_dir,
|
|
166
|
+
blob_store=(
|
|
167
|
+
None if schema.remote_blob_store is None else VarName(REMOTE_BLOB_STORE_VAR_NAME)
|
|
168
|
+
),
|
|
169
|
+
md5=table.md5,
|
|
170
|
+
pyarrow_schema=(
|
|
171
|
+
VarName(
|
|
172
|
+
f"{qualified_pyarrow_module_name}.{table.snake_case_name}_pyarrow_schema"
|
|
173
|
+
)
|
|
174
|
+
if render_pyarrow_schemas
|
|
175
|
+
else None
|
|
176
|
+
),
|
|
177
|
+
),
|
|
178
|
+
var_name=f"load_{table.snake_case_name}",
|
|
179
|
+
)
|
|
180
|
+
for table in schema.package_tables
|
|
181
|
+
],
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def render_pandera_module(
|
|
186
|
+
schema: metaschema.Schema,
|
|
187
|
+
package: str,
|
|
188
|
+
coerce_run_time_tables: bool = False,
|
|
189
|
+
loader_defs: Optional[ImportsAndCode] = None,
|
|
190
|
+
) -> str:
|
|
191
|
+
if loader_defs is None:
|
|
192
|
+
loader_defs = (
|
|
193
|
+
render_pandera_loaders(
|
|
194
|
+
schema,
|
|
195
|
+
package=package,
|
|
196
|
+
)
|
|
197
|
+
if schema.build_options.package_data_dir
|
|
198
|
+
else None
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# stdlib imports
|
|
202
|
+
all_constraints = itertools.chain.from_iterable(t.constraints for t in schema.types.values())
|
|
203
|
+
required_stdlib_modules = sorted(
|
|
204
|
+
set(itertools.chain.from_iterable(c.required_modules() for c in all_constraints))
|
|
205
|
+
)
|
|
206
|
+
all_dtypes = set(
|
|
207
|
+
itertools.chain.from_iterable(
|
|
208
|
+
(c.pandas(index=c.name in (t.primary_key or [])) for c in t.columns)
|
|
209
|
+
for t in schema.package_tables
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
table_schemas = "\n\n".join(
|
|
214
|
+
render_pandera_table_schema(table, coerce_run_time_tables=coerce_run_time_tables)
|
|
215
|
+
for table in schema.package_tables
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
any_np_dtypes = any(isinstance(dt, np.dtype) for dt in all_dtypes)
|
|
219
|
+
|
|
220
|
+
import_lines = ["import " + modname + "\n" for modname in required_stdlib_modules]
|
|
221
|
+
if import_lines:
|
|
222
|
+
import_lines.append("\n")
|
|
223
|
+
if any_np_dtypes:
|
|
224
|
+
import_lines.append("import numpy as np\n")
|
|
225
|
+
if any(isinstance(dt, pd_dtypes.ExtensionDtype) for dt in all_dtypes):
|
|
226
|
+
import_lines.append("import pandas as pd\n")
|
|
227
|
+
import_lines.append("import pandera as pa\n")
|
|
228
|
+
import_lines.append("\n")
|
|
229
|
+
if any_np_dtypes:
|
|
230
|
+
import_lines.append(f"import {thds.tabularasa.compat.__name__} # noqa: F401\n")
|
|
231
|
+
# is there an effective way to check if we have any np numeric dtypes as indices so I can leave out the 'noqa'?
|
|
232
|
+
if loader_defs:
|
|
233
|
+
import_lines.append(f"import {thds.tabularasa.loaders.util.__name__}\n")
|
|
234
|
+
if schema.remote_blob_store is not None:
|
|
235
|
+
import_lines.append(f"import {thds.tabularasa.schema.files.__name__}\n")
|
|
236
|
+
|
|
237
|
+
if loader_defs:
|
|
238
|
+
import_lines += loader_defs.imports
|
|
239
|
+
|
|
240
|
+
imports = "".join(import_lines)
|
|
241
|
+
|
|
242
|
+
global_var_defs = []
|
|
243
|
+
if schema.remote_blob_store is not None:
|
|
244
|
+
global_var_defs.append(
|
|
245
|
+
render_blob_store_def(schema.remote_blob_store, REMOTE_BLOB_STORE_VAR_NAME)
|
|
246
|
+
)
|
|
247
|
+
globals_ = "\n".join(global_var_defs)
|
|
248
|
+
loaders = "\n\n".join(loader_defs.code) if loader_defs else ""
|
|
249
|
+
|
|
250
|
+
return autoformat(
|
|
251
|
+
f"{imports}\n# {AUTOGEN_DISCLAIMER}\n\n{globals_}\n\n{table_schemas}\n\n{loaders}\n"
|
|
252
|
+
)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from textwrap import indent
|
|
3
|
+
from typing import Dict, Union
|
|
4
|
+
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
|
|
7
|
+
from thds.tabularasa.schema import metaschema
|
|
8
|
+
|
|
9
|
+
from ._format import autoformat
|
|
10
|
+
from .util import AUTOGEN_DISCLAIMER
|
|
11
|
+
|
|
12
|
+
_pyarrow_type_to_name: Dict[pa.DataType, str] = {}
|
|
13
|
+
_pyarrow_type_to_name.update(
|
|
14
|
+
(t(), f"int{t().bit_width}")
|
|
15
|
+
for t in [
|
|
16
|
+
pa.int8,
|
|
17
|
+
pa.int16,
|
|
18
|
+
pa.int32,
|
|
19
|
+
pa.int64,
|
|
20
|
+
]
|
|
21
|
+
)
|
|
22
|
+
_pyarrow_type_to_name.update(
|
|
23
|
+
(t(), f"uint{t().bit_width}")
|
|
24
|
+
for t in [
|
|
25
|
+
pa.uint8,
|
|
26
|
+
pa.uint16,
|
|
27
|
+
pa.uint32,
|
|
28
|
+
pa.uint64,
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
_pyarrow_type_to_name.update(
|
|
32
|
+
(t(), f"float{t().bit_width}") for t in [pa.float16, pa.float32, pa.float64]
|
|
33
|
+
)
|
|
34
|
+
_pyarrow_type_to_name.update((t(), f"date{t().bit_width}") for t in [pa.date32, pa.date64])
|
|
35
|
+
_pyarrow_type_to_name[pa.string()] = "string"
|
|
36
|
+
_pyarrow_type_to_name[pa.bool_()] = "bool_"
|
|
37
|
+
_pyarrow_type_to_name[pa.null()] = "null"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def render_pyarrow_schema(
|
|
41
|
+
schema: metaschema.Schema,
|
|
42
|
+
) -> str:
|
|
43
|
+
pyarrow_schemas = "\n\n".join(
|
|
44
|
+
(
|
|
45
|
+
f"{table.snake_case_name}_pyarrow_schema = {pyarrow_schema_literal(table.parquet_schema)}"
|
|
46
|
+
for table in schema.package_tables
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
return autoformat(f"import {pa.__name__}\n\n# {AUTOGEN_DISCLAIMER}\n\n{pyarrow_schemas}\n")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def pyarrow_schema_literal(schema: pa.Schema) -> str:
|
|
53
|
+
return _pyarrow_schema_literal(schema, "schema")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def pyarrow_field_literal(field: pa.Field) -> str:
|
|
57
|
+
t = field.type
|
|
58
|
+
if not t.num_fields:
|
|
59
|
+
return (
|
|
60
|
+
f'{pa.__name__}.field("{field.name}", {pyarrow_type_literal(field.type)}, '
|
|
61
|
+
f"nullable={field.nullable!r})"
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
return (
|
|
65
|
+
f'{pa.__name__}.field(\n "{field.name}",\n'
|
|
66
|
+
f'{indent(pyarrow_type_literal(field.type), " ")},\n nullable={field.nullable!r},\n)'
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def pyarrow_type_literal(type_: pa.DataType) -> str:
|
|
71
|
+
if isinstance(type_, pa.StructType):
|
|
72
|
+
return _pyarrow_schema_literal(type_, "struct")
|
|
73
|
+
elif isinstance(type_, pa.ListType):
|
|
74
|
+
v = type_.value_type
|
|
75
|
+
return f"{pa.__name__}.list_({pyarrow_type_literal(v)})"
|
|
76
|
+
elif isinstance(type_, pa.FixedSizeListType):
|
|
77
|
+
v = type_.value_type
|
|
78
|
+
return f"{pa.__name__}.list_({pyarrow_type_literal(v)}, list_size={v.list_size})"
|
|
79
|
+
elif isinstance(type_, pa.MapType):
|
|
80
|
+
k, v = type_.key_type, type_.item_type
|
|
81
|
+
return f"{pa.__name__}.map_({pyarrow_type_literal(k)}, {pyarrow_type_literal(v)})"
|
|
82
|
+
elif isinstance(type_, pa.TimestampType):
|
|
83
|
+
tz = "None" if type_.tz is None else f'"{type_.tz}"'
|
|
84
|
+
return f'{pa.__name__}.timestamp("{type_.unit}", {tz})'
|
|
85
|
+
else:
|
|
86
|
+
return f"{pa.__name__}.{_pyarrow_type_to_name[type_]}()"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _pyarrow_schema_literal(type_: Union[pa.Schema, pa.StructType], kind: str) -> str:
|
|
90
|
+
indent_ = partial(indent, prefix=" ")
|
|
91
|
+
fields = map(indent_, map(pyarrow_field_literal, type_))
|
|
92
|
+
sep = ",\n"
|
|
93
|
+
return f"{pa.__name__}.{kind}([\n{sep.join(fields)}\n])"
|