thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,252 @@
1
+ import itertools
2
+ from typing import List, NamedTuple, Optional
3
+
4
+ import numpy as np
5
+ import pandas.core.dtypes.dtypes as pd_dtypes
6
+ import pandera as pa
7
+
8
+ import thds.tabularasa.loaders.util
9
+ from thds.tabularasa.schema import metaschema
10
+
11
+ from ._format import autoformat
12
+ from .util import (
13
+ AUTOGEN_DISCLAIMER,
14
+ VarName,
15
+ _dict_literal,
16
+ _indent,
17
+ _list_literal,
18
+ constructor_template,
19
+ render_blob_store_def,
20
+ render_constructor,
21
+ )
22
+
23
+ REMOTE_BLOB_STORE_VAR_NAME = "REMOTE_BLOB_STORE"
24
+
25
+ PANDERA_DATAFRAME_SCHEMA_TEMPLATE = (
26
+ """pa.%s(
27
+ columns={columns},
28
+ index={index},
29
+ checks={checks},
30
+ coerce={coerce!r},
31
+ strict={strict!r},
32
+ ordered={ordered!r},
33
+ )"""
34
+ % pa.DataFrameSchema.__name__
35
+ )
36
+
37
+ PANDERA_COLUMN_SCHEMA_TEMPLATE = (
38
+ """pa.%s(
39
+ {dtype},
40
+ checks={checks},
41
+ nullable={nullable!r},
42
+ unique={unique!r},
43
+ )"""
44
+ % pa.Column.__name__
45
+ )
46
+
47
+ PANDERA_INDEX_SCHEMA_TEMPLATE = (
48
+ """pa.%s(
49
+ {dtype},
50
+ checks={checks},
51
+ nullable={nullable!r},
52
+ unique={unique!r},
53
+ name={name!r},
54
+ )"""
55
+ % pa.Index.__name__
56
+ )
57
+
58
+ PANDERA_MULTIINDEX_SCHEMA_TEMPLATE = (
59
+ """pa.%s(
60
+ [
61
+ {indexes},
62
+ ],
63
+ strict={strict!r},
64
+ ordered=True,
65
+ )"""
66
+ % pa.MultiIndex.__name__
67
+ )
68
+
69
+ PANDAS_LOADER_TEMPLATE = constructor_template(
70
+ thds.tabularasa.loaders.util.PandasParquetLoader.from_pandera_schema,
71
+ module_name=(
72
+ f"{thds.tabularasa.loaders.util.__name__}.{thds.tabularasa.loaders.util.PandasParquetLoader.__name__}"
73
+ ),
74
+ exclude=["filename"],
75
+ )
76
+
77
+
78
+ def render_pandera_table_schema(table: metaschema.Table, coerce_run_time_tables: bool) -> str:
79
+ proxy_schema: metaschema._DataFrameSchemaProxy
80
+ proxy_schema = metaschema.render_pandera_schema(table, as_str=True) # type: ignore
81
+
82
+ def render_check_exprs(check_exprs: Optional[List[str]]) -> str:
83
+ if check_exprs:
84
+ return _indent(_list_literal(check_exprs, linebreak=False), 1)
85
+ return repr(None)
86
+
87
+ def render_column_schema(schema: metaschema._ColumnSchemaProxy) -> str:
88
+ return PANDERA_COLUMN_SCHEMA_TEMPLATE.format(
89
+ dtype=schema.dtype,
90
+ checks=render_check_exprs(schema.checks),
91
+ nullable=schema.nullable,
92
+ unique=schema.unique,
93
+ )
94
+
95
+ def render_index_schema(schema: metaschema._IndexSchemaProxy) -> str:
96
+ return PANDERA_INDEX_SCHEMA_TEMPLATE.format(
97
+ dtype=schema.dtype,
98
+ checks=render_check_exprs(schema.checks),
99
+ nullable=schema.nullable,
100
+ unique=schema.unique,
101
+ name=schema.name,
102
+ )
103
+
104
+ def render_multiindex_schema(schema: metaschema._MultiIndexSchemaProxy) -> str:
105
+ indexes = ",\n".join(render_index_schema(index) for index in schema.indexes)
106
+ return PANDERA_MULTIINDEX_SCHEMA_TEMPLATE.format(
107
+ indexes=_indent(indexes, 2),
108
+ strict=schema.strict,
109
+ )
110
+
111
+ column_defs = [
112
+ (name, _indent(render_column_schema(column))) for name, column in proxy_schema.columns.items()
113
+ ]
114
+ column_def = _indent(_dict_literal(column_defs), 1) if column_defs else "{}"
115
+ if isinstance(proxy_schema.index, metaschema._IndexSchemaProxy):
116
+ index_def = _indent(render_index_schema(proxy_schema.index), 1)
117
+ elif isinstance(proxy_schema.index, metaschema._MultiIndexSchemaProxy):
118
+ index_def = _indent(render_multiindex_schema(proxy_schema.index), 1)
119
+ else:
120
+ index_def = repr(None)
121
+
122
+ if proxy_schema.checks:
123
+ checks = _indent(_list_literal(proxy_schema.checks, linebreak=False))
124
+ else:
125
+ checks = repr(None)
126
+
127
+ table_schema = PANDERA_DATAFRAME_SCHEMA_TEMPLATE.format(
128
+ columns=column_def,
129
+ index=index_def,
130
+ checks=checks,
131
+ strict=True,
132
+ # allow the pandera schema to coerce inputs if the table is dropped in at run time, in case e.g.
133
+ # we expected int32 but got int64, which is a non-fatal error
134
+ coerce=table.run_time_installed if coerce_run_time_tables else False,
135
+ ordered=False,
136
+ )
137
+ return f"{table.snake_case_name}_schema = {table_schema}"
138
+
139
+
140
+ class ImportsAndCode(NamedTuple):
141
+ imports: List[str]
142
+ code: List[str]
143
+
144
+
145
+ def render_pandera_loaders(
146
+ schema: metaschema.Schema,
147
+ package: str,
148
+ ) -> ImportsAndCode:
149
+ data_dir = schema.build_options.package_data_dir
150
+ render_pyarrow_schemas = schema.build_options.pyarrow
151
+ qualified_pyarrow_module_name = "pyarrow_schemas"
152
+ import_lines = list()
153
+ if render_pyarrow_schemas:
154
+ import_lines.append("\n")
155
+ import_lines.append(f"from . import pyarrow as {qualified_pyarrow_module_name}")
156
+ return ImportsAndCode(
157
+ import_lines,
158
+ [
159
+ render_constructor(
160
+ PANDAS_LOADER_TEMPLATE,
161
+ kwargs=dict(
162
+ table_name=table.snake_case_name,
163
+ schema=VarName(f"{table.snake_case_name}_schema"),
164
+ package=package,
165
+ data_dir=data_dir,
166
+ blob_store=(
167
+ None if schema.remote_blob_store is None else VarName(REMOTE_BLOB_STORE_VAR_NAME)
168
+ ),
169
+ md5=table.md5,
170
+ pyarrow_schema=(
171
+ VarName(
172
+ f"{qualified_pyarrow_module_name}.{table.snake_case_name}_pyarrow_schema"
173
+ )
174
+ if render_pyarrow_schemas
175
+ else None
176
+ ),
177
+ ),
178
+ var_name=f"load_{table.snake_case_name}",
179
+ )
180
+ for table in schema.package_tables
181
+ ],
182
+ )
183
+
184
+
185
+ def render_pandera_module(
186
+ schema: metaschema.Schema,
187
+ package: str,
188
+ coerce_run_time_tables: bool = False,
189
+ loader_defs: Optional[ImportsAndCode] = None,
190
+ ) -> str:
191
+ if loader_defs is None:
192
+ loader_defs = (
193
+ render_pandera_loaders(
194
+ schema,
195
+ package=package,
196
+ )
197
+ if schema.build_options.package_data_dir
198
+ else None
199
+ )
200
+
201
+ # stdlib imports
202
+ all_constraints = itertools.chain.from_iterable(t.constraints for t in schema.types.values())
203
+ required_stdlib_modules = sorted(
204
+ set(itertools.chain.from_iterable(c.required_modules() for c in all_constraints))
205
+ )
206
+ all_dtypes = set(
207
+ itertools.chain.from_iterable(
208
+ (c.pandas(index=c.name in (t.primary_key or [])) for c in t.columns)
209
+ for t in schema.package_tables
210
+ )
211
+ )
212
+
213
+ table_schemas = "\n\n".join(
214
+ render_pandera_table_schema(table, coerce_run_time_tables=coerce_run_time_tables)
215
+ for table in schema.package_tables
216
+ )
217
+
218
+ any_np_dtypes = any(isinstance(dt, np.dtype) for dt in all_dtypes)
219
+
220
+ import_lines = ["import " + modname + "\n" for modname in required_stdlib_modules]
221
+ if import_lines:
222
+ import_lines.append("\n")
223
+ if any_np_dtypes:
224
+ import_lines.append("import numpy as np\n")
225
+ if any(isinstance(dt, pd_dtypes.ExtensionDtype) for dt in all_dtypes):
226
+ import_lines.append("import pandas as pd\n")
227
+ import_lines.append("import pandera as pa\n")
228
+ import_lines.append("\n")
229
+ if any_np_dtypes:
230
+ import_lines.append(f"import {thds.tabularasa.compat.__name__} # noqa: F401\n")
231
+ # is there an effective way to check if we have any np numeric dtypes as indices so I can leave out the 'noqa'?
232
+ if loader_defs:
233
+ import_lines.append(f"import {thds.tabularasa.loaders.util.__name__}\n")
234
+ if schema.remote_blob_store is not None:
235
+ import_lines.append(f"import {thds.tabularasa.schema.files.__name__}\n")
236
+
237
+ if loader_defs:
238
+ import_lines += loader_defs.imports
239
+
240
+ imports = "".join(import_lines)
241
+
242
+ global_var_defs = []
243
+ if schema.remote_blob_store is not None:
244
+ global_var_defs.append(
245
+ render_blob_store_def(schema.remote_blob_store, REMOTE_BLOB_STORE_VAR_NAME)
246
+ )
247
+ globals_ = "\n".join(global_var_defs)
248
+ loaders = "\n\n".join(loader_defs.code) if loader_defs else ""
249
+
250
+ return autoformat(
251
+ f"{imports}\n# {AUTOGEN_DISCLAIMER}\n\n{globals_}\n\n{table_schemas}\n\n{loaders}\n"
252
+ )
@@ -0,0 +1,93 @@
1
+ from functools import partial
2
+ from textwrap import indent
3
+ from typing import Dict, Union
4
+
5
+ import pyarrow as pa
6
+
7
+ from thds.tabularasa.schema import metaschema
8
+
9
+ from ._format import autoformat
10
+ from .util import AUTOGEN_DISCLAIMER
11
+
12
+ _pyarrow_type_to_name: Dict[pa.DataType, str] = {}
13
+ _pyarrow_type_to_name.update(
14
+ (t(), f"int{t().bit_width}")
15
+ for t in [
16
+ pa.int8,
17
+ pa.int16,
18
+ pa.int32,
19
+ pa.int64,
20
+ ]
21
+ )
22
+ _pyarrow_type_to_name.update(
23
+ (t(), f"uint{t().bit_width}")
24
+ for t in [
25
+ pa.uint8,
26
+ pa.uint16,
27
+ pa.uint32,
28
+ pa.uint64,
29
+ ]
30
+ )
31
+ _pyarrow_type_to_name.update(
32
+ (t(), f"float{t().bit_width}") for t in [pa.float16, pa.float32, pa.float64]
33
+ )
34
+ _pyarrow_type_to_name.update((t(), f"date{t().bit_width}") for t in [pa.date32, pa.date64])
35
+ _pyarrow_type_to_name[pa.string()] = "string"
36
+ _pyarrow_type_to_name[pa.bool_()] = "bool_"
37
+ _pyarrow_type_to_name[pa.null()] = "null"
38
+
39
+
40
+ def render_pyarrow_schema(
41
+ schema: metaschema.Schema,
42
+ ) -> str:
43
+ pyarrow_schemas = "\n\n".join(
44
+ (
45
+ f"{table.snake_case_name}_pyarrow_schema = {pyarrow_schema_literal(table.parquet_schema)}"
46
+ for table in schema.package_tables
47
+ )
48
+ )
49
+ return autoformat(f"import {pa.__name__}\n\n# {AUTOGEN_DISCLAIMER}\n\n{pyarrow_schemas}\n")
50
+
51
+
52
+ def pyarrow_schema_literal(schema: pa.Schema) -> str:
53
+ return _pyarrow_schema_literal(schema, "schema")
54
+
55
+
56
+ def pyarrow_field_literal(field: pa.Field) -> str:
57
+ t = field.type
58
+ if not t.num_fields:
59
+ return (
60
+ f'{pa.__name__}.field("{field.name}", {pyarrow_type_literal(field.type)}, '
61
+ f"nullable={field.nullable!r})"
62
+ )
63
+ else:
64
+ return (
65
+ f'{pa.__name__}.field(\n "{field.name}",\n'
66
+ f'{indent(pyarrow_type_literal(field.type), " ")},\n nullable={field.nullable!r},\n)'
67
+ )
68
+
69
+
70
+ def pyarrow_type_literal(type_: pa.DataType) -> str:
71
+ if isinstance(type_, pa.StructType):
72
+ return _pyarrow_schema_literal(type_, "struct")
73
+ elif isinstance(type_, pa.ListType):
74
+ v = type_.value_type
75
+ return f"{pa.__name__}.list_({pyarrow_type_literal(v)})"
76
+ elif isinstance(type_, pa.FixedSizeListType):
77
+ v = type_.value_type
78
+ return f"{pa.__name__}.list_({pyarrow_type_literal(v)}, list_size={v.list_size})"
79
+ elif isinstance(type_, pa.MapType):
80
+ k, v = type_.key_type, type_.item_type
81
+ return f"{pa.__name__}.map_({pyarrow_type_literal(k)}, {pyarrow_type_literal(v)})"
82
+ elif isinstance(type_, pa.TimestampType):
83
+ tz = "None" if type_.tz is None else f'"{type_.tz}"'
84
+ return f'{pa.__name__}.timestamp("{type_.unit}", {tz})'
85
+ else:
86
+ return f"{pa.__name__}.{_pyarrow_type_to_name[type_]}()"
87
+
88
+
89
+ def _pyarrow_schema_literal(type_: Union[pa.Schema, pa.StructType], kind: str) -> str:
90
+ indent_ = partial(indent, prefix=" ")
91
+ fields = map(indent_, map(pyarrow_field_literal, type_))
92
+ sep = ",\n"
93
+ return f"{pa.__name__}.{kind}([\n{sep.join(fields)}\n])"