thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1007 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import itertools
|
|
3
|
+
import typing
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Callable,
|
|
8
|
+
Dict,
|
|
9
|
+
Iterator,
|
|
10
|
+
List,
|
|
11
|
+
Mapping,
|
|
12
|
+
NamedTuple,
|
|
13
|
+
Optional,
|
|
14
|
+
Sequence,
|
|
15
|
+
Set,
|
|
16
|
+
Tuple,
|
|
17
|
+
Type,
|
|
18
|
+
Union,
|
|
19
|
+
cast,
|
|
20
|
+
)
|
|
21
|
+
from warnings import warn
|
|
22
|
+
|
|
23
|
+
import networkx as nx
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandera as pa
|
|
26
|
+
import pyarrow
|
|
27
|
+
import typing_extensions
|
|
28
|
+
from pandas.core.dtypes import base as pd_dtypes
|
|
29
|
+
from pydantic import AnyUrl, BaseModel, Extra, Field
|
|
30
|
+
|
|
31
|
+
from thds.tabularasa.schema.files import FileSourceMixin
|
|
32
|
+
|
|
33
|
+
from .constraints import AnyColumnConstraint, EnumConstraint
|
|
34
|
+
from .dtypes import AnyDtype, DType
|
|
35
|
+
from .files import ADLSDataSpec, LocalDataSpec, RemoteBlobStoreSpec, TabularFileSource
|
|
36
|
+
from .util import (
|
|
37
|
+
DocumentedMixin,
|
|
38
|
+
DottedIdentifier,
|
|
39
|
+
EnumList,
|
|
40
|
+
HexStr,
|
|
41
|
+
Identifier,
|
|
42
|
+
NonEmptyStr,
|
|
43
|
+
PathStr,
|
|
44
|
+
predecessor_graph,
|
|
45
|
+
render_dtype,
|
|
46
|
+
snake_case,
|
|
47
|
+
snake_to_title,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
JSON = Dict[str, Union[Dict[str, Any], List[Any], int, float, str, bool, None]]
|
|
51
|
+
|
|
52
|
+
IdTuple = Tuple[Identifier, ...]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class AnonCustomType(DocumentedMixin):
|
|
56
|
+
type: DType
|
|
57
|
+
constraints: List[AnyColumnConstraint] = Field(min_items=1)
|
|
58
|
+
|
|
59
|
+
def with_name(self, name: Identifier) -> "CustomType":
|
|
60
|
+
return CustomType(
|
|
61
|
+
type=self.type, constraints=self.constraints, doc=self.doc, markup=self.markup, name=name
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def python(self) -> Type:
|
|
66
|
+
enum = self.enum
|
|
67
|
+
if enum is not None:
|
|
68
|
+
return cast(Type, typing_extensions.Literal[tuple(enum.enum)])
|
|
69
|
+
return self.type.python
|
|
70
|
+
|
|
71
|
+
def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
|
|
72
|
+
if builtin:
|
|
73
|
+
return self.type.python_type_literal(build_options=build_options, builtin=builtin)
|
|
74
|
+
else:
|
|
75
|
+
return self.python_type_def_literal(build_options=build_options)
|
|
76
|
+
|
|
77
|
+
def python_type_def_literal(self, build_options: "BuildOptions"):
|
|
78
|
+
enum = self.enum
|
|
79
|
+
if enum is not None:
|
|
80
|
+
module = typing_extensions if build_options.require_typing_extensions else typing
|
|
81
|
+
values = ", ".join(map(repr, enum.enum))
|
|
82
|
+
return f"{module.__name__}.Literal[{values}]"
|
|
83
|
+
else:
|
|
84
|
+
return self.type.python_type_literal(builtin=False, build_options=build_options)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def parquet(self) -> pyarrow.DataType:
|
|
88
|
+
return self.type.parquet
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def enum(self) -> Optional[EnumConstraint]:
|
|
92
|
+
return next((c for c in self.constraints if isinstance(c, EnumConstraint)), None)
|
|
93
|
+
|
|
94
|
+
def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
95
|
+
imports = self.type.attrs_required_imports(build_options=build_options)
|
|
96
|
+
extra = None
|
|
97
|
+
if self.enum is not None:
|
|
98
|
+
# render as literal
|
|
99
|
+
# extensions is technically not a std lib but it's easier to account for this way
|
|
100
|
+
extra = "typing_extensions" if build_options.require_typing_extensions else "typing"
|
|
101
|
+
elif build_options.use_newtypes:
|
|
102
|
+
# render as newtype
|
|
103
|
+
extra = "typing"
|
|
104
|
+
if extra is not None:
|
|
105
|
+
imports.add(extra)
|
|
106
|
+
return imports
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
110
|
+
yield from self.type.custom_type_refs
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def comment(self) -> Optional[str]:
|
|
114
|
+
comments = list(filter(None, (con.comment_expr() for con in self.constraints)))
|
|
115
|
+
if comments:
|
|
116
|
+
return "; ".join(comments)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class CustomType(AnonCustomType, extra=Extra.forbid):
|
|
121
|
+
name: Identifier
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def class_name(self) -> str:
|
|
125
|
+
return snake_to_title(self.name)
|
|
126
|
+
|
|
127
|
+
def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
|
|
128
|
+
if builtin:
|
|
129
|
+
return self.type.python_type_literal(build_options=build_options, builtin=builtin)
|
|
130
|
+
else:
|
|
131
|
+
return self.class_name
|
|
132
|
+
|
|
133
|
+
def python_type_def_literal(self, build_options: "BuildOptions"):
|
|
134
|
+
literal = super().python_type_def_literal(build_options)
|
|
135
|
+
if build_options.use_newtypes and self.enum is None:
|
|
136
|
+
return f'typing.NewType("{self.class_name}", {literal})'
|
|
137
|
+
else:
|
|
138
|
+
return literal
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
142
|
+
yield self.name
|
|
143
|
+
yield from super().custom_type_refs
|
|
144
|
+
|
|
145
|
+
def from_external(
|
|
146
|
+
self, ref: "ExternalSchemaRef", new_name: Optional[str]
|
|
147
|
+
) -> Union["CustomType", "ExternalCustomType"]:
|
|
148
|
+
if ref.package is None or ref.derived_code_submodule is None:
|
|
149
|
+
warn(
|
|
150
|
+
f"Either package or derived_code_submodule not specified for external type ref "
|
|
151
|
+
f"'{new_name}' originally named '{self.name}'; definition will be duplicated rather "
|
|
152
|
+
f"than imported"
|
|
153
|
+
)
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
return ExternalCustomType(
|
|
157
|
+
type=self.type,
|
|
158
|
+
constraints=self.constraints,
|
|
159
|
+
name=new_name or self.name,
|
|
160
|
+
external_name=self.name,
|
|
161
|
+
package=ref.package,
|
|
162
|
+
derived_code_submodule=ref.derived_code_submodule,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ExternalCustomType(CustomType, extra=Extra.forbid):
|
|
167
|
+
package: DottedIdentifier
|
|
168
|
+
derived_code_submodule: DottedIdentifier
|
|
169
|
+
external_name: Identifier
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def module_path(self) -> str:
|
|
173
|
+
return f"{self.package}.{self.derived_code_submodule}.attrs"
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def external_class_name(self) -> str:
|
|
177
|
+
return snake_to_title(self.external_name)
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def import_spec(self) -> Tuple[str, str]:
|
|
181
|
+
old_name = self.external_class_name
|
|
182
|
+
new_name = self.class_name
|
|
183
|
+
return self.module_path, old_name if old_name == new_name else f"{old_name} as {new_name}"
|
|
184
|
+
|
|
185
|
+
def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
186
|
+
if build_options.import_external_types:
|
|
187
|
+
return set()
|
|
188
|
+
else:
|
|
189
|
+
return super().attrs_required_imports(build_options)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class _CustomTypeRef(BaseModel, extra=Extra.forbid):
|
|
193
|
+
custom: Identifier
|
|
194
|
+
|
|
195
|
+
def __str__(self):
|
|
196
|
+
return repr(self.custom)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
200
|
+
yield self.custom
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class ExternalSchemaRef(BaseModel, extra=Extra.forbid):
|
|
204
|
+
schema_path: str
|
|
205
|
+
package: Optional[DottedIdentifier] = None
|
|
206
|
+
derived_code_submodule: Optional[DottedIdentifier] = None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class ExternalTypeRef(BaseModel, extra=Extra.forbid):
|
|
210
|
+
schema_name: Identifier
|
|
211
|
+
type_name: Identifier
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class _ComplexBaseType(BaseModel, extra=Extra.forbid):
|
|
215
|
+
@property
|
|
216
|
+
def sqlite(self) -> str:
|
|
217
|
+
return "JSON"
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def enum(self) -> Optional[EnumConstraint]:
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
def pandas(
|
|
224
|
+
self,
|
|
225
|
+
nullable: bool = False,
|
|
226
|
+
index: bool = False,
|
|
227
|
+
enum: Optional[EnumList] = None,
|
|
228
|
+
ordered: bool = False,
|
|
229
|
+
):
|
|
230
|
+
return np.dtype("object")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class _RawArrayType(_ComplexBaseType):
|
|
234
|
+
values: Union[DType, _CustomTypeRef, AnonCustomType, "_RawArrayType", "_RawMappingType"]
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
238
|
+
yield from self.values.custom_type_refs
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class ArrayType(_RawArrayType):
|
|
242
|
+
values: Union[DType, CustomType, AnonCustomType, "ArrayType", "MappingType"]
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def python(self) -> Type[List]:
|
|
246
|
+
return List[self.values.python] # type: ignore
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def parquet(self) -> pyarrow.DataType:
|
|
250
|
+
return pyarrow.list_(self.values.parquet)
|
|
251
|
+
|
|
252
|
+
def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
253
|
+
return {"typing", *self.values.attrs_required_imports(build_options=build_options)}
|
|
254
|
+
|
|
255
|
+
def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False):
|
|
256
|
+
return f"typing.List[{self.values.python_type_literal(build_options=build_options, builtin=builtin)}]"
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class _RawMappingType(_ComplexBaseType, extra=Extra.forbid):
|
|
260
|
+
keys: Union[DType, _CustomTypeRef, AnonCustomType]
|
|
261
|
+
values: Union[DType, _CustomTypeRef, AnonCustomType, "_RawArrayType", "_RawMappingType"]
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
265
|
+
yield from self.keys.custom_type_refs
|
|
266
|
+
yield from self.values.custom_type_refs
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class MappingType(_RawMappingType, extra=Extra.forbid):
|
|
270
|
+
keys: Union[DType, CustomType, AnonCustomType]
|
|
271
|
+
values: Union[DType, CustomType, AnonCustomType, "ArrayType", "MappingType"]
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def python(self) -> Type[Dict]:
|
|
275
|
+
return Dict[self.keys.python, self.values.python] # type: ignore
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def parquet(self) -> pyarrow.DataType:
|
|
279
|
+
return pyarrow.map_(self.keys.parquet, self.values.parquet)
|
|
280
|
+
|
|
281
|
+
def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
282
|
+
return {
|
|
283
|
+
"typing",
|
|
284
|
+
*self.keys.attrs_required_imports(build_options),
|
|
285
|
+
*self.values.attrs_required_imports(build_options),
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False):
|
|
289
|
+
return (
|
|
290
|
+
f"typing.Dict[{self.keys.python_type_literal(build_options=build_options, builtin=builtin)}, "
|
|
291
|
+
f"{self.values.python_type_literal(build_options=build_options, builtin=builtin)}]"
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class UniqueColumnsConstraint(BaseModel, extra=Extra.forbid):
|
|
296
|
+
unique: IdTuple
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def sqlite(self) -> str:
|
|
300
|
+
return f'UNIQUE ({", ".join(self.unique)})'
|
|
301
|
+
|
|
302
|
+
@staticmethod
|
|
303
|
+
def make_pandera_check_expr(unique: IdTuple) -> str:
|
|
304
|
+
return f"pa.{pa.Check.__name__}.unique_across_columns({list(unique)!r})"
|
|
305
|
+
|
|
306
|
+
def pandera_check_expr(self) -> str:
|
|
307
|
+
return self.make_pandera_check_expr(self.unique)
|
|
308
|
+
|
|
309
|
+
@staticmethod
|
|
310
|
+
def make_pandera_check(unique: IdTuple) -> pa.Check:
|
|
311
|
+
return pa.Check.unique_across_columns(list(unique))
|
|
312
|
+
|
|
313
|
+
def pandera_check(self) -> pa.Check:
|
|
314
|
+
return self.make_pandera_check(self.unique)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# this allows the recursive types above
|
|
318
|
+
_RawArrayType.update_forward_refs()
|
|
319
|
+
_RawMappingType.update_forward_refs()
|
|
320
|
+
ArrayType.update_forward_refs()
|
|
321
|
+
MappingType.update_forward_refs()
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# Remote data specifications - these are fetched at build time for preprocessing and packaging
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class RawDataDependencies(DocumentedMixin):
|
|
328
|
+
preprocessor: DottedIdentifier
|
|
329
|
+
reference: List[Identifier] = Field(default_factory=list)
|
|
330
|
+
adls: List[Identifier] = Field(default_factory=list)
|
|
331
|
+
local: List[Identifier] = Field(default_factory=list)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# Raw types; these are parsed from the yaml and converted to the rich types below by resolving
|
|
335
|
+
# references, and checking constraints
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class _RawColumn(BaseModel, extra=Extra.forbid):
|
|
339
|
+
name: Identifier
|
|
340
|
+
type: Union[DType, _CustomTypeRef, AnonCustomType, _RawArrayType, _RawMappingType]
|
|
341
|
+
nullable: bool = False
|
|
342
|
+
doc: NonEmptyStr
|
|
343
|
+
source_name: Optional[str] = None
|
|
344
|
+
na_values: Optional[Set[str]] = None
|
|
345
|
+
|
|
346
|
+
def with_attrs(
|
|
347
|
+
self,
|
|
348
|
+
*,
|
|
349
|
+
name: Optional[Identifier] = None,
|
|
350
|
+
nullable: Optional[bool] = None,
|
|
351
|
+
doc: Optional[NonEmptyStr] = None,
|
|
352
|
+
source_name: Optional[str] = None,
|
|
353
|
+
):
|
|
354
|
+
cls = type(self)
|
|
355
|
+
return cls(
|
|
356
|
+
name=self.name if name is None else name,
|
|
357
|
+
type=self.type,
|
|
358
|
+
nullable=self.nullable if nullable is None else nullable,
|
|
359
|
+
doc=self.doc if doc is None else doc,
|
|
360
|
+
source_name=self.source_name if source_name is None else source_name,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
@property
|
|
364
|
+
def snake_case_name(self) -> str:
|
|
365
|
+
return snake_case(self.name)
|
|
366
|
+
|
|
367
|
+
@property
|
|
368
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
369
|
+
yield from self.type.custom_type_refs
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class InheritanceSpec(BaseModel, extra=Extra.forbid):
|
|
373
|
+
"""Specification for columns in a table which inherits columns from other tables (usually transient
|
|
374
|
+
tables which are then used in joins or filtered to a subset). The `tables` attribute is a list of
|
|
375
|
+
tables to inherit columns from in order of precedence. The `columns` attribute is an optional list
|
|
376
|
+
of columns to include from any of the tables. When absent, all columns from all tables are included.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
tables: Sequence[Identifier]
|
|
380
|
+
columns: Set[Identifier] = Field(default_factory=set)
|
|
381
|
+
update_docs: Mapping[Identifier, str] = Field(default_factory=dict)
|
|
382
|
+
update_nullability: Mapping[Identifier, bool] = Field(default_factory=dict)
|
|
383
|
+
update_source_name: Mapping[Identifier, str] = Field(default_factory=dict)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class _RawTable(BaseModel, extra=Extra.forbid):
|
|
387
|
+
columns: Sequence[_RawColumn] = Field(default_factory=list)
|
|
388
|
+
doc: NonEmptyStr
|
|
389
|
+
dependencies: Optional[Union[TabularFileSource, RawDataDependencies]] = None
|
|
390
|
+
inherit_schema: Optional[InheritanceSpec] = None
|
|
391
|
+
constraints: List[UniqueColumnsConstraint] = Field(default_factory=list)
|
|
392
|
+
primary_key: Optional[Tuple[Identifier, ...]] = None
|
|
393
|
+
indexes: List[IdTuple] = Field(default_factory=list)
|
|
394
|
+
md5: Optional[HexStr] = None
|
|
395
|
+
# flag to indicate that a table is only defined as a dependency for other tables;
|
|
396
|
+
# if true, no package data will be written and no accessor code will be generated
|
|
397
|
+
transient: bool = False
|
|
398
|
+
# flag to indicate that this table's data is installed at runtime
|
|
399
|
+
# accessor code will still be generated, but no package data will be produced at build time
|
|
400
|
+
build_time_installed: bool = True
|
|
401
|
+
title: Optional[str] = None
|
|
402
|
+
|
|
403
|
+
def resolve_inherited_columns(self, schema: "_RawSchema") -> Sequence[_RawColumn]:
|
|
404
|
+
if self.inherit_schema is None:
|
|
405
|
+
return self.columns
|
|
406
|
+
else:
|
|
407
|
+
tables_with_precedence = [
|
|
408
|
+
schema.tables[name] for name in self.inherit_schema.tables if name in schema.tables
|
|
409
|
+
]
|
|
410
|
+
columns = list(self.columns)
|
|
411
|
+
permitted_column_names = self.inherit_schema.columns
|
|
412
|
+
used_column_names = set(c.name for c in self.columns)
|
|
413
|
+
for table in tables_with_precedence:
|
|
414
|
+
for column in table.resolve_inherited_columns(schema):
|
|
415
|
+
if column.name not in used_column_names and (
|
|
416
|
+
not permitted_column_names or (column.name in permitted_column_names)
|
|
417
|
+
):
|
|
418
|
+
if (
|
|
419
|
+
column.name in self.inherit_schema.update_docs
|
|
420
|
+
or column.name in self.inherit_schema.update_nullability
|
|
421
|
+
or column.name in self.inherit_schema.update_source_name
|
|
422
|
+
):
|
|
423
|
+
column = column.with_attrs(
|
|
424
|
+
doc=self.inherit_schema.update_docs.get(column.name),
|
|
425
|
+
nullable=self.inherit_schema.update_nullability.get(column.name),
|
|
426
|
+
source_name=self.inherit_schema.update_source_name.get(column.name),
|
|
427
|
+
)
|
|
428
|
+
columns.append(column)
|
|
429
|
+
used_column_names.add(column.name)
|
|
430
|
+
return columns
|
|
431
|
+
|
|
432
|
+
@property
|
|
433
|
+
def packaged(self) -> bool:
|
|
434
|
+
return not self.transient
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def run_time_installed(self) -> bool:
|
|
438
|
+
return not self.build_time_installed
|
|
439
|
+
|
|
440
|
+
@property
|
|
441
|
+
def custom_type_refs(self) -> Iterator[Identifier]:
|
|
442
|
+
for column in self.columns:
|
|
443
|
+
yield from column.custom_type_refs
|
|
444
|
+
|
|
445
|
+
def _graph_ref(self, name: str):
|
|
446
|
+
# _RawTable has no name attribute but this is needed in DAG validation prior to construction of
|
|
447
|
+
# final schema
|
|
448
|
+
return TransientReferenceDataRef(name) if self.transient else ReferenceDataRef(name)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
class CustomStr(str):
|
|
452
|
+
"""These exist to allow usage of strings as nodes in the networkx computational DAG without
|
|
453
|
+
collision - a local table and an ADLS resource could have the same name and not collide in the hash
|
|
454
|
+
table that underlies the networkx graph."""
|
|
455
|
+
|
|
456
|
+
_name: str
|
|
457
|
+
|
|
458
|
+
def __eq__(self, other):
|
|
459
|
+
return type(self) is type(other) and super().__eq__(other)
|
|
460
|
+
|
|
461
|
+
def __repr__(self):
|
|
462
|
+
return f"{self._name}({super().__str__()})"
|
|
463
|
+
|
|
464
|
+
def __hash__(self):
|
|
465
|
+
return hash(repr(self))
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class ADLSRef(CustomStr):
|
|
469
|
+
_name = "ADLS"
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class LocalRef(CustomStr):
|
|
473
|
+
_name = "Local"
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
class TabularTextFileRef(CustomStr):
|
|
477
|
+
_name = "TabularTextFile"
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
class ReferenceDataRef(CustomStr):
|
|
481
|
+
_name = "ReferenceData"
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
class TransientReferenceDataRef(ReferenceDataRef):
|
|
485
|
+
_name = "ReferenceData"
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class BuildOptions(BaseModel, extra=Extra.forbid):
|
|
489
|
+
# interface
|
|
490
|
+
derived_code_submodule: DottedIdentifier
|
|
491
|
+
attrs: bool
|
|
492
|
+
sqlite_data: bool
|
|
493
|
+
sqlite_interface: bool
|
|
494
|
+
pandas: bool
|
|
495
|
+
pyarrow: bool
|
|
496
|
+
# interface options
|
|
497
|
+
type_constraint_comments: bool = True
|
|
498
|
+
validate_transient_tables: bool = True
|
|
499
|
+
# set this to true if you want to generate code that's compatible with python 3.7 and lower
|
|
500
|
+
require_typing_extensions: bool = False
|
|
501
|
+
# import types from external schemas, or re-render them?
|
|
502
|
+
import_external_types: bool = True
|
|
503
|
+
# render custom types with constraints as typing.NewType instances?
|
|
504
|
+
use_newtypes: bool = True
|
|
505
|
+
# boolean to override behavior of dropping types not referenced by any table;
|
|
506
|
+
# allows a schema that defines only types to render source code definitions
|
|
507
|
+
render_all_types: bool = False
|
|
508
|
+
# data
|
|
509
|
+
package_data_dir: Optional[PathStr] = None
|
|
510
|
+
transient_data_dir: Optional[PathStr] = None
|
|
511
|
+
sqlite_db_path: Optional[PathStr] = None
|
|
512
|
+
package_data_file_size_limit: Optional[int] = None
|
|
513
|
+
# docs
|
|
514
|
+
repo_url: Optional[AnyUrl] = None
|
|
515
|
+
table_docs_dir: Optional[str] = None
|
|
516
|
+
type_docs_path: Optional[str] = None
|
|
517
|
+
source_docs_path: Optional[str] = None
|
|
518
|
+
curation_badge_path: Optional[str] = None
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class _RawSchema(BaseModel, extra=Extra.forbid):
|
|
522
|
+
tables: Mapping[Identifier, _RawTable] = Field(default_factory=dict)
|
|
523
|
+
types: Mapping[Identifier, Union[AnonCustomType, ExternalTypeRef]] = Field(default_factory=dict)
|
|
524
|
+
external_schemas: Mapping[Identifier, ExternalSchemaRef] = Field(default_factory=dict)
|
|
525
|
+
remote_data: Mapping[Identifier, ADLSDataSpec] = Field(default_factory=dict)
|
|
526
|
+
local_data: Mapping[Identifier, LocalDataSpec] = Field(default_factory=dict)
|
|
527
|
+
remote_blob_store: Optional[RemoteBlobStoreSpec] = None
|
|
528
|
+
build_options: BuildOptions
|
|
529
|
+
|
|
530
|
+
def inheritance_dag(self) -> nx.DiGraph:
|
|
531
|
+
dag = nx.DiGraph()
|
|
532
|
+
for table_name, table in self.tables.items():
|
|
533
|
+
if table.inherit_schema is not None:
|
|
534
|
+
dag.add_edges_from(
|
|
535
|
+
(table_name, inherited)
|
|
536
|
+
for inherited in table.inherit_schema.tables
|
|
537
|
+
if inherited in self.tables
|
|
538
|
+
)
|
|
539
|
+
return dag
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# Final materialized schema types; these extend the raw types and override the types of some fields to
|
|
543
|
+
# reflect resolution of references within the schema
|
|
544
|
+
|
|
545
|
+
ResolvedDType = Union[DType, AnonCustomType, CustomType, ArrayType, MappingType]
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class Column(_RawColumn):
|
|
549
|
+
type: ResolvedDType
|
|
550
|
+
|
|
551
|
+
@property
|
|
552
|
+
def dtype(self) -> Union[DType, ArrayType, MappingType]:
|
|
553
|
+
return self.type.type if isinstance(self.type, (AnonCustomType, CustomType)) else self.type
|
|
554
|
+
|
|
555
|
+
def pandas(self, index: bool = False) -> AnyDtype:
|
|
556
|
+
enum = self.type.enum
|
|
557
|
+
if enum is not None:
|
|
558
|
+
return self.dtype.pandas(
|
|
559
|
+
nullable=self.nullable, enum=enum.enum, ordered=enum.ordered, index=index
|
|
560
|
+
)
|
|
561
|
+
else:
|
|
562
|
+
return self.dtype.pandas(nullable=self.nullable, index=index)
|
|
563
|
+
|
|
564
|
+
def pandas_dtype_literal(self, index: bool = False) -> str:
|
|
565
|
+
dtype = self.pandas(index=index)
|
|
566
|
+
rendered = render_dtype(dtype)
|
|
567
|
+
|
|
568
|
+
if index and isinstance(dtype, np.dtype) and dtype.kind in "iuf":
|
|
569
|
+
return f"thds.tabularasa.compat.resolve_numeric_np_index_dtype_for_pd_version({rendered})"
|
|
570
|
+
# we actually need to render these dtypes wrapped in this compat function so that we can
|
|
571
|
+
# render schemas using pandas>=2.0, but they will still work with pandas<2.0
|
|
572
|
+
|
|
573
|
+
return rendered
|
|
574
|
+
|
|
575
|
+
@property
|
|
576
|
+
def python(self) -> Type:
|
|
577
|
+
return Optional[self.type.python] if self.nullable else self.type.python # type: ignore
|
|
578
|
+
|
|
579
|
+
def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
|
|
580
|
+
# column type literals are always within the body of a record class def, i.e. not a custom type
|
|
581
|
+
# def
|
|
582
|
+
literal = self.type.python_type_literal(build_options=build_options, builtin=builtin)
|
|
583
|
+
return f"typing.Optional[{literal}]" if self.nullable else literal
|
|
584
|
+
|
|
585
|
+
@property
|
|
586
|
+
def header_name(self) -> str:
|
|
587
|
+
if self.source_name is None:
|
|
588
|
+
return self.name
|
|
589
|
+
return self.source_name
|
|
590
|
+
|
|
591
|
+
@property
|
|
592
|
+
def parquet_field(self) -> pyarrow.Field:
|
|
593
|
+
metadata = dict(doc=self.doc.encode())
|
|
594
|
+
return pyarrow.field(self.snake_case_name, self.type.parquet, self.nullable, metadata=metadata)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
class Table(_RawTable):
|
|
598
|
+
# mypy prefers sequence here since we subclass the type arg from _RawColumn, and Sequence is
|
|
599
|
+
# covariant
|
|
600
|
+
columns: Sequence[Column]
|
|
601
|
+
name: Identifier
|
|
602
|
+
dependencies: Optional[Union[TabularFileSource, RawDataDependencies]]
|
|
603
|
+
|
|
604
|
+
@property
|
|
605
|
+
def unique_constraints(self) -> List[UniqueColumnsConstraint]:
|
|
606
|
+
if not self.constraints:
|
|
607
|
+
return []
|
|
608
|
+
return [c for c in self.constraints if isinstance(c, UniqueColumnsConstraint)]
|
|
609
|
+
|
|
610
|
+
@property
|
|
611
|
+
def single_column_unique_constraints(self) -> List[Identifier]:
|
|
612
|
+
return [c.unique[0] for c in self.unique_constraints if len(c.unique) == 1]
|
|
613
|
+
|
|
614
|
+
@property
|
|
615
|
+
def class_name(self) -> str:
|
|
616
|
+
return snake_to_title(self.name)
|
|
617
|
+
|
|
618
|
+
@property
|
|
619
|
+
def snake_case_name(self) -> str:
|
|
620
|
+
return snake_case(self.name)
|
|
621
|
+
|
|
622
|
+
@property
|
|
623
|
+
def doc_title(self) -> str:
|
|
624
|
+
if self.title is None:
|
|
625
|
+
return snake_to_title(self.name, separator=" ")
|
|
626
|
+
else:
|
|
627
|
+
return self.title
|
|
628
|
+
|
|
629
|
+
def _attrs_required_imports(
|
|
630
|
+
self, build_options: "BuildOptions", sqlite_interface: bool = False
|
|
631
|
+
) -> Set[str]:
|
|
632
|
+
columns: Iterator[Column]
|
|
633
|
+
if sqlite_interface:
|
|
634
|
+
index_cols = self.index_columns
|
|
635
|
+
# don't need type literals from std lib for custom types; can import class names
|
|
636
|
+
columns = (
|
|
637
|
+
column
|
|
638
|
+
for column in self.columns
|
|
639
|
+
if column.name in index_cols and not isinstance(column.type, CustomType)
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
columns = iter(self.columns)
|
|
643
|
+
|
|
644
|
+
modules = set()
|
|
645
|
+
for column in columns:
|
|
646
|
+
if column.nullable and not (
|
|
647
|
+
isinstance(column.type, ExternalCustomType) and build_options.import_external_types
|
|
648
|
+
):
|
|
649
|
+
modules.add("typing")
|
|
650
|
+
modules.update(column.type.attrs_required_imports(build_options))
|
|
651
|
+
return modules
|
|
652
|
+
|
|
653
|
+
def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
654
|
+
return self._attrs_required_imports(build_options=build_options, sqlite_interface=False)
|
|
655
|
+
|
|
656
|
+
def attrs_sqlite_required_imports(self, build_options: "BuildOptions") -> Set[str]:
|
|
657
|
+
return self._attrs_required_imports(build_options=build_options, sqlite_interface=True)
|
|
658
|
+
|
|
659
|
+
@property
|
|
660
|
+
def parquet_schema(self) -> pyarrow.Schema:
|
|
661
|
+
metadata = dict(
|
|
662
|
+
doc=self.doc.encode(),
|
|
663
|
+
primary_key=(
|
|
664
|
+
" ".join(map(snake_case, self.primary_key)).encode() if self.primary_key else b""
|
|
665
|
+
),
|
|
666
|
+
)
|
|
667
|
+
return pyarrow.schema([column.parquet_field for column in self.columns], metadata=metadata)
|
|
668
|
+
|
|
669
|
+
@property
|
|
670
|
+
def parquet_casts(self) -> Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]]:
|
|
671
|
+
pk = self.primary_key or ()
|
|
672
|
+
casts: Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]] = {}
|
|
673
|
+
|
|
674
|
+
for c in self.columns:
|
|
675
|
+
dtype = c.pandas(index=c.name in pk)
|
|
676
|
+
if isinstance(dtype, pd_dtypes.ExtensionDtype):
|
|
677
|
+
casts[c.snake_case_name] = dtype
|
|
678
|
+
elif isinstance(dtype, np.dtype) and dtype.name not in ("int32", "int64"):
|
|
679
|
+
casts[c.snake_case_name] = dtype
|
|
680
|
+
|
|
681
|
+
return casts
|
|
682
|
+
|
|
683
|
+
@property
|
|
684
|
+
def csv_na_values(self) -> Dict[str, Set[str]]:
|
|
685
|
+
"""Dict of column name to set of string values that should be considered null when reading a
|
|
686
|
+
tabular text file. Used for `na_values` arg of `pandas.read_csv`"""
|
|
687
|
+
na_values: Dict[str, Set[str]] = {}
|
|
688
|
+
default_na_values = (
|
|
689
|
+
self.dependencies.na_values if isinstance(self.dependencies, TabularFileSource) else None
|
|
690
|
+
)
|
|
691
|
+
for c in self.columns:
|
|
692
|
+
if c.na_values is not None:
|
|
693
|
+
na_values[c.header_name] = c.na_values
|
|
694
|
+
elif c.nullable and default_na_values is not None:
|
|
695
|
+
na_values[c.header_name] = default_na_values
|
|
696
|
+
return na_values
|
|
697
|
+
|
|
698
|
+
@property
|
|
699
|
+
def pandera_schema(self) -> pa.DataFrameSchema:
|
|
700
|
+
schema = render_pandera_schema(self, as_str=False)
|
|
701
|
+
return schema # type: ignore
|
|
702
|
+
|
|
703
|
+
@property
|
|
704
|
+
def graph_ref(self) -> ReferenceDataRef:
|
|
705
|
+
"""Reference to a node in the computational DAG"""
|
|
706
|
+
return self._graph_ref(self.name)
|
|
707
|
+
|
|
708
|
+
@property
|
|
709
|
+
def has_indexes(self) -> bool:
|
|
710
|
+
return bool(self.primary_key) or bool(self.indexes)
|
|
711
|
+
|
|
712
|
+
@property
|
|
713
|
+
def index_columns(self) -> Set[Identifier]:
|
|
714
|
+
return set(itertools.chain(self.primary_key or [], itertools.chain.from_iterable(self.indexes)))
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
# classes for mimicking pandera schema classes, to allow the same code block to generate code and
|
|
718
|
+
# a true pandera schema dynamically at runtime
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
class _ColumnSchemaProxy(NamedTuple):
|
|
722
|
+
dtype: str
|
|
723
|
+
checks: Optional[List[str]]
|
|
724
|
+
nullable: bool
|
|
725
|
+
unique: bool
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
class _IndexSchemaProxy(NamedTuple):
|
|
729
|
+
dtype: str
|
|
730
|
+
name: Identifier
|
|
731
|
+
checks: Optional[List[str]]
|
|
732
|
+
nullable: bool
|
|
733
|
+
unique: bool
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
class _MultiIndexSchemaProxy(NamedTuple):
|
|
737
|
+
indexes: List[_IndexSchemaProxy]
|
|
738
|
+
strict: bool
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
class _DataFrameSchemaProxy(NamedTuple):
|
|
742
|
+
columns: Dict[Identifier, _ColumnSchemaProxy]
|
|
743
|
+
index: Optional[Union[_IndexSchemaProxy, _MultiIndexSchemaProxy]] # type: ignore
|
|
744
|
+
checks: List[str]
|
|
745
|
+
coerce: bool
|
|
746
|
+
strict: bool
|
|
747
|
+
ordered: bool
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def render_pandera_schema(
|
|
751
|
+
table: Table, as_str: bool
|
|
752
|
+
) -> Union[_DataFrameSchemaProxy, pa.DataFrameSchema]:
|
|
753
|
+
column_defs: List[Tuple[str, Union[_ColumnSchemaProxy, pa.Column]]] = []
|
|
754
|
+
index_defs: List[Tuple[str, Union[_IndexSchemaProxy, pa.Index]]] = []
|
|
755
|
+
single_col_unique_constraints = set(table.single_column_unique_constraints)
|
|
756
|
+
index_names = set() if table.primary_key is None else set(table.primary_key)
|
|
757
|
+
single_col_index = len(index_names) == 1
|
|
758
|
+
|
|
759
|
+
for column in table.columns:
|
|
760
|
+
check_exprs: Optional[Union[List[str], List[pa.Check]]]
|
|
761
|
+
if isinstance(column.type, (AnonCustomType, CustomType)):
|
|
762
|
+
if as_str:
|
|
763
|
+
check_exprs = [c.pandera_check_expr() for c in column.type.constraints]
|
|
764
|
+
else:
|
|
765
|
+
check_exprs = [c.pandera_check() for c in column.type.constraints]
|
|
766
|
+
else:
|
|
767
|
+
check_exprs = None
|
|
768
|
+
|
|
769
|
+
if column.name in index_names:
|
|
770
|
+
constructor = _IndexSchemaProxy if as_str else pa.Index
|
|
771
|
+
exprlist = index_defs
|
|
772
|
+
extra_kw = dict(name=column.snake_case_name)
|
|
773
|
+
else:
|
|
774
|
+
constructor = _ColumnSchemaProxy if as_str else pa.Column
|
|
775
|
+
exprlist = column_defs # type: ignore
|
|
776
|
+
extra_kw = {}
|
|
777
|
+
|
|
778
|
+
# always enforce that indexes are unique since they derive from primary key declarations
|
|
779
|
+
# multi-index uniqueness checks have to be handled with a custom check
|
|
780
|
+
unique = column.name in single_col_unique_constraints or (
|
|
781
|
+
single_col_index and column.name in index_names
|
|
782
|
+
)
|
|
783
|
+
pandas_type: Union[str, AnyDtype]
|
|
784
|
+
if as_str:
|
|
785
|
+
pandas_type = column.pandas_dtype_literal(index=column.name in index_names)
|
|
786
|
+
else:
|
|
787
|
+
pandas_type = column.pandas(index=column.name in index_names)
|
|
788
|
+
expr = constructor(
|
|
789
|
+
dtype=pandas_type,
|
|
790
|
+
checks=check_exprs,
|
|
791
|
+
nullable=column.nullable,
|
|
792
|
+
unique=unique,
|
|
793
|
+
**extra_kw,
|
|
794
|
+
)
|
|
795
|
+
exprlist.append((column.snake_case_name, expr))
|
|
796
|
+
|
|
797
|
+
index_def: Optional[Union[_IndexSchemaProxy, _MultiIndexSchemaProxy, pa.Index, pa.MultiIndex]]
|
|
798
|
+
if index_defs:
|
|
799
|
+
if len(index_defs) == 1:
|
|
800
|
+
_, index_def = index_defs[0]
|
|
801
|
+
else:
|
|
802
|
+
constructor = _MultiIndexSchemaProxy if as_str else pa.MultiIndex
|
|
803
|
+
index_def = constructor(
|
|
804
|
+
indexes=[expr for _name, expr in index_defs],
|
|
805
|
+
strict=True,
|
|
806
|
+
)
|
|
807
|
+
else:
|
|
808
|
+
index_def = None
|
|
809
|
+
|
|
810
|
+
unique_constraints = [c.unique for c in table.unique_constraints if len(c.unique) > 1]
|
|
811
|
+
if len(index_names) > 1 and not any(index_names == set(u) for u in unique_constraints):
|
|
812
|
+
assert table.primary_key is not None # make mypy happy
|
|
813
|
+
unique_constraints.append(table.primary_key)
|
|
814
|
+
|
|
815
|
+
if unique_constraints:
|
|
816
|
+
from thds.tabularasa.loaders.util import unique_across_columns # noqa: F401
|
|
817
|
+
|
|
818
|
+
# Importing the above to ensure the custom pandera check is registered.
|
|
819
|
+
# Ideally, custom pandera checks would be registered in a more central location.
|
|
820
|
+
df_check_exprs = [
|
|
821
|
+
(
|
|
822
|
+
UniqueColumnsConstraint.make_pandera_check_expr(constraint)
|
|
823
|
+
if as_str
|
|
824
|
+
else UniqueColumnsConstraint.make_pandera_check(constraint)
|
|
825
|
+
)
|
|
826
|
+
for constraint in unique_constraints
|
|
827
|
+
]
|
|
828
|
+
else:
|
|
829
|
+
df_check_exprs = None
|
|
830
|
+
|
|
831
|
+
schema_cls = _DataFrameSchemaProxy if as_str else pa.DataFrameSchema
|
|
832
|
+
return schema_cls(
|
|
833
|
+
columns=dict(column_defs),
|
|
834
|
+
index=index_def,
|
|
835
|
+
checks=df_check_exprs,
|
|
836
|
+
coerce=False,
|
|
837
|
+
strict="filter" if table.transient else True,
|
|
838
|
+
ordered=False,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def is_build_time_package_table(table: Table) -> bool:
|
|
843
|
+
return table.build_time_installed and table.packaged
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def is_run_time_package_table(table: Table) -> bool:
|
|
847
|
+
return table.run_time_installed and table.packaged
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
class FileSourceMeta(NamedTuple):
|
|
851
|
+
# full path to the data source spec in the schema structure
|
|
852
|
+
schema_path: List[str]
|
|
853
|
+
name: str
|
|
854
|
+
source: FileSourceMixin
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
class Schema(_RawSchema):
|
|
858
|
+
"""Processed version of a `_RawSchema` that's been passed through validation to ensure integrity of
|
|
859
|
+
all references, and with names denormalized onto named objects (tables and types)"""
|
|
860
|
+
|
|
861
|
+
tables: Mapping[Identifier, Table]
|
|
862
|
+
types: Mapping[Identifier, CustomType]
|
|
863
|
+
|
|
864
|
+
@property
|
|
865
|
+
def build_time_package_tables(self) -> Iterator[Table]:
|
|
866
|
+
return self.filter_tables(is_build_time_package_table)
|
|
867
|
+
|
|
868
|
+
@property
|
|
869
|
+
def run_time_package_tables(self) -> Iterator[Table]:
|
|
870
|
+
return self.filter_tables(is_run_time_package_table)
|
|
871
|
+
|
|
872
|
+
@property
|
|
873
|
+
def package_tables(self) -> Iterator[Table]:
|
|
874
|
+
return self.filter_tables(lambda table: table.packaged)
|
|
875
|
+
|
|
876
|
+
@property
|
|
877
|
+
def transient_tables(self) -> Iterator[Table]:
|
|
878
|
+
return self.filter_tables(lambda table: table.transient)
|
|
879
|
+
|
|
880
|
+
@property
|
|
881
|
+
def computable_tables(self) -> Iterator[Table]:
|
|
882
|
+
return self.filter_tables(lambda table: table.dependencies is not None)
|
|
883
|
+
|
|
884
|
+
def filter_tables(self, predicate: Callable[[Table], bool]) -> Iterator[Table]:
|
|
885
|
+
return filter(predicate, self.tables.values())
|
|
886
|
+
|
|
887
|
+
@property
|
|
888
|
+
def all_custom_type_refs(self) -> Set[Identifier]:
|
|
889
|
+
"""Every ref to a type that will be rendered as part of this schema"""
|
|
890
|
+
if self.build_options.render_all_types:
|
|
891
|
+
return set(self.types)
|
|
892
|
+
else:
|
|
893
|
+
return set(ref for table in self.package_tables for ref in table.custom_type_refs)
|
|
894
|
+
|
|
895
|
+
@property
|
|
896
|
+
def packaged_custom_type_refs(self) -> Set[Identifier]:
|
|
897
|
+
return set(
|
|
898
|
+
ref
|
|
899
|
+
for ref in self.all_custom_type_refs
|
|
900
|
+
if not isinstance(self.types[ref], ExternalCustomType)
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
@property
|
|
904
|
+
def external_type_refs(self) -> Set[Identifier]:
|
|
905
|
+
return set(
|
|
906
|
+
ref for ref in self.all_custom_type_refs if isinstance(self.types[ref], ExternalCustomType)
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
@property
|
|
910
|
+
def attrs_required_imports(self) -> Set[str]:
|
|
911
|
+
assert self.build_options is not None, "can't generate attrs schema without `build_options`"
|
|
912
|
+
# all types referenced in tables. Includes imports needed for inline-defined anonymous types
|
|
913
|
+
modules = set(
|
|
914
|
+
itertools.chain.from_iterable(
|
|
915
|
+
t.attrs_required_imports(self.build_options) for t in self.tables.values()
|
|
916
|
+
)
|
|
917
|
+
)
|
|
918
|
+
# all top-level defined field types. Includes imports needed for types not used in any table
|
|
919
|
+
modules.update(
|
|
920
|
+
itertools.chain.from_iterable(
|
|
921
|
+
t.attrs_required_imports(self.build_options) for t in self.defined_types
|
|
922
|
+
)
|
|
923
|
+
)
|
|
924
|
+
return modules
|
|
925
|
+
|
|
926
|
+
@property
|
|
927
|
+
def all_file_sources(self) -> Iterator[FileSourceMeta]:
|
|
928
|
+
for table_name, table in self.tables.items():
|
|
929
|
+
if isinstance(table.dependencies, FileSourceMixin):
|
|
930
|
+
yield FileSourceMeta(
|
|
931
|
+
["tables", table_name, "dependencies"], table_name, table.dependencies
|
|
932
|
+
)
|
|
933
|
+
sources: Mapping[str, FileSourceMixin]
|
|
934
|
+
for type_name, sources in [("local_data", self.local_data), ("remote_data", self.remote_data)]:
|
|
935
|
+
for source_name, source in sources.items():
|
|
936
|
+
yield FileSourceMeta([type_name, source_name], source_name, source)
|
|
937
|
+
|
|
938
|
+
def sources_needing_update(self, as_of: Optional[datetime.date] = None) -> List[FileSourceMeta]:
|
|
939
|
+
as_of_ = as_of or datetime.date.today()
|
|
940
|
+
return [meta for meta in self.all_file_sources if meta.source.needs_update(as_of_)]
|
|
941
|
+
|
|
942
|
+
@property
|
|
943
|
+
def external_type_imports(self) -> Dict[str, Set[str]]:
|
|
944
|
+
"""Mapping from qualified module name to class name or
|
|
945
|
+
'<external class name> as <internal class name>' expression"""
|
|
946
|
+
if not self.build_options.import_external_types:
|
|
947
|
+
return {}
|
|
948
|
+
|
|
949
|
+
imports: Dict[str, Set[str]] = defaultdict(set)
|
|
950
|
+
for ref in self.external_type_refs:
|
|
951
|
+
t = self.types[ref]
|
|
952
|
+
# true by definition of `self.external_type_refs`
|
|
953
|
+
assert isinstance(t, ExternalCustomType)
|
|
954
|
+
module, import_name = t.import_spec
|
|
955
|
+
imports[module].add(import_name)
|
|
956
|
+
|
|
957
|
+
return imports
|
|
958
|
+
|
|
959
|
+
@property
|
|
960
|
+
def defined_types(self) -> List[CustomType]:
|
|
961
|
+
"""All field types which are defined non-anonymously in the generated attrs code for this schema"""
|
|
962
|
+
referenced_custom_type_refs = set(self.packaged_custom_type_refs)
|
|
963
|
+
if not self.build_options.import_external_types:
|
|
964
|
+
referenced_custom_type_refs.update(self.external_type_refs)
|
|
965
|
+
|
|
966
|
+
return [self.types[name] for name in referenced_custom_type_refs]
|
|
967
|
+
|
|
968
|
+
def dependency_dag(
|
|
969
|
+
self, table_predicate: Callable[[Table], bool] = is_build_time_package_table
|
|
970
|
+
) -> nx.DiGraph:
|
|
971
|
+
"""Directed graph of dependencies between all data packaging steps"""
|
|
972
|
+
dag = nx.DiGraph()
|
|
973
|
+
tables = set()
|
|
974
|
+
for tablename, table in self.tables.items():
|
|
975
|
+
# run-time-installed tables have no dependencies
|
|
976
|
+
table_ref = table._graph_ref(tablename)
|
|
977
|
+
if table_predicate(table):
|
|
978
|
+
tables.add(table_ref)
|
|
979
|
+
dag.add_node(table_ref)
|
|
980
|
+
|
|
981
|
+
if isinstance(table.dependencies, RawDataDependencies):
|
|
982
|
+
for reflist, refcls in [
|
|
983
|
+
(table.dependencies.adls, ADLSRef),
|
|
984
|
+
(table.dependencies.local, LocalRef),
|
|
985
|
+
]:
|
|
986
|
+
if reflist:
|
|
987
|
+
dag.add_edges_from((refcls(dep), table_ref) for dep in reflist)
|
|
988
|
+
for table_dep in table.dependencies.reference:
|
|
989
|
+
if table_dep in self.tables:
|
|
990
|
+
ref = self.tables[table_dep]._graph_ref(table_dep)
|
|
991
|
+
else:
|
|
992
|
+
# this can't actually happen post-validation but we need it in case of a bad ref
|
|
993
|
+
# during validation
|
|
994
|
+
ref = ReferenceDataRef(table_dep)
|
|
995
|
+
dag.add_edge(ref, table_ref)
|
|
996
|
+
elif isinstance(table.dependencies, TabularFileSource):
|
|
997
|
+
dag.add_edge(TabularTextFileRef(tablename), table_ref)
|
|
998
|
+
elif table.dependencies is None and table_predicate(table):
|
|
999
|
+
warn(
|
|
1000
|
+
f"Table '{tablename}' has no dependencies and can not be included in the "
|
|
1001
|
+
f"computational DAG; it must be installed manually via parquet files"
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
if len(tables) < len(dag):
|
|
1005
|
+
dag = predecessor_graph(dag, tables).copy()
|
|
1006
|
+
|
|
1007
|
+
return dag
|