thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1007 @@
1
+ import datetime
2
+ import itertools
3
+ import typing
4
+ from collections import defaultdict
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ Iterator,
10
+ List,
11
+ Mapping,
12
+ NamedTuple,
13
+ Optional,
14
+ Sequence,
15
+ Set,
16
+ Tuple,
17
+ Type,
18
+ Union,
19
+ cast,
20
+ )
21
+ from warnings import warn
22
+
23
+ import networkx as nx
24
+ import numpy as np
25
+ import pandera as pa
26
+ import pyarrow
27
+ import typing_extensions
28
+ from pandas.core.dtypes import base as pd_dtypes
29
+ from pydantic import AnyUrl, BaseModel, Extra, Field
30
+
31
+ from thds.tabularasa.schema.files import FileSourceMixin
32
+
33
+ from .constraints import AnyColumnConstraint, EnumConstraint
34
+ from .dtypes import AnyDtype, DType
35
+ from .files import ADLSDataSpec, LocalDataSpec, RemoteBlobStoreSpec, TabularFileSource
36
+ from .util import (
37
+ DocumentedMixin,
38
+ DottedIdentifier,
39
+ EnumList,
40
+ HexStr,
41
+ Identifier,
42
+ NonEmptyStr,
43
+ PathStr,
44
+ predecessor_graph,
45
+ render_dtype,
46
+ snake_case,
47
+ snake_to_title,
48
+ )
49
+
50
+ JSON = Dict[str, Union[Dict[str, Any], List[Any], int, float, str, bool, None]]
51
+
52
+ IdTuple = Tuple[Identifier, ...]
53
+
54
+
55
+ class AnonCustomType(DocumentedMixin):
56
+ type: DType
57
+ constraints: List[AnyColumnConstraint] = Field(min_items=1)
58
+
59
+ def with_name(self, name: Identifier) -> "CustomType":
60
+ return CustomType(
61
+ type=self.type, constraints=self.constraints, doc=self.doc, markup=self.markup, name=name
62
+ )
63
+
64
+ @property
65
+ def python(self) -> Type:
66
+ enum = self.enum
67
+ if enum is not None:
68
+ return cast(Type, typing_extensions.Literal[tuple(enum.enum)])
69
+ return self.type.python
70
+
71
+ def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
72
+ if builtin:
73
+ return self.type.python_type_literal(build_options=build_options, builtin=builtin)
74
+ else:
75
+ return self.python_type_def_literal(build_options=build_options)
76
+
77
+ def python_type_def_literal(self, build_options: "BuildOptions"):
78
+ enum = self.enum
79
+ if enum is not None:
80
+ module = typing_extensions if build_options.require_typing_extensions else typing
81
+ values = ", ".join(map(repr, enum.enum))
82
+ return f"{module.__name__}.Literal[{values}]"
83
+ else:
84
+ return self.type.python_type_literal(builtin=False, build_options=build_options)
85
+
86
+ @property
87
+ def parquet(self) -> pyarrow.DataType:
88
+ return self.type.parquet
89
+
90
+ @property
91
+ def enum(self) -> Optional[EnumConstraint]:
92
+ return next((c for c in self.constraints if isinstance(c, EnumConstraint)), None)
93
+
94
+ def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
95
+ imports = self.type.attrs_required_imports(build_options=build_options)
96
+ extra = None
97
+ if self.enum is not None:
98
+ # render as literal
99
+ # extensions is technically not a std lib but it's easier to account for this way
100
+ extra = "typing_extensions" if build_options.require_typing_extensions else "typing"
101
+ elif build_options.use_newtypes:
102
+ # render as newtype
103
+ extra = "typing"
104
+ if extra is not None:
105
+ imports.add(extra)
106
+ return imports
107
+
108
+ @property
109
+ def custom_type_refs(self) -> Iterator[Identifier]:
110
+ yield from self.type.custom_type_refs
111
+
112
+ @property
113
+ def comment(self) -> Optional[str]:
114
+ comments = list(filter(None, (con.comment_expr() for con in self.constraints)))
115
+ if comments:
116
+ return "; ".join(comments)
117
+ return None
118
+
119
+
120
+ class CustomType(AnonCustomType, extra=Extra.forbid):
121
+ name: Identifier
122
+
123
+ @property
124
+ def class_name(self) -> str:
125
+ return snake_to_title(self.name)
126
+
127
+ def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
128
+ if builtin:
129
+ return self.type.python_type_literal(build_options=build_options, builtin=builtin)
130
+ else:
131
+ return self.class_name
132
+
133
+ def python_type_def_literal(self, build_options: "BuildOptions"):
134
+ literal = super().python_type_def_literal(build_options)
135
+ if build_options.use_newtypes and self.enum is None:
136
+ return f'typing.NewType("{self.class_name}", {literal})'
137
+ else:
138
+ return literal
139
+
140
+ @property
141
+ def custom_type_refs(self) -> Iterator[Identifier]:
142
+ yield self.name
143
+ yield from super().custom_type_refs
144
+
145
+ def from_external(
146
+ self, ref: "ExternalSchemaRef", new_name: Optional[str]
147
+ ) -> Union["CustomType", "ExternalCustomType"]:
148
+ if ref.package is None or ref.derived_code_submodule is None:
149
+ warn(
150
+ f"Either package or derived_code_submodule not specified for external type ref "
151
+ f"'{new_name}' originally named '{self.name}'; definition will be duplicated rather "
152
+ f"than imported"
153
+ )
154
+ return self
155
+
156
+ return ExternalCustomType(
157
+ type=self.type,
158
+ constraints=self.constraints,
159
+ name=new_name or self.name,
160
+ external_name=self.name,
161
+ package=ref.package,
162
+ derived_code_submodule=ref.derived_code_submodule,
163
+ )
164
+
165
+
166
+ class ExternalCustomType(CustomType, extra=Extra.forbid):
167
+ package: DottedIdentifier
168
+ derived_code_submodule: DottedIdentifier
169
+ external_name: Identifier
170
+
171
+ @property
172
+ def module_path(self) -> str:
173
+ return f"{self.package}.{self.derived_code_submodule}.attrs"
174
+
175
+ @property
176
+ def external_class_name(self) -> str:
177
+ return snake_to_title(self.external_name)
178
+
179
+ @property
180
+ def import_spec(self) -> Tuple[str, str]:
181
+ old_name = self.external_class_name
182
+ new_name = self.class_name
183
+ return self.module_path, old_name if old_name == new_name else f"{old_name} as {new_name}"
184
+
185
+ def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
186
+ if build_options.import_external_types:
187
+ return set()
188
+ else:
189
+ return super().attrs_required_imports(build_options)
190
+
191
+
192
+ class _CustomTypeRef(BaseModel, extra=Extra.forbid):
193
+ custom: Identifier
194
+
195
+ def __str__(self):
196
+ return repr(self.custom)
197
+
198
+ @property
199
+ def custom_type_refs(self) -> Iterator[Identifier]:
200
+ yield self.custom
201
+
202
+
203
+ class ExternalSchemaRef(BaseModel, extra=Extra.forbid):
204
+ schema_path: str
205
+ package: Optional[DottedIdentifier] = None
206
+ derived_code_submodule: Optional[DottedIdentifier] = None
207
+
208
+
209
+ class ExternalTypeRef(BaseModel, extra=Extra.forbid):
210
+ schema_name: Identifier
211
+ type_name: Identifier
212
+
213
+
214
+ class _ComplexBaseType(BaseModel, extra=Extra.forbid):
215
+ @property
216
+ def sqlite(self) -> str:
217
+ return "JSON"
218
+
219
+ @property
220
+ def enum(self) -> Optional[EnumConstraint]:
221
+ return None
222
+
223
+ def pandas(
224
+ self,
225
+ nullable: bool = False,
226
+ index: bool = False,
227
+ enum: Optional[EnumList] = None,
228
+ ordered: bool = False,
229
+ ):
230
+ return np.dtype("object")
231
+
232
+
233
+ class _RawArrayType(_ComplexBaseType):
234
+ values: Union[DType, _CustomTypeRef, AnonCustomType, "_RawArrayType", "_RawMappingType"]
235
+
236
+ @property
237
+ def custom_type_refs(self) -> Iterator[Identifier]:
238
+ yield from self.values.custom_type_refs
239
+
240
+
241
+ class ArrayType(_RawArrayType):
242
+ values: Union[DType, CustomType, AnonCustomType, "ArrayType", "MappingType"]
243
+
244
+ @property
245
+ def python(self) -> Type[List]:
246
+ return List[self.values.python] # type: ignore
247
+
248
+ @property
249
+ def parquet(self) -> pyarrow.DataType:
250
+ return pyarrow.list_(self.values.parquet)
251
+
252
+ def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
253
+ return {"typing", *self.values.attrs_required_imports(build_options=build_options)}
254
+
255
+ def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False):
256
+ return f"typing.List[{self.values.python_type_literal(build_options=build_options, builtin=builtin)}]"
257
+
258
+
259
+ class _RawMappingType(_ComplexBaseType, extra=Extra.forbid):
260
+ keys: Union[DType, _CustomTypeRef, AnonCustomType]
261
+ values: Union[DType, _CustomTypeRef, AnonCustomType, "_RawArrayType", "_RawMappingType"]
262
+
263
+ @property
264
+ def custom_type_refs(self) -> Iterator[Identifier]:
265
+ yield from self.keys.custom_type_refs
266
+ yield from self.values.custom_type_refs
267
+
268
+
269
+ class MappingType(_RawMappingType, extra=Extra.forbid):
270
+ keys: Union[DType, CustomType, AnonCustomType]
271
+ values: Union[DType, CustomType, AnonCustomType, "ArrayType", "MappingType"]
272
+
273
+ @property
274
+ def python(self) -> Type[Dict]:
275
+ return Dict[self.keys.python, self.values.python] # type: ignore
276
+
277
+ @property
278
+ def parquet(self) -> pyarrow.DataType:
279
+ return pyarrow.map_(self.keys.parquet, self.values.parquet)
280
+
281
+ def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
282
+ return {
283
+ "typing",
284
+ *self.keys.attrs_required_imports(build_options),
285
+ *self.values.attrs_required_imports(build_options),
286
+ }
287
+
288
+ def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False):
289
+ return (
290
+ f"typing.Dict[{self.keys.python_type_literal(build_options=build_options, builtin=builtin)}, "
291
+ f"{self.values.python_type_literal(build_options=build_options, builtin=builtin)}]"
292
+ )
293
+
294
+
295
+ class UniqueColumnsConstraint(BaseModel, extra=Extra.forbid):
296
+ unique: IdTuple
297
+
298
+ @property
299
+ def sqlite(self) -> str:
300
+ return f'UNIQUE ({", ".join(self.unique)})'
301
+
302
+ @staticmethod
303
+ def make_pandera_check_expr(unique: IdTuple) -> str:
304
+ return f"pa.{pa.Check.__name__}.unique_across_columns({list(unique)!r})"
305
+
306
+ def pandera_check_expr(self) -> str:
307
+ return self.make_pandera_check_expr(self.unique)
308
+
309
+ @staticmethod
310
+ def make_pandera_check(unique: IdTuple) -> pa.Check:
311
+ return pa.Check.unique_across_columns(list(unique))
312
+
313
+ def pandera_check(self) -> pa.Check:
314
+ return self.make_pandera_check(self.unique)
315
+
316
+
317
+ # this allows the recursive types above
318
+ _RawArrayType.update_forward_refs()
319
+ _RawMappingType.update_forward_refs()
320
+ ArrayType.update_forward_refs()
321
+ MappingType.update_forward_refs()
322
+
323
+
324
+ # Remote data specifications - these are fetched at build time for preprocessing and packaging
325
+
326
+
327
+ class RawDataDependencies(DocumentedMixin):
328
+ preprocessor: DottedIdentifier
329
+ reference: List[Identifier] = Field(default_factory=list)
330
+ adls: List[Identifier] = Field(default_factory=list)
331
+ local: List[Identifier] = Field(default_factory=list)
332
+
333
+
334
+ # Raw types; these are parsed from the yaml and converted to the rich types below by resolving
335
+ # references, and checking constraints
336
+
337
+
338
+ class _RawColumn(BaseModel, extra=Extra.forbid):
339
+ name: Identifier
340
+ type: Union[DType, _CustomTypeRef, AnonCustomType, _RawArrayType, _RawMappingType]
341
+ nullable: bool = False
342
+ doc: NonEmptyStr
343
+ source_name: Optional[str] = None
344
+ na_values: Optional[Set[str]] = None
345
+
346
+ def with_attrs(
347
+ self,
348
+ *,
349
+ name: Optional[Identifier] = None,
350
+ nullable: Optional[bool] = None,
351
+ doc: Optional[NonEmptyStr] = None,
352
+ source_name: Optional[str] = None,
353
+ ):
354
+ cls = type(self)
355
+ return cls(
356
+ name=self.name if name is None else name,
357
+ type=self.type,
358
+ nullable=self.nullable if nullable is None else nullable,
359
+ doc=self.doc if doc is None else doc,
360
+ source_name=self.source_name if source_name is None else source_name,
361
+ )
362
+
363
+ @property
364
+ def snake_case_name(self) -> str:
365
+ return snake_case(self.name)
366
+
367
+ @property
368
+ def custom_type_refs(self) -> Iterator[Identifier]:
369
+ yield from self.type.custom_type_refs
370
+
371
+
372
+ class InheritanceSpec(BaseModel, extra=Extra.forbid):
373
+ """Specification for columns in a table which inherits columns from other tables (usually transient
374
+ tables which are then used in joins or filtered to a subset). The `tables` attribute is a list of
375
+ tables to inherit columns from in order of precedence. The `columns` attribute is an optional list
376
+ of columns to include from any of the tables. When absent, all columns from all tables are included.
377
+ """
378
+
379
+ tables: Sequence[Identifier]
380
+ columns: Set[Identifier] = Field(default_factory=set)
381
+ update_docs: Mapping[Identifier, str] = Field(default_factory=dict)
382
+ update_nullability: Mapping[Identifier, bool] = Field(default_factory=dict)
383
+ update_source_name: Mapping[Identifier, str] = Field(default_factory=dict)
384
+
385
+
386
+ class _RawTable(BaseModel, extra=Extra.forbid):
387
+ columns: Sequence[_RawColumn] = Field(default_factory=list)
388
+ doc: NonEmptyStr
389
+ dependencies: Optional[Union[TabularFileSource, RawDataDependencies]] = None
390
+ inherit_schema: Optional[InheritanceSpec] = None
391
+ constraints: List[UniqueColumnsConstraint] = Field(default_factory=list)
392
+ primary_key: Optional[Tuple[Identifier, ...]] = None
393
+ indexes: List[IdTuple] = Field(default_factory=list)
394
+ md5: Optional[HexStr] = None
395
+ # flag to indicate that a table is only defined as a dependency for other tables;
396
+ # if true, no package data will be written and no accessor code will be generated
397
+ transient: bool = False
398
+ # flag to indicate that this table's data is installed at runtime
399
+ # accessor code will still be generated, but no package data will be produced at build time
400
+ build_time_installed: bool = True
401
+ title: Optional[str] = None
402
+
403
+ def resolve_inherited_columns(self, schema: "_RawSchema") -> Sequence[_RawColumn]:
404
+ if self.inherit_schema is None:
405
+ return self.columns
406
+ else:
407
+ tables_with_precedence = [
408
+ schema.tables[name] for name in self.inherit_schema.tables if name in schema.tables
409
+ ]
410
+ columns = list(self.columns)
411
+ permitted_column_names = self.inherit_schema.columns
412
+ used_column_names = set(c.name for c in self.columns)
413
+ for table in tables_with_precedence:
414
+ for column in table.resolve_inherited_columns(schema):
415
+ if column.name not in used_column_names and (
416
+ not permitted_column_names or (column.name in permitted_column_names)
417
+ ):
418
+ if (
419
+ column.name in self.inherit_schema.update_docs
420
+ or column.name in self.inherit_schema.update_nullability
421
+ or column.name in self.inherit_schema.update_source_name
422
+ ):
423
+ column = column.with_attrs(
424
+ doc=self.inherit_schema.update_docs.get(column.name),
425
+ nullable=self.inherit_schema.update_nullability.get(column.name),
426
+ source_name=self.inherit_schema.update_source_name.get(column.name),
427
+ )
428
+ columns.append(column)
429
+ used_column_names.add(column.name)
430
+ return columns
431
+
432
+ @property
433
+ def packaged(self) -> bool:
434
+ return not self.transient
435
+
436
+ @property
437
+ def run_time_installed(self) -> bool:
438
+ return not self.build_time_installed
439
+
440
+ @property
441
+ def custom_type_refs(self) -> Iterator[Identifier]:
442
+ for column in self.columns:
443
+ yield from column.custom_type_refs
444
+
445
+ def _graph_ref(self, name: str):
446
+ # _RawTable has no name attribute but this is needed in DAG validation prior to construction of
447
+ # final schema
448
+ return TransientReferenceDataRef(name) if self.transient else ReferenceDataRef(name)
449
+
450
+
451
+ class CustomStr(str):
452
+ """These exist to allow usage of strings as nodes in the networkx computational DAG without
453
+ collision - a local table and an ADLS resource could have the same name and not collide in the hash
454
+ table that underlies the networkx graph."""
455
+
456
+ _name: str
457
+
458
+ def __eq__(self, other):
459
+ return type(self) is type(other) and super().__eq__(other)
460
+
461
+ def __repr__(self):
462
+ return f"{self._name}({super().__str__()})"
463
+
464
+ def __hash__(self):
465
+ return hash(repr(self))
466
+
467
+
468
+ class ADLSRef(CustomStr):
469
+ _name = "ADLS"
470
+
471
+
472
+ class LocalRef(CustomStr):
473
+ _name = "Local"
474
+
475
+
476
+ class TabularTextFileRef(CustomStr):
477
+ _name = "TabularTextFile"
478
+
479
+
480
+ class ReferenceDataRef(CustomStr):
481
+ _name = "ReferenceData"
482
+
483
+
484
+ class TransientReferenceDataRef(ReferenceDataRef):
485
+ _name = "ReferenceData"
486
+
487
+
488
+ class BuildOptions(BaseModel, extra=Extra.forbid):
489
+ # interface
490
+ derived_code_submodule: DottedIdentifier
491
+ attrs: bool
492
+ sqlite_data: bool
493
+ sqlite_interface: bool
494
+ pandas: bool
495
+ pyarrow: bool
496
+ # interface options
497
+ type_constraint_comments: bool = True
498
+ validate_transient_tables: bool = True
499
+ # set this to true if you want to generate code that's compatible with python 3.7 and lower
500
+ require_typing_extensions: bool = False
501
+ # import types from external schemas, or re-render them?
502
+ import_external_types: bool = True
503
+ # render custom types with constraints as typing.NewType instances?
504
+ use_newtypes: bool = True
505
+ # boolean to override behavior of dropping types not referenced by any table;
506
+ # allows a schema that defines only types to render source code definitions
507
+ render_all_types: bool = False
508
+ # data
509
+ package_data_dir: Optional[PathStr] = None
510
+ transient_data_dir: Optional[PathStr] = None
511
+ sqlite_db_path: Optional[PathStr] = None
512
+ package_data_file_size_limit: Optional[int] = None
513
+ # docs
514
+ repo_url: Optional[AnyUrl] = None
515
+ table_docs_dir: Optional[str] = None
516
+ type_docs_path: Optional[str] = None
517
+ source_docs_path: Optional[str] = None
518
+ curation_badge_path: Optional[str] = None
519
+
520
+
521
+ class _RawSchema(BaseModel, extra=Extra.forbid):
522
+ tables: Mapping[Identifier, _RawTable] = Field(default_factory=dict)
523
+ types: Mapping[Identifier, Union[AnonCustomType, ExternalTypeRef]] = Field(default_factory=dict)
524
+ external_schemas: Mapping[Identifier, ExternalSchemaRef] = Field(default_factory=dict)
525
+ remote_data: Mapping[Identifier, ADLSDataSpec] = Field(default_factory=dict)
526
+ local_data: Mapping[Identifier, LocalDataSpec] = Field(default_factory=dict)
527
+ remote_blob_store: Optional[RemoteBlobStoreSpec] = None
528
+ build_options: BuildOptions
529
+
530
+ def inheritance_dag(self) -> nx.DiGraph:
531
+ dag = nx.DiGraph()
532
+ for table_name, table in self.tables.items():
533
+ if table.inherit_schema is not None:
534
+ dag.add_edges_from(
535
+ (table_name, inherited)
536
+ for inherited in table.inherit_schema.tables
537
+ if inherited in self.tables
538
+ )
539
+ return dag
540
+
541
+
542
+ # Final materialized schema types; these extend the raw types and override the types of some fields to
543
+ # reflect resolution of references within the schema
544
+
545
+ ResolvedDType = Union[DType, AnonCustomType, CustomType, ArrayType, MappingType]
546
+
547
+
548
+ class Column(_RawColumn):
549
+ type: ResolvedDType
550
+
551
+ @property
552
+ def dtype(self) -> Union[DType, ArrayType, MappingType]:
553
+ return self.type.type if isinstance(self.type, (AnonCustomType, CustomType)) else self.type
554
+
555
+ def pandas(self, index: bool = False) -> AnyDtype:
556
+ enum = self.type.enum
557
+ if enum is not None:
558
+ return self.dtype.pandas(
559
+ nullable=self.nullable, enum=enum.enum, ordered=enum.ordered, index=index
560
+ )
561
+ else:
562
+ return self.dtype.pandas(nullable=self.nullable, index=index)
563
+
564
+ def pandas_dtype_literal(self, index: bool = False) -> str:
565
+ dtype = self.pandas(index=index)
566
+ rendered = render_dtype(dtype)
567
+
568
+ if index and isinstance(dtype, np.dtype) and dtype.kind in "iuf":
569
+ return f"thds.tabularasa.compat.resolve_numeric_np_index_dtype_for_pd_version({rendered})"
570
+ # we actually need to render these dtypes wrapped in this compat function so that we can
571
+ # render schemas using pandas>=2.0, but they will still work with pandas<2.0
572
+
573
+ return rendered
574
+
575
+ @property
576
+ def python(self) -> Type:
577
+ return Optional[self.type.python] if self.nullable else self.type.python # type: ignore
578
+
579
+ def python_type_literal(self, build_options: "BuildOptions", builtin: bool = False) -> str:
580
+ # column type literals are always within the body of a record class def, i.e. not a custom type
581
+ # def
582
+ literal = self.type.python_type_literal(build_options=build_options, builtin=builtin)
583
+ return f"typing.Optional[{literal}]" if self.nullable else literal
584
+
585
+ @property
586
+ def header_name(self) -> str:
587
+ if self.source_name is None:
588
+ return self.name
589
+ return self.source_name
590
+
591
+ @property
592
+ def parquet_field(self) -> pyarrow.Field:
593
+ metadata = dict(doc=self.doc.encode())
594
+ return pyarrow.field(self.snake_case_name, self.type.parquet, self.nullable, metadata=metadata)
595
+
596
+
597
+ class Table(_RawTable):
598
+ # mypy prefers sequence here since we subclass the type arg from _RawColumn, and Sequence is
599
+ # covariant
600
+ columns: Sequence[Column]
601
+ name: Identifier
602
+ dependencies: Optional[Union[TabularFileSource, RawDataDependencies]]
603
+
604
+ @property
605
+ def unique_constraints(self) -> List[UniqueColumnsConstraint]:
606
+ if not self.constraints:
607
+ return []
608
+ return [c for c in self.constraints if isinstance(c, UniqueColumnsConstraint)]
609
+
610
+ @property
611
+ def single_column_unique_constraints(self) -> List[Identifier]:
612
+ return [c.unique[0] for c in self.unique_constraints if len(c.unique) == 1]
613
+
614
+ @property
615
+ def class_name(self) -> str:
616
+ return snake_to_title(self.name)
617
+
618
+ @property
619
+ def snake_case_name(self) -> str:
620
+ return snake_case(self.name)
621
+
622
+ @property
623
+ def doc_title(self) -> str:
624
+ if self.title is None:
625
+ return snake_to_title(self.name, separator=" ")
626
+ else:
627
+ return self.title
628
+
629
+ def _attrs_required_imports(
630
+ self, build_options: "BuildOptions", sqlite_interface: bool = False
631
+ ) -> Set[str]:
632
+ columns: Iterator[Column]
633
+ if sqlite_interface:
634
+ index_cols = self.index_columns
635
+ # don't need type literals from std lib for custom types; can import class names
636
+ columns = (
637
+ column
638
+ for column in self.columns
639
+ if column.name in index_cols and not isinstance(column.type, CustomType)
640
+ )
641
+ else:
642
+ columns = iter(self.columns)
643
+
644
+ modules = set()
645
+ for column in columns:
646
+ if column.nullable and not (
647
+ isinstance(column.type, ExternalCustomType) and build_options.import_external_types
648
+ ):
649
+ modules.add("typing")
650
+ modules.update(column.type.attrs_required_imports(build_options))
651
+ return modules
652
+
653
+ def attrs_required_imports(self, build_options: "BuildOptions") -> Set[str]:
654
+ return self._attrs_required_imports(build_options=build_options, sqlite_interface=False)
655
+
656
+ def attrs_sqlite_required_imports(self, build_options: "BuildOptions") -> Set[str]:
657
+ return self._attrs_required_imports(build_options=build_options, sqlite_interface=True)
658
+
659
+ @property
660
+ def parquet_schema(self) -> pyarrow.Schema:
661
+ metadata = dict(
662
+ doc=self.doc.encode(),
663
+ primary_key=(
664
+ " ".join(map(snake_case, self.primary_key)).encode() if self.primary_key else b""
665
+ ),
666
+ )
667
+ return pyarrow.schema([column.parquet_field for column in self.columns], metadata=metadata)
668
+
669
+ @property
670
+ def parquet_casts(self) -> Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]]:
671
+ pk = self.primary_key or ()
672
+ casts: Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]] = {}
673
+
674
+ for c in self.columns:
675
+ dtype = c.pandas(index=c.name in pk)
676
+ if isinstance(dtype, pd_dtypes.ExtensionDtype):
677
+ casts[c.snake_case_name] = dtype
678
+ elif isinstance(dtype, np.dtype) and dtype.name not in ("int32", "int64"):
679
+ casts[c.snake_case_name] = dtype
680
+
681
+ return casts
682
+
683
+ @property
684
+ def csv_na_values(self) -> Dict[str, Set[str]]:
685
+ """Dict of column name to set of string values that should be considered null when reading a
686
+ tabular text file. Used for `na_values` arg of `pandas.read_csv`"""
687
+ na_values: Dict[str, Set[str]] = {}
688
+ default_na_values = (
689
+ self.dependencies.na_values if isinstance(self.dependencies, TabularFileSource) else None
690
+ )
691
+ for c in self.columns:
692
+ if c.na_values is not None:
693
+ na_values[c.header_name] = c.na_values
694
+ elif c.nullable and default_na_values is not None:
695
+ na_values[c.header_name] = default_na_values
696
+ return na_values
697
+
698
+ @property
699
+ def pandera_schema(self) -> pa.DataFrameSchema:
700
+ schema = render_pandera_schema(self, as_str=False)
701
+ return schema # type: ignore
702
+
703
+ @property
704
+ def graph_ref(self) -> ReferenceDataRef:
705
+ """Reference to a node in the computational DAG"""
706
+ return self._graph_ref(self.name)
707
+
708
+ @property
709
+ def has_indexes(self) -> bool:
710
+ return bool(self.primary_key) or bool(self.indexes)
711
+
712
+ @property
713
+ def index_columns(self) -> Set[Identifier]:
714
+ return set(itertools.chain(self.primary_key or [], itertools.chain.from_iterable(self.indexes)))
715
+
716
+
717
+ # classes for mimicking pandera schema classes, to allow the same code block to generate code and
718
+ # a true pandera schema dynamically at runtime
719
+
720
+
721
+ class _ColumnSchemaProxy(NamedTuple):
722
+ dtype: str
723
+ checks: Optional[List[str]]
724
+ nullable: bool
725
+ unique: bool
726
+
727
+
728
+ class _IndexSchemaProxy(NamedTuple):
729
+ dtype: str
730
+ name: Identifier
731
+ checks: Optional[List[str]]
732
+ nullable: bool
733
+ unique: bool
734
+
735
+
736
+ class _MultiIndexSchemaProxy(NamedTuple):
737
+ indexes: List[_IndexSchemaProxy]
738
+ strict: bool
739
+
740
+
741
+ class _DataFrameSchemaProxy(NamedTuple):
742
+ columns: Dict[Identifier, _ColumnSchemaProxy]
743
+ index: Optional[Union[_IndexSchemaProxy, _MultiIndexSchemaProxy]] # type: ignore
744
+ checks: List[str]
745
+ coerce: bool
746
+ strict: bool
747
+ ordered: bool
748
+
749
+
750
+ def render_pandera_schema(
751
+ table: Table, as_str: bool
752
+ ) -> Union[_DataFrameSchemaProxy, pa.DataFrameSchema]:
753
+ column_defs: List[Tuple[str, Union[_ColumnSchemaProxy, pa.Column]]] = []
754
+ index_defs: List[Tuple[str, Union[_IndexSchemaProxy, pa.Index]]] = []
755
+ single_col_unique_constraints = set(table.single_column_unique_constraints)
756
+ index_names = set() if table.primary_key is None else set(table.primary_key)
757
+ single_col_index = len(index_names) == 1
758
+
759
+ for column in table.columns:
760
+ check_exprs: Optional[Union[List[str], List[pa.Check]]]
761
+ if isinstance(column.type, (AnonCustomType, CustomType)):
762
+ if as_str:
763
+ check_exprs = [c.pandera_check_expr() for c in column.type.constraints]
764
+ else:
765
+ check_exprs = [c.pandera_check() for c in column.type.constraints]
766
+ else:
767
+ check_exprs = None
768
+
769
+ if column.name in index_names:
770
+ constructor = _IndexSchemaProxy if as_str else pa.Index
771
+ exprlist = index_defs
772
+ extra_kw = dict(name=column.snake_case_name)
773
+ else:
774
+ constructor = _ColumnSchemaProxy if as_str else pa.Column
775
+ exprlist = column_defs # type: ignore
776
+ extra_kw = {}
777
+
778
+ # always enforce that indexes are unique since they derive from primary key declarations
779
+ # multi-index uniqueness checks have to be handled with a custom check
780
+ unique = column.name in single_col_unique_constraints or (
781
+ single_col_index and column.name in index_names
782
+ )
783
+ pandas_type: Union[str, AnyDtype]
784
+ if as_str:
785
+ pandas_type = column.pandas_dtype_literal(index=column.name in index_names)
786
+ else:
787
+ pandas_type = column.pandas(index=column.name in index_names)
788
+ expr = constructor(
789
+ dtype=pandas_type,
790
+ checks=check_exprs,
791
+ nullable=column.nullable,
792
+ unique=unique,
793
+ **extra_kw,
794
+ )
795
+ exprlist.append((column.snake_case_name, expr))
796
+
797
+ index_def: Optional[Union[_IndexSchemaProxy, _MultiIndexSchemaProxy, pa.Index, pa.MultiIndex]]
798
+ if index_defs:
799
+ if len(index_defs) == 1:
800
+ _, index_def = index_defs[0]
801
+ else:
802
+ constructor = _MultiIndexSchemaProxy if as_str else pa.MultiIndex
803
+ index_def = constructor(
804
+ indexes=[expr for _name, expr in index_defs],
805
+ strict=True,
806
+ )
807
+ else:
808
+ index_def = None
809
+
810
+ unique_constraints = [c.unique for c in table.unique_constraints if len(c.unique) > 1]
811
+ if len(index_names) > 1 and not any(index_names == set(u) for u in unique_constraints):
812
+ assert table.primary_key is not None # make mypy happy
813
+ unique_constraints.append(table.primary_key)
814
+
815
+ if unique_constraints:
816
+ from thds.tabularasa.loaders.util import unique_across_columns # noqa: F401
817
+
818
+ # Importing the above to ensure the custom pandera check is registered.
819
+ # Ideally, custom pandera checks would be registered in a more central location.
820
+ df_check_exprs = [
821
+ (
822
+ UniqueColumnsConstraint.make_pandera_check_expr(constraint)
823
+ if as_str
824
+ else UniqueColumnsConstraint.make_pandera_check(constraint)
825
+ )
826
+ for constraint in unique_constraints
827
+ ]
828
+ else:
829
+ df_check_exprs = None
830
+
831
+ schema_cls = _DataFrameSchemaProxy if as_str else pa.DataFrameSchema
832
+ return schema_cls(
833
+ columns=dict(column_defs),
834
+ index=index_def,
835
+ checks=df_check_exprs,
836
+ coerce=False,
837
+ strict="filter" if table.transient else True,
838
+ ordered=False,
839
+ )
840
+
841
+
842
+ def is_build_time_package_table(table: Table) -> bool:
843
+ return table.build_time_installed and table.packaged
844
+
845
+
846
+ def is_run_time_package_table(table: Table) -> bool:
847
+ return table.run_time_installed and table.packaged
848
+
849
+
850
+ class FileSourceMeta(NamedTuple):
851
+ # full path to the data source spec in the schema structure
852
+ schema_path: List[str]
853
+ name: str
854
+ source: FileSourceMixin
855
+
856
+
857
+ class Schema(_RawSchema):
858
+ """Processed version of a `_RawSchema` that's been passed through validation to ensure integrity of
859
+ all references, and with names denormalized onto named objects (tables and types)"""
860
+
861
+ tables: Mapping[Identifier, Table]
862
+ types: Mapping[Identifier, CustomType]
863
+
864
+ @property
865
+ def build_time_package_tables(self) -> Iterator[Table]:
866
+ return self.filter_tables(is_build_time_package_table)
867
+
868
+ @property
869
+ def run_time_package_tables(self) -> Iterator[Table]:
870
+ return self.filter_tables(is_run_time_package_table)
871
+
872
+ @property
873
+ def package_tables(self) -> Iterator[Table]:
874
+ return self.filter_tables(lambda table: table.packaged)
875
+
876
+ @property
877
+ def transient_tables(self) -> Iterator[Table]:
878
+ return self.filter_tables(lambda table: table.transient)
879
+
880
+ @property
881
+ def computable_tables(self) -> Iterator[Table]:
882
+ return self.filter_tables(lambda table: table.dependencies is not None)
883
+
884
+ def filter_tables(self, predicate: Callable[[Table], bool]) -> Iterator[Table]:
885
+ return filter(predicate, self.tables.values())
886
+
887
+ @property
888
+ def all_custom_type_refs(self) -> Set[Identifier]:
889
+ """Every ref to a type that will be rendered as part of this schema"""
890
+ if self.build_options.render_all_types:
891
+ return set(self.types)
892
+ else:
893
+ return set(ref for table in self.package_tables for ref in table.custom_type_refs)
894
+
895
+ @property
896
+ def packaged_custom_type_refs(self) -> Set[Identifier]:
897
+ return set(
898
+ ref
899
+ for ref in self.all_custom_type_refs
900
+ if not isinstance(self.types[ref], ExternalCustomType)
901
+ )
902
+
903
+ @property
904
+ def external_type_refs(self) -> Set[Identifier]:
905
+ return set(
906
+ ref for ref in self.all_custom_type_refs if isinstance(self.types[ref], ExternalCustomType)
907
+ )
908
+
909
+ @property
910
+ def attrs_required_imports(self) -> Set[str]:
911
+ assert self.build_options is not None, "can't generate attrs schema without `build_options`"
912
+ # all types referenced in tables. Includes imports needed for inline-defined anonymous types
913
+ modules = set(
914
+ itertools.chain.from_iterable(
915
+ t.attrs_required_imports(self.build_options) for t in self.tables.values()
916
+ )
917
+ )
918
+ # all top-level defined field types. Includes imports needed for types not used in any table
919
+ modules.update(
920
+ itertools.chain.from_iterable(
921
+ t.attrs_required_imports(self.build_options) for t in self.defined_types
922
+ )
923
+ )
924
+ return modules
925
+
926
+ @property
927
+ def all_file_sources(self) -> Iterator[FileSourceMeta]:
928
+ for table_name, table in self.tables.items():
929
+ if isinstance(table.dependencies, FileSourceMixin):
930
+ yield FileSourceMeta(
931
+ ["tables", table_name, "dependencies"], table_name, table.dependencies
932
+ )
933
+ sources: Mapping[str, FileSourceMixin]
934
+ for type_name, sources in [("local_data", self.local_data), ("remote_data", self.remote_data)]:
935
+ for source_name, source in sources.items():
936
+ yield FileSourceMeta([type_name, source_name], source_name, source)
937
+
938
+ def sources_needing_update(self, as_of: Optional[datetime.date] = None) -> List[FileSourceMeta]:
939
+ as_of_ = as_of or datetime.date.today()
940
+ return [meta for meta in self.all_file_sources if meta.source.needs_update(as_of_)]
941
+
942
+ @property
943
+ def external_type_imports(self) -> Dict[str, Set[str]]:
944
+ """Mapping from qualified module name to class name or
945
+ '<external class name> as <internal class name>' expression"""
946
+ if not self.build_options.import_external_types:
947
+ return {}
948
+
949
+ imports: Dict[str, Set[str]] = defaultdict(set)
950
+ for ref in self.external_type_refs:
951
+ t = self.types[ref]
952
+ # true by definition of `self.external_type_refs`
953
+ assert isinstance(t, ExternalCustomType)
954
+ module, import_name = t.import_spec
955
+ imports[module].add(import_name)
956
+
957
+ return imports
958
+
959
+ @property
960
+ def defined_types(self) -> List[CustomType]:
961
+ """All field types which are defined non-anonymously in the generated attrs code for this schema"""
962
+ referenced_custom_type_refs = set(self.packaged_custom_type_refs)
963
+ if not self.build_options.import_external_types:
964
+ referenced_custom_type_refs.update(self.external_type_refs)
965
+
966
+ return [self.types[name] for name in referenced_custom_type_refs]
967
+
968
+ def dependency_dag(
969
+ self, table_predicate: Callable[[Table], bool] = is_build_time_package_table
970
+ ) -> nx.DiGraph:
971
+ """Directed graph of dependencies between all data packaging steps"""
972
+ dag = nx.DiGraph()
973
+ tables = set()
974
+ for tablename, table in self.tables.items():
975
+ # run-time-installed tables have no dependencies
976
+ table_ref = table._graph_ref(tablename)
977
+ if table_predicate(table):
978
+ tables.add(table_ref)
979
+ dag.add_node(table_ref)
980
+
981
+ if isinstance(table.dependencies, RawDataDependencies):
982
+ for reflist, refcls in [
983
+ (table.dependencies.adls, ADLSRef),
984
+ (table.dependencies.local, LocalRef),
985
+ ]:
986
+ if reflist:
987
+ dag.add_edges_from((refcls(dep), table_ref) for dep in reflist)
988
+ for table_dep in table.dependencies.reference:
989
+ if table_dep in self.tables:
990
+ ref = self.tables[table_dep]._graph_ref(table_dep)
991
+ else:
992
+ # this can't actually happen post-validation but we need it in case of a bad ref
993
+ # during validation
994
+ ref = ReferenceDataRef(table_dep)
995
+ dag.add_edge(ref, table_ref)
996
+ elif isinstance(table.dependencies, TabularFileSource):
997
+ dag.add_edge(TabularTextFileRef(tablename), table_ref)
998
+ elif table.dependencies is None and table_predicate(table):
999
+ warn(
1000
+ f"Table '{tablename}' has no dependencies and can not be included in the "
1001
+ f"computational DAG; it must be installed manually via parquet files"
1002
+ )
1003
+
1004
+ if len(tables) < len(dag):
1005
+ dag = predecessor_graph(dag, tables).copy()
1006
+
1007
+ return dag