thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,878 @@
1
+ import io
2
+ import itertools
3
+ import os
4
+ from collections import Counter
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any, Collection, Dict, List, Mapping, Optional, Set, Tuple, Type, Union, cast
8
+
9
+ import networkx as nx
10
+ import pkg_resources
11
+ import yaml
12
+ from _warnings import warn
13
+
14
+ from .. import git_util
15
+ from .constraints import AnyColumnConstraint, EnumConstraint
16
+ from .dtypes import DType
17
+ from .files import ADLSDataSpec, LocalDataSpec, TabularFileSource
18
+ from .metaschema import (
19
+ JSON,
20
+ AnonCustomType,
21
+ ArrayType,
22
+ Column,
23
+ CustomType,
24
+ ExternalCustomType,
25
+ ExternalTypeRef,
26
+ MappingType,
27
+ RawDataDependencies,
28
+ Schema,
29
+ Table,
30
+ _CustomTypeRef,
31
+ _RawArrayType,
32
+ _RawColumn,
33
+ _RawMappingType,
34
+ _RawSchema,
35
+ _RawTable,
36
+ )
37
+ from .util import Identifier, import_func, predecessor_graph
38
+
39
+ ErrorMessage = str
40
+
41
+
42
+ class MetaschemaValidationError(ValueError):
43
+ def __init__(self, errors: List[ErrorMessage]):
44
+ self.errors = errors
45
+
46
+ def __str__(self):
47
+ return "\n".join(self.errors)
48
+
49
+
50
+ def empty_column_tuple(table_name: str, kind: str, index: Optional[int] = None):
51
+ index_expr = "" if index is None else f"at index {index} "
52
+ return f"Table '{table_name}' {kind} {index_expr}is empty"
53
+
54
+
55
+ def repeated_cols_in_table(table_name: str, repeated_cols: Collection[str]) -> ErrorMessage:
56
+ return f"Table '{table_name}' has repeated column names {sorted(repeated_cols)}"
57
+
58
+
59
+ def missing_cols_in_constraint(
60
+ table_name: str, constraint_type: str, missing_cols: Collection[str], index: Optional[int] = None
61
+ ) -> ErrorMessage:
62
+ index_ = "" if index is None else f" (index {index})"
63
+ return (
64
+ f"Table '{table_name}' {constraint_type}{index_} references columns {sorted(missing_cols)}"
65
+ f" which are undefined"
66
+ )
67
+
68
+
69
+ def repeated_cols_in_constraint(
70
+ table_name: str, constraint_type: str, repeated_cols: Collection[str], index: Optional[int] = None
71
+ ) -> ErrorMessage:
72
+ index_ = "" if index is None else f" (index {index})"
73
+ return f"Table '{table_name}' {constraint_type}{index_} has repeated columns {sorted(repeated_cols)}"
74
+
75
+
76
+ def uniqueness_check_invalid_for_collection_type(
77
+ table_name: str,
78
+ column_name: str,
79
+ constraint_index: Optional[int] = None,
80
+ ):
81
+ index_spec = "" if constraint_index is None else f" at index {constraint_index}"
82
+ return (
83
+ f"Cannot check uniqueness for collection-valued column '{column_name}' in table "
84
+ f"'{table_name}'; occurred in table constraint{index_spec}"
85
+ )
86
+
87
+
88
+ def index_invalid_for_collection_type(
89
+ table_name: str,
90
+ column_name: str,
91
+ constraint_index: Optional[int] = None,
92
+ ) -> ErrorMessage:
93
+ index_spec = "primary key" if constraint_index is None else f"index at index {constraint_index}"
94
+ return (
95
+ f"Cannot use collection-valued column '{column_name}' in table '{table_name}' in an index; "
96
+ f"occurred in {index_spec}"
97
+ )
98
+
99
+
100
+ def missing_custom_type(column_name: str, index: int, table_name: str, type_name: str) -> ErrorMessage:
101
+ return (
102
+ f"Column '{column_name}' (index {index}) of table '{table_name}' references custom type "
103
+ f"'{type_name}' which does not exist"
104
+ )
105
+
106
+
107
+ def missing_inherited_table(table_name: str, inherited_table_name: str) -> ErrorMessage:
108
+ return (
109
+ f"Table '{table_name}' references inherited table '{inherited_table_name}' which does not exist"
110
+ )
111
+
112
+
113
+ def missing_inherited_column(
114
+ table_name: str, inherited_column: str, inherited_tables: Collection[str], reason: str
115
+ ) -> ErrorMessage:
116
+ return (
117
+ f"Table '{table_name}' references column '{inherited_column}' for {reason} in its inheritance "
118
+ f"specification, which is present in none of the inherited tables: {list(inherited_tables)}"
119
+ )
120
+
121
+
122
+ def missing_remote_data_spec(table_name: str, remote_data_ref_name: str) -> ErrorMessage:
123
+ return (
124
+ f"Table '{table_name}' references remote data dependency '{remote_data_ref_name}' which "
125
+ f"does not exist"
126
+ )
127
+
128
+
129
+ def missing_local_data_spec(
130
+ table_name: str, local_data_ref_name: str, local_data_type: str
131
+ ) -> ErrorMessage:
132
+ return (
133
+ f"Table '{table_name}' references '{local_data_type}' data dependency "
134
+ f"'{local_data_ref_name}' which does not exist"
135
+ )
136
+
137
+
138
+ def missing_external_schema(
139
+ type_name: str, external_schema_name: str, failed_to_load: bool
140
+ ) -> ErrorMessage:
141
+ return (
142
+ f"Type '{type_name}' references external schema '{external_schema_name}' "
143
+ f"which {'failed to load' if failed_to_load else 'is undefined'}"
144
+ )
145
+
146
+
147
+ def missing_external_type(
148
+ type_name: str, external_schema_name: str, external_type_name: str
149
+ ) -> ErrorMessage:
150
+ return (
151
+ f"Type '{type_name}' references type '{external_type_name}' which isn't present in external schema "
152
+ f"'{external_schema_name}' (possibly dropped as not referenced)"
153
+ )
154
+
155
+
156
+ def source_name_defined_for_derived_table(table_name: str, column_name: str) -> ErrorMessage:
157
+ return (
158
+ f"Table '{table_name}' is derived but defines a source name for column {column_name}; "
159
+ f"derived tables should be written in the same schema in which they are read"
160
+ )
161
+
162
+
163
+ def constraint_doesnt_apply(
164
+ type_name: str, index: int, constraint: AnyColumnConstraint, dtype: str
165
+ ) -> ErrorMessage:
166
+ return (
167
+ f"Constraint {constraint} (index {index}) of custom type '{type_name}' doesn't apply to "
168
+ f"dtype '{dtype}'"
169
+ )
170
+
171
+
172
+ def dependencies_required_for_build_time_tables(table_name: str) -> ErrorMessage:
173
+ return (
174
+ f"Table '{table_name}' is marked as build-time-installed but has no dependencies; "
175
+ f"build-time-installed tables must specify data dependencies"
176
+ )
177
+
178
+
179
+ def repeated_constraint_type(type_name: str, constraint_type: Type) -> ErrorMessage:
180
+ return f"Constraint type {constraint_type} is repeated for custom type '{type_name}'"
181
+
182
+
183
+ def empty_enum(type_name: str, index: int) -> ErrorMessage:
184
+ return f"Constraint for type '{type_name}' (index {index}) is empty enum"
185
+
186
+
187
+ def resource_doesnt_exist(
188
+ resource_name: str, resource_type: str, package_name: Optional[str], file_name: str
189
+ ) -> ErrorMessage:
190
+ package_addendum = f" in package '{package_name}'" if package_name else ""
191
+ return (
192
+ f"Resource for {resource_type} '{resource_name}' doesn't exist{package_addendum}"
193
+ f"at path '{file_name}'"
194
+ )
195
+
196
+
197
+ def resource_order_mismatch(
198
+ resource_name: str,
199
+ resource_type: str,
200
+ package_name: Optional[str],
201
+ resource_paths: Set[str],
202
+ ordered_paths: Set[str],
203
+ ) -> ErrorMessage:
204
+ package_addendum = f" from the package '{package_name}'" if package_name else ""
205
+ return (
206
+ f"The set of files in the resource for {resource_type} '{resource_name}'{package_addendum} "
207
+ f"does not equal the set of files specified in the resource's order: "
208
+ f"{resource_paths} != {ordered_paths}"
209
+ )
210
+
211
+
212
+ def ordered_resource_is_not_dir(
213
+ resource_name: str,
214
+ resource_type: str,
215
+ package_name: Optional[str],
216
+ file_name: str,
217
+ ordered_paths: Set[str],
218
+ ) -> ErrorMessage:
219
+ package_addendum = f" in package '{package_name}'" if package_name else ""
220
+ return (
221
+ f"Package resource for {resource_type} '{resource_name}'{package_addendum}"
222
+ f"at path '{file_name}' is not a directory but the resource has an order set: {ordered_paths}"
223
+ )
224
+
225
+
226
+ def package_not_installed(resource_name: str, resource_type: str, package_name: str) -> ErrorMessage:
227
+ return f"Package '{package_name}' in {resource_type} '{resource_name}' is not installed"
228
+
229
+
230
+ def preprocessor_not_importable(
231
+ table_name: str, preprocessor_path: str, exception: Exception
232
+ ) -> ErrorMessage:
233
+ return (
234
+ f"Preprocessor function path {preprocessor_path} for table {table_name} is not importable: "
235
+ f"{exception!r}"
236
+ )
237
+
238
+
239
+ def preprocessor_not_callable(
240
+ table_name: str, preprocessor_path: str, exception: Exception
241
+ ) -> ErrorMessage:
242
+ return (
243
+ f"Preprocessor function path {preprocessor_path} for table {table_name} does not reference"
244
+ f" a function: {exception!r}"
245
+ )
246
+
247
+
248
+ def external_schema_invalid(schema_name: str) -> ErrorMessage:
249
+ return f"External schema '{schema_name}' failed to validate"
250
+
251
+
252
+ def external_schema_not_found(
253
+ schema_name: str, package_name: Optional[str], schema_path: str, module_not_found: bool
254
+ ) -> ErrorMessage:
255
+ package = "" if package_name is None else f" in package {package_name}"
256
+ return (
257
+ f"External schema '{schema_name}' was not loaded at path '{schema_path}'{package}; "
258
+ f"{'module' if module_not_found else 'file'} not found"
259
+ )
260
+
261
+
262
+ def run_time_table_is_build_time_dependency(table_name: str) -> ErrorMessage:
263
+ return (
264
+ f"Run-time-installed table '{table_name}' is a transitive dependency of "
265
+ f"build-time-installed tables"
266
+ )
267
+
268
+
269
+ def dependency_graph_not_a_dag(cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
270
+ return graph_not_a_dag("Data dependency", cycle)
271
+
272
+
273
+ def inheritance_graph_not_a_dag(cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
274
+ return graph_not_a_dag("Table inheritance", cycle)
275
+
276
+
277
+ def graph_not_a_dag(kind: str, cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
278
+ nodes = [*(e[0] for e in cycle), cycle[-1][1]]
279
+ cycle_str = " -> ".join(map(repr, nodes))
280
+ return f"{kind} graph is not a DAG; example cycle: {cycle_str}"
281
+
282
+
283
+ def _validate_unique_column_names(table: _RawTable, tablename: str) -> List[ErrorMessage]:
284
+ errors = []
285
+ colnames = {c.snake_case_name for c in table.columns}
286
+ if len(colnames) < len(table.columns):
287
+ counts = Counter(c.snake_case_name for c in table.columns)
288
+ duped = {n for n, c in counts.items() if c > 1}
289
+ errors.append(repeated_cols_in_table(tablename, duped))
290
+
291
+ return errors
292
+
293
+
294
+ def _validate_table_constraints(
295
+ table: _RawTable, tablename: str, schema: _RawSchema
296
+ ) -> List[ErrorMessage]:
297
+ errors = []
298
+ colnames = {c.name: c for c in table.resolve_inherited_columns(schema)}
299
+
300
+ def repeated(xs: Optional[Collection]) -> List:
301
+ if xs is None:
302
+ return []
303
+ counts = Counter(xs)
304
+ return [x for x, n in counts.items() if n > 1]
305
+
306
+ for constraint_kind, column_tuples in [
307
+ ("unique constraint", ((i, c.unique) for i, c in enumerate(table.constraints))),
308
+ ("index", enumerate(table.indexes)),
309
+ ("primary key", [] if table.primary_key is None else [(None, table.primary_key)]),
310
+ ]:
311
+ for i, columns in column_tuples:
312
+ if not len(columns):
313
+ errors.append(empty_column_tuple(tablename, constraint_kind, i))
314
+ continue
315
+
316
+ missing_cols = set(columns).difference(colnames)
317
+ if missing_cols:
318
+ errors.append(missing_cols_in_constraint(tablename, constraint_kind, missing_cols, i))
319
+
320
+ repeated_cols = repeated(columns)
321
+ if repeated_cols:
322
+ errors.append(repeated_cols_in_constraint(tablename, constraint_kind, repeated_cols, i))
323
+
324
+ for colname in columns:
325
+ column = colnames.get(colname)
326
+ if column is not None and isinstance(
327
+ column.type, (_RawArrayType, _RawMappingType, ArrayType, MappingType)
328
+ ):
329
+ if constraint_kind == "unique constraint":
330
+ errors.append(
331
+ uniqueness_check_invalid_for_collection_type(tablename, colname, i)
332
+ )
333
+ else:
334
+ errors.append(index_invalid_for_collection_type(tablename, colname, i))
335
+
336
+ return errors
337
+
338
+
339
+ def _validate_column_types(
340
+ table: _RawTable,
341
+ tablename: str,
342
+ custom_types: Collection[Identifier],
343
+ ) -> List[ErrorMessage]:
344
+ errors = []
345
+ for i, column in enumerate(table.columns):
346
+ for refname in column.custom_type_refs:
347
+ if refname not in custom_types:
348
+ errors.append(missing_custom_type(column.name, i, tablename, refname))
349
+
350
+ return errors
351
+
352
+
353
+ def _validate_table_inheritance(
354
+ table: _RawTable,
355
+ tablename: str,
356
+ schema: _RawSchema,
357
+ ):
358
+ inheritance = table.inherit_schema
359
+ if inheritance is None:
360
+ return []
361
+
362
+ errors = []
363
+ inherited_table_names = []
364
+ inherited_tables = []
365
+ for inherited_table_name in inheritance.tables:
366
+ if inherited_table_name not in schema.tables:
367
+ errors.append(missing_inherited_table(tablename, inherited_table_name))
368
+ elif inherited_table_name in inherited_table_names:
369
+ warn(
370
+ f"Table '{inherited_table_name}' is repeated in inherited table list for table "
371
+ f"'{tablename}'"
372
+ )
373
+ else:
374
+ inherited_table_names.append(inherited_table_name)
375
+ inherited_tables.append(schema.tables[inherited_table_name])
376
+
377
+ heritable_column_names = {
378
+ c.name for table in inherited_tables for c in table.resolve_inherited_columns(schema)
379
+ }
380
+ defined_column_names = {c.name for c in table.columns}
381
+
382
+ for column_set, kind in [
383
+ (inheritance.columns, "inclusion"),
384
+ (inheritance.update_docs, "docstring update"),
385
+ (inheritance.update_nullability, "nullability update"),
386
+ (inheritance.update_source_name, "source name update"),
387
+ ]:
388
+ for column_name in column_set:
389
+ if column_name not in heritable_column_names:
390
+ errors.append(
391
+ missing_inherited_column(tablename, column_name, inherited_table_names, kind)
392
+ )
393
+ defined_explicitly = column_name in defined_column_names
394
+ excluded = (
395
+ bool(inheritance.columns)
396
+ and (column_name not in inheritance.columns)
397
+ and kind != "inclusion"
398
+ )
399
+ if defined_explicitly or excluded:
400
+ reference_type = " and ".join(
401
+ s
402
+ for s, condition in [
403
+ ("not marked for inclusion", excluded),
404
+ ("defined explicitly", defined_explicitly),
405
+ ]
406
+ if condition
407
+ )
408
+ addendum = (
409
+ "; it will not be present in the resulting table schema"
410
+ if excluded and not defined_explicitly
411
+ else ""
412
+ )
413
+ warn(
414
+ f"Column '{column_name}' is marked for {kind} in inheritance specification for "
415
+ f"table '{tablename}', but also {reference_type}{addendum}"
416
+ )
417
+
418
+ return errors
419
+
420
+
421
+ def _validate_data_dependencies(
422
+ table: _RawTable,
423
+ tablename: str,
424
+ tables: Mapping[Identifier, _RawTable],
425
+ remote_data: Mapping[Identifier, ADLSDataSpec],
426
+ local_data: Mapping[Identifier, LocalDataSpec],
427
+ ) -> List[ErrorMessage]:
428
+ errors = []
429
+ if table.build_time_installed and table.dependencies is None:
430
+ errors.append(dependencies_required_for_build_time_tables(tablename))
431
+
432
+ if isinstance(table.dependencies, RawDataDependencies):
433
+ for refname in table.dependencies.adls:
434
+ if refname not in remote_data:
435
+ errors.append(missing_remote_data_spec(tablename, refname))
436
+
437
+ for refname in table.dependencies.reference:
438
+ if refname not in tables:
439
+ errors.append(missing_local_data_spec(tablename, refname, "reference"))
440
+
441
+ for refname in table.dependencies.local:
442
+ if refname not in local_data:
443
+ errors.append(missing_local_data_spec(tablename, refname, "raw"))
444
+
445
+ for column in table.columns:
446
+ if column.source_name is not None:
447
+ errors.append(source_name_defined_for_derived_table(tablename, column.name))
448
+
449
+ return errors
450
+
451
+
452
+ def _validate_type_constraints(type_: AnonCustomType, typename: str) -> List[ErrorMessage]:
453
+ errors = []
454
+ for i, constraint in enumerate(type_.constraints):
455
+ if not constraint.applies_to(type_.type):
456
+ errors.append(constraint_doesnt_apply(typename, i, constraint, type_.type.value))
457
+ if isinstance(constraint, EnumConstraint):
458
+ if not constraint.enum:
459
+ errors.append(empty_enum(typename, i))
460
+
461
+ constraint_type_counts = Counter(map(type, type_.constraints))
462
+ repeated_constraint_types = [t for t, c in constraint_type_counts.items() if c > 1]
463
+ if repeated_constraint_types:
464
+ errors.extend(repeated_constraint_type(typename, t) for t in repeated_constraint_types)
465
+
466
+ return errors
467
+
468
+
469
+ def _validate_external_type_ref(
470
+ type_: ExternalTypeRef,
471
+ external_schemas: Mapping[Identifier, Schema],
472
+ typename: str,
473
+ failed_external_schemas: Set[str],
474
+ ) -> List[ErrorMessage]:
475
+ errors = []
476
+ if type_.schema_name not in external_schemas:
477
+ errors.append(
478
+ missing_external_schema(
479
+ typename, type_.schema_name, type_.schema_name in failed_external_schemas
480
+ )
481
+ )
482
+ else:
483
+ external_schema = external_schemas[type_.schema_name]
484
+ if type_.type_name not in external_schema.types:
485
+ errors.append(missing_external_type(typename, type_.schema_name, type_.type_name))
486
+
487
+ return errors
488
+
489
+
490
+ def _validate_local_data_resource(
491
+ package: Optional[str], data_path: str, resource_name: str, resource_desc: str
492
+ ) -> List[ErrorMessage]:
493
+ errors: List[ErrorMessage] = []
494
+ if package is None:
495
+ exists = os.path.isfile(data_path) or os.path.isdir(data_path)
496
+ else:
497
+ try:
498
+ exists = pkg_resources.resource_exists(package, data_path)
499
+ except ModuleNotFoundError:
500
+ errors.append(package_not_installed(resource_name, resource_desc, package))
501
+ exists = True
502
+
503
+ if not exists:
504
+ errors.append(resource_doesnt_exist(resource_name, resource_desc, package, data_path))
505
+
506
+ return errors
507
+
508
+
509
+ def _validate_local_ordered_data_resource(
510
+ resource: LocalDataSpec, resource_name: str, resource_desc: str
511
+ ) -> List[ErrorMessage]:
512
+ errors: List[ErrorMessage] = []
513
+ assert resource.order is not None
514
+ ordered_paths = set(resource.order)
515
+ if resource.is_dir:
516
+ files = set()
517
+ for filename in resource.list_dir():
518
+ files.add(os.path.basename(filename))
519
+ if files != ordered_paths:
520
+ errors.append(
521
+ resource_order_mismatch(
522
+ resource_name, resource_desc, resource.package, files, ordered_paths
523
+ )
524
+ )
525
+ else:
526
+ errors.append(
527
+ ordered_resource_is_not_dir(
528
+ resource_name, resource_desc, resource.package, resource.filename, ordered_paths
529
+ )
530
+ )
531
+ return errors
532
+
533
+
534
+ def _validate_preprocessor(table: _RawTable, tablename: str) -> List[ErrorMessage]:
535
+ errors: List[ErrorMessage] = []
536
+ if not isinstance(table.dependencies, RawDataDependencies):
537
+ return errors
538
+
539
+ funcpath = table.dependencies.preprocessor
540
+ try:
541
+ import_func(funcpath)
542
+ except (ImportError, AttributeError) as e:
543
+ errors.append(preprocessor_not_importable(tablename, funcpath, e))
544
+ except TypeError as e:
545
+ errors.append(preprocessor_not_callable(tablename, funcpath, e))
546
+
547
+ return errors
548
+
549
+
550
+ def _validate_dependency_dag(schema: _RawSchema) -> List[ErrorMessage]:
551
+ errors: List[ErrorMessage] = []
552
+ full_graph = Schema.dependency_dag(schema, lambda table: True) # type: ignore
553
+
554
+ def to_nodeset(predicate):
555
+ return set(
556
+ table._graph_ref(tablename) for tablename, table in schema.tables.items() if predicate(table)
557
+ )
558
+
559
+ build_time_tables = to_nodeset(lambda table: table.build_time_installed)
560
+ run_time_tables = to_nodeset(lambda table: table.run_time_installed)
561
+ transient_tables = to_nodeset(lambda table: table.transient)
562
+
563
+ if not nx.is_directed_acyclic_graph(full_graph):
564
+ cycle = nx.find_cycle(full_graph)
565
+ errors.append(dependency_graph_not_a_dag(cycle))
566
+
567
+ # no runtime-installed table should be a recursive dependency of any build-time-installed table
568
+ build_time_graph = predecessor_graph(full_graph, build_time_tables)
569
+ for ref in build_time_graph:
570
+ if ref in run_time_tables:
571
+ errors.append(run_time_table_is_build_time_dependency(str(ref)))
572
+
573
+ # transient tables should have successors; however, this is not an error, just a warning that such
574
+ # tables will be silently ignored at build time
575
+ for ref in transient_tables:
576
+ if not list(full_graph.successors(ref)):
577
+ warn(
578
+ f"Table '{ref}' is marked as transient but has no downstream dependencies; it will not "
579
+ f"be computed in builds"
580
+ )
581
+
582
+ return errors
583
+
584
+
585
+ def _validate_inheritance_dag(schema: _RawSchema) -> List[ErrorMessage]:
586
+ full_graph = schema.inheritance_dag()
587
+ errors = []
588
+ if not nx.is_directed_acyclic_graph(full_graph):
589
+ cycle = nx.find_cycle(full_graph)
590
+ errors.append(inheritance_graph_not_a_dag(cycle))
591
+ return errors
592
+
593
+
594
+ def _resolve_typeref(
595
+ dtype: Union[DType, AnonCustomType, _CustomTypeRef, _RawArrayType, _RawMappingType],
596
+ custom_types: Mapping[Identifier, CustomType],
597
+ ) -> Union[DType, AnonCustomType, CustomType, ArrayType, MappingType]:
598
+ if isinstance(dtype, _CustomTypeRef):
599
+ return custom_types[dtype.custom]
600
+ elif isinstance(dtype, _RawArrayType):
601
+ if isinstance(dtype.values, (AnonCustomType, _CustomTypeRef)):
602
+ warn(f"Array elements with custom type {dtype.values} cannot currently be validated")
603
+ return ArrayType(values=_resolve_typeref(dtype.values, custom_types))
604
+ elif isinstance(dtype, _RawMappingType):
605
+ if isinstance(dtype.keys, (AnonCustomType, _CustomTypeRef)):
606
+ warn(f"Mapping keys with custom type {dtype.keys} cannot currently be validated")
607
+ if isinstance(dtype.values, (AnonCustomType, _CustomTypeRef)):
608
+ warn(f"Mapping values with custom type {dtype.values} cannot currently be validated")
609
+ return MappingType(
610
+ keys=cast(
611
+ Union[DType, CustomType, AnonCustomType], _resolve_typeref(dtype.keys, custom_types)
612
+ ),
613
+ values=_resolve_typeref(dtype.values, custom_types),
614
+ )
615
+ else:
616
+ return dtype
617
+
618
+
619
+ def _resolve_column_typerefs(
620
+ column: _RawColumn, custom_types: Mapping[Identifier, CustomType]
621
+ ) -> Column:
622
+ return Column(
623
+ name=column.name,
624
+ type=_resolve_typeref(column.type, custom_types),
625
+ nullable=column.nullable,
626
+ doc=column.doc,
627
+ source_name=column.source_name,
628
+ na_values=column.na_values,
629
+ )
630
+
631
+
632
+ def distinct_indexes(table: _RawTable, table_name: str) -> List[Tuple[str, ...]]:
633
+ indexes = []
634
+ for index in table.indexes:
635
+ if index == table.primary_key:
636
+ warn(
637
+ f"Table {table_name} has its primary key re-defined as an index: {table.primary_key}; "
638
+ f"discarding"
639
+ )
640
+ elif index in indexes:
641
+ warn(f"Table {table_name} has a duplicate definition of index {index}; discarding")
642
+ else:
643
+ indexes.append(index)
644
+
645
+ return indexes
646
+
647
+
648
+ def _load_external_schema(
649
+ schema_name: str,
650
+ package: Optional[str],
651
+ schema_path: str,
652
+ git_ref: Optional[str] = None,
653
+ ) -> Tuple[Optional[Schema], List[ErrorMessage]]:
654
+ errors = []
655
+ external_schema: Optional[Schema] = None
656
+ try:
657
+ external_schema = load_schema(
658
+ package,
659
+ schema_path,
660
+ require_data_resources=False,
661
+ require_preprocessors=False,
662
+ git_ref=git_ref,
663
+ )
664
+ except ModuleNotFoundError:
665
+ errors.append(
666
+ external_schema_not_found(schema_name, package, schema_path, module_not_found=True)
667
+ )
668
+ except FileNotFoundError:
669
+ errors.append(
670
+ external_schema_not_found(schema_name, package, schema_path, module_not_found=False)
671
+ )
672
+ except MetaschemaValidationError:
673
+ errors.append(external_schema_invalid(schema_name))
674
+
675
+ return external_schema, errors
676
+
677
+
678
+ def validation_errors(
679
+ raw_schema: _RawSchema,
680
+ require_external_schemas: bool = True,
681
+ require_data_resources: bool = False,
682
+ require_preprocessors: bool = False,
683
+ git_ref: Optional[str] = None,
684
+ ) -> Tuple[List[ErrorMessage], Mapping[str, Schema]]:
685
+ errors = _validate_inheritance_dag(raw_schema)
686
+ bad_inheritance_graph = bool(errors)
687
+
688
+ # load external schemas
689
+ external_schemas = {}
690
+ failed_external_schemas = set()
691
+ if require_external_schemas:
692
+ for schema_name, schema_ref in raw_schema.external_schemas.items():
693
+ external_schema, load_errors = _load_external_schema(
694
+ schema_name,
695
+ schema_ref.package,
696
+ schema_ref.schema_path,
697
+ git_ref=git_ref,
698
+ )
699
+ if load_errors:
700
+ errors.extend(load_errors)
701
+ failed_external_schemas.add(schema_name)
702
+ else:
703
+ assert external_schema is not None
704
+ external_schemas[schema_name] = external_schema
705
+
706
+ # verify all column name refs
707
+ for tablename, table in raw_schema.tables.items():
708
+ errors.extend(_validate_unique_column_names(table, tablename))
709
+ if not bad_inheritance_graph or table.inherit_schema is None:
710
+ # this check involves resolving inherited columns, so we skip it if the inheritance graph is
711
+ # badly formed
712
+ errors.extend(_validate_table_constraints(table, tablename, raw_schema))
713
+
714
+ errors.extend(_validate_column_types(table, tablename, raw_schema.types))
715
+ errors.extend(
716
+ _validate_data_dependencies(
717
+ table, tablename, raw_schema.tables, raw_schema.remote_data, raw_schema.local_data
718
+ )
719
+ )
720
+ if not bad_inheritance_graph and table.inherit_schema is not None:
721
+ errors.extend(_validate_table_inheritance(table, tablename, raw_schema))
722
+ if require_data_resources and isinstance(table.dependencies, TabularFileSource):
723
+ errors.extend(
724
+ _validate_local_data_resource(
725
+ table.dependencies.package,
726
+ table.dependencies.filename,
727
+ tablename,
728
+ "tabular file source for table",
729
+ )
730
+ )
731
+ if require_preprocessors:
732
+ errors.extend(_validate_preprocessor(table, tablename))
733
+
734
+ if require_data_resources:
735
+ for resourcename, local_resource in raw_schema.local_data.items():
736
+ errors.extend(
737
+ _validate_local_data_resource(
738
+ local_resource.package,
739
+ local_resource.filename,
740
+ resourcename,
741
+ "local data specification",
742
+ )
743
+ )
744
+ if local_resource.order:
745
+ errors.extend(
746
+ _validate_local_ordered_data_resource(
747
+ local_resource,
748
+ resourcename,
749
+ "local data specification",
750
+ )
751
+ )
752
+ for i, order_path in enumerate(local_resource.order):
753
+ errors.extend(
754
+ _validate_local_data_resource(
755
+ local_resource.package,
756
+ "/".join([local_resource.filename, order_path]),
757
+ resourcename,
758
+ f"local data order [{i}] specification",
759
+ )
760
+ )
761
+
762
+ for typename, dtype in raw_schema.types.items():
763
+ if isinstance(dtype, AnonCustomType):
764
+ errors.extend(_validate_type_constraints(dtype, typename))
765
+ elif require_external_schemas:
766
+ errors.extend(
767
+ _validate_external_type_ref(dtype, external_schemas, typename, failed_external_schemas)
768
+ )
769
+
770
+ errors.extend(_validate_dependency_dag(raw_schema))
771
+
772
+ return errors, external_schemas
773
+
774
+
775
+ def validate(
776
+ json: Dict,
777
+ require_data_resources: bool = False,
778
+ require_preprocessors: bool = False,
779
+ git_ref: Optional[str] = None,
780
+ ) -> Schema:
781
+ # low-level pydantic validation happens here
782
+ raw_schema = _RawSchema(**json)
783
+ # higher-level semantic validation happens here
784
+ errors, external_schemas = validation_errors(
785
+ raw_schema,
786
+ require_external_schemas=True,
787
+ require_data_resources=require_data_resources,
788
+ require_preprocessors=require_preprocessors,
789
+ git_ref=git_ref,
790
+ )
791
+ if errors:
792
+ raise MetaschemaValidationError(errors)
793
+
794
+ named_custom_types: Dict[Identifier, Union[CustomType, ExternalCustomType]] = {}
795
+ referenced_custom_types = set(
796
+ itertools.chain.from_iterable(table.custom_type_refs for table in raw_schema.tables.values())
797
+ )
798
+ for name, t in raw_schema.types.items():
799
+ if not raw_schema.build_options.render_all_types and name not in referenced_custom_types:
800
+ warn(f"Discarding type {name!r} which is referenced in no table")
801
+ else:
802
+ if isinstance(t, ExternalTypeRef):
803
+ external_schema = external_schemas[t.schema_name]
804
+ external_type = external_schema.types[t.type_name]
805
+ schema_ref = raw_schema.external_schemas[t.schema_name]
806
+ schema_ref.derived_code_submodule = external_schema.build_options.derived_code_submodule
807
+ named_custom_types[name] = external_type.from_external(schema_ref, name)
808
+ else:
809
+ named_custom_types[name] = t.with_name(name)
810
+
811
+ for name, spec in raw_schema.remote_data.items():
812
+ non_version_controlled_paths = [p for p in spec.paths if p.md5 is None]
813
+ if non_version_controlled_paths:
814
+ warn(
815
+ f"Remote data specification '{name}' has {len(non_version_controlled_paths)} "
816
+ "paths with no specified hash; build consistency cannot be guaranteed for any "
817
+ "tables depending on these"
818
+ )
819
+
820
+ return Schema(
821
+ build_options=raw_schema.build_options,
822
+ tables={
823
+ tablename: Table(
824
+ name=tablename,
825
+ columns=[
826
+ _resolve_column_typerefs(column, named_custom_types)
827
+ for column in table.resolve_inherited_columns(raw_schema)
828
+ ],
829
+ constraints=table.constraints,
830
+ primary_key=table.primary_key,
831
+ indexes=distinct_indexes(table, tablename),
832
+ doc=table.doc,
833
+ dependencies=table.dependencies,
834
+ md5=table.md5,
835
+ transient=table.transient,
836
+ build_time_installed=table.build_time_installed,
837
+ )
838
+ for tablename, table in raw_schema.tables.items()
839
+ },
840
+ types=named_custom_types,
841
+ external_schemas=raw_schema.external_schemas,
842
+ remote_data=raw_schema.remote_data,
843
+ remote_blob_store=raw_schema.remote_blob_store,
844
+ local_data=raw_schema.local_data,
845
+ )
846
+
847
+
848
+ @lru_cache(None) # singleton
849
+ def load_schema(
850
+ package: Optional[str],
851
+ schema_path: str,
852
+ require_data_resources: bool = False,
853
+ require_preprocessors: bool = False,
854
+ git_ref: Optional[str] = None,
855
+ ) -> Schema:
856
+ if git_ref is None:
857
+ if package is None:
858
+ with open(schema_path, "r") as f:
859
+ json: JSON = yaml.safe_load(f)
860
+ else:
861
+ with pkg_resources.resource_stream(package, schema_path) as f:
862
+ json = yaml.safe_load(f)
863
+
864
+ else:
865
+ abspath = (
866
+ Path(schema_path)
867
+ if package is None
868
+ else Path(pkg_resources.resource_filename(package, str(schema_path)))
869
+ )
870
+ contents = git_util.blob_contents(abspath, git_ref)
871
+ json = yaml.safe_load(io.BytesIO(contents))
872
+
873
+ return validate(
874
+ json,
875
+ require_data_resources=require_data_resources,
876
+ require_preprocessors=require_preprocessors,
877
+ git_ref=git_ref,
878
+ )