thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,878 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Collection, Dict, List, Mapping, Optional, Set, Tuple, Type, Union, cast
|
|
8
|
+
|
|
9
|
+
import networkx as nx
|
|
10
|
+
import pkg_resources
|
|
11
|
+
import yaml
|
|
12
|
+
from _warnings import warn
|
|
13
|
+
|
|
14
|
+
from .. import git_util
|
|
15
|
+
from .constraints import AnyColumnConstraint, EnumConstraint
|
|
16
|
+
from .dtypes import DType
|
|
17
|
+
from .files import ADLSDataSpec, LocalDataSpec, TabularFileSource
|
|
18
|
+
from .metaschema import (
|
|
19
|
+
JSON,
|
|
20
|
+
AnonCustomType,
|
|
21
|
+
ArrayType,
|
|
22
|
+
Column,
|
|
23
|
+
CustomType,
|
|
24
|
+
ExternalCustomType,
|
|
25
|
+
ExternalTypeRef,
|
|
26
|
+
MappingType,
|
|
27
|
+
RawDataDependencies,
|
|
28
|
+
Schema,
|
|
29
|
+
Table,
|
|
30
|
+
_CustomTypeRef,
|
|
31
|
+
_RawArrayType,
|
|
32
|
+
_RawColumn,
|
|
33
|
+
_RawMappingType,
|
|
34
|
+
_RawSchema,
|
|
35
|
+
_RawTable,
|
|
36
|
+
)
|
|
37
|
+
from .util import Identifier, import_func, predecessor_graph
|
|
38
|
+
|
|
39
|
+
ErrorMessage = str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MetaschemaValidationError(ValueError):
|
|
43
|
+
def __init__(self, errors: List[ErrorMessage]):
|
|
44
|
+
self.errors = errors
|
|
45
|
+
|
|
46
|
+
def __str__(self):
|
|
47
|
+
return "\n".join(self.errors)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def empty_column_tuple(table_name: str, kind: str, index: Optional[int] = None):
|
|
51
|
+
index_expr = "" if index is None else f"at index {index} "
|
|
52
|
+
return f"Table '{table_name}' {kind} {index_expr}is empty"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def repeated_cols_in_table(table_name: str, repeated_cols: Collection[str]) -> ErrorMessage:
|
|
56
|
+
return f"Table '{table_name}' has repeated column names {sorted(repeated_cols)}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def missing_cols_in_constraint(
|
|
60
|
+
table_name: str, constraint_type: str, missing_cols: Collection[str], index: Optional[int] = None
|
|
61
|
+
) -> ErrorMessage:
|
|
62
|
+
index_ = "" if index is None else f" (index {index})"
|
|
63
|
+
return (
|
|
64
|
+
f"Table '{table_name}' {constraint_type}{index_} references columns {sorted(missing_cols)}"
|
|
65
|
+
f" which are undefined"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def repeated_cols_in_constraint(
|
|
70
|
+
table_name: str, constraint_type: str, repeated_cols: Collection[str], index: Optional[int] = None
|
|
71
|
+
) -> ErrorMessage:
|
|
72
|
+
index_ = "" if index is None else f" (index {index})"
|
|
73
|
+
return f"Table '{table_name}' {constraint_type}{index_} has repeated columns {sorted(repeated_cols)}"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def uniqueness_check_invalid_for_collection_type(
|
|
77
|
+
table_name: str,
|
|
78
|
+
column_name: str,
|
|
79
|
+
constraint_index: Optional[int] = None,
|
|
80
|
+
):
|
|
81
|
+
index_spec = "" if constraint_index is None else f" at index {constraint_index}"
|
|
82
|
+
return (
|
|
83
|
+
f"Cannot check uniqueness for collection-valued column '{column_name}' in table "
|
|
84
|
+
f"'{table_name}'; occurred in table constraint{index_spec}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def index_invalid_for_collection_type(
|
|
89
|
+
table_name: str,
|
|
90
|
+
column_name: str,
|
|
91
|
+
constraint_index: Optional[int] = None,
|
|
92
|
+
) -> ErrorMessage:
|
|
93
|
+
index_spec = "primary key" if constraint_index is None else f"index at index {constraint_index}"
|
|
94
|
+
return (
|
|
95
|
+
f"Cannot use collection-valued column '{column_name}' in table '{table_name}' in an index; "
|
|
96
|
+
f"occurred in {index_spec}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def missing_custom_type(column_name: str, index: int, table_name: str, type_name: str) -> ErrorMessage:
|
|
101
|
+
return (
|
|
102
|
+
f"Column '{column_name}' (index {index}) of table '{table_name}' references custom type "
|
|
103
|
+
f"'{type_name}' which does not exist"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def missing_inherited_table(table_name: str, inherited_table_name: str) -> ErrorMessage:
|
|
108
|
+
return (
|
|
109
|
+
f"Table '{table_name}' references inherited table '{inherited_table_name}' which does not exist"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def missing_inherited_column(
|
|
114
|
+
table_name: str, inherited_column: str, inherited_tables: Collection[str], reason: str
|
|
115
|
+
) -> ErrorMessage:
|
|
116
|
+
return (
|
|
117
|
+
f"Table '{table_name}' references column '{inherited_column}' for {reason} in its inheritance "
|
|
118
|
+
f"specification, which is present in none of the inherited tables: {list(inherited_tables)}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def missing_remote_data_spec(table_name: str, remote_data_ref_name: str) -> ErrorMessage:
|
|
123
|
+
return (
|
|
124
|
+
f"Table '{table_name}' references remote data dependency '{remote_data_ref_name}' which "
|
|
125
|
+
f"does not exist"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def missing_local_data_spec(
|
|
130
|
+
table_name: str, local_data_ref_name: str, local_data_type: str
|
|
131
|
+
) -> ErrorMessage:
|
|
132
|
+
return (
|
|
133
|
+
f"Table '{table_name}' references '{local_data_type}' data dependency "
|
|
134
|
+
f"'{local_data_ref_name}' which does not exist"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def missing_external_schema(
|
|
139
|
+
type_name: str, external_schema_name: str, failed_to_load: bool
|
|
140
|
+
) -> ErrorMessage:
|
|
141
|
+
return (
|
|
142
|
+
f"Type '{type_name}' references external schema '{external_schema_name}' "
|
|
143
|
+
f"which {'failed to load' if failed_to_load else 'is undefined'}"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def missing_external_type(
|
|
148
|
+
type_name: str, external_schema_name: str, external_type_name: str
|
|
149
|
+
) -> ErrorMessage:
|
|
150
|
+
return (
|
|
151
|
+
f"Type '{type_name}' references type '{external_type_name}' which isn't present in external schema "
|
|
152
|
+
f"'{external_schema_name}' (possibly dropped as not referenced)"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def source_name_defined_for_derived_table(table_name: str, column_name: str) -> ErrorMessage:
|
|
157
|
+
return (
|
|
158
|
+
f"Table '{table_name}' is derived but defines a source name for column {column_name}; "
|
|
159
|
+
f"derived tables should be written in the same schema in which they are read"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def constraint_doesnt_apply(
|
|
164
|
+
type_name: str, index: int, constraint: AnyColumnConstraint, dtype: str
|
|
165
|
+
) -> ErrorMessage:
|
|
166
|
+
return (
|
|
167
|
+
f"Constraint {constraint} (index {index}) of custom type '{type_name}' doesn't apply to "
|
|
168
|
+
f"dtype '{dtype}'"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def dependencies_required_for_build_time_tables(table_name: str) -> ErrorMessage:
|
|
173
|
+
return (
|
|
174
|
+
f"Table '{table_name}' is marked as build-time-installed but has no dependencies; "
|
|
175
|
+
f"build-time-installed tables must specify data dependencies"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def repeated_constraint_type(type_name: str, constraint_type: Type) -> ErrorMessage:
|
|
180
|
+
return f"Constraint type {constraint_type} is repeated for custom type '{type_name}'"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def empty_enum(type_name: str, index: int) -> ErrorMessage:
|
|
184
|
+
return f"Constraint for type '{type_name}' (index {index}) is empty enum"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def resource_doesnt_exist(
|
|
188
|
+
resource_name: str, resource_type: str, package_name: Optional[str], file_name: str
|
|
189
|
+
) -> ErrorMessage:
|
|
190
|
+
package_addendum = f" in package '{package_name}'" if package_name else ""
|
|
191
|
+
return (
|
|
192
|
+
f"Resource for {resource_type} '{resource_name}' doesn't exist{package_addendum}"
|
|
193
|
+
f"at path '{file_name}'"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def resource_order_mismatch(
|
|
198
|
+
resource_name: str,
|
|
199
|
+
resource_type: str,
|
|
200
|
+
package_name: Optional[str],
|
|
201
|
+
resource_paths: Set[str],
|
|
202
|
+
ordered_paths: Set[str],
|
|
203
|
+
) -> ErrorMessage:
|
|
204
|
+
package_addendum = f" from the package '{package_name}'" if package_name else ""
|
|
205
|
+
return (
|
|
206
|
+
f"The set of files in the resource for {resource_type} '{resource_name}'{package_addendum} "
|
|
207
|
+
f"does not equal the set of files specified in the resource's order: "
|
|
208
|
+
f"{resource_paths} != {ordered_paths}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def ordered_resource_is_not_dir(
|
|
213
|
+
resource_name: str,
|
|
214
|
+
resource_type: str,
|
|
215
|
+
package_name: Optional[str],
|
|
216
|
+
file_name: str,
|
|
217
|
+
ordered_paths: Set[str],
|
|
218
|
+
) -> ErrorMessage:
|
|
219
|
+
package_addendum = f" in package '{package_name}'" if package_name else ""
|
|
220
|
+
return (
|
|
221
|
+
f"Package resource for {resource_type} '{resource_name}'{package_addendum}"
|
|
222
|
+
f"at path '{file_name}' is not a directory but the resource has an order set: {ordered_paths}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def package_not_installed(resource_name: str, resource_type: str, package_name: str) -> ErrorMessage:
|
|
227
|
+
return f"Package '{package_name}' in {resource_type} '{resource_name}' is not installed"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def preprocessor_not_importable(
|
|
231
|
+
table_name: str, preprocessor_path: str, exception: Exception
|
|
232
|
+
) -> ErrorMessage:
|
|
233
|
+
return (
|
|
234
|
+
f"Preprocessor function path {preprocessor_path} for table {table_name} is not importable: "
|
|
235
|
+
f"{exception!r}"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def preprocessor_not_callable(
|
|
240
|
+
table_name: str, preprocessor_path: str, exception: Exception
|
|
241
|
+
) -> ErrorMessage:
|
|
242
|
+
return (
|
|
243
|
+
f"Preprocessor function path {preprocessor_path} for table {table_name} does not reference"
|
|
244
|
+
f" a function: {exception!r}"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def external_schema_invalid(schema_name: str) -> ErrorMessage:
|
|
249
|
+
return f"External schema '{schema_name}' failed to validate"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def external_schema_not_found(
|
|
253
|
+
schema_name: str, package_name: Optional[str], schema_path: str, module_not_found: bool
|
|
254
|
+
) -> ErrorMessage:
|
|
255
|
+
package = "" if package_name is None else f" in package {package_name}"
|
|
256
|
+
return (
|
|
257
|
+
f"External schema '{schema_name}' was not loaded at path '{schema_path}'{package}; "
|
|
258
|
+
f"{'module' if module_not_found else 'file'} not found"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def run_time_table_is_build_time_dependency(table_name: str) -> ErrorMessage:
|
|
263
|
+
return (
|
|
264
|
+
f"Run-time-installed table '{table_name}' is a transitive dependency of "
|
|
265
|
+
f"build-time-installed tables"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def dependency_graph_not_a_dag(cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
|
|
270
|
+
return graph_not_a_dag("Data dependency", cycle)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def inheritance_graph_not_a_dag(cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
|
|
274
|
+
return graph_not_a_dag("Table inheritance", cycle)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def graph_not_a_dag(kind: str, cycle: List[Tuple[Any, Any]]) -> ErrorMessage:
|
|
278
|
+
nodes = [*(e[0] for e in cycle), cycle[-1][1]]
|
|
279
|
+
cycle_str = " -> ".join(map(repr, nodes))
|
|
280
|
+
return f"{kind} graph is not a DAG; example cycle: {cycle_str}"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _validate_unique_column_names(table: _RawTable, tablename: str) -> List[ErrorMessage]:
|
|
284
|
+
errors = []
|
|
285
|
+
colnames = {c.snake_case_name for c in table.columns}
|
|
286
|
+
if len(colnames) < len(table.columns):
|
|
287
|
+
counts = Counter(c.snake_case_name for c in table.columns)
|
|
288
|
+
duped = {n for n, c in counts.items() if c > 1}
|
|
289
|
+
errors.append(repeated_cols_in_table(tablename, duped))
|
|
290
|
+
|
|
291
|
+
return errors
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _validate_table_constraints(
|
|
295
|
+
table: _RawTable, tablename: str, schema: _RawSchema
|
|
296
|
+
) -> List[ErrorMessage]:
|
|
297
|
+
errors = []
|
|
298
|
+
colnames = {c.name: c for c in table.resolve_inherited_columns(schema)}
|
|
299
|
+
|
|
300
|
+
def repeated(xs: Optional[Collection]) -> List:
|
|
301
|
+
if xs is None:
|
|
302
|
+
return []
|
|
303
|
+
counts = Counter(xs)
|
|
304
|
+
return [x for x, n in counts.items() if n > 1]
|
|
305
|
+
|
|
306
|
+
for constraint_kind, column_tuples in [
|
|
307
|
+
("unique constraint", ((i, c.unique) for i, c in enumerate(table.constraints))),
|
|
308
|
+
("index", enumerate(table.indexes)),
|
|
309
|
+
("primary key", [] if table.primary_key is None else [(None, table.primary_key)]),
|
|
310
|
+
]:
|
|
311
|
+
for i, columns in column_tuples:
|
|
312
|
+
if not len(columns):
|
|
313
|
+
errors.append(empty_column_tuple(tablename, constraint_kind, i))
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
missing_cols = set(columns).difference(colnames)
|
|
317
|
+
if missing_cols:
|
|
318
|
+
errors.append(missing_cols_in_constraint(tablename, constraint_kind, missing_cols, i))
|
|
319
|
+
|
|
320
|
+
repeated_cols = repeated(columns)
|
|
321
|
+
if repeated_cols:
|
|
322
|
+
errors.append(repeated_cols_in_constraint(tablename, constraint_kind, repeated_cols, i))
|
|
323
|
+
|
|
324
|
+
for colname in columns:
|
|
325
|
+
column = colnames.get(colname)
|
|
326
|
+
if column is not None and isinstance(
|
|
327
|
+
column.type, (_RawArrayType, _RawMappingType, ArrayType, MappingType)
|
|
328
|
+
):
|
|
329
|
+
if constraint_kind == "unique constraint":
|
|
330
|
+
errors.append(
|
|
331
|
+
uniqueness_check_invalid_for_collection_type(tablename, colname, i)
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
errors.append(index_invalid_for_collection_type(tablename, colname, i))
|
|
335
|
+
|
|
336
|
+
return errors
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _validate_column_types(
|
|
340
|
+
table: _RawTable,
|
|
341
|
+
tablename: str,
|
|
342
|
+
custom_types: Collection[Identifier],
|
|
343
|
+
) -> List[ErrorMessage]:
|
|
344
|
+
errors = []
|
|
345
|
+
for i, column in enumerate(table.columns):
|
|
346
|
+
for refname in column.custom_type_refs:
|
|
347
|
+
if refname not in custom_types:
|
|
348
|
+
errors.append(missing_custom_type(column.name, i, tablename, refname))
|
|
349
|
+
|
|
350
|
+
return errors
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _validate_table_inheritance(
|
|
354
|
+
table: _RawTable,
|
|
355
|
+
tablename: str,
|
|
356
|
+
schema: _RawSchema,
|
|
357
|
+
):
|
|
358
|
+
inheritance = table.inherit_schema
|
|
359
|
+
if inheritance is None:
|
|
360
|
+
return []
|
|
361
|
+
|
|
362
|
+
errors = []
|
|
363
|
+
inherited_table_names = []
|
|
364
|
+
inherited_tables = []
|
|
365
|
+
for inherited_table_name in inheritance.tables:
|
|
366
|
+
if inherited_table_name not in schema.tables:
|
|
367
|
+
errors.append(missing_inherited_table(tablename, inherited_table_name))
|
|
368
|
+
elif inherited_table_name in inherited_table_names:
|
|
369
|
+
warn(
|
|
370
|
+
f"Table '{inherited_table_name}' is repeated in inherited table list for table "
|
|
371
|
+
f"'{tablename}'"
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
inherited_table_names.append(inherited_table_name)
|
|
375
|
+
inherited_tables.append(schema.tables[inherited_table_name])
|
|
376
|
+
|
|
377
|
+
heritable_column_names = {
|
|
378
|
+
c.name for table in inherited_tables for c in table.resolve_inherited_columns(schema)
|
|
379
|
+
}
|
|
380
|
+
defined_column_names = {c.name for c in table.columns}
|
|
381
|
+
|
|
382
|
+
for column_set, kind in [
|
|
383
|
+
(inheritance.columns, "inclusion"),
|
|
384
|
+
(inheritance.update_docs, "docstring update"),
|
|
385
|
+
(inheritance.update_nullability, "nullability update"),
|
|
386
|
+
(inheritance.update_source_name, "source name update"),
|
|
387
|
+
]:
|
|
388
|
+
for column_name in column_set:
|
|
389
|
+
if column_name not in heritable_column_names:
|
|
390
|
+
errors.append(
|
|
391
|
+
missing_inherited_column(tablename, column_name, inherited_table_names, kind)
|
|
392
|
+
)
|
|
393
|
+
defined_explicitly = column_name in defined_column_names
|
|
394
|
+
excluded = (
|
|
395
|
+
bool(inheritance.columns)
|
|
396
|
+
and (column_name not in inheritance.columns)
|
|
397
|
+
and kind != "inclusion"
|
|
398
|
+
)
|
|
399
|
+
if defined_explicitly or excluded:
|
|
400
|
+
reference_type = " and ".join(
|
|
401
|
+
s
|
|
402
|
+
for s, condition in [
|
|
403
|
+
("not marked for inclusion", excluded),
|
|
404
|
+
("defined explicitly", defined_explicitly),
|
|
405
|
+
]
|
|
406
|
+
if condition
|
|
407
|
+
)
|
|
408
|
+
addendum = (
|
|
409
|
+
"; it will not be present in the resulting table schema"
|
|
410
|
+
if excluded and not defined_explicitly
|
|
411
|
+
else ""
|
|
412
|
+
)
|
|
413
|
+
warn(
|
|
414
|
+
f"Column '{column_name}' is marked for {kind} in inheritance specification for "
|
|
415
|
+
f"table '{tablename}', but also {reference_type}{addendum}"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return errors
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _validate_data_dependencies(
|
|
422
|
+
table: _RawTable,
|
|
423
|
+
tablename: str,
|
|
424
|
+
tables: Mapping[Identifier, _RawTable],
|
|
425
|
+
remote_data: Mapping[Identifier, ADLSDataSpec],
|
|
426
|
+
local_data: Mapping[Identifier, LocalDataSpec],
|
|
427
|
+
) -> List[ErrorMessage]:
|
|
428
|
+
errors = []
|
|
429
|
+
if table.build_time_installed and table.dependencies is None:
|
|
430
|
+
errors.append(dependencies_required_for_build_time_tables(tablename))
|
|
431
|
+
|
|
432
|
+
if isinstance(table.dependencies, RawDataDependencies):
|
|
433
|
+
for refname in table.dependencies.adls:
|
|
434
|
+
if refname not in remote_data:
|
|
435
|
+
errors.append(missing_remote_data_spec(tablename, refname))
|
|
436
|
+
|
|
437
|
+
for refname in table.dependencies.reference:
|
|
438
|
+
if refname not in tables:
|
|
439
|
+
errors.append(missing_local_data_spec(tablename, refname, "reference"))
|
|
440
|
+
|
|
441
|
+
for refname in table.dependencies.local:
|
|
442
|
+
if refname not in local_data:
|
|
443
|
+
errors.append(missing_local_data_spec(tablename, refname, "raw"))
|
|
444
|
+
|
|
445
|
+
for column in table.columns:
|
|
446
|
+
if column.source_name is not None:
|
|
447
|
+
errors.append(source_name_defined_for_derived_table(tablename, column.name))
|
|
448
|
+
|
|
449
|
+
return errors
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _validate_type_constraints(type_: AnonCustomType, typename: str) -> List[ErrorMessage]:
|
|
453
|
+
errors = []
|
|
454
|
+
for i, constraint in enumerate(type_.constraints):
|
|
455
|
+
if not constraint.applies_to(type_.type):
|
|
456
|
+
errors.append(constraint_doesnt_apply(typename, i, constraint, type_.type.value))
|
|
457
|
+
if isinstance(constraint, EnumConstraint):
|
|
458
|
+
if not constraint.enum:
|
|
459
|
+
errors.append(empty_enum(typename, i))
|
|
460
|
+
|
|
461
|
+
constraint_type_counts = Counter(map(type, type_.constraints))
|
|
462
|
+
repeated_constraint_types = [t for t, c in constraint_type_counts.items() if c > 1]
|
|
463
|
+
if repeated_constraint_types:
|
|
464
|
+
errors.extend(repeated_constraint_type(typename, t) for t in repeated_constraint_types)
|
|
465
|
+
|
|
466
|
+
return errors
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _validate_external_type_ref(
|
|
470
|
+
type_: ExternalTypeRef,
|
|
471
|
+
external_schemas: Mapping[Identifier, Schema],
|
|
472
|
+
typename: str,
|
|
473
|
+
failed_external_schemas: Set[str],
|
|
474
|
+
) -> List[ErrorMessage]:
|
|
475
|
+
errors = []
|
|
476
|
+
if type_.schema_name not in external_schemas:
|
|
477
|
+
errors.append(
|
|
478
|
+
missing_external_schema(
|
|
479
|
+
typename, type_.schema_name, type_.schema_name in failed_external_schemas
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
external_schema = external_schemas[type_.schema_name]
|
|
484
|
+
if type_.type_name not in external_schema.types:
|
|
485
|
+
errors.append(missing_external_type(typename, type_.schema_name, type_.type_name))
|
|
486
|
+
|
|
487
|
+
return errors
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _validate_local_data_resource(
|
|
491
|
+
package: Optional[str], data_path: str, resource_name: str, resource_desc: str
|
|
492
|
+
) -> List[ErrorMessage]:
|
|
493
|
+
errors: List[ErrorMessage] = []
|
|
494
|
+
if package is None:
|
|
495
|
+
exists = os.path.isfile(data_path) or os.path.isdir(data_path)
|
|
496
|
+
else:
|
|
497
|
+
try:
|
|
498
|
+
exists = pkg_resources.resource_exists(package, data_path)
|
|
499
|
+
except ModuleNotFoundError:
|
|
500
|
+
errors.append(package_not_installed(resource_name, resource_desc, package))
|
|
501
|
+
exists = True
|
|
502
|
+
|
|
503
|
+
if not exists:
|
|
504
|
+
errors.append(resource_doesnt_exist(resource_name, resource_desc, package, data_path))
|
|
505
|
+
|
|
506
|
+
return errors
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _validate_local_ordered_data_resource(
|
|
510
|
+
resource: LocalDataSpec, resource_name: str, resource_desc: str
|
|
511
|
+
) -> List[ErrorMessage]:
|
|
512
|
+
errors: List[ErrorMessage] = []
|
|
513
|
+
assert resource.order is not None
|
|
514
|
+
ordered_paths = set(resource.order)
|
|
515
|
+
if resource.is_dir:
|
|
516
|
+
files = set()
|
|
517
|
+
for filename in resource.list_dir():
|
|
518
|
+
files.add(os.path.basename(filename))
|
|
519
|
+
if files != ordered_paths:
|
|
520
|
+
errors.append(
|
|
521
|
+
resource_order_mismatch(
|
|
522
|
+
resource_name, resource_desc, resource.package, files, ordered_paths
|
|
523
|
+
)
|
|
524
|
+
)
|
|
525
|
+
else:
|
|
526
|
+
errors.append(
|
|
527
|
+
ordered_resource_is_not_dir(
|
|
528
|
+
resource_name, resource_desc, resource.package, resource.filename, ordered_paths
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
return errors
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _validate_preprocessor(table: _RawTable, tablename: str) -> List[ErrorMessage]:
|
|
535
|
+
errors: List[ErrorMessage] = []
|
|
536
|
+
if not isinstance(table.dependencies, RawDataDependencies):
|
|
537
|
+
return errors
|
|
538
|
+
|
|
539
|
+
funcpath = table.dependencies.preprocessor
|
|
540
|
+
try:
|
|
541
|
+
import_func(funcpath)
|
|
542
|
+
except (ImportError, AttributeError) as e:
|
|
543
|
+
errors.append(preprocessor_not_importable(tablename, funcpath, e))
|
|
544
|
+
except TypeError as e:
|
|
545
|
+
errors.append(preprocessor_not_callable(tablename, funcpath, e))
|
|
546
|
+
|
|
547
|
+
return errors
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _validate_dependency_dag(schema: _RawSchema) -> List[ErrorMessage]:
|
|
551
|
+
errors: List[ErrorMessage] = []
|
|
552
|
+
full_graph = Schema.dependency_dag(schema, lambda table: True) # type: ignore
|
|
553
|
+
|
|
554
|
+
def to_nodeset(predicate):
|
|
555
|
+
return set(
|
|
556
|
+
table._graph_ref(tablename) for tablename, table in schema.tables.items() if predicate(table)
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
build_time_tables = to_nodeset(lambda table: table.build_time_installed)
|
|
560
|
+
run_time_tables = to_nodeset(lambda table: table.run_time_installed)
|
|
561
|
+
transient_tables = to_nodeset(lambda table: table.transient)
|
|
562
|
+
|
|
563
|
+
if not nx.is_directed_acyclic_graph(full_graph):
|
|
564
|
+
cycle = nx.find_cycle(full_graph)
|
|
565
|
+
errors.append(dependency_graph_not_a_dag(cycle))
|
|
566
|
+
|
|
567
|
+
# no runtime-installed table should be a recursive dependency of any build-time-installed table
|
|
568
|
+
build_time_graph = predecessor_graph(full_graph, build_time_tables)
|
|
569
|
+
for ref in build_time_graph:
|
|
570
|
+
if ref in run_time_tables:
|
|
571
|
+
errors.append(run_time_table_is_build_time_dependency(str(ref)))
|
|
572
|
+
|
|
573
|
+
# transient tables should have successors; however, this is not an error, just a warning that such
|
|
574
|
+
# tables will be silently ignored at build time
|
|
575
|
+
for ref in transient_tables:
|
|
576
|
+
if not list(full_graph.successors(ref)):
|
|
577
|
+
warn(
|
|
578
|
+
f"Table '{ref}' is marked as transient but has no downstream dependencies; it will not "
|
|
579
|
+
f"be computed in builds"
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
return errors
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _validate_inheritance_dag(schema: _RawSchema) -> List[ErrorMessage]:
|
|
586
|
+
full_graph = schema.inheritance_dag()
|
|
587
|
+
errors = []
|
|
588
|
+
if not nx.is_directed_acyclic_graph(full_graph):
|
|
589
|
+
cycle = nx.find_cycle(full_graph)
|
|
590
|
+
errors.append(inheritance_graph_not_a_dag(cycle))
|
|
591
|
+
return errors
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _resolve_typeref(
|
|
595
|
+
dtype: Union[DType, AnonCustomType, _CustomTypeRef, _RawArrayType, _RawMappingType],
|
|
596
|
+
custom_types: Mapping[Identifier, CustomType],
|
|
597
|
+
) -> Union[DType, AnonCustomType, CustomType, ArrayType, MappingType]:
|
|
598
|
+
if isinstance(dtype, _CustomTypeRef):
|
|
599
|
+
return custom_types[dtype.custom]
|
|
600
|
+
elif isinstance(dtype, _RawArrayType):
|
|
601
|
+
if isinstance(dtype.values, (AnonCustomType, _CustomTypeRef)):
|
|
602
|
+
warn(f"Array elements with custom type {dtype.values} cannot currently be validated")
|
|
603
|
+
return ArrayType(values=_resolve_typeref(dtype.values, custom_types))
|
|
604
|
+
elif isinstance(dtype, _RawMappingType):
|
|
605
|
+
if isinstance(dtype.keys, (AnonCustomType, _CustomTypeRef)):
|
|
606
|
+
warn(f"Mapping keys with custom type {dtype.keys} cannot currently be validated")
|
|
607
|
+
if isinstance(dtype.values, (AnonCustomType, _CustomTypeRef)):
|
|
608
|
+
warn(f"Mapping values with custom type {dtype.values} cannot currently be validated")
|
|
609
|
+
return MappingType(
|
|
610
|
+
keys=cast(
|
|
611
|
+
Union[DType, CustomType, AnonCustomType], _resolve_typeref(dtype.keys, custom_types)
|
|
612
|
+
),
|
|
613
|
+
values=_resolve_typeref(dtype.values, custom_types),
|
|
614
|
+
)
|
|
615
|
+
else:
|
|
616
|
+
return dtype
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def _resolve_column_typerefs(
|
|
620
|
+
column: _RawColumn, custom_types: Mapping[Identifier, CustomType]
|
|
621
|
+
) -> Column:
|
|
622
|
+
return Column(
|
|
623
|
+
name=column.name,
|
|
624
|
+
type=_resolve_typeref(column.type, custom_types),
|
|
625
|
+
nullable=column.nullable,
|
|
626
|
+
doc=column.doc,
|
|
627
|
+
source_name=column.source_name,
|
|
628
|
+
na_values=column.na_values,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def distinct_indexes(table: _RawTable, table_name: str) -> List[Tuple[str, ...]]:
|
|
633
|
+
indexes = []
|
|
634
|
+
for index in table.indexes:
|
|
635
|
+
if index == table.primary_key:
|
|
636
|
+
warn(
|
|
637
|
+
f"Table {table_name} has its primary key re-defined as an index: {table.primary_key}; "
|
|
638
|
+
f"discarding"
|
|
639
|
+
)
|
|
640
|
+
elif index in indexes:
|
|
641
|
+
warn(f"Table {table_name} has a duplicate definition of index {index}; discarding")
|
|
642
|
+
else:
|
|
643
|
+
indexes.append(index)
|
|
644
|
+
|
|
645
|
+
return indexes
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _load_external_schema(
|
|
649
|
+
schema_name: str,
|
|
650
|
+
package: Optional[str],
|
|
651
|
+
schema_path: str,
|
|
652
|
+
git_ref: Optional[str] = None,
|
|
653
|
+
) -> Tuple[Optional[Schema], List[ErrorMessage]]:
|
|
654
|
+
errors = []
|
|
655
|
+
external_schema: Optional[Schema] = None
|
|
656
|
+
try:
|
|
657
|
+
external_schema = load_schema(
|
|
658
|
+
package,
|
|
659
|
+
schema_path,
|
|
660
|
+
require_data_resources=False,
|
|
661
|
+
require_preprocessors=False,
|
|
662
|
+
git_ref=git_ref,
|
|
663
|
+
)
|
|
664
|
+
except ModuleNotFoundError:
|
|
665
|
+
errors.append(
|
|
666
|
+
external_schema_not_found(schema_name, package, schema_path, module_not_found=True)
|
|
667
|
+
)
|
|
668
|
+
except FileNotFoundError:
|
|
669
|
+
errors.append(
|
|
670
|
+
external_schema_not_found(schema_name, package, schema_path, module_not_found=False)
|
|
671
|
+
)
|
|
672
|
+
except MetaschemaValidationError:
|
|
673
|
+
errors.append(external_schema_invalid(schema_name))
|
|
674
|
+
|
|
675
|
+
return external_schema, errors
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def validation_errors(
|
|
679
|
+
raw_schema: _RawSchema,
|
|
680
|
+
require_external_schemas: bool = True,
|
|
681
|
+
require_data_resources: bool = False,
|
|
682
|
+
require_preprocessors: bool = False,
|
|
683
|
+
git_ref: Optional[str] = None,
|
|
684
|
+
) -> Tuple[List[ErrorMessage], Mapping[str, Schema]]:
|
|
685
|
+
errors = _validate_inheritance_dag(raw_schema)
|
|
686
|
+
bad_inheritance_graph = bool(errors)
|
|
687
|
+
|
|
688
|
+
# load external schemas
|
|
689
|
+
external_schemas = {}
|
|
690
|
+
failed_external_schemas = set()
|
|
691
|
+
if require_external_schemas:
|
|
692
|
+
for schema_name, schema_ref in raw_schema.external_schemas.items():
|
|
693
|
+
external_schema, load_errors = _load_external_schema(
|
|
694
|
+
schema_name,
|
|
695
|
+
schema_ref.package,
|
|
696
|
+
schema_ref.schema_path,
|
|
697
|
+
git_ref=git_ref,
|
|
698
|
+
)
|
|
699
|
+
if load_errors:
|
|
700
|
+
errors.extend(load_errors)
|
|
701
|
+
failed_external_schemas.add(schema_name)
|
|
702
|
+
else:
|
|
703
|
+
assert external_schema is not None
|
|
704
|
+
external_schemas[schema_name] = external_schema
|
|
705
|
+
|
|
706
|
+
# verify all column name refs
|
|
707
|
+
for tablename, table in raw_schema.tables.items():
|
|
708
|
+
errors.extend(_validate_unique_column_names(table, tablename))
|
|
709
|
+
if not bad_inheritance_graph or table.inherit_schema is None:
|
|
710
|
+
# this check involves resolving inherited columns, so we skip it if the inheritance graph is
|
|
711
|
+
# badly formed
|
|
712
|
+
errors.extend(_validate_table_constraints(table, tablename, raw_schema))
|
|
713
|
+
|
|
714
|
+
errors.extend(_validate_column_types(table, tablename, raw_schema.types))
|
|
715
|
+
errors.extend(
|
|
716
|
+
_validate_data_dependencies(
|
|
717
|
+
table, tablename, raw_schema.tables, raw_schema.remote_data, raw_schema.local_data
|
|
718
|
+
)
|
|
719
|
+
)
|
|
720
|
+
if not bad_inheritance_graph and table.inherit_schema is not None:
|
|
721
|
+
errors.extend(_validate_table_inheritance(table, tablename, raw_schema))
|
|
722
|
+
if require_data_resources and isinstance(table.dependencies, TabularFileSource):
|
|
723
|
+
errors.extend(
|
|
724
|
+
_validate_local_data_resource(
|
|
725
|
+
table.dependencies.package,
|
|
726
|
+
table.dependencies.filename,
|
|
727
|
+
tablename,
|
|
728
|
+
"tabular file source for table",
|
|
729
|
+
)
|
|
730
|
+
)
|
|
731
|
+
if require_preprocessors:
|
|
732
|
+
errors.extend(_validate_preprocessor(table, tablename))
|
|
733
|
+
|
|
734
|
+
if require_data_resources:
|
|
735
|
+
for resourcename, local_resource in raw_schema.local_data.items():
|
|
736
|
+
errors.extend(
|
|
737
|
+
_validate_local_data_resource(
|
|
738
|
+
local_resource.package,
|
|
739
|
+
local_resource.filename,
|
|
740
|
+
resourcename,
|
|
741
|
+
"local data specification",
|
|
742
|
+
)
|
|
743
|
+
)
|
|
744
|
+
if local_resource.order:
|
|
745
|
+
errors.extend(
|
|
746
|
+
_validate_local_ordered_data_resource(
|
|
747
|
+
local_resource,
|
|
748
|
+
resourcename,
|
|
749
|
+
"local data specification",
|
|
750
|
+
)
|
|
751
|
+
)
|
|
752
|
+
for i, order_path in enumerate(local_resource.order):
|
|
753
|
+
errors.extend(
|
|
754
|
+
_validate_local_data_resource(
|
|
755
|
+
local_resource.package,
|
|
756
|
+
"/".join([local_resource.filename, order_path]),
|
|
757
|
+
resourcename,
|
|
758
|
+
f"local data order [{i}] specification",
|
|
759
|
+
)
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
for typename, dtype in raw_schema.types.items():
|
|
763
|
+
if isinstance(dtype, AnonCustomType):
|
|
764
|
+
errors.extend(_validate_type_constraints(dtype, typename))
|
|
765
|
+
elif require_external_schemas:
|
|
766
|
+
errors.extend(
|
|
767
|
+
_validate_external_type_ref(dtype, external_schemas, typename, failed_external_schemas)
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
errors.extend(_validate_dependency_dag(raw_schema))
|
|
771
|
+
|
|
772
|
+
return errors, external_schemas
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def validate(
|
|
776
|
+
json: Dict,
|
|
777
|
+
require_data_resources: bool = False,
|
|
778
|
+
require_preprocessors: bool = False,
|
|
779
|
+
git_ref: Optional[str] = None,
|
|
780
|
+
) -> Schema:
|
|
781
|
+
# low-level pydantic validation happens here
|
|
782
|
+
raw_schema = _RawSchema(**json)
|
|
783
|
+
# higher-level semantic validation happens here
|
|
784
|
+
errors, external_schemas = validation_errors(
|
|
785
|
+
raw_schema,
|
|
786
|
+
require_external_schemas=True,
|
|
787
|
+
require_data_resources=require_data_resources,
|
|
788
|
+
require_preprocessors=require_preprocessors,
|
|
789
|
+
git_ref=git_ref,
|
|
790
|
+
)
|
|
791
|
+
if errors:
|
|
792
|
+
raise MetaschemaValidationError(errors)
|
|
793
|
+
|
|
794
|
+
named_custom_types: Dict[Identifier, Union[CustomType, ExternalCustomType]] = {}
|
|
795
|
+
referenced_custom_types = set(
|
|
796
|
+
itertools.chain.from_iterable(table.custom_type_refs for table in raw_schema.tables.values())
|
|
797
|
+
)
|
|
798
|
+
for name, t in raw_schema.types.items():
|
|
799
|
+
if not raw_schema.build_options.render_all_types and name not in referenced_custom_types:
|
|
800
|
+
warn(f"Discarding type {name!r} which is referenced in no table")
|
|
801
|
+
else:
|
|
802
|
+
if isinstance(t, ExternalTypeRef):
|
|
803
|
+
external_schema = external_schemas[t.schema_name]
|
|
804
|
+
external_type = external_schema.types[t.type_name]
|
|
805
|
+
schema_ref = raw_schema.external_schemas[t.schema_name]
|
|
806
|
+
schema_ref.derived_code_submodule = external_schema.build_options.derived_code_submodule
|
|
807
|
+
named_custom_types[name] = external_type.from_external(schema_ref, name)
|
|
808
|
+
else:
|
|
809
|
+
named_custom_types[name] = t.with_name(name)
|
|
810
|
+
|
|
811
|
+
for name, spec in raw_schema.remote_data.items():
|
|
812
|
+
non_version_controlled_paths = [p for p in spec.paths if p.md5 is None]
|
|
813
|
+
if non_version_controlled_paths:
|
|
814
|
+
warn(
|
|
815
|
+
f"Remote data specification '{name}' has {len(non_version_controlled_paths)} "
|
|
816
|
+
"paths with no specified hash; build consistency cannot be guaranteed for any "
|
|
817
|
+
"tables depending on these"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
return Schema(
|
|
821
|
+
build_options=raw_schema.build_options,
|
|
822
|
+
tables={
|
|
823
|
+
tablename: Table(
|
|
824
|
+
name=tablename,
|
|
825
|
+
columns=[
|
|
826
|
+
_resolve_column_typerefs(column, named_custom_types)
|
|
827
|
+
for column in table.resolve_inherited_columns(raw_schema)
|
|
828
|
+
],
|
|
829
|
+
constraints=table.constraints,
|
|
830
|
+
primary_key=table.primary_key,
|
|
831
|
+
indexes=distinct_indexes(table, tablename),
|
|
832
|
+
doc=table.doc,
|
|
833
|
+
dependencies=table.dependencies,
|
|
834
|
+
md5=table.md5,
|
|
835
|
+
transient=table.transient,
|
|
836
|
+
build_time_installed=table.build_time_installed,
|
|
837
|
+
)
|
|
838
|
+
for tablename, table in raw_schema.tables.items()
|
|
839
|
+
},
|
|
840
|
+
types=named_custom_types,
|
|
841
|
+
external_schemas=raw_schema.external_schemas,
|
|
842
|
+
remote_data=raw_schema.remote_data,
|
|
843
|
+
remote_blob_store=raw_schema.remote_blob_store,
|
|
844
|
+
local_data=raw_schema.local_data,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
@lru_cache(None) # singleton
|
|
849
|
+
def load_schema(
|
|
850
|
+
package: Optional[str],
|
|
851
|
+
schema_path: str,
|
|
852
|
+
require_data_resources: bool = False,
|
|
853
|
+
require_preprocessors: bool = False,
|
|
854
|
+
git_ref: Optional[str] = None,
|
|
855
|
+
) -> Schema:
|
|
856
|
+
if git_ref is None:
|
|
857
|
+
if package is None:
|
|
858
|
+
with open(schema_path, "r") as f:
|
|
859
|
+
json: JSON = yaml.safe_load(f)
|
|
860
|
+
else:
|
|
861
|
+
with pkg_resources.resource_stream(package, schema_path) as f:
|
|
862
|
+
json = yaml.safe_load(f)
|
|
863
|
+
|
|
864
|
+
else:
|
|
865
|
+
abspath = (
|
|
866
|
+
Path(schema_path)
|
|
867
|
+
if package is None
|
|
868
|
+
else Path(pkg_resources.resource_filename(package, str(schema_path)))
|
|
869
|
+
)
|
|
870
|
+
contents = git_util.blob_contents(abspath, git_ref)
|
|
871
|
+
json = yaml.safe_load(io.BytesIO(contents))
|
|
872
|
+
|
|
873
|
+
return validate(
|
|
874
|
+
json,
|
|
875
|
+
require_data_resources=require_data_resources,
|
|
876
|
+
require_preprocessors=require_preprocessors,
|
|
877
|
+
git_ref=git_ref,
|
|
878
|
+
)
|