thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,385 @@
1
+ import datetime
2
+ from enum import IntEnum
3
+ from functools import singledispatch
4
+ from logging import getLogger
5
+ from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import pyarrow
10
+
11
+ K1 = TypeVar("K1")
12
+ K2 = TypeVar("K2")
13
+ V1 = TypeVar("V1")
14
+ V2 = TypeVar("V2")
15
+
16
+ TuplesToDict = Callable[[Iterable[Tuple[K1, V1]]], Dict[K2, V2]]
17
+ IterableToList = Callable[[Iterable[V1]], List[V2]]
18
+ DictToList = Callable[[Dict[K1, V1]], List[Tuple[K2, V2]]]
19
+
20
+ PANDAS_NULL_VALUES = {None, np.nan, pd.NA}
21
+ NONHASHABLE_TYPES = {dict, list, np.ndarray}
22
+
23
+
24
+ def identity(x):
25
+ return x
26
+
27
+
28
+ def pandas_maybe(f: Callable[[V1], V2]) -> Callable[[Optional[V1]], Optional[V2]]:
29
+ """Wrap a function with this to allow passing it to `pandas.Series.apply` in case null values are
30
+ present"""
31
+
32
+ def f_(x):
33
+ if (type(x) not in NONHASHABLE_TYPES) and (x in PANDAS_NULL_VALUES):
34
+ return None
35
+ return f(x)
36
+
37
+ return f_
38
+
39
+
40
+ def is_primitive_type(t: pyarrow.DataType) -> bool:
41
+ return pyarrow.types.is_primitive(t) or pyarrow.types.is_string(t)
42
+
43
+
44
+ # helpers for postprocessing dataframes read from parquet files possibly with complex types
45
+
46
+
47
+ @singledispatch
48
+ def to_pyiterable(a: Union[np.ndarray, Iterable]) -> Iterable:
49
+ return a
50
+
51
+
52
+ @to_pyiterable.register(np.ndarray)
53
+ def to_pyiterable_array(a: np.ndarray) -> Iterable:
54
+ if a.dtype.kind == "O":
55
+ # can iterate over object type array and get python objects; no need to make a copy
56
+ return a
57
+ return a.tolist()
58
+
59
+
60
+ @singledispatch
61
+ def tolist(list_: Iterable[V1]) -> List[V1]:
62
+ raise NotImplementedError(type(list_))
63
+
64
+
65
+ tolist.register(list)(identity)
66
+ tolist.register(np.ndarray)(np.ndarray.tolist)
67
+
68
+
69
+ def list_map(values: Callable[[V1], V2]) -> IterableToList:
70
+ def mapped(it):
71
+ return list(map(values, to_pyiterable(it)))
72
+
73
+ return mapped
74
+
75
+
76
+ def dict_map_keys_values(keys: Callable[[K1], K2], values: Callable[[V1], V2]) -> TuplesToDict:
77
+ def mapped(it):
78
+ return dict((keys(k), values(v)) for k, v in it)
79
+
80
+ return mapped
81
+
82
+
83
+ def dict_map_keys(keys: Callable[[K1], K2]) -> TuplesToDict[K1, V1, K2, V1]:
84
+ def mapped(it):
85
+ return dict((keys(k), v) for k, v in it)
86
+
87
+ return mapped
88
+
89
+
90
+ def dict_map_values(values: Callable[[V1], V2]) -> TuplesToDict[K1, V1, K1, V2]:
91
+ def mapped(it):
92
+ return dict((k, values(v)) for k, v in it)
93
+
94
+ return mapped
95
+
96
+
97
+ def todate(x: datetime.date) -> datetime.date:
98
+ return x.date() if isinstance(x, datetime.datetime) else x
99
+
100
+
101
+ def postprocessor_for_pyarrow_value_type(value_type: pyarrow.DataType) -> Optional[Callable]:
102
+ # Only for entries in arrays/maps; some newer versions of pyarrow load date types as datetime there.
103
+ # Not for scalar columns, where we'll just allow the more efficient pandas datetime/timestamp dtypes
104
+ if value_type in (pyarrow.date32(), pyarrow.date64()):
105
+ return todate
106
+ return postprocessor_for_pyarrow_type(value_type)
107
+
108
+
109
+ @singledispatch
110
+ def postprocessor_for_pyarrow_type(t: pyarrow.DataType) -> Optional[Callable]:
111
+ return None
112
+
113
+
114
+ @postprocessor_for_pyarrow_type.register(pyarrow.ListType)
115
+ def postprocessor_for_pyarrow_array(t: pyarrow.Array) -> IterableToList:
116
+ pproc = postprocessor_for_pyarrow_value_type(t.value_type)
117
+ if pproc is None:
118
+ return tolist
119
+ return list_map(pproc)
120
+
121
+
122
+ @postprocessor_for_pyarrow_type.register(pyarrow.MapType)
123
+ def postprocessor_for_pyarrow_map(t: pyarrow.MapType) -> TuplesToDict:
124
+ key_pproc = postprocessor_for_pyarrow_value_type(t.key_type)
125
+ val_pproc = postprocessor_for_pyarrow_value_type(t.item_type)
126
+ if key_pproc is None:
127
+ if val_pproc is None:
128
+ return dict
129
+ return dict_map_values(val_pproc)
130
+ elif val_pproc is None:
131
+ return dict_map_keys(key_pproc)
132
+ else:
133
+ return dict_map_keys_values(key_pproc, val_pproc)
134
+
135
+
136
+ def postprocess_parquet_dataframe(df: pd.DataFrame, schema: pyarrow.Schema) -> pd.DataFrame:
137
+ """Postprocess a dataframe read from an arrow table (casts collection types to dicts and lists)"""
138
+ for name in schema.names:
139
+ field = schema.field(name)
140
+ pproc = postprocessor_for_pyarrow_type(field.type)
141
+ if pproc is not None:
142
+ if field.nullable:
143
+ pproc = pandas_maybe(pproc)
144
+ df[name] = df[name].apply(pproc)
145
+
146
+ return df
147
+
148
+
149
+ # helpers for preprocessing dataframes for writing to parquet files possibly with complex types
150
+
151
+
152
+ def dict_to_list(d: Dict[K1, V1]) -> List[Tuple[K1, V1]]:
153
+ return list(d.items())
154
+
155
+
156
+ def dict_to_list_keys_values(keys: Callable[[K1], K2], values: Callable[[V1], V2]) -> DictToList:
157
+ def mapped(it: Dict[K1, V1]):
158
+ return [(keys(k), values(v)) for k, v in it.items()]
159
+
160
+ return mapped
161
+
162
+
163
+ def dict_to_list_keys(keys: Callable[[K1], K2]) -> DictToList[K1, V1, K2, V1]:
164
+ def mapped(it: Dict[K1, V1]):
165
+ return [(keys(k), v) for k, v in it.items()]
166
+
167
+ return mapped
168
+
169
+
170
+ def dict_to_list_values(values: Callable[[V1], V2]) -> DictToList[K1, V1, K1, V2]:
171
+ def mapped(it: Dict[K1, V1]):
172
+ return [(k, values(v)) for k, v in it.items()]
173
+
174
+ return mapped
175
+
176
+
177
+ @singledispatch
178
+ def preprocessor_for_pyarrow_type(t: pyarrow.DataType) -> Optional[Callable]:
179
+ return None
180
+
181
+
182
+ @preprocessor_for_pyarrow_type.register(pyarrow.MapType)
183
+ def preprocessor_for_pyarrow_map(t: pyarrow.MapType) -> DictToList:
184
+ key_pproc = preprocessor_for_pyarrow_type(t.key_type)
185
+ val_pproc = preprocessor_for_pyarrow_type(t.item_type)
186
+ if key_pproc is None:
187
+ if val_pproc is None:
188
+ return dict_to_list
189
+ return dict_to_list_values(val_pproc)
190
+ elif val_pproc is None:
191
+ return dict_to_list_keys(key_pproc)
192
+ else:
193
+ return dict_to_list_keys_values(key_pproc, val_pproc)
194
+
195
+
196
+ # parquet type safety
197
+
198
+ _pyarrow_type_to_py_type: Dict[pyarrow.DataType, Type] = {}
199
+ _pyarrow_type_to_py_type.update(
200
+ (t(), int) for t in [pyarrow.uint8, pyarrow.uint16, pyarrow.uint32, pyarrow.uint64]
201
+ )
202
+ _pyarrow_type_to_py_type.update(
203
+ (t(), int) for t in [pyarrow.int8, pyarrow.int16, pyarrow.int32, pyarrow.int64]
204
+ )
205
+ _pyarrow_type_to_py_type.update(
206
+ (t(), float) for t in [pyarrow.float16, pyarrow.float32, pyarrow.float64]
207
+ )
208
+ _pyarrow_type_to_py_type.update((t(), datetime.date) for t in [pyarrow.date32, pyarrow.date64])
209
+ _pyarrow_type_to_py_type[pyarrow.string()] = str
210
+ _pyarrow_type_to_py_type[pyarrow.bool_()] = bool
211
+ _pyarrow_type_to_py_type[pyarrow.null()] = type(None)
212
+
213
+
214
+ class TypeCheckLevel(IntEnum):
215
+ """Enum specifying a level of type safety when checking arrow schemas at runtime
216
+ same_names: only require that the expected field name set is a subset of the supplied field name set.
217
+ This applies recursively to record types
218
+ compatible: also require that all types are semantically compatible; e.g. if floats are expected but
219
+ ints are given, that will pass, but not vice-versa. This also includes nullability constraints:
220
+ if a nullable type is expected and a non-nullable version is given, that will pass, but not
221
+ vice-versa
222
+ same_kind: additionally require that types given have the same kind as those expected. E.g. an int32
223
+ in place of an int8 will be fine, but not a float type
224
+ exact: require exactly the same types and nullability constraints as expected
225
+ """
226
+
227
+ same_names = 0
228
+ compatible = 1
229
+ same_kind = 2
230
+ exact = 3
231
+
232
+
233
+ def type_check_pyarrow_schemas(
234
+ actual_schema: Union[pyarrow.Schema, pyarrow.StructType],
235
+ expected_schema: Union[pyarrow.Schema, pyarrow.StructType],
236
+ type_check_level: TypeCheckLevel,
237
+ columns: Optional[List[str]] = None,
238
+ raise_: bool = True,
239
+ warn_inexact: bool = True,
240
+ ) -> bool:
241
+ actual_fields = {field.name: field for field in actual_schema}
242
+ expected_fields = {field.name: field for field in expected_schema}
243
+ if columns is None:
244
+ columns = [field.name for field in expected_schema]
245
+
246
+ missing = set(columns).difference(actual_fields)
247
+ extra = set(actual_fields).difference(columns)
248
+ errors = []
249
+ logger = getLogger(__name__)
250
+ if extra:
251
+ error = f"Expected only columns {columns}, but {sorted(extra)} were also present"
252
+ if type_check_level >= TypeCheckLevel.exact:
253
+ logger.error(error)
254
+ errors.append(error)
255
+ else:
256
+ logger.warning(error)
257
+ if missing:
258
+ error = f"Expected columns {columns}, but {sorted(missing)} were missing"
259
+ logger.error(error)
260
+ errors.append(error)
261
+
262
+ for column in columns:
263
+ if column not in missing:
264
+ expected = expected_fields[column]
265
+ actual = actual_fields[column]
266
+
267
+ if (
268
+ warn_inexact
269
+ and (type_check_level < TypeCheckLevel.exact)
270
+ and not pyarrow_field_compatible(actual, expected, TypeCheckLevel.exact)
271
+ ):
272
+ logger.warning(
273
+ f"Field {actual} didn't match expected {expected} "
274
+ f"according to type check rule {TypeCheckLevel.exact.name!r}"
275
+ )
276
+ if not pyarrow_field_compatible(actual, expected, type_check_level):
277
+ error = (
278
+ f"Field {actual} didn't match expected {expected} "
279
+ f"according to type check rule {type_check_level.name!r}"
280
+ )
281
+ logger.error(error)
282
+ errors.append(error)
283
+
284
+ if raise_ and errors:
285
+ raise TypeError("\n".join(errors))
286
+
287
+ return not bool(errors)
288
+
289
+
290
+ def pyarrow_field_compatible(
291
+ actual: pyarrow.Field, expected: pyarrow.Field, level: TypeCheckLevel
292
+ ) -> bool:
293
+ if level >= TypeCheckLevel.exact and actual.nullable != expected.nullable:
294
+ return False
295
+ elif level >= TypeCheckLevel.compatible and actual.nullable and not expected.nullable:
296
+ return False
297
+ elif level >= TypeCheckLevel.compatible and level < TypeCheckLevel.same_kind:
298
+ return pyarrow_type_compatible(actual.type, expected.type, level) or (
299
+ (actual.type == pyarrow.null()) and expected.nullable
300
+ )
301
+ else:
302
+ return pyarrow_type_compatible(actual.type, expected.type, level)
303
+
304
+
305
+ @singledispatch
306
+ def pyarrow_type_compatible(
307
+ actual: pyarrow.DataType, expected: pyarrow.DataType, level: TypeCheckLevel
308
+ ) -> bool:
309
+ if level >= TypeCheckLevel.exact:
310
+ return actual == expected
311
+ elif level >= TypeCheckLevel.same_kind:
312
+ return _pyarrow_type_to_py_type[actual] == _pyarrow_type_to_py_type.get(expected)
313
+ elif level >= TypeCheckLevel.compatible:
314
+ actual_kind = _pyarrow_type_to_py_type[actual]
315
+ expected_kind = _pyarrow_type_to_py_type.get(expected)
316
+ if expected_kind is int:
317
+ return actual_kind is int
318
+ elif expected_kind is float:
319
+ return actual_kind is int or actual_kind is float
320
+ else:
321
+ return actual_kind == expected_kind
322
+ return True
323
+
324
+
325
+ @pyarrow_type_compatible.register(pyarrow.StructType)
326
+ def _pyarrow_type_compatible_struct(
327
+ actual: pyarrow.StructType, expected: pyarrow.DataType, level: TypeCheckLevel
328
+ ) -> bool:
329
+ return isinstance(expected, pyarrow.StructType) and type_check_pyarrow_schemas(
330
+ actual, expected, level, raise_=False
331
+ )
332
+
333
+
334
+ @pyarrow_type_compatible.register(pyarrow.ListType)
335
+ def _pyarrow_type_compatible_list(
336
+ actual: pyarrow.ListType, expected: pyarrow.DataType, level: TypeCheckLevel
337
+ ) -> bool:
338
+ return isinstance(expected, pyarrow.ListType) and pyarrow_field_compatible(
339
+ actual.value_field, expected.value_field, level
340
+ )
341
+
342
+
343
+ @pyarrow_type_compatible.register(pyarrow.FixedSizeListType)
344
+ def _pyarrow_type_compatible_fixed_size_list(
345
+ actual: pyarrow.FixedSizeListType, expected: pyarrow.DataType, level: TypeCheckLevel
346
+ ) -> bool:
347
+ if level >= TypeCheckLevel.compatible:
348
+ if isinstance(expected, pyarrow.FixedSizeListType) and actual.list_size != expected.list_size:
349
+ return False
350
+ return isinstance(
351
+ expected, (pyarrow.FixedSizeListType, pyarrow.ListType)
352
+ ) and pyarrow_field_compatible(actual.value_field, expected.value_field, level)
353
+
354
+
355
+ @pyarrow_type_compatible.register(pyarrow.MapType)
356
+ def _pyarrow_type_compatible_map(
357
+ actual: pyarrow.MapType, expected: pyarrow.DataType, level: TypeCheckLevel
358
+ ) -> bool:
359
+ return (
360
+ isinstance(expected, pyarrow.MapType)
361
+ and pyarrow_field_compatible(actual.key_field, expected.key_field, level)
362
+ and pyarrow_field_compatible(actual.item_field, expected.item_field, level)
363
+ )
364
+
365
+
366
+ @pyarrow_type_compatible.register(pyarrow.TimestampType)
367
+ def _pyarrow_type_compatible_timestamp(
368
+ actual: pyarrow.TimestampType, expected: pyarrow.DataType, level: TypeCheckLevel
369
+ ):
370
+ if level == TypeCheckLevel.compatible:
371
+ expected_kind = _pyarrow_type_to_py_type.get(expected)
372
+ if expected_kind is datetime.date:
373
+ return actual.tz is None
374
+ elif level < TypeCheckLevel.compatible:
375
+ return True
376
+
377
+ if not isinstance(expected, pyarrow.TimestampType):
378
+ return False
379
+ elif level >= TypeCheckLevel.exact:
380
+ return actual.unit == expected.unit and actual.tz == expected.tz
381
+ elif level >= TypeCheckLevel.compatible:
382
+ units = ["s", "ms", "us", "ns"]
383
+ if units.index(actual.unit) < units.index(expected.unit):
384
+ return False
385
+ return actual.tz == expected.tz