thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import (
|
|
5
|
+
IO,
|
|
6
|
+
Callable,
|
|
7
|
+
Dict,
|
|
8
|
+
Generic,
|
|
9
|
+
Iterable,
|
|
10
|
+
Iterator,
|
|
11
|
+
List,
|
|
12
|
+
Optional,
|
|
13
|
+
Sequence,
|
|
14
|
+
Tuple,
|
|
15
|
+
Type,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
import attr
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import pandas.core.dtypes.base as pd_dtypes
|
|
24
|
+
import pandera as pa
|
|
25
|
+
import pkg_resources
|
|
26
|
+
import pyarrow
|
|
27
|
+
import pyarrow.parquet as pq
|
|
28
|
+
|
|
29
|
+
from thds.tabularasa.data_dependencies.adls import sync_adls_data
|
|
30
|
+
from thds.tabularasa.data_dependencies.util import check_categorical_values, hash_file
|
|
31
|
+
from thds.tabularasa.schema.dtypes import PyType
|
|
32
|
+
from thds.tabularasa.schema.metaschema import RemoteBlobStoreSpec, Table
|
|
33
|
+
from thds.tabularasa.schema.util import snake_case
|
|
34
|
+
|
|
35
|
+
from .parquet_util import (
|
|
36
|
+
TypeCheckLevel,
|
|
37
|
+
list_map,
|
|
38
|
+
postprocess_parquet_dataframe,
|
|
39
|
+
postprocessor_for_pyarrow_type,
|
|
40
|
+
type_check_pyarrow_schemas,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# sqlite_constructor_for_record_type and AttrsSQLiteDatabase are not
|
|
44
|
+
# used here but they're imported for backward compatibility with
|
|
45
|
+
# existing generated code, which expects it to be importable from
|
|
46
|
+
# here. They were moved to sqlite_util to reduce the size of this file.
|
|
47
|
+
from .sqlite_util import AttrsSQLiteDatabase # noqa: F401
|
|
48
|
+
from .sqlite_util import sqlite_constructor_for_record_type # noqa: F401
|
|
49
|
+
|
|
50
|
+
T = TypeVar("T")
|
|
51
|
+
K = TypeVar("K", bound=PyType)
|
|
52
|
+
V = TypeVar("V", bound=PyType)
|
|
53
|
+
Record = TypeVar("Record", bound=attr.AttrsInstance)
|
|
54
|
+
|
|
55
|
+
PARQUET_EXT = ".parquet"
|
|
56
|
+
PQ_BATCH_SIZE_ATTRS = 100
|
|
57
|
+
PQ_BATCH_SIZE_PANDAS = 2**16
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def identity(x):
|
|
61
|
+
return x
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def maybe(f: Callable[[T], V]) -> Callable[[Optional[T]], Optional[V]]:
|
|
65
|
+
def f_(x):
|
|
66
|
+
return None if x is None else f(x)
|
|
67
|
+
|
|
68
|
+
return f_
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def default_parquet_package_data_path(
|
|
72
|
+
table_name: str, data_dir: str, as_package_data: bool = True
|
|
73
|
+
) -> str:
|
|
74
|
+
"""Standardized path to a parquet file resource for a packaged table inside a
|
|
75
|
+
shared package subdirectory.
|
|
76
|
+
When `as_package_data == True`, return a *package data* (Not OS) path to a resource; otherwise return
|
|
77
|
+
a regular OS-compatible file path."""
|
|
78
|
+
return package_data_path(
|
|
79
|
+
f"{snake_case(table_name)}{PARQUET_EXT}", data_dir, as_package_data=as_package_data
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def package_data_path(filename: str, data_dir: str, as_package_data: bool = True) -> str:
|
|
84
|
+
"""Standardized path to a file resource for inside a shared package subdirectory.
|
|
85
|
+
When `as_package_data == True`, return a *package data* (Not OS) path to a resource; otherwise return
|
|
86
|
+
a regular OS-compatible file path.
|
|
87
|
+
see https://setuptools.pypa.io/en/latest/pkg_resources.html#basic-resource-access"""
|
|
88
|
+
return (
|
|
89
|
+
f"{data_dir.rstrip('/')}/{filename}"
|
|
90
|
+
if as_package_data
|
|
91
|
+
else str(Path(data_dir.replace("/", os.sep)) / filename)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def unique_across_columns(df: pd.DataFrame, colnames: Sequence[str]) -> pd.Series:
|
|
96
|
+
index_cols = [c for c in colnames if c in df.index.names]
|
|
97
|
+
cols = [c for c in colnames if c in df.columns]
|
|
98
|
+
if not index_cols:
|
|
99
|
+
check_cols = df[cols]
|
|
100
|
+
elif not cols and set(index_cols) == set(df.index.names):
|
|
101
|
+
# optimization - don't duplicate the index if we don't have to
|
|
102
|
+
check_cols = df.index # type: ignore
|
|
103
|
+
else:
|
|
104
|
+
check_cols = pd.DataFrame(
|
|
105
|
+
{
|
|
106
|
+
**{c: df.index.get_level_values(c).values for c in index_cols},
|
|
107
|
+
**{c: df[c].values for c in cols},
|
|
108
|
+
}
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
duped = check_cols.duplicated(keep=False)
|
|
112
|
+
if isinstance(duped, pd.Series):
|
|
113
|
+
# if check_cols was a frame vs an index
|
|
114
|
+
duped = duped.values # type: ignore
|
|
115
|
+
|
|
116
|
+
return pd.Series(~duped, index=df.index)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _register_unique_across_columns() -> None:
|
|
120
|
+
# make sure the registration runs once
|
|
121
|
+
# forced re-importing with `mops.testing.deferred_imports.assert_dev_deps_not_imported` raises an error
|
|
122
|
+
if hasattr(pa.Check, "unique_across_columns"):
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
pa.extensions.register_check_method(statistics=["colnames"], supported_types=pd.DataFrame)(
|
|
126
|
+
unique_across_columns
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
_register_unique_across_columns()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class _PackageDataOrFileInterface:
|
|
134
|
+
package: Optional[str]
|
|
135
|
+
data_path: str
|
|
136
|
+
md5: Optional[str] = None
|
|
137
|
+
blob_store: Optional[RemoteBlobStoreSpec] = None
|
|
138
|
+
|
|
139
|
+
def set_path(self, table_name: str, data_dir: Union[str, Path], filename: Optional[str]):
|
|
140
|
+
# package data or local file
|
|
141
|
+
as_package_data = self.package is not None
|
|
142
|
+
if filename is None:
|
|
143
|
+
self.data_path = default_parquet_package_data_path(
|
|
144
|
+
table_name, str(data_dir), as_package_data=as_package_data
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
self.data_path = package_data_path(filename, str(data_dir), as_package_data=as_package_data)
|
|
148
|
+
assert os.path.exists(self.data_path)
|
|
149
|
+
if self.blob_store is not None and self.md5 is None:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"No md5 defined for remote file in blob store {self.blob_store} for table with local path "
|
|
152
|
+
f"{self.data_path}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def file_exists(self) -> bool:
|
|
156
|
+
if self.package is None:
|
|
157
|
+
return os.path.exists(self.data_path)
|
|
158
|
+
else:
|
|
159
|
+
return pkg_resources.resource_exists(self.package, self.data_path)
|
|
160
|
+
|
|
161
|
+
def _resource_stream(self, sync: bool = True) -> IO[bytes]:
|
|
162
|
+
if sync:
|
|
163
|
+
maybe_path = self.sync_blob()
|
|
164
|
+
if maybe_path:
|
|
165
|
+
return open(maybe_path, "rb")
|
|
166
|
+
|
|
167
|
+
if self.package is None:
|
|
168
|
+
return open(self.data_path, "rb")
|
|
169
|
+
else:
|
|
170
|
+
return pkg_resources.resource_stream(self.package, self.data_path)
|
|
171
|
+
|
|
172
|
+
def file_path(self, sync: bool = True) -> Path:
|
|
173
|
+
"""Path on the local filesystem to the file underlying this loader. If a blob store is specified
|
|
174
|
+
and the local path doesn't exist, it will be synced, unless `sync=False` is passed."""
|
|
175
|
+
if sync:
|
|
176
|
+
maybe_path = self.sync_blob()
|
|
177
|
+
if maybe_path:
|
|
178
|
+
return maybe_path
|
|
179
|
+
|
|
180
|
+
if self.package is None:
|
|
181
|
+
return Path(self.data_path)
|
|
182
|
+
else:
|
|
183
|
+
return Path(pkg_resources.resource_filename(self.package, self.data_path))
|
|
184
|
+
|
|
185
|
+
def sync_blob(self, link: bool = False) -> Optional[Path]:
|
|
186
|
+
"""Ensure that the local file underlying this loader is available.
|
|
187
|
+
If the file does not exist, sync it from the blob store, or raise `FileNotFoundError` when no
|
|
188
|
+
blob store is defined. Returns a local path to the cached download if a sync was performed,
|
|
189
|
+
otherwise returns `None`.
|
|
190
|
+
When `link` is True and a download is performed, the resulting file is linked to the local file
|
|
191
|
+
associated with this resource.
|
|
192
|
+
"""
|
|
193
|
+
if not self.file_exists():
|
|
194
|
+
if self.blob_store is None:
|
|
195
|
+
raise FileNotFoundError(
|
|
196
|
+
"Local or package data file doesn't exist and no remote blob is defined for table "
|
|
197
|
+
f"with local path {self.data_path}"
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
assert (
|
|
201
|
+
self.md5 is not None
|
|
202
|
+
), f"No md5 defined for {self.data_path}; can't safely sync blob"
|
|
203
|
+
target_local_path = self.file_path(sync=False)
|
|
204
|
+
getLogger(__name__).info(
|
|
205
|
+
f"Syncing blob with hash {self.md5}" f" to {target_local_path}" if link else ""
|
|
206
|
+
)
|
|
207
|
+
remote_data_spec = self.blob_store.data_spec(self.md5)
|
|
208
|
+
local_files = sync_adls_data(remote_data_spec)
|
|
209
|
+
local_path = local_files[0].local_path
|
|
210
|
+
if link:
|
|
211
|
+
os.link(local_path, target_local_path)
|
|
212
|
+
return local_path
|
|
213
|
+
else:
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def file_hash(self) -> str:
|
|
217
|
+
with self._resource_stream() as f:
|
|
218
|
+
return hash_file(f)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class _ParquetPackageDataOrFileInterface(_PackageDataOrFileInterface):
|
|
222
|
+
def metadata(self) -> pq.FileMetaData:
|
|
223
|
+
with self._resource_stream() as f:
|
|
224
|
+
return pq.ParquetFile(f).metadata
|
|
225
|
+
|
|
226
|
+
def num_rows(self) -> int:
|
|
227
|
+
return self.metadata().num_rows
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class AttrsParquetLoader(Generic[Record], _ParquetPackageDataOrFileInterface):
|
|
231
|
+
"""Base interface for loading package resources as record iterators"""
|
|
232
|
+
|
|
233
|
+
def __init__(
|
|
234
|
+
self,
|
|
235
|
+
table_name: str,
|
|
236
|
+
type_: Type[Record],
|
|
237
|
+
*,
|
|
238
|
+
package: Optional[str],
|
|
239
|
+
data_dir: Union[str, Path],
|
|
240
|
+
filename: Optional[str] = None,
|
|
241
|
+
pyarrow_schema: Optional[pyarrow.Schema] = None,
|
|
242
|
+
md5: Optional[str] = None,
|
|
243
|
+
blob_store: Optional[RemoteBlobStoreSpec] = None,
|
|
244
|
+
):
|
|
245
|
+
self.type_ = type_
|
|
246
|
+
self.table_name = table_name
|
|
247
|
+
self.package = package
|
|
248
|
+
self.pyarrow_schema = pyarrow_schema
|
|
249
|
+
self.md5 = md5
|
|
250
|
+
self.blob_store = blob_store
|
|
251
|
+
self.set_path(table_name=table_name, data_dir=data_dir, filename=filename)
|
|
252
|
+
|
|
253
|
+
def __call__(
|
|
254
|
+
self, path: Optional[Path] = None, type_check: Optional[Union[int, TypeCheckLevel]] = None
|
|
255
|
+
) -> Iterator[Record]:
|
|
256
|
+
"""Load an iterator of instances of the attrs record type `self.type_` from a package data
|
|
257
|
+
parqet file.
|
|
258
|
+
|
|
259
|
+
:param path: Optional path to a local parquet file. Overrides the underlying package data file
|
|
260
|
+
when passed.
|
|
261
|
+
:param type_check: Optional `reference_data.loaders.parquet_util.TypeCheckLevel` indicating that
|
|
262
|
+
a type check should be performed on the arrow schema of the parquet file _before_ reading any
|
|
263
|
+
data, and at what level of strictness.
|
|
264
|
+
:return: an iterator of instances of `self.type_`, an attrs class matching the schema of the
|
|
265
|
+
parquet file being read.
|
|
266
|
+
"""
|
|
267
|
+
if type_check is not None and self.pyarrow_schema is None:
|
|
268
|
+
raise ValueError(f"Can't type check table {self.table_name} with no pyarrow schema")
|
|
269
|
+
|
|
270
|
+
with self._resource_stream() if path is None else open(path, "rb") as f:
|
|
271
|
+
col_order = [col.name for col in attr.fields(self.type_)]
|
|
272
|
+
parquet_file = pq.ParquetFile(f)
|
|
273
|
+
schema = parquet_file.schema.to_arrow_schema()
|
|
274
|
+
if type_check is not None:
|
|
275
|
+
assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
|
|
276
|
+
type_check_pyarrow_schemas(
|
|
277
|
+
schema, self.pyarrow_schema, TypeCheckLevel(type_check), col_order
|
|
278
|
+
)
|
|
279
|
+
# this is to re-order columns *just in case* they're in a different order in the parquet file
|
|
280
|
+
ixs_postprocessors: List[Tuple[int, Callable]] = []
|
|
281
|
+
for name in col_order:
|
|
282
|
+
i = schema.names.index(name)
|
|
283
|
+
field = schema.field(i)
|
|
284
|
+
pproc = postprocessor_for_pyarrow_type(field.type)
|
|
285
|
+
if pproc is not None:
|
|
286
|
+
if field.nullable:
|
|
287
|
+
pproc = maybe(pproc)
|
|
288
|
+
pproc = list_map(pproc)
|
|
289
|
+
else:
|
|
290
|
+
pproc = identity
|
|
291
|
+
ixs_postprocessors.append((i, pproc))
|
|
292
|
+
for batch in parquet_file.iter_batches(batch_size=PQ_BATCH_SIZE_ATTRS):
|
|
293
|
+
columns = [pproc(batch.columns[i].to_pylist()) for i, pproc in ixs_postprocessors]
|
|
294
|
+
parsed_rows = map(self.type_, *columns)
|
|
295
|
+
yield from parsed_rows
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class PandasParquetLoader(_ParquetPackageDataOrFileInterface):
|
|
299
|
+
def __init__(
|
|
300
|
+
self,
|
|
301
|
+
table_name: str,
|
|
302
|
+
*,
|
|
303
|
+
package: Optional[str],
|
|
304
|
+
data_dir: Union[str, Path],
|
|
305
|
+
filename: Optional[str] = None,
|
|
306
|
+
md5: Optional[str] = None,
|
|
307
|
+
blob_store: Optional[RemoteBlobStoreSpec] = None,
|
|
308
|
+
columns: Optional[List[str]] = None,
|
|
309
|
+
schema: Optional[pa.DataFrameSchema] = None,
|
|
310
|
+
pyarrow_schema: Optional[pyarrow.Schema] = None,
|
|
311
|
+
index_columns: Optional[List[str]] = None,
|
|
312
|
+
casts: Optional[Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]]] = None,
|
|
313
|
+
):
|
|
314
|
+
self.table_name = table_name
|
|
315
|
+
self.schema = schema
|
|
316
|
+
self.pyarrow_schema = pyarrow_schema
|
|
317
|
+
self.columns = columns
|
|
318
|
+
self.index_columns = index_columns
|
|
319
|
+
self.casts = casts
|
|
320
|
+
self.package = package
|
|
321
|
+
self.md5 = md5
|
|
322
|
+
self.blob_store = blob_store
|
|
323
|
+
self.set_path(table_name=table_name, data_dir=data_dir, filename=filename)
|
|
324
|
+
|
|
325
|
+
def __call__(
|
|
326
|
+
self,
|
|
327
|
+
path: Optional[Path] = None,
|
|
328
|
+
validate: bool = False,
|
|
329
|
+
type_check: Optional[Union[int, TypeCheckLevel]] = None,
|
|
330
|
+
postprocess: bool = True,
|
|
331
|
+
cast: bool = False,
|
|
332
|
+
) -> pd.DataFrame:
|
|
333
|
+
"""Load a `pandas.DataFrame` from a package data parqet file
|
|
334
|
+
|
|
335
|
+
See the `load_batched` method for documentation of the parameters"""
|
|
336
|
+
return next(
|
|
337
|
+
self.load_batched(
|
|
338
|
+
path,
|
|
339
|
+
validate=validate,
|
|
340
|
+
type_check=type_check,
|
|
341
|
+
postprocess=postprocess,
|
|
342
|
+
cast=cast,
|
|
343
|
+
batch_size=None,
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def load_batched(
|
|
348
|
+
self,
|
|
349
|
+
path: Optional[Path] = None,
|
|
350
|
+
batch_size: Optional[int] = PQ_BATCH_SIZE_PANDAS,
|
|
351
|
+
validate: bool = False,
|
|
352
|
+
type_check: Optional[Union[int, TypeCheckLevel]] = None,
|
|
353
|
+
postprocess: bool = True,
|
|
354
|
+
cast: bool = False,
|
|
355
|
+
) -> Iterator[pd.DataFrame]:
|
|
356
|
+
"""Load an iterator of `pandas.DataFrame`s from a package data parqet file in a memory-efficient way.
|
|
357
|
+
|
|
358
|
+
:param path: Optional path to a local parquet file. Overrides the underlying package data file
|
|
359
|
+
when passed.
|
|
360
|
+
:param batch_size: Read the data in batches of this many rows. Every DataFrame yielded except
|
|
361
|
+
possibly the last will have this many rows. Allows for control of memory usage. If `None`,
|
|
362
|
+
the entire table will be read and yielded as a single DataFrame.
|
|
363
|
+
:param validate: validate against the associated `pandera` schema?
|
|
364
|
+
:param postprocess: apply postprocessors to complex types? E.g., `pyarrow` returns lists of
|
|
365
|
+
tuples for mapping types; this will cast those to dicts
|
|
366
|
+
:param type_check: Optional `reference_data.loaders.parquet_util.TypeCheckLevel` indicating that
|
|
367
|
+
a type check should be performed on the arrow schema of the parquet file _before_ reading any
|
|
368
|
+
data, and at what level of strictness.
|
|
369
|
+
:param cast: Indicates whether to attempt a pyarrow table cast on read. When `False`, no cast
|
|
370
|
+
will ever be performed. When `True`, the behavior depends on the value of `type_check`:
|
|
371
|
+
When `type_check` is supplied, in case of a type check failure, attempt to cast the
|
|
372
|
+
arrow table to the arrow schema for this table. When `type_check` is `None`, always cast the
|
|
373
|
+
arrow table to the arrow schema for this table.
|
|
374
|
+
:return: iterator of the the loaded and possibly postprocessed and validated DataFrames
|
|
375
|
+
"""
|
|
376
|
+
if validate and self.schema is None:
|
|
377
|
+
raise ValueError(f"Can't validate table {self.table_name} with no pandera schema")
|
|
378
|
+
|
|
379
|
+
if type_check is not None and self.pyarrow_schema is None:
|
|
380
|
+
raise ValueError(f"Can't type check table {self.table_name} with no pyarrow schema")
|
|
381
|
+
|
|
382
|
+
if cast and self.pyarrow_schema is None:
|
|
383
|
+
raise ValueError(f"Can't cast table {self.table_name} with no pyarrow schema")
|
|
384
|
+
|
|
385
|
+
logger = getLogger(__name__)
|
|
386
|
+
|
|
387
|
+
with self._resource_stream() if path is None else open(path, "rb") as f:
|
|
388
|
+
pq_file = pyarrow.parquet.ParquetFile(f)
|
|
389
|
+
schema = pq_file.schema.to_arrow_schema()
|
|
390
|
+
if type_check is not None:
|
|
391
|
+
assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
|
|
392
|
+
try:
|
|
393
|
+
logger.info(f"Type-checking parquet file for table {self.table_name}")
|
|
394
|
+
type_check_pyarrow_schemas(
|
|
395
|
+
schema, self.pyarrow_schema, TypeCheckLevel(type_check), self.columns
|
|
396
|
+
)
|
|
397
|
+
except TypeError as e:
|
|
398
|
+
if cast:
|
|
399
|
+
logger.warning(
|
|
400
|
+
f"Type-checking failed at level '{type_check.name}'; " # type: ignore
|
|
401
|
+
"a type cast will be attempted on read"
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
raise e
|
|
405
|
+
else:
|
|
406
|
+
cast = False
|
|
407
|
+
|
|
408
|
+
logger.info(f"Loading arrow data for table {self.table_name} from parquet")
|
|
409
|
+
if batch_size is None:
|
|
410
|
+
table = pq_file.read(self.columns)
|
|
411
|
+
batches: Iterable[pyarrow.Table] = [table]
|
|
412
|
+
else:
|
|
413
|
+
batches = (
|
|
414
|
+
pyarrow.Table.from_batches([b])
|
|
415
|
+
for b in pq_file.iter_batches(batch_size, columns=self.columns)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
categorical_dtypes = (
|
|
419
|
+
[
|
|
420
|
+
(name, dtype)
|
|
421
|
+
for name, dtype in self.casts.items()
|
|
422
|
+
if isinstance(dtype, pd.CategoricalDtype)
|
|
423
|
+
]
|
|
424
|
+
if self.casts
|
|
425
|
+
else []
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
for table in batches:
|
|
429
|
+
if cast:
|
|
430
|
+
assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
|
|
431
|
+
table = table.cast(self.pyarrow_schema)
|
|
432
|
+
|
|
433
|
+
# ignore_metadata is only here because table.to_pandas has a bug wherein some dtypes get
|
|
434
|
+
# changed for index columns (which are specified in the metadata under a 'pandas' key).
|
|
435
|
+
# We handle setting the index ourselves below.
|
|
436
|
+
# Likewise, we omit the `categories` arg here, as pyarrow cannot in general recover the
|
|
437
|
+
# order of the original categories from the parquet file - those are handled using
|
|
438
|
+
# `self.casts` below.
|
|
439
|
+
df = table.to_pandas(date_as_object=False, ignore_metadata=True)
|
|
440
|
+
|
|
441
|
+
if self.casts:
|
|
442
|
+
for name, dtype in categorical_dtypes:
|
|
443
|
+
check_categorical_values(df[name], dtype)
|
|
444
|
+
|
|
445
|
+
df = df.astype(self.casts, copy=False)
|
|
446
|
+
|
|
447
|
+
if postprocess:
|
|
448
|
+
df = postprocess_parquet_dataframe(df, schema)
|
|
449
|
+
|
|
450
|
+
if self.index_columns is not None:
|
|
451
|
+
df.set_index(self.index_columns, inplace=True)
|
|
452
|
+
|
|
453
|
+
if validate:
|
|
454
|
+
assert self.schema is not None # make mypy happy; this condition is checked above
|
|
455
|
+
df = self.schema.validate(df)
|
|
456
|
+
|
|
457
|
+
yield df
|
|
458
|
+
|
|
459
|
+
@classmethod
|
|
460
|
+
def from_pandera_schema(
|
|
461
|
+
cls,
|
|
462
|
+
table_name: str,
|
|
463
|
+
schema: pa.DataFrameSchema,
|
|
464
|
+
package: str,
|
|
465
|
+
data_dir: str,
|
|
466
|
+
*,
|
|
467
|
+
blob_store: Optional[RemoteBlobStoreSpec] = None,
|
|
468
|
+
md5: Optional[str] = None,
|
|
469
|
+
filename: Optional[str] = None,
|
|
470
|
+
pyarrow_schema: Optional[pyarrow.Schema] = None,
|
|
471
|
+
) -> "PandasParquetLoader":
|
|
472
|
+
casts: Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]] = {}
|
|
473
|
+
index_columns: Optional[List[str]]
|
|
474
|
+
all_cols = list(schema.columns.items())
|
|
475
|
+
if isinstance(schema.index, pa.MultiIndex):
|
|
476
|
+
all_cols.extend(schema.index.columns.items())
|
|
477
|
+
index_columns = list(schema.index.names)
|
|
478
|
+
elif isinstance(schema.index, pa.Index):
|
|
479
|
+
all_cols.append((schema.index.name, schema.index))
|
|
480
|
+
index_columns = list(schema.index.names)
|
|
481
|
+
else:
|
|
482
|
+
index_columns = None
|
|
483
|
+
|
|
484
|
+
for name, col in all_cols:
|
|
485
|
+
assert isinstance(name, str)
|
|
486
|
+
dtype = col.dtype.type if isinstance(col.dtype, pa.DataType) else col.dtype
|
|
487
|
+
|
|
488
|
+
if isinstance(dtype, pd_dtypes.ExtensionDtype):
|
|
489
|
+
casts[name] = dtype
|
|
490
|
+
elif isinstance(dtype, pa.dtypes.Int) and ( # type: ignore
|
|
491
|
+
(not dtype.signed) or dtype.bit_width not in (32, 64)
|
|
492
|
+
):
|
|
493
|
+
typename = f"{'' if dtype.signed else 'u'}int{dtype.bit_width}"
|
|
494
|
+
casts[name] = np.dtype(typename)
|
|
495
|
+
elif np.issubdtype(dtype, np.datetime64):
|
|
496
|
+
casts[name] = np.dtype("datetime64[ns]")
|
|
497
|
+
|
|
498
|
+
return cls(
|
|
499
|
+
table_name,
|
|
500
|
+
schema=schema,
|
|
501
|
+
pyarrow_schema=pyarrow_schema,
|
|
502
|
+
columns=[name for name, _col in all_cols],
|
|
503
|
+
index_columns=index_columns,
|
|
504
|
+
casts=casts,
|
|
505
|
+
package=package,
|
|
506
|
+
data_dir=data_dir,
|
|
507
|
+
filename=filename,
|
|
508
|
+
blob_store=blob_store,
|
|
509
|
+
md5=md5,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
@classmethod
|
|
513
|
+
def from_schema_table(
|
|
514
|
+
cls,
|
|
515
|
+
table: Table,
|
|
516
|
+
package: Optional[str],
|
|
517
|
+
data_dir: Union[str, Path],
|
|
518
|
+
filename: Optional[str] = None,
|
|
519
|
+
derive_schema: bool = False,
|
|
520
|
+
) -> "PandasParquetLoader":
|
|
521
|
+
return cls(
|
|
522
|
+
table.name,
|
|
523
|
+
schema=table.pandera_schema if derive_schema else None,
|
|
524
|
+
pyarrow_schema=table.parquet_schema,
|
|
525
|
+
columns=[t.name for t in table.columns],
|
|
526
|
+
index_columns=table.primary_key if table.primary_key is None else list(table.primary_key),
|
|
527
|
+
casts=table.parquet_casts,
|
|
528
|
+
package=package,
|
|
529
|
+
data_dir=data_dir,
|
|
530
|
+
filename=filename,
|
|
531
|
+
md5=table.md5,
|
|
532
|
+
)
|
thds/tabularasa/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
__all__ = ["constraints", "dtypes", "files", "metaschema", "util", "validation", "load_schema"]
|
|
2
|
+
|
|
3
|
+
from . import constraints, dtypes, files, metaschema, util, validation
|
|
4
|
+
from .dtypes import DType # noqa: F401
|
|
5
|
+
from .files import ADLSDataSpec, LocalDataSpec, TabularFileSource, VersionControlledPath # noqa: F401
|
|
6
|
+
from .metaschema import Column, CustomType, RawDataDependencies, Schema, Table # noqa: F401
|
|
7
|
+
from .validation import load_schema
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
__all__ = [
|
|
2
|
+
"render_attrs_loaders",
|
|
3
|
+
"render_attrs_module",
|
|
4
|
+
"render_attrs_sqlite_schema",
|
|
5
|
+
"render_pandera_module",
|
|
6
|
+
"render_pandera_loaders",
|
|
7
|
+
"render_pyarrow_schema",
|
|
8
|
+
"render_sphinx_docs",
|
|
9
|
+
"render_sql_schema",
|
|
10
|
+
"write_if_ast_changed",
|
|
11
|
+
"write_sql",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
from .attrs import render_attrs_loaders, render_attrs_module
|
|
15
|
+
from .attrs_sqlite import render_attrs_sqlite_schema
|
|
16
|
+
from .io import write_if_ast_changed, write_sql
|
|
17
|
+
from .pandas import render_pandera_loaders, render_pandera_module
|
|
18
|
+
from .pyarrow import render_pyarrow_schema
|
|
19
|
+
from .sphinx import render_sphinx_docs
|
|
20
|
+
from .sqlite import render_sql_schema
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os.path
|
|
3
|
+
import tempfile
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from typing import Any, Callable, List, Optional, Tuple
|
|
6
|
+
from warnings import warn
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@lru_cache
|
|
10
|
+
def __autoformat_imports() -> Tuple[Optional[Any], Optional[Callable[[List[str]], int]]]:
|
|
11
|
+
try:
|
|
12
|
+
import black
|
|
13
|
+
except ImportError:
|
|
14
|
+
warn(
|
|
15
|
+
"`black` is unavailable; generated python code will not be auto-formatted. "
|
|
16
|
+
"Specify the 'cli' extra to ensure this dependency is present."
|
|
17
|
+
)
|
|
18
|
+
black = None # type: ignore
|
|
19
|
+
try:
|
|
20
|
+
from isort.main import main as isort_main # type: ignore
|
|
21
|
+
except ImportError:
|
|
22
|
+
warn(
|
|
23
|
+
"`isort` is unavailable; imports in generated python code will not be automatically sorted. "
|
|
24
|
+
"Specify the 'cli' extra to ensure this dependency is present."
|
|
25
|
+
)
|
|
26
|
+
isort_main = None # type: ignore
|
|
27
|
+
return black, isort_main # type: ignore
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def autoformat(py_code: str) -> str:
|
|
31
|
+
_LOGGER = logging.getLogger(__name__)
|
|
32
|
+
try:
|
|
33
|
+
black, isort_main = __autoformat_imports()
|
|
34
|
+
if black is not None:
|
|
35
|
+
_LOGGER.info("Applying `black` formatting to auto-generated code")
|
|
36
|
+
py_code = black.format_str(py_code, mode=black.FileMode())
|
|
37
|
+
if isort_main is not None:
|
|
38
|
+
_LOGGER.info("Applying `isort` formatting to auto-generated code")
|
|
39
|
+
with tempfile.TemporaryDirectory() as d:
|
|
40
|
+
outfile = os.path.join(d, "tmp.py")
|
|
41
|
+
with open(outfile, "w") as f:
|
|
42
|
+
f.write(py_code)
|
|
43
|
+
isort_main([outfile, "--profile", "black"])
|
|
44
|
+
with open(outfile, "r") as f_:
|
|
45
|
+
py_code = f_.read()
|
|
46
|
+
return py_code
|
|
47
|
+
except Exception as ex:
|
|
48
|
+
print(f"{repr(ex)} when attempting to format code:")
|
|
49
|
+
print(py_code)
|
|
50
|
+
raise
|