thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,532 @@
1
+ import os
2
+ from logging import getLogger
3
+ from pathlib import Path
4
+ from typing import (
5
+ IO,
6
+ Callable,
7
+ Dict,
8
+ Generic,
9
+ Iterable,
10
+ Iterator,
11
+ List,
12
+ Optional,
13
+ Sequence,
14
+ Tuple,
15
+ Type,
16
+ TypeVar,
17
+ Union,
18
+ )
19
+
20
+ import attr
21
+ import numpy as np
22
+ import pandas as pd
23
+ import pandas.core.dtypes.base as pd_dtypes
24
+ import pandera as pa
25
+ import pkg_resources
26
+ import pyarrow
27
+ import pyarrow.parquet as pq
28
+
29
+ from thds.tabularasa.data_dependencies.adls import sync_adls_data
30
+ from thds.tabularasa.data_dependencies.util import check_categorical_values, hash_file
31
+ from thds.tabularasa.schema.dtypes import PyType
32
+ from thds.tabularasa.schema.metaschema import RemoteBlobStoreSpec, Table
33
+ from thds.tabularasa.schema.util import snake_case
34
+
35
+ from .parquet_util import (
36
+ TypeCheckLevel,
37
+ list_map,
38
+ postprocess_parquet_dataframe,
39
+ postprocessor_for_pyarrow_type,
40
+ type_check_pyarrow_schemas,
41
+ )
42
+
43
+ # sqlite_constructor_for_record_type and AttrsSQLiteDatabase are not
44
+ # used here but they're imported for backward compatibility with
45
+ # existing generated code, which expects it to be importable from
46
+ # here. They were moved to sqlite_util to reduce the size of this file.
47
+ from .sqlite_util import AttrsSQLiteDatabase # noqa: F401
48
+ from .sqlite_util import sqlite_constructor_for_record_type # noqa: F401
49
+
50
+ T = TypeVar("T")
51
+ K = TypeVar("K", bound=PyType)
52
+ V = TypeVar("V", bound=PyType)
53
+ Record = TypeVar("Record", bound=attr.AttrsInstance)
54
+
55
+ PARQUET_EXT = ".parquet"
56
+ PQ_BATCH_SIZE_ATTRS = 100
57
+ PQ_BATCH_SIZE_PANDAS = 2**16
58
+
59
+
60
+ def identity(x):
61
+ return x
62
+
63
+
64
+ def maybe(f: Callable[[T], V]) -> Callable[[Optional[T]], Optional[V]]:
65
+ def f_(x):
66
+ return None if x is None else f(x)
67
+
68
+ return f_
69
+
70
+
71
+ def default_parquet_package_data_path(
72
+ table_name: str, data_dir: str, as_package_data: bool = True
73
+ ) -> str:
74
+ """Standardized path to a parquet file resource for a packaged table inside a
75
+ shared package subdirectory.
76
+ When `as_package_data == True`, return a *package data* (Not OS) path to a resource; otherwise return
77
+ a regular OS-compatible file path."""
78
+ return package_data_path(
79
+ f"{snake_case(table_name)}{PARQUET_EXT}", data_dir, as_package_data=as_package_data
80
+ )
81
+
82
+
83
+ def package_data_path(filename: str, data_dir: str, as_package_data: bool = True) -> str:
84
+ """Standardized path to a file resource for inside a shared package subdirectory.
85
+ When `as_package_data == True`, return a *package data* (Not OS) path to a resource; otherwise return
86
+ a regular OS-compatible file path.
87
+ see https://setuptools.pypa.io/en/latest/pkg_resources.html#basic-resource-access"""
88
+ return (
89
+ f"{data_dir.rstrip('/')}/{filename}"
90
+ if as_package_data
91
+ else str(Path(data_dir.replace("/", os.sep)) / filename)
92
+ )
93
+
94
+
95
+ def unique_across_columns(df: pd.DataFrame, colnames: Sequence[str]) -> pd.Series:
96
+ index_cols = [c for c in colnames if c in df.index.names]
97
+ cols = [c for c in colnames if c in df.columns]
98
+ if not index_cols:
99
+ check_cols = df[cols]
100
+ elif not cols and set(index_cols) == set(df.index.names):
101
+ # optimization - don't duplicate the index if we don't have to
102
+ check_cols = df.index # type: ignore
103
+ else:
104
+ check_cols = pd.DataFrame(
105
+ {
106
+ **{c: df.index.get_level_values(c).values for c in index_cols},
107
+ **{c: df[c].values for c in cols},
108
+ }
109
+ )
110
+
111
+ duped = check_cols.duplicated(keep=False)
112
+ if isinstance(duped, pd.Series):
113
+ # if check_cols was a frame vs an index
114
+ duped = duped.values # type: ignore
115
+
116
+ return pd.Series(~duped, index=df.index)
117
+
118
+
119
+ def _register_unique_across_columns() -> None:
120
+ # make sure the registration runs once
121
+ # forced re-importing with `mops.testing.deferred_imports.assert_dev_deps_not_imported` raises an error
122
+ if hasattr(pa.Check, "unique_across_columns"):
123
+ return None
124
+
125
+ pa.extensions.register_check_method(statistics=["colnames"], supported_types=pd.DataFrame)(
126
+ unique_across_columns
127
+ )
128
+
129
+
130
+ _register_unique_across_columns()
131
+
132
+
133
+ class _PackageDataOrFileInterface:
134
+ package: Optional[str]
135
+ data_path: str
136
+ md5: Optional[str] = None
137
+ blob_store: Optional[RemoteBlobStoreSpec] = None
138
+
139
+ def set_path(self, table_name: str, data_dir: Union[str, Path], filename: Optional[str]):
140
+ # package data or local file
141
+ as_package_data = self.package is not None
142
+ if filename is None:
143
+ self.data_path = default_parquet_package_data_path(
144
+ table_name, str(data_dir), as_package_data=as_package_data
145
+ )
146
+ else:
147
+ self.data_path = package_data_path(filename, str(data_dir), as_package_data=as_package_data)
148
+ assert os.path.exists(self.data_path)
149
+ if self.blob_store is not None and self.md5 is None:
150
+ raise ValueError(
151
+ f"No md5 defined for remote file in blob store {self.blob_store} for table with local path "
152
+ f"{self.data_path}"
153
+ )
154
+
155
+ def file_exists(self) -> bool:
156
+ if self.package is None:
157
+ return os.path.exists(self.data_path)
158
+ else:
159
+ return pkg_resources.resource_exists(self.package, self.data_path)
160
+
161
+ def _resource_stream(self, sync: bool = True) -> IO[bytes]:
162
+ if sync:
163
+ maybe_path = self.sync_blob()
164
+ if maybe_path:
165
+ return open(maybe_path, "rb")
166
+
167
+ if self.package is None:
168
+ return open(self.data_path, "rb")
169
+ else:
170
+ return pkg_resources.resource_stream(self.package, self.data_path)
171
+
172
+ def file_path(self, sync: bool = True) -> Path:
173
+ """Path on the local filesystem to the file underlying this loader. If a blob store is specified
174
+ and the local path doesn't exist, it will be synced, unless `sync=False` is passed."""
175
+ if sync:
176
+ maybe_path = self.sync_blob()
177
+ if maybe_path:
178
+ return maybe_path
179
+
180
+ if self.package is None:
181
+ return Path(self.data_path)
182
+ else:
183
+ return Path(pkg_resources.resource_filename(self.package, self.data_path))
184
+
185
+ def sync_blob(self, link: bool = False) -> Optional[Path]:
186
+ """Ensure that the local file underlying this loader is available.
187
+ If the file does not exist, sync it from the blob store, or raise `FileNotFoundError` when no
188
+ blob store is defined. Returns a local path to the cached download if a sync was performed,
189
+ otherwise returns `None`.
190
+ When `link` is True and a download is performed, the resulting file is linked to the local file
191
+ associated with this resource.
192
+ """
193
+ if not self.file_exists():
194
+ if self.blob_store is None:
195
+ raise FileNotFoundError(
196
+ "Local or package data file doesn't exist and no remote blob is defined for table "
197
+ f"with local path {self.data_path}"
198
+ )
199
+ else:
200
+ assert (
201
+ self.md5 is not None
202
+ ), f"No md5 defined for {self.data_path}; can't safely sync blob"
203
+ target_local_path = self.file_path(sync=False)
204
+ getLogger(__name__).info(
205
+ f"Syncing blob with hash {self.md5}" f" to {target_local_path}" if link else ""
206
+ )
207
+ remote_data_spec = self.blob_store.data_spec(self.md5)
208
+ local_files = sync_adls_data(remote_data_spec)
209
+ local_path = local_files[0].local_path
210
+ if link:
211
+ os.link(local_path, target_local_path)
212
+ return local_path
213
+ else:
214
+ return None
215
+
216
+ def file_hash(self) -> str:
217
+ with self._resource_stream() as f:
218
+ return hash_file(f)
219
+
220
+
221
+ class _ParquetPackageDataOrFileInterface(_PackageDataOrFileInterface):
222
+ def metadata(self) -> pq.FileMetaData:
223
+ with self._resource_stream() as f:
224
+ return pq.ParquetFile(f).metadata
225
+
226
+ def num_rows(self) -> int:
227
+ return self.metadata().num_rows
228
+
229
+
230
+ class AttrsParquetLoader(Generic[Record], _ParquetPackageDataOrFileInterface):
231
+ """Base interface for loading package resources as record iterators"""
232
+
233
+ def __init__(
234
+ self,
235
+ table_name: str,
236
+ type_: Type[Record],
237
+ *,
238
+ package: Optional[str],
239
+ data_dir: Union[str, Path],
240
+ filename: Optional[str] = None,
241
+ pyarrow_schema: Optional[pyarrow.Schema] = None,
242
+ md5: Optional[str] = None,
243
+ blob_store: Optional[RemoteBlobStoreSpec] = None,
244
+ ):
245
+ self.type_ = type_
246
+ self.table_name = table_name
247
+ self.package = package
248
+ self.pyarrow_schema = pyarrow_schema
249
+ self.md5 = md5
250
+ self.blob_store = blob_store
251
+ self.set_path(table_name=table_name, data_dir=data_dir, filename=filename)
252
+
253
+ def __call__(
254
+ self, path: Optional[Path] = None, type_check: Optional[Union[int, TypeCheckLevel]] = None
255
+ ) -> Iterator[Record]:
256
+ """Load an iterator of instances of the attrs record type `self.type_` from a package data
257
+ parqet file.
258
+
259
+ :param path: Optional path to a local parquet file. Overrides the underlying package data file
260
+ when passed.
261
+ :param type_check: Optional `reference_data.loaders.parquet_util.TypeCheckLevel` indicating that
262
+ a type check should be performed on the arrow schema of the parquet file _before_ reading any
263
+ data, and at what level of strictness.
264
+ :return: an iterator of instances of `self.type_`, an attrs class matching the schema of the
265
+ parquet file being read.
266
+ """
267
+ if type_check is not None and self.pyarrow_schema is None:
268
+ raise ValueError(f"Can't type check table {self.table_name} with no pyarrow schema")
269
+
270
+ with self._resource_stream() if path is None else open(path, "rb") as f:
271
+ col_order = [col.name for col in attr.fields(self.type_)]
272
+ parquet_file = pq.ParquetFile(f)
273
+ schema = parquet_file.schema.to_arrow_schema()
274
+ if type_check is not None:
275
+ assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
276
+ type_check_pyarrow_schemas(
277
+ schema, self.pyarrow_schema, TypeCheckLevel(type_check), col_order
278
+ )
279
+ # this is to re-order columns *just in case* they're in a different order in the parquet file
280
+ ixs_postprocessors: List[Tuple[int, Callable]] = []
281
+ for name in col_order:
282
+ i = schema.names.index(name)
283
+ field = schema.field(i)
284
+ pproc = postprocessor_for_pyarrow_type(field.type)
285
+ if pproc is not None:
286
+ if field.nullable:
287
+ pproc = maybe(pproc)
288
+ pproc = list_map(pproc)
289
+ else:
290
+ pproc = identity
291
+ ixs_postprocessors.append((i, pproc))
292
+ for batch in parquet_file.iter_batches(batch_size=PQ_BATCH_SIZE_ATTRS):
293
+ columns = [pproc(batch.columns[i].to_pylist()) for i, pproc in ixs_postprocessors]
294
+ parsed_rows = map(self.type_, *columns)
295
+ yield from parsed_rows
296
+
297
+
298
+ class PandasParquetLoader(_ParquetPackageDataOrFileInterface):
299
+ def __init__(
300
+ self,
301
+ table_name: str,
302
+ *,
303
+ package: Optional[str],
304
+ data_dir: Union[str, Path],
305
+ filename: Optional[str] = None,
306
+ md5: Optional[str] = None,
307
+ blob_store: Optional[RemoteBlobStoreSpec] = None,
308
+ columns: Optional[List[str]] = None,
309
+ schema: Optional[pa.DataFrameSchema] = None,
310
+ pyarrow_schema: Optional[pyarrow.Schema] = None,
311
+ index_columns: Optional[List[str]] = None,
312
+ casts: Optional[Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]]] = None,
313
+ ):
314
+ self.table_name = table_name
315
+ self.schema = schema
316
+ self.pyarrow_schema = pyarrow_schema
317
+ self.columns = columns
318
+ self.index_columns = index_columns
319
+ self.casts = casts
320
+ self.package = package
321
+ self.md5 = md5
322
+ self.blob_store = blob_store
323
+ self.set_path(table_name=table_name, data_dir=data_dir, filename=filename)
324
+
325
+ def __call__(
326
+ self,
327
+ path: Optional[Path] = None,
328
+ validate: bool = False,
329
+ type_check: Optional[Union[int, TypeCheckLevel]] = None,
330
+ postprocess: bool = True,
331
+ cast: bool = False,
332
+ ) -> pd.DataFrame:
333
+ """Load a `pandas.DataFrame` from a package data parqet file
334
+
335
+ See the `load_batched` method for documentation of the parameters"""
336
+ return next(
337
+ self.load_batched(
338
+ path,
339
+ validate=validate,
340
+ type_check=type_check,
341
+ postprocess=postprocess,
342
+ cast=cast,
343
+ batch_size=None,
344
+ )
345
+ )
346
+
347
+ def load_batched(
348
+ self,
349
+ path: Optional[Path] = None,
350
+ batch_size: Optional[int] = PQ_BATCH_SIZE_PANDAS,
351
+ validate: bool = False,
352
+ type_check: Optional[Union[int, TypeCheckLevel]] = None,
353
+ postprocess: bool = True,
354
+ cast: bool = False,
355
+ ) -> Iterator[pd.DataFrame]:
356
+ """Load an iterator of `pandas.DataFrame`s from a package data parqet file in a memory-efficient way.
357
+
358
+ :param path: Optional path to a local parquet file. Overrides the underlying package data file
359
+ when passed.
360
+ :param batch_size: Read the data in batches of this many rows. Every DataFrame yielded except
361
+ possibly the last will have this many rows. Allows for control of memory usage. If `None`,
362
+ the entire table will be read and yielded as a single DataFrame.
363
+ :param validate: validate against the associated `pandera` schema?
364
+ :param postprocess: apply postprocessors to complex types? E.g., `pyarrow` returns lists of
365
+ tuples for mapping types; this will cast those to dicts
366
+ :param type_check: Optional `reference_data.loaders.parquet_util.TypeCheckLevel` indicating that
367
+ a type check should be performed on the arrow schema of the parquet file _before_ reading any
368
+ data, and at what level of strictness.
369
+ :param cast: Indicates whether to attempt a pyarrow table cast on read. When `False`, no cast
370
+ will ever be performed. When `True`, the behavior depends on the value of `type_check`:
371
+ When `type_check` is supplied, in case of a type check failure, attempt to cast the
372
+ arrow table to the arrow schema for this table. When `type_check` is `None`, always cast the
373
+ arrow table to the arrow schema for this table.
374
+ :return: iterator of the the loaded and possibly postprocessed and validated DataFrames
375
+ """
376
+ if validate and self.schema is None:
377
+ raise ValueError(f"Can't validate table {self.table_name} with no pandera schema")
378
+
379
+ if type_check is not None and self.pyarrow_schema is None:
380
+ raise ValueError(f"Can't type check table {self.table_name} with no pyarrow schema")
381
+
382
+ if cast and self.pyarrow_schema is None:
383
+ raise ValueError(f"Can't cast table {self.table_name} with no pyarrow schema")
384
+
385
+ logger = getLogger(__name__)
386
+
387
+ with self._resource_stream() if path is None else open(path, "rb") as f:
388
+ pq_file = pyarrow.parquet.ParquetFile(f)
389
+ schema = pq_file.schema.to_arrow_schema()
390
+ if type_check is not None:
391
+ assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
392
+ try:
393
+ logger.info(f"Type-checking parquet file for table {self.table_name}")
394
+ type_check_pyarrow_schemas(
395
+ schema, self.pyarrow_schema, TypeCheckLevel(type_check), self.columns
396
+ )
397
+ except TypeError as e:
398
+ if cast:
399
+ logger.warning(
400
+ f"Type-checking failed at level '{type_check.name}'; " # type: ignore
401
+ "a type cast will be attempted on read"
402
+ )
403
+ else:
404
+ raise e
405
+ else:
406
+ cast = False
407
+
408
+ logger.info(f"Loading arrow data for table {self.table_name} from parquet")
409
+ if batch_size is None:
410
+ table = pq_file.read(self.columns)
411
+ batches: Iterable[pyarrow.Table] = [table]
412
+ else:
413
+ batches = (
414
+ pyarrow.Table.from_batches([b])
415
+ for b in pq_file.iter_batches(batch_size, columns=self.columns)
416
+ )
417
+
418
+ categorical_dtypes = (
419
+ [
420
+ (name, dtype)
421
+ for name, dtype in self.casts.items()
422
+ if isinstance(dtype, pd.CategoricalDtype)
423
+ ]
424
+ if self.casts
425
+ else []
426
+ )
427
+
428
+ for table in batches:
429
+ if cast:
430
+ assert self.pyarrow_schema is not None # make mypy happy; this condition is checked
431
+ table = table.cast(self.pyarrow_schema)
432
+
433
+ # ignore_metadata is only here because table.to_pandas has a bug wherein some dtypes get
434
+ # changed for index columns (which are specified in the metadata under a 'pandas' key).
435
+ # We handle setting the index ourselves below.
436
+ # Likewise, we omit the `categories` arg here, as pyarrow cannot in general recover the
437
+ # order of the original categories from the parquet file - those are handled using
438
+ # `self.casts` below.
439
+ df = table.to_pandas(date_as_object=False, ignore_metadata=True)
440
+
441
+ if self.casts:
442
+ for name, dtype in categorical_dtypes:
443
+ check_categorical_values(df[name], dtype)
444
+
445
+ df = df.astype(self.casts, copy=False)
446
+
447
+ if postprocess:
448
+ df = postprocess_parquet_dataframe(df, schema)
449
+
450
+ if self.index_columns is not None:
451
+ df.set_index(self.index_columns, inplace=True)
452
+
453
+ if validate:
454
+ assert self.schema is not None # make mypy happy; this condition is checked above
455
+ df = self.schema.validate(df)
456
+
457
+ yield df
458
+
459
+ @classmethod
460
+ def from_pandera_schema(
461
+ cls,
462
+ table_name: str,
463
+ schema: pa.DataFrameSchema,
464
+ package: str,
465
+ data_dir: str,
466
+ *,
467
+ blob_store: Optional[RemoteBlobStoreSpec] = None,
468
+ md5: Optional[str] = None,
469
+ filename: Optional[str] = None,
470
+ pyarrow_schema: Optional[pyarrow.Schema] = None,
471
+ ) -> "PandasParquetLoader":
472
+ casts: Dict[str, Union[np.dtype, pd_dtypes.ExtensionDtype]] = {}
473
+ index_columns: Optional[List[str]]
474
+ all_cols = list(schema.columns.items())
475
+ if isinstance(schema.index, pa.MultiIndex):
476
+ all_cols.extend(schema.index.columns.items())
477
+ index_columns = list(schema.index.names)
478
+ elif isinstance(schema.index, pa.Index):
479
+ all_cols.append((schema.index.name, schema.index))
480
+ index_columns = list(schema.index.names)
481
+ else:
482
+ index_columns = None
483
+
484
+ for name, col in all_cols:
485
+ assert isinstance(name, str)
486
+ dtype = col.dtype.type if isinstance(col.dtype, pa.DataType) else col.dtype
487
+
488
+ if isinstance(dtype, pd_dtypes.ExtensionDtype):
489
+ casts[name] = dtype
490
+ elif isinstance(dtype, pa.dtypes.Int) and ( # type: ignore
491
+ (not dtype.signed) or dtype.bit_width not in (32, 64)
492
+ ):
493
+ typename = f"{'' if dtype.signed else 'u'}int{dtype.bit_width}"
494
+ casts[name] = np.dtype(typename)
495
+ elif np.issubdtype(dtype, np.datetime64):
496
+ casts[name] = np.dtype("datetime64[ns]")
497
+
498
+ return cls(
499
+ table_name,
500
+ schema=schema,
501
+ pyarrow_schema=pyarrow_schema,
502
+ columns=[name for name, _col in all_cols],
503
+ index_columns=index_columns,
504
+ casts=casts,
505
+ package=package,
506
+ data_dir=data_dir,
507
+ filename=filename,
508
+ blob_store=blob_store,
509
+ md5=md5,
510
+ )
511
+
512
+ @classmethod
513
+ def from_schema_table(
514
+ cls,
515
+ table: Table,
516
+ package: Optional[str],
517
+ data_dir: Union[str, Path],
518
+ filename: Optional[str] = None,
519
+ derive_schema: bool = False,
520
+ ) -> "PandasParquetLoader":
521
+ return cls(
522
+ table.name,
523
+ schema=table.pandera_schema if derive_schema else None,
524
+ pyarrow_schema=table.parquet_schema,
525
+ columns=[t.name for t in table.columns],
526
+ index_columns=table.primary_key if table.primary_key is None else list(table.primary_key),
527
+ casts=table.parquet_casts,
528
+ package=package,
529
+ data_dir=data_dir,
530
+ filename=filename,
531
+ md5=table.md5,
532
+ )
File without changes
@@ -0,0 +1,7 @@
1
+ __all__ = ["constraints", "dtypes", "files", "metaschema", "util", "validation", "load_schema"]
2
+
3
+ from . import constraints, dtypes, files, metaschema, util, validation
4
+ from .dtypes import DType # noqa: F401
5
+ from .files import ADLSDataSpec, LocalDataSpec, TabularFileSource, VersionControlledPath # noqa: F401
6
+ from .metaschema import Column, CustomType, RawDataDependencies, Schema, Table # noqa: F401
7
+ from .validation import load_schema
@@ -0,0 +1,20 @@
1
+ __all__ = [
2
+ "render_attrs_loaders",
3
+ "render_attrs_module",
4
+ "render_attrs_sqlite_schema",
5
+ "render_pandera_module",
6
+ "render_pandera_loaders",
7
+ "render_pyarrow_schema",
8
+ "render_sphinx_docs",
9
+ "render_sql_schema",
10
+ "write_if_ast_changed",
11
+ "write_sql",
12
+ ]
13
+
14
+ from .attrs import render_attrs_loaders, render_attrs_module
15
+ from .attrs_sqlite import render_attrs_sqlite_schema
16
+ from .io import write_if_ast_changed, write_sql
17
+ from .pandas import render_pandera_loaders, render_pandera_module
18
+ from .pyarrow import render_pyarrow_schema
19
+ from .sphinx import render_sphinx_docs
20
+ from .sqlite import render_sql_schema
@@ -0,0 +1,50 @@
1
+ import logging
2
+ import os.path
3
+ import tempfile
4
+ from functools import lru_cache
5
+ from typing import Any, Callable, List, Optional, Tuple
6
+ from warnings import warn
7
+
8
+
9
+ @lru_cache
10
+ def __autoformat_imports() -> Tuple[Optional[Any], Optional[Callable[[List[str]], int]]]:
11
+ try:
12
+ import black
13
+ except ImportError:
14
+ warn(
15
+ "`black` is unavailable; generated python code will not be auto-formatted. "
16
+ "Specify the 'cli' extra to ensure this dependency is present."
17
+ )
18
+ black = None # type: ignore
19
+ try:
20
+ from isort.main import main as isort_main # type: ignore
21
+ except ImportError:
22
+ warn(
23
+ "`isort` is unavailable; imports in generated python code will not be automatically sorted. "
24
+ "Specify the 'cli' extra to ensure this dependency is present."
25
+ )
26
+ isort_main = None # type: ignore
27
+ return black, isort_main # type: ignore
28
+
29
+
30
+ def autoformat(py_code: str) -> str:
31
+ _LOGGER = logging.getLogger(__name__)
32
+ try:
33
+ black, isort_main = __autoformat_imports()
34
+ if black is not None:
35
+ _LOGGER.info("Applying `black` formatting to auto-generated code")
36
+ py_code = black.format_str(py_code, mode=black.FileMode())
37
+ if isort_main is not None:
38
+ _LOGGER.info("Applying `isort` formatting to auto-generated code")
39
+ with tempfile.TemporaryDirectory() as d:
40
+ outfile = os.path.join(d, "tmp.py")
41
+ with open(outfile, "w") as f:
42
+ f.write(py_code)
43
+ isort_main([outfile, "--profile", "black"])
44
+ with open(outfile, "r") as f_:
45
+ py_code = f_.read()
46
+ return py_code
47
+ except Exception as ex:
48
+ print(f"{repr(ex)} when attempting to format code:")
49
+ print(py_code)
50
+ raise