thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from packaging import version
|
|
4
|
+
|
|
5
|
+
PANDAS_VERSION_LT_2_0 = version.parse(pd.__version__) < version.parse("2.0")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def resolve_numeric_np_index_dtype_for_pd_version(dtype: str | np.dtype) -> np.dtype:
|
|
9
|
+
"""Resolve the numeric numpy index dtype depending on the installed pandas version."""
|
|
10
|
+
dtype_ = np.dtype(dtype)
|
|
11
|
+
|
|
12
|
+
if dtype_.kind not in ("iuf"):
|
|
13
|
+
raise TypeError(
|
|
14
|
+
f"{dtype} is not, or does not resolve to, an (unsigned) integer or float. Resolved to {dtype_}"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if not PANDAS_VERSION_LT_2_0:
|
|
18
|
+
return dtype_
|
|
19
|
+
|
|
20
|
+
if hasattr(np, "dtypes"): # 2.x introduces .dtypes
|
|
21
|
+
# NumPy 2.x
|
|
22
|
+
if dtype_.kind == "i":
|
|
23
|
+
return np.dtypes.Int64DType()
|
|
24
|
+
elif dtype_.kind == "u":
|
|
25
|
+
return np.dtypes.UInt64DType()
|
|
26
|
+
return np.dtypes.Float64DType()
|
|
27
|
+
|
|
28
|
+
# NumPy 1.x fallback
|
|
29
|
+
if dtype_.kind == "i":
|
|
30
|
+
return np.dtype("int64")
|
|
31
|
+
elif dtype_.kind == "u":
|
|
32
|
+
return np.dtype("uint64")
|
|
33
|
+
return np.dtype("float64")
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Protocol, Union
|
|
5
|
+
|
|
6
|
+
import attr
|
|
7
|
+
|
|
8
|
+
from thds.adls import ADLSFileSystem, fqn
|
|
9
|
+
from thds.tabularasa.schema.files import ADLSDataSpec, RemoteBlobStoreSpec
|
|
10
|
+
|
|
11
|
+
CACHE_DIR = ".cache/"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@attr.s(auto_attribs=True)
|
|
15
|
+
class ADLSFileIntegrityError(FileNotFoundError):
|
|
16
|
+
adls_account: str
|
|
17
|
+
adls_filesystem: str
|
|
18
|
+
adls_path: str
|
|
19
|
+
expected_md5: str
|
|
20
|
+
md5: str
|
|
21
|
+
|
|
22
|
+
def __str__(self):
|
|
23
|
+
return (
|
|
24
|
+
f"Unexpected contents for ADLS file: account={self.adls_account} "
|
|
25
|
+
f"filesystem={self.adls_filesystem} path={self.adls_path} "
|
|
26
|
+
f"expected_md5={self.expected_md5} md5={self.md5}"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@attr.s(auto_attribs=True)
|
|
31
|
+
class ADLSDownloadResult:
|
|
32
|
+
adls_account: str
|
|
33
|
+
adls_filesystem: str
|
|
34
|
+
adls_path: str
|
|
35
|
+
local_path: Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@lru_cache(None)
|
|
39
|
+
def adls_filesystem(account: str, filesystem: str, cache_dir: Optional[Union[Path, str]] = CACHE_DIR):
|
|
40
|
+
return ADLSFileSystem(account, filesystem, cache_dir=cache_dir)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def sync_adls_data(
|
|
44
|
+
adls_spec: ADLSDataSpec, cache_dir: Optional[Union[Path, str]] = CACHE_DIR
|
|
45
|
+
) -> List[ADLSDownloadResult]:
|
|
46
|
+
from .util import hash_file
|
|
47
|
+
|
|
48
|
+
adls = adls_filesystem(adls_spec.adls_account, adls_spec.adls_filesystem, cache_dir)
|
|
49
|
+
adls_paths = [adls_path.name for adls_path in adls_spec.paths]
|
|
50
|
+
expected_hashes = [adls_path.md5 for adls_path in adls_spec.paths]
|
|
51
|
+
cache_paths = adls.fetch_files(adls_paths)
|
|
52
|
+
results = []
|
|
53
|
+
for adls_path, cache_path, expected_md5 in zip(adls_paths, cache_paths, expected_hashes):
|
|
54
|
+
download_result = ADLSDownloadResult(
|
|
55
|
+
adls_account=adls_spec.adls_account,
|
|
56
|
+
adls_filesystem=adls_spec.adls_filesystem,
|
|
57
|
+
adls_path=adls_path,
|
|
58
|
+
local_path=cache_path,
|
|
59
|
+
)
|
|
60
|
+
if expected_md5 is None:
|
|
61
|
+
results.append(download_result)
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
md5 = hash_file(cache_path)
|
|
65
|
+
if md5 != expected_md5:
|
|
66
|
+
raise ADLSFileIntegrityError(
|
|
67
|
+
adls_account=adls_spec.adls_account,
|
|
68
|
+
adls_filesystem=adls_spec.adls_filesystem,
|
|
69
|
+
adls_path=adls_path,
|
|
70
|
+
expected_md5=expected_md5,
|
|
71
|
+
md5=md5,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
results.append(download_result)
|
|
75
|
+
if adls_spec.ordered:
|
|
76
|
+
result_order = {os.path.basename(result.local_path): result for result in results}
|
|
77
|
+
return [result_order[os.path.basename(path.name)] for path in adls_spec.paths]
|
|
78
|
+
else:
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SupportsRemoteData(Protocol):
|
|
83
|
+
md5: Optional[str] = None
|
|
84
|
+
blob_store: Optional[RemoteBlobStoreSpec] = None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_remote_data_fqn(interface: SupportsRemoteData) -> fqn.AdlsFqn:
|
|
88
|
+
if interface.md5 and interface.blob_store:
|
|
89
|
+
return (
|
|
90
|
+
fqn.of(
|
|
91
|
+
storage_account=interface.blob_store.adls_account,
|
|
92
|
+
container=interface.blob_store.adls_filesystem,
|
|
93
|
+
path=interface.blob_store.path,
|
|
94
|
+
)
|
|
95
|
+
/ interface.md5
|
|
96
|
+
)
|
|
97
|
+
raise ValueError("Getting a remote data path requires both the `md5` and `blob_store` to be set.")
|