thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,33 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from packaging import version
4
+
5
+ PANDAS_VERSION_LT_2_0 = version.parse(pd.__version__) < version.parse("2.0")
6
+
7
+
8
+ def resolve_numeric_np_index_dtype_for_pd_version(dtype: str | np.dtype) -> np.dtype:
9
+ """Resolve the numeric numpy index dtype depending on the installed pandas version."""
10
+ dtype_ = np.dtype(dtype)
11
+
12
+ if dtype_.kind not in ("iuf"):
13
+ raise TypeError(
14
+ f"{dtype} is not, or does not resolve to, an (unsigned) integer or float. Resolved to {dtype_}"
15
+ )
16
+
17
+ if not PANDAS_VERSION_LT_2_0:
18
+ return dtype_
19
+
20
+ if hasattr(np, "dtypes"): # 2.x introduces .dtypes
21
+ # NumPy 2.x
22
+ if dtype_.kind == "i":
23
+ return np.dtypes.Int64DType()
24
+ elif dtype_.kind == "u":
25
+ return np.dtypes.UInt64DType()
26
+ return np.dtypes.Float64DType()
27
+
28
+ # NumPy 1.x fallback
29
+ if dtype_.kind == "i":
30
+ return np.dtype("int64")
31
+ elif dtype_.kind == "u":
32
+ return np.dtype("uint64")
33
+ return np.dtype("float64")
File without changes
@@ -0,0 +1,97 @@
1
+ import os.path
2
+ from functools import lru_cache
3
+ from pathlib import Path
4
+ from typing import List, Optional, Protocol, Union
5
+
6
+ import attr
7
+
8
+ from thds.adls import ADLSFileSystem, fqn
9
+ from thds.tabularasa.schema.files import ADLSDataSpec, RemoteBlobStoreSpec
10
+
11
+ CACHE_DIR = ".cache/"
12
+
13
+
14
+ @attr.s(auto_attribs=True)
15
+ class ADLSFileIntegrityError(FileNotFoundError):
16
+ adls_account: str
17
+ adls_filesystem: str
18
+ adls_path: str
19
+ expected_md5: str
20
+ md5: str
21
+
22
+ def __str__(self):
23
+ return (
24
+ f"Unexpected contents for ADLS file: account={self.adls_account} "
25
+ f"filesystem={self.adls_filesystem} path={self.adls_path} "
26
+ f"expected_md5={self.expected_md5} md5={self.md5}"
27
+ )
28
+
29
+
30
+ @attr.s(auto_attribs=True)
31
+ class ADLSDownloadResult:
32
+ adls_account: str
33
+ adls_filesystem: str
34
+ adls_path: str
35
+ local_path: Path
36
+
37
+
38
+ @lru_cache(None)
39
+ def adls_filesystem(account: str, filesystem: str, cache_dir: Optional[Union[Path, str]] = CACHE_DIR):
40
+ return ADLSFileSystem(account, filesystem, cache_dir=cache_dir)
41
+
42
+
43
+ def sync_adls_data(
44
+ adls_spec: ADLSDataSpec, cache_dir: Optional[Union[Path, str]] = CACHE_DIR
45
+ ) -> List[ADLSDownloadResult]:
46
+ from .util import hash_file
47
+
48
+ adls = adls_filesystem(adls_spec.adls_account, adls_spec.adls_filesystem, cache_dir)
49
+ adls_paths = [adls_path.name for adls_path in adls_spec.paths]
50
+ expected_hashes = [adls_path.md5 for adls_path in adls_spec.paths]
51
+ cache_paths = adls.fetch_files(adls_paths)
52
+ results = []
53
+ for adls_path, cache_path, expected_md5 in zip(adls_paths, cache_paths, expected_hashes):
54
+ download_result = ADLSDownloadResult(
55
+ adls_account=adls_spec.adls_account,
56
+ adls_filesystem=adls_spec.adls_filesystem,
57
+ adls_path=adls_path,
58
+ local_path=cache_path,
59
+ )
60
+ if expected_md5 is None:
61
+ results.append(download_result)
62
+ continue
63
+
64
+ md5 = hash_file(cache_path)
65
+ if md5 != expected_md5:
66
+ raise ADLSFileIntegrityError(
67
+ adls_account=adls_spec.adls_account,
68
+ adls_filesystem=adls_spec.adls_filesystem,
69
+ adls_path=adls_path,
70
+ expected_md5=expected_md5,
71
+ md5=md5,
72
+ )
73
+
74
+ results.append(download_result)
75
+ if adls_spec.ordered:
76
+ result_order = {os.path.basename(result.local_path): result for result in results}
77
+ return [result_order[os.path.basename(path.name)] for path in adls_spec.paths]
78
+ else:
79
+ return results
80
+
81
+
82
+ class SupportsRemoteData(Protocol):
83
+ md5: Optional[str] = None
84
+ blob_store: Optional[RemoteBlobStoreSpec] = None
85
+
86
+
87
+ def get_remote_data_fqn(interface: SupportsRemoteData) -> fqn.AdlsFqn:
88
+ if interface.md5 and interface.blob_store:
89
+ return (
90
+ fqn.of(
91
+ storage_account=interface.blob_store.adls_account,
92
+ container=interface.blob_store.adls_filesystem,
93
+ path=interface.blob_store.path,
94
+ )
95
+ / interface.md5
96
+ )
97
+ raise ValueError("Getting a remote data path requires both the `md5` and `blob_store` to be set.")