thds.tabularasa 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/tabularasa/__init__.py +6 -0
- thds/tabularasa/__main__.py +1122 -0
- thds/tabularasa/compat.py +33 -0
- thds/tabularasa/data_dependencies/__init__.py +0 -0
- thds/tabularasa/data_dependencies/adls.py +97 -0
- thds/tabularasa/data_dependencies/build.py +573 -0
- thds/tabularasa/data_dependencies/sqlite.py +286 -0
- thds/tabularasa/data_dependencies/tabular.py +167 -0
- thds/tabularasa/data_dependencies/util.py +209 -0
- thds/tabularasa/diff/__init__.py +0 -0
- thds/tabularasa/diff/data.py +346 -0
- thds/tabularasa/diff/schema.py +254 -0
- thds/tabularasa/diff/summary.py +249 -0
- thds/tabularasa/git_util.py +37 -0
- thds/tabularasa/loaders/__init__.py +0 -0
- thds/tabularasa/loaders/lazy_adls.py +44 -0
- thds/tabularasa/loaders/parquet_util.py +385 -0
- thds/tabularasa/loaders/sqlite_util.py +346 -0
- thds/tabularasa/loaders/util.py +532 -0
- thds/tabularasa/py.typed +0 -0
- thds/tabularasa/schema/__init__.py +7 -0
- thds/tabularasa/schema/compilation/__init__.py +20 -0
- thds/tabularasa/schema/compilation/_format.py +50 -0
- thds/tabularasa/schema/compilation/attrs.py +257 -0
- thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
- thds/tabularasa/schema/compilation/io.py +96 -0
- thds/tabularasa/schema/compilation/pandas.py +252 -0
- thds/tabularasa/schema/compilation/pyarrow.py +93 -0
- thds/tabularasa/schema/compilation/sphinx.py +550 -0
- thds/tabularasa/schema/compilation/sqlite.py +69 -0
- thds/tabularasa/schema/compilation/util.py +117 -0
- thds/tabularasa/schema/constraints.py +327 -0
- thds/tabularasa/schema/dtypes.py +153 -0
- thds/tabularasa/schema/extract_from_parquet.py +132 -0
- thds/tabularasa/schema/files.py +215 -0
- thds/tabularasa/schema/metaschema.py +1007 -0
- thds/tabularasa/schema/util.py +123 -0
- thds/tabularasa/schema/validation.py +878 -0
- thds/tabularasa/sqlite3_compat.py +41 -0
- thds/tabularasa/sqlite_from_parquet.py +34 -0
- thds/tabularasa/to_sqlite.py +56 -0
- thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
- thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
- thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
- thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
- thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Should be able to 'extract' a Schema object from an existing parquet file.
|
|
2
|
+
|
|
3
|
+
You might want to use this to convert a Parquet file directly to
|
|
4
|
+
SQLite without additional ceremony - use `tabularasa.to_sqlite` in
|
|
5
|
+
conjunction with this.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import typing as ty
|
|
9
|
+
from functools import partial
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
import pyarrow.lib
|
|
14
|
+
import pyarrow.parquet as pq
|
|
15
|
+
|
|
16
|
+
from . import metaschema as ms
|
|
17
|
+
|
|
18
|
+
_LEAF_MAPPINGS = {
|
|
19
|
+
# DType doesn't currently support everything:
|
|
20
|
+
# https://arrow.apache.org/docs/python/api/datatypes.html
|
|
21
|
+
pa.string(): ms.DType.STR,
|
|
22
|
+
pa.bool_(): ms.DType.BOOL,
|
|
23
|
+
# ints
|
|
24
|
+
pa.int8(): ms.DType.INT8,
|
|
25
|
+
pa.int16(): ms.DType.INT16,
|
|
26
|
+
pa.int32(): ms.DType.INT32,
|
|
27
|
+
pa.int64(): ms.DType.INT64,
|
|
28
|
+
pa.uint8(): ms.DType.UINT8,
|
|
29
|
+
pa.uint16(): ms.DType.UINT16,
|
|
30
|
+
pa.uint32(): ms.DType.UINT32,
|
|
31
|
+
pa.uint64(): ms.DType.UINT64,
|
|
32
|
+
# reals
|
|
33
|
+
pa.float16(): ms.DType.FLOAT32, # should we support float16 in DType?
|
|
34
|
+
pa.float32(): ms.DType.FLOAT32,
|
|
35
|
+
pa.float64(): ms.DType.FLOAT64,
|
|
36
|
+
# dates/times
|
|
37
|
+
pa.date32(): ms.DType.DATE,
|
|
38
|
+
pa.date64(): ms.DType.DATETIME,
|
|
39
|
+
pa.timestamp("s"): ms.DType.DATETIME,
|
|
40
|
+
pa.timestamp("ms"): ms.DType.DATETIME,
|
|
41
|
+
pa.timestamp("us"): ms.DType.DATETIME,
|
|
42
|
+
pa.timestamp("ns"): ms.DType.DATETIME,
|
|
43
|
+
}
|
|
44
|
+
ColumnType = ty.Union[ms.DType, ms.AnonCustomType, ms.CustomType, ms.ArrayType, ms.MappingType]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def pyarrow_type_to_dtype(pyarrow_type: pyarrow.lib.DataType) -> ColumnType:
|
|
48
|
+
if pyarrow_type in _LEAF_MAPPINGS:
|
|
49
|
+
return _LEAF_MAPPINGS[pyarrow_type]
|
|
50
|
+
if pa.types.is_map(pyarrow_type):
|
|
51
|
+
key_type = pyarrow_type_to_dtype(pyarrow_type.key_type)
|
|
52
|
+
assert not isinstance(key_type, (ms._RawArrayType, ms._RawMappingType))
|
|
53
|
+
return ms.MappingType(
|
|
54
|
+
keys=key_type,
|
|
55
|
+
values=pyarrow_type_to_dtype(pyarrow_type.item_type),
|
|
56
|
+
)
|
|
57
|
+
if pa.types.is_list(pyarrow_type):
|
|
58
|
+
return ms.ArrayType(
|
|
59
|
+
values=pyarrow_type_to_dtype(pyarrow_type.value_type),
|
|
60
|
+
)
|
|
61
|
+
if pa.types.is_struct(pyarrow_type):
|
|
62
|
+
# TODO support these as though they were mappings, possibly?
|
|
63
|
+
raise ValueError("Structs are not yet supported by tabularasa.")
|
|
64
|
+
raise ValueError(f"Unsupported pyarrow type: {pyarrow_type}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _decide_field_nullability(pyarrow_field: pyarrow.lib.Field, pq_file: pq.ParquetFile) -> bool:
|
|
68
|
+
if not pyarrow_field.nullable:
|
|
69
|
+
# if the incoming schema is certain about this, then
|
|
70
|
+
# maintain their declaration without inspecting the actual file.
|
|
71
|
+
return False
|
|
72
|
+
# otherwise, infer it from the data
|
|
73
|
+
for batch in pq_file.iter_batches(columns=[pyarrow_field.name]):
|
|
74
|
+
if batch[pyarrow_field.name].null_count:
|
|
75
|
+
return True
|
|
76
|
+
# the impact of saying this is False if no nulls were found but it
|
|
77
|
+
# was theoretically possible is low - we're trying to create a
|
|
78
|
+
# schema based on the data we _already have_, rather than the
|
|
79
|
+
# entire world of possible data.
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def pyarrow_field_to_column(pq_file: pq.ParquetFile, pyarrow_field: pyarrow.lib.Field) -> ms.Column:
|
|
84
|
+
"""Convert a pyarrow field to a Column object."""
|
|
85
|
+
return ms.Column(
|
|
86
|
+
name=pyarrow_field.name,
|
|
87
|
+
type=pyarrow_type_to_dtype(pyarrow_field.type),
|
|
88
|
+
nullable=_decide_field_nullability(pyarrow_field, pq_file),
|
|
89
|
+
doc=pyarrow_field.metadata and pyarrow_field.metadata.get("doc", "") or "autoextracted",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def define_table_from_parquet(
|
|
94
|
+
pq_file: Path,
|
|
95
|
+
name: str,
|
|
96
|
+
*,
|
|
97
|
+
primary_key: ty.Optional[ms.IdTuple] = None,
|
|
98
|
+
indexes: ty.Collection[ms.IdTuple] = tuple(),
|
|
99
|
+
) -> ms.Table:
|
|
100
|
+
"""Extract a table from parquet into a Schema object.
|
|
101
|
+
|
|
102
|
+
The filename will be embedded in the doc field.
|
|
103
|
+
"""
|
|
104
|
+
pq_schema = pq.read_schema(pq_file)
|
|
105
|
+
|
|
106
|
+
columns = list(map(partial(pyarrow_field_to_column, pq.ParquetFile(pq_file)), pq_schema))
|
|
107
|
+
valid_colnames = {column.name for column in columns}
|
|
108
|
+
|
|
109
|
+
# validate that primary_key and indexes match
|
|
110
|
+
def _validate(id_tuple: ms.IdTuple, descrip: str):
|
|
111
|
+
for identifier in id_tuple:
|
|
112
|
+
if identifier not in valid_colnames:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Cannot specify name {identifier} as part of {descrip}"
|
|
115
|
+
" since it is not a valid column name."
|
|
116
|
+
f" Options are: {valid_colnames}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
for keys in indexes:
|
|
120
|
+
_validate(keys, f"index {keys}")
|
|
121
|
+
if primary_key:
|
|
122
|
+
_validate(primary_key, f"primary key {primary_key}")
|
|
123
|
+
|
|
124
|
+
return ms.Table(
|
|
125
|
+
name=name,
|
|
126
|
+
columns=columns,
|
|
127
|
+
doc=str(pq_file),
|
|
128
|
+
dependencies=None,
|
|
129
|
+
transient=True,
|
|
130
|
+
indexes=list(indexes),
|
|
131
|
+
primary_key=primary_key,
|
|
132
|
+
)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import datetime
|
|
3
|
+
import os
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Literal, Optional, Set, Tuple, Union
|
|
7
|
+
|
|
8
|
+
import pkg_resources
|
|
9
|
+
from pydantic import AnyUrl, BaseModel, Extra, Field
|
|
10
|
+
|
|
11
|
+
from .util import DashedIdentifier, DocumentedMixin, DottedIdentifier, HexStr, PathStr
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CSVQuotingConvention(Enum):
|
|
15
|
+
QUOTE_ALL = "quote_all"
|
|
16
|
+
QUOTE_NONE = "quote_none"
|
|
17
|
+
QUOTE_NONNUMERIC = "quote_nonnumeric"
|
|
18
|
+
QUOTE_MINIMAL = "quote_minimal"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
UpdateFrequency = Literal["Yearly", "Quarterly", "Monthly", "Biannual"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def quarter(date: datetime.date) -> int:
|
|
25
|
+
return (date.month - 1) // 3 + 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def half(date: datetime.date) -> int:
|
|
29
|
+
return (date.month - 1) // 6 + 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_tail(freq: UpdateFrequency, date: datetime.date) -> Tuple[int, ...]:
|
|
33
|
+
if freq == "Yearly":
|
|
34
|
+
return ()
|
|
35
|
+
if freq == "Quarterly":
|
|
36
|
+
return (quarter(date),)
|
|
37
|
+
if freq == "Monthly":
|
|
38
|
+
return (date.month,)
|
|
39
|
+
return (half(date),)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _date_tuple(date: datetime.date, freq: UpdateFrequency) -> Tuple[int, ...]:
|
|
43
|
+
return (date.year, *_get_tail(freq, date))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
current_date = datetime.date.today()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class FileSourceMixin(BaseModel, extra=Extra.forbid):
|
|
50
|
+
authority: Optional[str] = None
|
|
51
|
+
url: Optional[AnyUrl] = None
|
|
52
|
+
landing_page: Optional[AnyUrl] = None
|
|
53
|
+
last_checked: Optional[datetime.date] = None
|
|
54
|
+
last_updated: Optional[datetime.date] = None
|
|
55
|
+
update_frequency: Optional[UpdateFrequency] = None
|
|
56
|
+
is_open_access: Optional[bool] = None
|
|
57
|
+
doc: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
def needs_update(self, current_date: datetime.date) -> bool:
|
|
60
|
+
if self.update_frequency is not None:
|
|
61
|
+
if self.last_updated is None:
|
|
62
|
+
return True
|
|
63
|
+
return _date_tuple(current_date, self.update_frequency) > _date_tuple(
|
|
64
|
+
self.last_updated, self.update_frequency
|
|
65
|
+
)
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class LocalFileSourceMixin(FileSourceMixin):
|
|
70
|
+
filename: PathStr
|
|
71
|
+
package: Optional[DottedIdentifier] = None
|
|
72
|
+
encoding: Optional[str] = None
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def full_path(self) -> Path:
|
|
76
|
+
return Path(
|
|
77
|
+
self.filename
|
|
78
|
+
if self.package is None
|
|
79
|
+
else pkg_resources.resource_filename(self.package, self.filename)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def file_handle(self):
|
|
84
|
+
if self.package is None:
|
|
85
|
+
return open(self.filename, "rb")
|
|
86
|
+
else:
|
|
87
|
+
if pkg_resources.resource_isdir(self.package, self.filename):
|
|
88
|
+
raise IsADirectoryError(pkg_resources.resource_filename(self.package, self.filename))
|
|
89
|
+
return pkg_resources.resource_stream(self.package, self.filename)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def is_dir(self) -> bool:
|
|
93
|
+
if self.package is None:
|
|
94
|
+
return os.path.isdir(self.filename)
|
|
95
|
+
else:
|
|
96
|
+
return pkg_resources.resource_isdir(self.package, self.filename)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TabularFileSource(DocumentedMixin, LocalFileSourceMixin):
|
|
100
|
+
delimiter: Optional[str] = csv.excel.delimiter
|
|
101
|
+
quotechar: Optional[str] = csv.excel.quotechar
|
|
102
|
+
escapechar: Optional[str] = csv.excel.escapechar
|
|
103
|
+
doublequote: Optional[bool] = csv.excel.doublequote
|
|
104
|
+
skipinitialspace: Optional[bool] = csv.excel.skipinitialspace
|
|
105
|
+
lineterminator: Optional[str] = csv.excel.lineterminator
|
|
106
|
+
skiprows: Optional[int] = None
|
|
107
|
+
quoting: Optional[CSVQuotingConvention] = CSVQuotingConvention.QUOTE_MINIMAL
|
|
108
|
+
package: Optional[DottedIdentifier] = None
|
|
109
|
+
# Fairly conservative choice - only empty string is treated as explicitly null,
|
|
110
|
+
# and only on nullable columns
|
|
111
|
+
na_values: Optional[Set[str]] = Field(default_factory=lambda: {""})
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def csv_dialect(self) -> csv.Dialect:
|
|
115
|
+
# This is ugly but required for flexibility when using pandas.read_csv.
|
|
116
|
+
# The reason is that while read_csv allows passing all the attributes of a csv.Dialect,
|
|
117
|
+
# it does _not_ allow passing multi-char lineterminators (as are present in csv.excel as '\r\n'
|
|
118
|
+
# and this is often required in practice). The simple-seeming thing then is to pass the excel
|
|
119
|
+
# dialect and let the keyword args override it, but pandas overrides in the _opposite_ direction:
|
|
120
|
+
# the dialect overrides the keyword args. So in order to inject e.g. different delimiters or
|
|
121
|
+
# quoting conventions while keeping the '\r\n' lineterminator of the excel dialect, we have to
|
|
122
|
+
# build a new dialect object here.
|
|
123
|
+
kwargs = self.csv_reader_kwargs
|
|
124
|
+
|
|
125
|
+
if all(v == getattr(csv.excel, k) for k, v in kwargs.items()):
|
|
126
|
+
return csv.excel()
|
|
127
|
+
else:
|
|
128
|
+
dialect_name = "csv_dialect%d" % abs(hash(tuple(sorted(kwargs.items()))))
|
|
129
|
+
try:
|
|
130
|
+
dialect = csv.get_dialect(dialect_name)
|
|
131
|
+
except Exception:
|
|
132
|
+
dialect_cls = type(dialect_name, (csv.excel,), kwargs)
|
|
133
|
+
csv.register_dialect(dialect_name, dialect_cls)
|
|
134
|
+
dialect = dialect_cls()
|
|
135
|
+
|
|
136
|
+
return dialect
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def csv_reader_kwargs(self) -> Dict[str, Union[str, int, bool]]:
|
|
140
|
+
kw: Dict[str, Union[str, int, bool]] = {}
|
|
141
|
+
if self.quoting is not None:
|
|
142
|
+
kw.update(quoting=getattr(csv, self.quoting.name))
|
|
143
|
+
|
|
144
|
+
for name in [
|
|
145
|
+
"delimiter",
|
|
146
|
+
"quotechar",
|
|
147
|
+
"escapechar",
|
|
148
|
+
"doublequote",
|
|
149
|
+
"skipinitialspace",
|
|
150
|
+
"lineterminator",
|
|
151
|
+
"skiprows",
|
|
152
|
+
]:
|
|
153
|
+
value = self.__dict__.get(name)
|
|
154
|
+
if value is not None:
|
|
155
|
+
kw[name] = value
|
|
156
|
+
|
|
157
|
+
return kw
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class VersionControlledPath(BaseModel, extra=Extra.forbid):
|
|
161
|
+
name: PathStr
|
|
162
|
+
md5: Optional[HexStr] = None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class ADLSDataSpec(FileSourceMixin):
|
|
166
|
+
adls_account: DashedIdentifier
|
|
167
|
+
adls_filesystem: DashedIdentifier
|
|
168
|
+
paths: List[VersionControlledPath] = Field(min_items=1)
|
|
169
|
+
ordered: bool = False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class LocalDataSpec(LocalFileSourceMixin):
|
|
173
|
+
order: List[PathStr] = Field(default_factory=list, unique_items=True)
|
|
174
|
+
package: Optional[DottedIdentifier] = None
|
|
175
|
+
|
|
176
|
+
def list_dir(self):
|
|
177
|
+
if self.package is None:
|
|
178
|
+
return os.listdir(self.filename)
|
|
179
|
+
else:
|
|
180
|
+
return pkg_resources.resource_listdir(self.package, self.filename)
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def all_data_specs(self) -> List["LocalDataSpec"]:
|
|
184
|
+
if self.is_dir:
|
|
185
|
+
return [
|
|
186
|
+
LocalDataSpec(package=self.package, filename=os.path.join(self.filename, filename))
|
|
187
|
+
for filename in self.list_dir()
|
|
188
|
+
]
|
|
189
|
+
else:
|
|
190
|
+
return [self]
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def ordered_data_specs(self) -> List["LocalDataSpec"]:
|
|
194
|
+
if not self.order:
|
|
195
|
+
raise AttributeError("No `order` is set - `ordered_data_specs` are not available")
|
|
196
|
+
spec_order = {os.path.basename(spec.filename): spec for spec in self.all_data_specs}
|
|
197
|
+
if set(spec_order.keys()) != set(self.order):
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"`order` does not match the file names in the spec: {set(self.order)} != {set(spec_order.keys())}"
|
|
200
|
+
)
|
|
201
|
+
return [spec_order[name] for name in self.order]
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class RemoteBlobStoreSpec(BaseModel, extra=Extra.forbid):
|
|
205
|
+
adls_account: DashedIdentifier
|
|
206
|
+
adls_filesystem: DashedIdentifier
|
|
207
|
+
path: PathStr
|
|
208
|
+
|
|
209
|
+
def data_spec(self, md5: str, extension: Optional[str] = None) -> ADLSDataSpec:
|
|
210
|
+
ext = extension or ""
|
|
211
|
+
return ADLSDataSpec(
|
|
212
|
+
adls_account=self.adls_account,
|
|
213
|
+
adls_filesystem=self.adls_filesystem,
|
|
214
|
+
paths=[VersionControlledPath(name=f"{self.path.rstrip('/')}/{md5}{ext}", md5=md5)],
|
|
215
|
+
)
|