thds.tabularasa 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. thds/tabularasa/__init__.py +6 -0
  2. thds/tabularasa/__main__.py +1122 -0
  3. thds/tabularasa/compat.py +33 -0
  4. thds/tabularasa/data_dependencies/__init__.py +0 -0
  5. thds/tabularasa/data_dependencies/adls.py +97 -0
  6. thds/tabularasa/data_dependencies/build.py +573 -0
  7. thds/tabularasa/data_dependencies/sqlite.py +286 -0
  8. thds/tabularasa/data_dependencies/tabular.py +167 -0
  9. thds/tabularasa/data_dependencies/util.py +209 -0
  10. thds/tabularasa/diff/__init__.py +0 -0
  11. thds/tabularasa/diff/data.py +346 -0
  12. thds/tabularasa/diff/schema.py +254 -0
  13. thds/tabularasa/diff/summary.py +249 -0
  14. thds/tabularasa/git_util.py +37 -0
  15. thds/tabularasa/loaders/__init__.py +0 -0
  16. thds/tabularasa/loaders/lazy_adls.py +44 -0
  17. thds/tabularasa/loaders/parquet_util.py +385 -0
  18. thds/tabularasa/loaders/sqlite_util.py +346 -0
  19. thds/tabularasa/loaders/util.py +532 -0
  20. thds/tabularasa/py.typed +0 -0
  21. thds/tabularasa/schema/__init__.py +7 -0
  22. thds/tabularasa/schema/compilation/__init__.py +20 -0
  23. thds/tabularasa/schema/compilation/_format.py +50 -0
  24. thds/tabularasa/schema/compilation/attrs.py +257 -0
  25. thds/tabularasa/schema/compilation/attrs_sqlite.py +278 -0
  26. thds/tabularasa/schema/compilation/io.py +96 -0
  27. thds/tabularasa/schema/compilation/pandas.py +252 -0
  28. thds/tabularasa/schema/compilation/pyarrow.py +93 -0
  29. thds/tabularasa/schema/compilation/sphinx.py +550 -0
  30. thds/tabularasa/schema/compilation/sqlite.py +69 -0
  31. thds/tabularasa/schema/compilation/util.py +117 -0
  32. thds/tabularasa/schema/constraints.py +327 -0
  33. thds/tabularasa/schema/dtypes.py +153 -0
  34. thds/tabularasa/schema/extract_from_parquet.py +132 -0
  35. thds/tabularasa/schema/files.py +215 -0
  36. thds/tabularasa/schema/metaschema.py +1007 -0
  37. thds/tabularasa/schema/util.py +123 -0
  38. thds/tabularasa/schema/validation.py +878 -0
  39. thds/tabularasa/sqlite3_compat.py +41 -0
  40. thds/tabularasa/sqlite_from_parquet.py +34 -0
  41. thds/tabularasa/to_sqlite.py +56 -0
  42. thds_tabularasa-0.13.0.dist-info/METADATA +530 -0
  43. thds_tabularasa-0.13.0.dist-info/RECORD +46 -0
  44. thds_tabularasa-0.13.0.dist-info/WHEEL +5 -0
  45. thds_tabularasa-0.13.0.dist-info/entry_points.txt +2 -0
  46. thds_tabularasa-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,132 @@
1
+ """Should be able to 'extract' a Schema object from an existing parquet file.
2
+
3
+ You might want to use this to convert a Parquet file directly to
4
+ SQLite without additional ceremony - use `tabularasa.to_sqlite` in
5
+ conjunction with this.
6
+ """
7
+
8
+ import typing as ty
9
+ from functools import partial
10
+ from pathlib import Path
11
+
12
+ import pyarrow as pa
13
+ import pyarrow.lib
14
+ import pyarrow.parquet as pq
15
+
16
+ from . import metaschema as ms
17
+
18
+ _LEAF_MAPPINGS = {
19
+ # DType doesn't currently support everything:
20
+ # https://arrow.apache.org/docs/python/api/datatypes.html
21
+ pa.string(): ms.DType.STR,
22
+ pa.bool_(): ms.DType.BOOL,
23
+ # ints
24
+ pa.int8(): ms.DType.INT8,
25
+ pa.int16(): ms.DType.INT16,
26
+ pa.int32(): ms.DType.INT32,
27
+ pa.int64(): ms.DType.INT64,
28
+ pa.uint8(): ms.DType.UINT8,
29
+ pa.uint16(): ms.DType.UINT16,
30
+ pa.uint32(): ms.DType.UINT32,
31
+ pa.uint64(): ms.DType.UINT64,
32
+ # reals
33
+ pa.float16(): ms.DType.FLOAT32, # should we support float16 in DType?
34
+ pa.float32(): ms.DType.FLOAT32,
35
+ pa.float64(): ms.DType.FLOAT64,
36
+ # dates/times
37
+ pa.date32(): ms.DType.DATE,
38
+ pa.date64(): ms.DType.DATETIME,
39
+ pa.timestamp("s"): ms.DType.DATETIME,
40
+ pa.timestamp("ms"): ms.DType.DATETIME,
41
+ pa.timestamp("us"): ms.DType.DATETIME,
42
+ pa.timestamp("ns"): ms.DType.DATETIME,
43
+ }
44
+ ColumnType = ty.Union[ms.DType, ms.AnonCustomType, ms.CustomType, ms.ArrayType, ms.MappingType]
45
+
46
+
47
+ def pyarrow_type_to_dtype(pyarrow_type: pyarrow.lib.DataType) -> ColumnType:
48
+ if pyarrow_type in _LEAF_MAPPINGS:
49
+ return _LEAF_MAPPINGS[pyarrow_type]
50
+ if pa.types.is_map(pyarrow_type):
51
+ key_type = pyarrow_type_to_dtype(pyarrow_type.key_type)
52
+ assert not isinstance(key_type, (ms._RawArrayType, ms._RawMappingType))
53
+ return ms.MappingType(
54
+ keys=key_type,
55
+ values=pyarrow_type_to_dtype(pyarrow_type.item_type),
56
+ )
57
+ if pa.types.is_list(pyarrow_type):
58
+ return ms.ArrayType(
59
+ values=pyarrow_type_to_dtype(pyarrow_type.value_type),
60
+ )
61
+ if pa.types.is_struct(pyarrow_type):
62
+ # TODO support these as though they were mappings, possibly?
63
+ raise ValueError("Structs are not yet supported by tabularasa.")
64
+ raise ValueError(f"Unsupported pyarrow type: {pyarrow_type}")
65
+
66
+
67
+ def _decide_field_nullability(pyarrow_field: pyarrow.lib.Field, pq_file: pq.ParquetFile) -> bool:
68
+ if not pyarrow_field.nullable:
69
+ # if the incoming schema is certain about this, then
70
+ # maintain their declaration without inspecting the actual file.
71
+ return False
72
+ # otherwise, infer it from the data
73
+ for batch in pq_file.iter_batches(columns=[pyarrow_field.name]):
74
+ if batch[pyarrow_field.name].null_count:
75
+ return True
76
+ # the impact of saying this is False if no nulls were found but it
77
+ # was theoretically possible is low - we're trying to create a
78
+ # schema based on the data we _already have_, rather than the
79
+ # entire world of possible data.
80
+ return False
81
+
82
+
83
+ def pyarrow_field_to_column(pq_file: pq.ParquetFile, pyarrow_field: pyarrow.lib.Field) -> ms.Column:
84
+ """Convert a pyarrow field to a Column object."""
85
+ return ms.Column(
86
+ name=pyarrow_field.name,
87
+ type=pyarrow_type_to_dtype(pyarrow_field.type),
88
+ nullable=_decide_field_nullability(pyarrow_field, pq_file),
89
+ doc=pyarrow_field.metadata and pyarrow_field.metadata.get("doc", "") or "autoextracted",
90
+ )
91
+
92
+
93
+ def define_table_from_parquet(
94
+ pq_file: Path,
95
+ name: str,
96
+ *,
97
+ primary_key: ty.Optional[ms.IdTuple] = None,
98
+ indexes: ty.Collection[ms.IdTuple] = tuple(),
99
+ ) -> ms.Table:
100
+ """Extract a table from parquet into a Schema object.
101
+
102
+ The filename will be embedded in the doc field.
103
+ """
104
+ pq_schema = pq.read_schema(pq_file)
105
+
106
+ columns = list(map(partial(pyarrow_field_to_column, pq.ParquetFile(pq_file)), pq_schema))
107
+ valid_colnames = {column.name for column in columns}
108
+
109
+ # validate that primary_key and indexes match
110
+ def _validate(id_tuple: ms.IdTuple, descrip: str):
111
+ for identifier in id_tuple:
112
+ if identifier not in valid_colnames:
113
+ raise ValueError(
114
+ f"Cannot specify name {identifier} as part of {descrip}"
115
+ " since it is not a valid column name."
116
+ f" Options are: {valid_colnames}"
117
+ )
118
+
119
+ for keys in indexes:
120
+ _validate(keys, f"index {keys}")
121
+ if primary_key:
122
+ _validate(primary_key, f"primary key {primary_key}")
123
+
124
+ return ms.Table(
125
+ name=name,
126
+ columns=columns,
127
+ doc=str(pq_file),
128
+ dependencies=None,
129
+ transient=True,
130
+ indexes=list(indexes),
131
+ primary_key=primary_key,
132
+ )
@@ -0,0 +1,215 @@
1
+ import csv
2
+ import datetime
3
+ import os
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Dict, List, Literal, Optional, Set, Tuple, Union
7
+
8
+ import pkg_resources
9
+ from pydantic import AnyUrl, BaseModel, Extra, Field
10
+
11
+ from .util import DashedIdentifier, DocumentedMixin, DottedIdentifier, HexStr, PathStr
12
+
13
+
14
+ class CSVQuotingConvention(Enum):
15
+ QUOTE_ALL = "quote_all"
16
+ QUOTE_NONE = "quote_none"
17
+ QUOTE_NONNUMERIC = "quote_nonnumeric"
18
+ QUOTE_MINIMAL = "quote_minimal"
19
+
20
+
21
+ UpdateFrequency = Literal["Yearly", "Quarterly", "Monthly", "Biannual"]
22
+
23
+
24
+ def quarter(date: datetime.date) -> int:
25
+ return (date.month - 1) // 3 + 1
26
+
27
+
28
+ def half(date: datetime.date) -> int:
29
+ return (date.month - 1) // 6 + 1
30
+
31
+
32
+ def _get_tail(freq: UpdateFrequency, date: datetime.date) -> Tuple[int, ...]:
33
+ if freq == "Yearly":
34
+ return ()
35
+ if freq == "Quarterly":
36
+ return (quarter(date),)
37
+ if freq == "Monthly":
38
+ return (date.month,)
39
+ return (half(date),)
40
+
41
+
42
+ def _date_tuple(date: datetime.date, freq: UpdateFrequency) -> Tuple[int, ...]:
43
+ return (date.year, *_get_tail(freq, date))
44
+
45
+
46
+ current_date = datetime.date.today()
47
+
48
+
49
+ class FileSourceMixin(BaseModel, extra=Extra.forbid):
50
+ authority: Optional[str] = None
51
+ url: Optional[AnyUrl] = None
52
+ landing_page: Optional[AnyUrl] = None
53
+ last_checked: Optional[datetime.date] = None
54
+ last_updated: Optional[datetime.date] = None
55
+ update_frequency: Optional[UpdateFrequency] = None
56
+ is_open_access: Optional[bool] = None
57
+ doc: Optional[str] = None
58
+
59
+ def needs_update(self, current_date: datetime.date) -> bool:
60
+ if self.update_frequency is not None:
61
+ if self.last_updated is None:
62
+ return True
63
+ return _date_tuple(current_date, self.update_frequency) > _date_tuple(
64
+ self.last_updated, self.update_frequency
65
+ )
66
+ return False
67
+
68
+
69
+ class LocalFileSourceMixin(FileSourceMixin):
70
+ filename: PathStr
71
+ package: Optional[DottedIdentifier] = None
72
+ encoding: Optional[str] = None
73
+
74
+ @property
75
+ def full_path(self) -> Path:
76
+ return Path(
77
+ self.filename
78
+ if self.package is None
79
+ else pkg_resources.resource_filename(self.package, self.filename)
80
+ )
81
+
82
+ @property
83
+ def file_handle(self):
84
+ if self.package is None:
85
+ return open(self.filename, "rb")
86
+ else:
87
+ if pkg_resources.resource_isdir(self.package, self.filename):
88
+ raise IsADirectoryError(pkg_resources.resource_filename(self.package, self.filename))
89
+ return pkg_resources.resource_stream(self.package, self.filename)
90
+
91
+ @property
92
+ def is_dir(self) -> bool:
93
+ if self.package is None:
94
+ return os.path.isdir(self.filename)
95
+ else:
96
+ return pkg_resources.resource_isdir(self.package, self.filename)
97
+
98
+
99
+ class TabularFileSource(DocumentedMixin, LocalFileSourceMixin):
100
+ delimiter: Optional[str] = csv.excel.delimiter
101
+ quotechar: Optional[str] = csv.excel.quotechar
102
+ escapechar: Optional[str] = csv.excel.escapechar
103
+ doublequote: Optional[bool] = csv.excel.doublequote
104
+ skipinitialspace: Optional[bool] = csv.excel.skipinitialspace
105
+ lineterminator: Optional[str] = csv.excel.lineterminator
106
+ skiprows: Optional[int] = None
107
+ quoting: Optional[CSVQuotingConvention] = CSVQuotingConvention.QUOTE_MINIMAL
108
+ package: Optional[DottedIdentifier] = None
109
+ # Fairly conservative choice - only empty string is treated as explicitly null,
110
+ # and only on nullable columns
111
+ na_values: Optional[Set[str]] = Field(default_factory=lambda: {""})
112
+
113
+ @property
114
+ def csv_dialect(self) -> csv.Dialect:
115
+ # This is ugly but required for flexibility when using pandas.read_csv.
116
+ # The reason is that while read_csv allows passing all the attributes of a csv.Dialect,
117
+ # it does _not_ allow passing multi-char lineterminators (as are present in csv.excel as '\r\n'
118
+ # and this is often required in practice). The simple-seeming thing then is to pass the excel
119
+ # dialect and let the keyword args override it, but pandas overrides in the _opposite_ direction:
120
+ # the dialect overrides the keyword args. So in order to inject e.g. different delimiters or
121
+ # quoting conventions while keeping the '\r\n' lineterminator of the excel dialect, we have to
122
+ # build a new dialect object here.
123
+ kwargs = self.csv_reader_kwargs
124
+
125
+ if all(v == getattr(csv.excel, k) for k, v in kwargs.items()):
126
+ return csv.excel()
127
+ else:
128
+ dialect_name = "csv_dialect%d" % abs(hash(tuple(sorted(kwargs.items()))))
129
+ try:
130
+ dialect = csv.get_dialect(dialect_name)
131
+ except Exception:
132
+ dialect_cls = type(dialect_name, (csv.excel,), kwargs)
133
+ csv.register_dialect(dialect_name, dialect_cls)
134
+ dialect = dialect_cls()
135
+
136
+ return dialect
137
+
138
+ @property
139
+ def csv_reader_kwargs(self) -> Dict[str, Union[str, int, bool]]:
140
+ kw: Dict[str, Union[str, int, bool]] = {}
141
+ if self.quoting is not None:
142
+ kw.update(quoting=getattr(csv, self.quoting.name))
143
+
144
+ for name in [
145
+ "delimiter",
146
+ "quotechar",
147
+ "escapechar",
148
+ "doublequote",
149
+ "skipinitialspace",
150
+ "lineterminator",
151
+ "skiprows",
152
+ ]:
153
+ value = self.__dict__.get(name)
154
+ if value is not None:
155
+ kw[name] = value
156
+
157
+ return kw
158
+
159
+
160
+ class VersionControlledPath(BaseModel, extra=Extra.forbid):
161
+ name: PathStr
162
+ md5: Optional[HexStr] = None
163
+
164
+
165
+ class ADLSDataSpec(FileSourceMixin):
166
+ adls_account: DashedIdentifier
167
+ adls_filesystem: DashedIdentifier
168
+ paths: List[VersionControlledPath] = Field(min_items=1)
169
+ ordered: bool = False
170
+
171
+
172
+ class LocalDataSpec(LocalFileSourceMixin):
173
+ order: List[PathStr] = Field(default_factory=list, unique_items=True)
174
+ package: Optional[DottedIdentifier] = None
175
+
176
+ def list_dir(self):
177
+ if self.package is None:
178
+ return os.listdir(self.filename)
179
+ else:
180
+ return pkg_resources.resource_listdir(self.package, self.filename)
181
+
182
+ @property
183
+ def all_data_specs(self) -> List["LocalDataSpec"]:
184
+ if self.is_dir:
185
+ return [
186
+ LocalDataSpec(package=self.package, filename=os.path.join(self.filename, filename))
187
+ for filename in self.list_dir()
188
+ ]
189
+ else:
190
+ return [self]
191
+
192
+ @property
193
+ def ordered_data_specs(self) -> List["LocalDataSpec"]:
194
+ if not self.order:
195
+ raise AttributeError("No `order` is set - `ordered_data_specs` are not available")
196
+ spec_order = {os.path.basename(spec.filename): spec for spec in self.all_data_specs}
197
+ if set(spec_order.keys()) != set(self.order):
198
+ raise ValueError(
199
+ f"`order` does not match the file names in the spec: {set(self.order)} != {set(spec_order.keys())}"
200
+ )
201
+ return [spec_order[name] for name in self.order]
202
+
203
+
204
+ class RemoteBlobStoreSpec(BaseModel, extra=Extra.forbid):
205
+ adls_account: DashedIdentifier
206
+ adls_filesystem: DashedIdentifier
207
+ path: PathStr
208
+
209
+ def data_spec(self, md5: str, extension: Optional[str] = None) -> ADLSDataSpec:
210
+ ext = extension or ""
211
+ return ADLSDataSpec(
212
+ adls_account=self.adls_account,
213
+ adls_filesystem=self.adls_filesystem,
214
+ paths=[VersionControlledPath(name=f"{self.path.rstrip('/')}/{md5}{ext}", md5=md5)],
215
+ )