interloper-google-cloud 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""BigQuery IO implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from google.cloud import bigquery
|
|
8
|
+
from google.cloud.exceptions import NotFound
|
|
9
|
+
from interloper.errors import ConfigError, TableNotFoundError
|
|
10
|
+
from interloper.io.database import DatabaseIO, WriteDisposition
|
|
11
|
+
from interloper.serialization.io import IOSpec
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from interloper.io.adapter import DataAdapter
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _infer_bq_type(value: Any) -> str:
|
|
18
|
+
"""Infer a BigQuery field type from a Python value.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
value: A sample Python value used to determine the field type.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
A BigQuery standard SQL type name.
|
|
25
|
+
"""
|
|
26
|
+
import datetime
|
|
27
|
+
from decimal import Decimal
|
|
28
|
+
|
|
29
|
+
if isinstance(value, bool):
|
|
30
|
+
return "BOOLEAN"
|
|
31
|
+
if isinstance(value, int):
|
|
32
|
+
return "INTEGER"
|
|
33
|
+
if isinstance(value, float):
|
|
34
|
+
return "FLOAT"
|
|
35
|
+
if isinstance(value, Decimal):
|
|
36
|
+
return "NUMERIC"
|
|
37
|
+
if isinstance(value, datetime.datetime):
|
|
38
|
+
return "TIMESTAMP"
|
|
39
|
+
if isinstance(value, datetime.date):
|
|
40
|
+
return "DATE"
|
|
41
|
+
if isinstance(value, bytes):
|
|
42
|
+
return "BYTES"
|
|
43
|
+
return "STRING"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BigQueryIO(DatabaseIO):
|
|
47
|
+
"""BigQuery IO manager.
|
|
48
|
+
|
|
49
|
+
Provides read and write access to Google BigQuery tables. Uses the
|
|
50
|
+
``google-cloud-bigquery`` client directly (no SQLAlchemy).
|
|
51
|
+
|
|
52
|
+
The BigQuery *dataset* is resolved from the asset's ``dataset`` attribute
|
|
53
|
+
(i.e. the schema parameter in :class:`DatabaseIO` hooks). If the asset has
|
|
54
|
+
no ``dataset``, the ``default_dataset`` constructor argument is used as a
|
|
55
|
+
fallback.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
project: Google Cloud project ID.
|
|
59
|
+
default_dataset: Fallback BigQuery dataset when the asset has no
|
|
60
|
+
``dataset`` attribute. At least one of the asset's ``dataset`` or
|
|
61
|
+
this parameter must be set.
|
|
62
|
+
location: BigQuery location (e.g. ``"US"``, ``"EU"``).
|
|
63
|
+
credentials: Optional Google credentials object. When *None*, the
|
|
64
|
+
default application credentials are used.
|
|
65
|
+
write_disposition: Controls whether existing rows are deleted before
|
|
66
|
+
writing. Defaults to :attr:`WriteDisposition.REPLACE`.
|
|
67
|
+
chunk_size: Number of rows per insert batch.
|
|
68
|
+
adapter: Optional data adapter for type conversion.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
project: str,
|
|
74
|
+
default_dataset: str | None = None,
|
|
75
|
+
location: str = "EU",
|
|
76
|
+
credentials: Any = None,
|
|
77
|
+
write_disposition: WriteDisposition = WriteDisposition.REPLACE,
|
|
78
|
+
chunk_size: int = 1000,
|
|
79
|
+
adapter: DataAdapter | str | None = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
super().__init__(write_disposition, chunk_size, adapter)
|
|
82
|
+
self.project = project
|
|
83
|
+
self.default_dataset = default_dataset
|
|
84
|
+
self.location = location
|
|
85
|
+
self._client = bigquery.Client(project=project, credentials=credentials, location=location)
|
|
86
|
+
|
|
87
|
+
# ------------------------------------------------------------------
|
|
88
|
+
# Helpers
|
|
89
|
+
# ------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
def _resolve_dataset(self, schema: str | None) -> str:
|
|
92
|
+
"""Return the BigQuery dataset to use.
|
|
93
|
+
|
|
94
|
+
Prefers ``schema`` (from the asset's ``dataset``). Falls back to
|
|
95
|
+
:attr:`default_dataset`.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
schema: Schema parameter from the asset context.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The resolved dataset name.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If neither *schema* nor *default_dataset* is set.
|
|
105
|
+
"""
|
|
106
|
+
dataset = schema or self.default_dataset
|
|
107
|
+
if dataset is None:
|
|
108
|
+
raise ConfigError(
|
|
109
|
+
"BigQueryIO requires a dataset. Either set 'dataset' on the asset "
|
|
110
|
+
"or provide 'default_dataset' to BigQueryIO."
|
|
111
|
+
)
|
|
112
|
+
return dataset
|
|
113
|
+
|
|
114
|
+
def _table_ref(self, table: str, schema: str | None) -> str:
|
|
115
|
+
"""Build a fully-qualified BigQuery table reference.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
table: Table name.
|
|
119
|
+
schema: Schema (dataset) override.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
``project.dataset.table`` string.
|
|
123
|
+
"""
|
|
124
|
+
dataset = self._resolve_dataset(schema)
|
|
125
|
+
return f"{self.project}.{dataset}.{table}"
|
|
126
|
+
|
|
127
|
+
def _table_exists(self, table: str, schema: str | None) -> bool:
|
|
128
|
+
"""Check whether a BigQuery table exists.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
table: Table name.
|
|
132
|
+
schema: Schema (dataset) override.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
``True`` if the table exists, ``False`` otherwise.
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
self._client.get_table(self._table_ref(table, schema))
|
|
139
|
+
except NotFound:
|
|
140
|
+
return False
|
|
141
|
+
return True
|
|
142
|
+
|
|
143
|
+
def _create_table(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
|
|
144
|
+
"""Create a BigQuery table from sample row data.
|
|
145
|
+
|
|
146
|
+
Column types are inferred from the Python values in the first row
|
|
147
|
+
using :func:`_infer_bq_type`.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
table: Target table name.
|
|
151
|
+
schema: Database schema (dataset).
|
|
152
|
+
rows: Row data (at least one row required for schema inference).
|
|
153
|
+
"""
|
|
154
|
+
sample = rows[0]
|
|
155
|
+
bq_schema = [bigquery.SchemaField(name, _infer_bq_type(value)) for name, value in sample.items()]
|
|
156
|
+
bq_table = bigquery.Table(self._table_ref(table, schema), schema=bq_schema)
|
|
157
|
+
self._client.create_table(bq_table)
|
|
158
|
+
|
|
159
|
+
def _ensure_dataset(self, schema: str | None) -> None:
|
|
160
|
+
"""Create the BigQuery dataset if it does not already exist.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
schema: Schema (dataset) override.
|
|
164
|
+
"""
|
|
165
|
+
dataset = self._resolve_dataset(schema)
|
|
166
|
+
dataset_ref = bigquery.DatasetReference(self.project, dataset)
|
|
167
|
+
try:
|
|
168
|
+
self._client.get_dataset(dataset_ref)
|
|
169
|
+
except NotFound:
|
|
170
|
+
bq_dataset = bigquery.Dataset(dataset_ref)
|
|
171
|
+
bq_dataset.location = self.location
|
|
172
|
+
self._client.create_dataset(bq_dataset)
|
|
173
|
+
|
|
174
|
+
# ------------------------------------------------------------------
|
|
175
|
+
# DatabaseIO hooks
|
|
176
|
+
# ------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
def _insert(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
|
|
179
|
+
"""Insert rows into BigQuery using a load job.
|
|
180
|
+
|
|
181
|
+
If the table does not exist yet, the dataset is ensured and the table is
|
|
182
|
+
created from the row data before loading.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
table: Target table name.
|
|
186
|
+
schema: Database schema (dataset).
|
|
187
|
+
rows: Row data as list of dicts.
|
|
188
|
+
"""
|
|
189
|
+
if not self._table_exists(table, schema):
|
|
190
|
+
self._ensure_dataset(schema)
|
|
191
|
+
self._create_table(table, schema, rows)
|
|
192
|
+
|
|
193
|
+
ref = self._table_ref(table, schema)
|
|
194
|
+
job_config = bigquery.LoadJobConfig(
|
|
195
|
+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
|
196
|
+
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
|
197
|
+
)
|
|
198
|
+
job = self._client.load_table_from_json(rows, ref, job_config=job_config)
|
|
199
|
+
job.result() # Wait for completion
|
|
200
|
+
|
|
201
|
+
def _delete_all(self, table: str, schema: str | None) -> None:
|
|
202
|
+
"""Truncate all rows from the BigQuery table.
|
|
203
|
+
|
|
204
|
+
No-op when the table does not exist yet.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
table: Target table name.
|
|
208
|
+
schema: Database schema (dataset).
|
|
209
|
+
"""
|
|
210
|
+
if not self._table_exists(table, schema):
|
|
211
|
+
return
|
|
212
|
+
ref = self._table_ref(table, schema)
|
|
213
|
+
self._client.query(f"TRUNCATE TABLE `{ref}`").result()
|
|
214
|
+
|
|
215
|
+
def _delete_partition(self, table: str, schema: str | None, column: str, value: Any) -> None:
|
|
216
|
+
"""Delete rows matching a partition value.
|
|
217
|
+
|
|
218
|
+
No-op when the table does not exist yet.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
table: Target table name.
|
|
222
|
+
schema: Database schema (dataset).
|
|
223
|
+
column: Partition column name.
|
|
224
|
+
value: Partition value to match.
|
|
225
|
+
"""
|
|
226
|
+
if not self._table_exists(table, schema):
|
|
227
|
+
return
|
|
228
|
+
ref = self._table_ref(table, schema)
|
|
229
|
+
query = f"DELETE FROM `{ref}` WHERE `{column}` = @partition_value"
|
|
230
|
+
job_config = bigquery.QueryJobConfig(
|
|
231
|
+
query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
|
|
232
|
+
)
|
|
233
|
+
self._client.query(query, job_config=job_config).result()
|
|
234
|
+
|
|
235
|
+
def _select_all(self, table: str, schema: str | None) -> list[dict[str, Any]]:
|
|
236
|
+
"""Select all rows from the BigQuery table.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
table: Target table name.
|
|
240
|
+
schema: Database schema (dataset).
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
All rows as list of dicts.
|
|
244
|
+
|
|
245
|
+
Raises:
|
|
246
|
+
ValueError: If the table does not exist.
|
|
247
|
+
"""
|
|
248
|
+
if not self._table_exists(table, schema):
|
|
249
|
+
qualified = self._table_ref(table, schema)
|
|
250
|
+
raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
|
|
251
|
+
ref = self._table_ref(table, schema)
|
|
252
|
+
rows = self._client.query(f"SELECT * FROM `{ref}`").result()
|
|
253
|
+
return [dict(row) for row in rows]
|
|
254
|
+
|
|
255
|
+
def _select_partition(self, table: str, schema: str | None, column: str, value: Any) -> list[dict[str, Any]]:
|
|
256
|
+
"""Select rows matching a partition value.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
table: Target table name.
|
|
260
|
+
schema: Database schema (dataset).
|
|
261
|
+
column: Partition column name.
|
|
262
|
+
value: Partition value to match.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Matching rows as list of dicts.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
ValueError: If the table does not exist.
|
|
269
|
+
"""
|
|
270
|
+
if not self._table_exists(table, schema):
|
|
271
|
+
qualified = self._table_ref(table, schema)
|
|
272
|
+
raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
|
|
273
|
+
ref = self._table_ref(table, schema)
|
|
274
|
+
query = f"SELECT * FROM `{ref}` WHERE `{column}` = @partition_value"
|
|
275
|
+
job_config = bigquery.QueryJobConfig(
|
|
276
|
+
query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
|
|
277
|
+
)
|
|
278
|
+
rows = self._client.query(query, job_config=job_config).result()
|
|
279
|
+
return [dict(row) for row in rows]
|
|
280
|
+
|
|
281
|
+
# ------------------------------------------------------------------
|
|
282
|
+
# Introspection
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
def _count_by_partition(
|
|
286
|
+
self, table: str, schema: str | None, column: str,
|
|
287
|
+
) -> dict[str, int]:
|
|
288
|
+
"""Return row counts grouped by partition column via BigQuery SQL.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
table: Target table name.
|
|
292
|
+
schema: Database schema (dataset).
|
|
293
|
+
column: Column to group by.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Mapping from partition value (as string) to row count.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
TableNotFoundError: If the table does not exist.
|
|
300
|
+
"""
|
|
301
|
+
if not self._table_exists(table, schema):
|
|
302
|
+
ref = self._table_ref(table, schema)
|
|
303
|
+
raise TableNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
|
|
304
|
+
|
|
305
|
+
ref = self._table_ref(table, schema)
|
|
306
|
+
query = (
|
|
307
|
+
f"SELECT CAST(`{column}` AS STRING) AS partition_value, "
|
|
308
|
+
f"COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
|
|
309
|
+
)
|
|
310
|
+
rows = self._client.query(query).result()
|
|
311
|
+
return {row["partition_value"]: row["cnt"] for row in rows}
|
|
312
|
+
|
|
313
|
+
# ------------------------------------------------------------------
|
|
314
|
+
# Serialization
|
|
315
|
+
# ------------------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
def to_spec(self) -> IOSpec:
|
|
318
|
+
"""Convert to serializable spec."""
|
|
319
|
+
init = self._base_init_kwargs()
|
|
320
|
+
init["project"] = self.project
|
|
321
|
+
if self.default_dataset is not None:
|
|
322
|
+
init["default_dataset"] = self.default_dataset
|
|
323
|
+
init["location"] = self.location
|
|
324
|
+
return IOSpec(path=self.path, init=init)
|
|
325
|
+
|
|
326
|
+
# ------------------------------------------------------------------
|
|
327
|
+
# Lifecycle
|
|
328
|
+
# ------------------------------------------------------------------
|
|
329
|
+
|
|
330
|
+
def dispose(self) -> None:
|
|
331
|
+
"""Close the BigQuery client."""
|
|
332
|
+
self._client.close()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _bq_param_type(value: Any) -> str:
|
|
336
|
+
"""Map a Python value to a BigQuery query parameter type.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
value: A Python value.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
BigQuery parameter type string.
|
|
343
|
+
"""
|
|
344
|
+
import datetime
|
|
345
|
+
from decimal import Decimal
|
|
346
|
+
|
|
347
|
+
if isinstance(value, bool):
|
|
348
|
+
return "BOOL"
|
|
349
|
+
if isinstance(value, int):
|
|
350
|
+
return "INT64"
|
|
351
|
+
if isinstance(value, float):
|
|
352
|
+
return "FLOAT64"
|
|
353
|
+
if isinstance(value, Decimal):
|
|
354
|
+
return "NUMERIC"
|
|
355
|
+
if isinstance(value, datetime.datetime):
|
|
356
|
+
return "TIMESTAMP"
|
|
357
|
+
if isinstance(value, datetime.date):
|
|
358
|
+
return "DATE"
|
|
359
|
+
if isinstance(value, bytes):
|
|
360
|
+
return "BYTES"
|
|
361
|
+
return "STRING"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: interloper-google-cloud
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Interloper Google Cloud IO managers
|
|
5
|
+
Author: Guillaume Onfroy
|
|
6
|
+
Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
|
|
7
|
+
Requires-Dist: google-cloud-bigquery>=3.0
|
|
8
|
+
Requires-Dist: interloper-core
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# interloper-google-cloud
|
|
13
|
+
|
|
14
|
+
Google Cloud IO managers for Interloper.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
interloper_google_cloud/__init__.py,sha256=qr06LkU-jsRvTlCtP5NoMvT3L8jEPj_jmjVt87mOqGw,143
|
|
2
|
+
interloper_google_cloud/io/__init__.py,sha256=wBQWh3mRgsHJwrLc6gB139BOAwjpjQe7MBDP5JW14mw,161
|
|
3
|
+
interloper_google_cloud/io/bigquery.py,sha256=LPTleEVO024q3XlLV_FlR-G_NIjNQNFvDTMalUvFRY8,12897
|
|
4
|
+
interloper_google_cloud-0.2.0.dist-info/WHEEL,sha256=01-mvBXsCWcapci73Y4TRTWrxqv9JijDtCFiicuPHXE,80
|
|
5
|
+
interloper_google_cloud-0.2.0.dist-info/METADATA,sha256=F6-Vd0AS4x0efqBYPgPVFNbIDGCAEvpjdftYxIxVs7c,402
|
|
6
|
+
interloper_google_cloud-0.2.0.dist-info/RECORD,,
|