interloper-google-cloud 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.3
2
+ Name: interloper-google-cloud
3
+ Version: 0.2.0
4
+ Summary: Interloper Google Cloud IO managers
5
+ Author: Guillaume Onfroy
6
+ Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
7
+ Requires-Dist: google-cloud-bigquery>=3.0
8
+ Requires-Dist: interloper-core
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+
12
+ # interloper-google-cloud
13
+
14
+ Google Cloud IO managers for Interloper.
@@ -0,0 +1,3 @@
1
+ # interloper-google-cloud
2
+
3
+ Google Cloud IO managers for Interloper.
@@ -0,0 +1,43 @@
1
+ # ###############
2
+ # PROJECT / UV
3
+ # ###############
4
+ [project]
5
+ name = "interloper-google-cloud"
6
+ version = "0.2.0"
7
+ description = "Interloper Google Cloud IO managers"
8
+ readme = "README.md"
9
+ authors = [{ name = "Guillaume Onfroy", email = "guillaume@digitlcloud.com" }]
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "google-cloud-bigquery>=3.0",
13
+ "interloper-core",
14
+ ]
15
+
16
+ [build-system]
17
+ requires = ["uv_build>=0.9.5,<0.10.0"]
18
+ build-backend = "uv_build"
19
+
20
+ [tool.uv.sources]
21
+ interloper-core = { workspace = true }
22
+
23
+ # ###############
24
+ # RUFF
25
+ # ###############
26
+ [tool.ruff]
27
+ line-length = 120
28
+
29
+ [tool.ruff.lint]
30
+ extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
31
+
32
+ [tool.ruff.lint.per-file-ignores]
33
+ "__init__.py" = ["F401", "F403"]
34
+ "tests/**" = ["ANN", "F811"]
35
+
36
+ # ###############
37
+ # PYRIGHT
38
+ # ###############
39
+ [tool.pyright]
40
+ include = ["src"]
41
+ typeCheckingMode = "basic"
42
+ reportMissingParameterType = true
43
+ ignore = ["libs/**", "tests/**", "scripts/**"]
@@ -0,0 +1,7 @@
1
+ """Interloper Google Cloud integration for BigQuery IO."""
2
+
3
+ from interloper_google_cloud.io import BigQueryIO
4
+
5
+ __all__ = [
6
+ "BigQueryIO",
7
+ ]
@@ -0,0 +1,7 @@
1
+ """Google Cloud IO managers for reading and writing to BigQuery."""
2
+
3
+ from interloper_google_cloud.io.bigquery import BigQueryIO
4
+
5
+ __all__ = [
6
+ "BigQueryIO",
7
+ ]
@@ -0,0 +1,361 @@
1
+ """BigQuery IO implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from google.cloud import bigquery
8
+ from google.cloud.exceptions import NotFound
9
+ from interloper.errors import ConfigError, TableNotFoundError
10
+ from interloper.io.database import DatabaseIO, WriteDisposition
11
+ from interloper.serialization.io import IOSpec
12
+
13
+ if TYPE_CHECKING:
14
+ from interloper.io.adapter import DataAdapter
15
+
16
+
17
+ def _infer_bq_type(value: Any) -> str:
18
+ """Infer a BigQuery field type from a Python value.
19
+
20
+ Args:
21
+ value: A sample Python value used to determine the field type.
22
+
23
+ Returns:
24
+ A BigQuery standard SQL type name.
25
+ """
26
+ import datetime
27
+ from decimal import Decimal
28
+
29
+ if isinstance(value, bool):
30
+ return "BOOLEAN"
31
+ if isinstance(value, int):
32
+ return "INTEGER"
33
+ if isinstance(value, float):
34
+ return "FLOAT"
35
+ if isinstance(value, Decimal):
36
+ return "NUMERIC"
37
+ if isinstance(value, datetime.datetime):
38
+ return "TIMESTAMP"
39
+ if isinstance(value, datetime.date):
40
+ return "DATE"
41
+ if isinstance(value, bytes):
42
+ return "BYTES"
43
+ return "STRING"
44
+
45
+
46
+ class BigQueryIO(DatabaseIO):
47
+ """BigQuery IO manager.
48
+
49
+ Provides read and write access to Google BigQuery tables. Uses the
50
+ ``google-cloud-bigquery`` client directly (no SQLAlchemy).
51
+
52
+ The BigQuery *dataset* is resolved from the asset's ``dataset`` attribute
53
+ (i.e. the schema parameter in :class:`DatabaseIO` hooks). If the asset has
54
+ no ``dataset``, the ``default_dataset`` constructor argument is used as a
55
+ fallback.
56
+
57
+ Args:
58
+ project: Google Cloud project ID.
59
+ default_dataset: Fallback BigQuery dataset when the asset has no
60
+ ``dataset`` attribute. At least one of the asset's ``dataset`` or
61
+ this parameter must be set.
62
+ location: BigQuery location (e.g. ``"US"``, ``"EU"``).
63
+ credentials: Optional Google credentials object. When *None*, the
64
+ default application credentials are used.
65
+ write_disposition: Controls whether existing rows are deleted before
66
+ writing. Defaults to :attr:`WriteDisposition.REPLACE`.
67
+ chunk_size: Number of rows per insert batch.
68
+ adapter: Optional data adapter for type conversion.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ project: str,
74
+ default_dataset: str | None = None,
75
+ location: str = "EU",
76
+ credentials: Any = None,
77
+ write_disposition: WriteDisposition = WriteDisposition.REPLACE,
78
+ chunk_size: int = 1000,
79
+ adapter: DataAdapter | str | None = None,
80
+ ) -> None:
81
+ super().__init__(write_disposition, chunk_size, adapter)
82
+ self.project = project
83
+ self.default_dataset = default_dataset
84
+ self.location = location
85
+ self._client = bigquery.Client(project=project, credentials=credentials, location=location)
86
+
87
+ # ------------------------------------------------------------------
88
+ # Helpers
89
+ # ------------------------------------------------------------------
90
+
91
+ def _resolve_dataset(self, schema: str | None) -> str:
92
+ """Return the BigQuery dataset to use.
93
+
94
+ Prefers ``schema`` (from the asset's ``dataset``). Falls back to
95
+ :attr:`default_dataset`.
96
+
97
+ Args:
98
+ schema: Schema parameter from the asset context.
99
+
100
+ Returns:
101
+ The resolved dataset name.
102
+
103
+ Raises:
104
+ ValueError: If neither *schema* nor *default_dataset* is set.
105
+ """
106
+ dataset = schema or self.default_dataset
107
+ if dataset is None:
108
+ raise ConfigError(
109
+ "BigQueryIO requires a dataset. Either set 'dataset' on the asset "
110
+ "or provide 'default_dataset' to BigQueryIO."
111
+ )
112
+ return dataset
113
+
114
+ def _table_ref(self, table: str, schema: str | None) -> str:
115
+ """Build a fully-qualified BigQuery table reference.
116
+
117
+ Args:
118
+ table: Table name.
119
+ schema: Schema (dataset) override.
120
+
121
+ Returns:
122
+ ``project.dataset.table`` string.
123
+ """
124
+ dataset = self._resolve_dataset(schema)
125
+ return f"{self.project}.{dataset}.{table}"
126
+
127
+ def _table_exists(self, table: str, schema: str | None) -> bool:
128
+ """Check whether a BigQuery table exists.
129
+
130
+ Args:
131
+ table: Table name.
132
+ schema: Schema (dataset) override.
133
+
134
+ Returns:
135
+ ``True`` if the table exists, ``False`` otherwise.
136
+ """
137
+ try:
138
+ self._client.get_table(self._table_ref(table, schema))
139
+ except NotFound:
140
+ return False
141
+ return True
142
+
143
+ def _create_table(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
144
+ """Create a BigQuery table from sample row data.
145
+
146
+ Column types are inferred from the Python values in the first row
147
+ using :func:`_infer_bq_type`.
148
+
149
+ Args:
150
+ table: Target table name.
151
+ schema: Database schema (dataset).
152
+ rows: Row data (at least one row required for schema inference).
153
+ """
154
+ sample = rows[0]
155
+ bq_schema = [bigquery.SchemaField(name, _infer_bq_type(value)) for name, value in sample.items()]
156
+ bq_table = bigquery.Table(self._table_ref(table, schema), schema=bq_schema)
157
+ self._client.create_table(bq_table)
158
+
159
+ def _ensure_dataset(self, schema: str | None) -> None:
160
+ """Create the BigQuery dataset if it does not already exist.
161
+
162
+ Args:
163
+ schema: Schema (dataset) override.
164
+ """
165
+ dataset = self._resolve_dataset(schema)
166
+ dataset_ref = bigquery.DatasetReference(self.project, dataset)
167
+ try:
168
+ self._client.get_dataset(dataset_ref)
169
+ except NotFound:
170
+ bq_dataset = bigquery.Dataset(dataset_ref)
171
+ bq_dataset.location = self.location
172
+ self._client.create_dataset(bq_dataset)
173
+
174
+ # ------------------------------------------------------------------
175
+ # DatabaseIO hooks
176
+ # ------------------------------------------------------------------
177
+
178
+ def _insert(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
179
+ """Insert rows into BigQuery using a load job.
180
+
181
+ If the table does not exist yet, the dataset is ensured and the table is
182
+ created from the row data before loading.
183
+
184
+ Args:
185
+ table: Target table name.
186
+ schema: Database schema (dataset).
187
+ rows: Row data as list of dicts.
188
+ """
189
+ if not self._table_exists(table, schema):
190
+ self._ensure_dataset(schema)
191
+ self._create_table(table, schema, rows)
192
+
193
+ ref = self._table_ref(table, schema)
194
+ job_config = bigquery.LoadJobConfig(
195
+ source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
196
+ write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
197
+ )
198
+ job = self._client.load_table_from_json(rows, ref, job_config=job_config)
199
+ job.result() # Wait for completion
200
+
201
+ def _delete_all(self, table: str, schema: str | None) -> None:
202
+ """Truncate all rows from the BigQuery table.
203
+
204
+ No-op when the table does not exist yet.
205
+
206
+ Args:
207
+ table: Target table name.
208
+ schema: Database schema (dataset).
209
+ """
210
+ if not self._table_exists(table, schema):
211
+ return
212
+ ref = self._table_ref(table, schema)
213
+ self._client.query(f"TRUNCATE TABLE `{ref}`").result()
214
+
215
+ def _delete_partition(self, table: str, schema: str | None, column: str, value: Any) -> None:
216
+ """Delete rows matching a partition value.
217
+
218
+ No-op when the table does not exist yet.
219
+
220
+ Args:
221
+ table: Target table name.
222
+ schema: Database schema (dataset).
223
+ column: Partition column name.
224
+ value: Partition value to match.
225
+ """
226
+ if not self._table_exists(table, schema):
227
+ return
228
+ ref = self._table_ref(table, schema)
229
+ query = f"DELETE FROM `{ref}` WHERE `{column}` = @partition_value"
230
+ job_config = bigquery.QueryJobConfig(
231
+ query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
232
+ )
233
+ self._client.query(query, job_config=job_config).result()
234
+
235
+ def _select_all(self, table: str, schema: str | None) -> list[dict[str, Any]]:
236
+ """Select all rows from the BigQuery table.
237
+
238
+ Args:
239
+ table: Target table name.
240
+ schema: Database schema (dataset).
241
+
242
+ Returns:
243
+ All rows as list of dicts.
244
+
245
+ Raises:
246
+ ValueError: If the table does not exist.
247
+ """
248
+ if not self._table_exists(table, schema):
249
+ qualified = self._table_ref(table, schema)
250
+ raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
251
+ ref = self._table_ref(table, schema)
252
+ rows = self._client.query(f"SELECT * FROM `{ref}`").result()
253
+ return [dict(row) for row in rows]
254
+
255
+ def _select_partition(self, table: str, schema: str | None, column: str, value: Any) -> list[dict[str, Any]]:
256
+ """Select rows matching a partition value.
257
+
258
+ Args:
259
+ table: Target table name.
260
+ schema: Database schema (dataset).
261
+ column: Partition column name.
262
+ value: Partition value to match.
263
+
264
+ Returns:
265
+ Matching rows as list of dicts.
266
+
267
+ Raises:
268
+ ValueError: If the table does not exist.
269
+ """
270
+ if not self._table_exists(table, schema):
271
+ qualified = self._table_ref(table, schema)
272
+ raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
273
+ ref = self._table_ref(table, schema)
274
+ query = f"SELECT * FROM `{ref}` WHERE `{column}` = @partition_value"
275
+ job_config = bigquery.QueryJobConfig(
276
+ query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
277
+ )
278
+ rows = self._client.query(query, job_config=job_config).result()
279
+ return [dict(row) for row in rows]
280
+
281
+ # ------------------------------------------------------------------
282
+ # Introspection
283
+ # ------------------------------------------------------------------
284
+
285
+ def _count_by_partition(
286
+ self, table: str, schema: str | None, column: str,
287
+ ) -> dict[str, int]:
288
+ """Return row counts grouped by partition column via BigQuery SQL.
289
+
290
+ Args:
291
+ table: Target table name.
292
+ schema: Database schema (dataset).
293
+ column: Column to group by.
294
+
295
+ Returns:
296
+ Mapping from partition value (as string) to row count.
297
+
298
+ Raises:
299
+ TableNotFoundError: If the table does not exist.
300
+ """
301
+ if not self._table_exists(table, schema):
302
+ ref = self._table_ref(table, schema)
303
+ raise TableNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
304
+
305
+ ref = self._table_ref(table, schema)
306
+ query = (
307
+ f"SELECT CAST(`{column}` AS STRING) AS partition_value, "
308
+ f"COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
309
+ )
310
+ rows = self._client.query(query).result()
311
+ return {row["partition_value"]: row["cnt"] for row in rows}
312
+
313
+ # ------------------------------------------------------------------
314
+ # Serialization
315
+ # ------------------------------------------------------------------
316
+
317
+ def to_spec(self) -> IOSpec:
318
+ """Convert to serializable spec."""
319
+ init = self._base_init_kwargs()
320
+ init["project"] = self.project
321
+ if self.default_dataset is not None:
322
+ init["default_dataset"] = self.default_dataset
323
+ init["location"] = self.location
324
+ return IOSpec(path=self.path, init=init)
325
+
326
+ # ------------------------------------------------------------------
327
+ # Lifecycle
328
+ # ------------------------------------------------------------------
329
+
330
+ def dispose(self) -> None:
331
+ """Close the BigQuery client."""
332
+ self._client.close()
333
+
334
+
335
+ def _bq_param_type(value: Any) -> str:
336
+ """Map a Python value to a BigQuery query parameter type.
337
+
338
+ Args:
339
+ value: A Python value.
340
+
341
+ Returns:
342
+ BigQuery parameter type string.
343
+ """
344
+ import datetime
345
+ from decimal import Decimal
346
+
347
+ if isinstance(value, bool):
348
+ return "BOOL"
349
+ if isinstance(value, int):
350
+ return "INT64"
351
+ if isinstance(value, float):
352
+ return "FLOAT64"
353
+ if isinstance(value, Decimal):
354
+ return "NUMERIC"
355
+ if isinstance(value, datetime.datetime):
356
+ return "TIMESTAMP"
357
+ if isinstance(value, datetime.date):
358
+ return "DATE"
359
+ if isinstance(value, bytes):
360
+ return "BYTES"
361
+ return "STRING"