interloper-google-cloud 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: interloper-google-cloud
3
- Version: 0.2.0
4
- Summary: Interloper Google Cloud IO managers
3
+ Version: 0.3.0
4
+ Summary: Interloper Google Cloud integration: BigQuery destination
5
5
  Author: Guillaume Onfroy
6
6
  Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
7
7
  Requires-Dist: google-cloud-bigquery>=3.0
@@ -11,4 +11,4 @@ Description-Content-Type: text/markdown
11
11
 
12
12
  # interloper-google-cloud
13
13
 
14
- Google Cloud IO managers for Interloper.
14
+ Google Cloud integration for the Interloper framework. Provides a BigQuery destination and Google Cloud connection resource.
@@ -0,0 +1,3 @@
1
+ # interloper-google-cloud
2
+
3
+ Google Cloud integration for the Interloper framework. Provides a BigQuery destination and Google Cloud connection resource.
@@ -3,8 +3,8 @@
3
3
  # ###############
4
4
  [project]
5
5
  name = "interloper-google-cloud"
6
- version = "0.2.0"
7
- description = "Interloper Google Cloud IO managers"
6
+ version = "0.3.0"
7
+ description = "Interloper Google Cloud integration: BigQuery destination"
8
8
  readme = "README.md"
9
9
  authors = [{ name = "Guillaume Onfroy", email = "guillaume@digitlcloud.com" }]
10
10
  requires-python = ">=3.10"
@@ -14,7 +14,7 @@ dependencies = [
14
14
  ]
15
15
 
16
16
  [build-system]
17
- requires = ["uv_build>=0.9.5,<0.10.0"]
17
+ requires = ["uv_build>=0.11.5,<0.12"]
18
18
  build-backend = "uv_build"
19
19
 
20
20
  [tool.uv.sources]
@@ -40,4 +40,4 @@ extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
40
40
  include = ["src"]
41
41
  typeCheckingMode = "basic"
42
42
  reportMissingParameterType = true
43
- ignore = ["libs/**", "tests/**", "scripts/**"]
43
+ ignore = ["tests/**"]
@@ -0,0 +1,9 @@
1
+ """Interloper Google Cloud integration: BigQuery destination and connection."""
2
+
3
+ from interloper_google_cloud.bigquery import BigQueryDestination
4
+ from interloper_google_cloud.connection import GoogleCloudConnection
5
+
6
+ __all__ = [
7
+ "BigQueryDestination",
8
+ "GoogleCloudConnection",
9
+ ]
@@ -0,0 +1,7 @@
1
+ """Google Cloud destination implementations."""
2
+
3
+ from interloper_google_cloud.bigquery.destination import BigQueryDestination
4
+
5
+ __all__ = [
6
+ "BigQueryDestination",
7
+ ]
@@ -1,88 +1,66 @@
1
- """BigQuery IO implementation."""
1
+ """BigQuery destination implementation."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import TYPE_CHECKING, Any
5
+ import datetime
6
+ import json
7
+ from decimal import Decimal
8
+ from functools import cached_property
9
+ from typing import Any
6
10
 
11
+ import google.auth
7
12
  from google.cloud import bigquery
8
- from google.cloud.exceptions import NotFound
9
- from interloper.errors import ConfigError, TableNotFoundError
10
- from interloper.io.database import DatabaseIO, WriteDisposition
11
- from interloper.serialization.io import IOSpec
12
-
13
- if TYPE_CHECKING:
14
- from interloper.io.adapter import DataAdapter
15
-
16
-
17
- def _infer_bq_type(value: Any) -> str:
18
- """Infer a BigQuery field type from a Python value.
19
-
20
- Args:
21
- value: A sample Python value used to determine the field type.
22
-
23
- Returns:
24
- A BigQuery standard SQL type name.
25
- """
26
- import datetime
27
- from decimal import Decimal
28
-
29
- if isinstance(value, bool):
30
- return "BOOLEAN"
31
- if isinstance(value, int):
32
- return "INTEGER"
33
- if isinstance(value, float):
34
- return "FLOAT"
35
- if isinstance(value, Decimal):
36
- return "NUMERIC"
37
- if isinstance(value, datetime.datetime):
38
- return "TIMESTAMP"
39
- if isinstance(value, datetime.date):
40
- return "DATE"
41
- if isinstance(value, bytes):
42
- return "BYTES"
43
- return "STRING"
44
-
45
-
46
- class BigQueryIO(DatabaseIO):
47
- """BigQuery IO manager.
48
-
49
- Provides read and write access to Google BigQuery tables. Uses the
50
- ``google-cloud-bigquery`` client directly (no SQLAlchemy).
51
-
52
- The BigQuery *dataset* is resolved from the asset's ``dataset`` attribute
53
- (i.e. the schema parameter in :class:`DatabaseIO` hooks). If the asset has
54
- no ``dataset``, the ``default_dataset`` constructor argument is used as a
55
- fallback.
56
-
57
- Args:
58
- project: Google Cloud project ID.
59
- default_dataset: Fallback BigQuery dataset when the asset has no
60
- ``dataset`` attribute. At least one of the asset's ``dataset`` or
61
- this parameter must be set.
62
- location: BigQuery location (e.g. ``"US"``, ``"EU"``).
63
- credentials: Optional Google credentials object. When *None*, the
64
- default application credentials are used.
65
- write_disposition: Controls whether existing rows are deleted before
66
- writing. Defaults to :attr:`WriteDisposition.REPLACE`.
67
- chunk_size: Number of rows per insert batch.
68
- adapter: Optional data adapter for type conversion.
69
- """
70
-
71
- def __init__(
72
- self,
73
- project: str,
74
- default_dataset: str | None = None,
75
- location: str = "EU",
76
- credentials: Any = None,
77
- write_disposition: WriteDisposition = WriteDisposition.REPLACE,
78
- chunk_size: int = 1000,
79
- adapter: DataAdapter | str | None = None,
80
- ) -> None:
81
- super().__init__(write_disposition, chunk_size, adapter)
82
- self.project = project
83
- self.default_dataset = default_dataset
84
- self.location = location
85
- self._client = bigquery.Client(project=project, credentials=credentials, location=location)
13
+ from google.cloud.exceptions import Conflict, NotFound
14
+ from google.oauth2 import service_account
15
+ from interloper.destination import destination
16
+ from interloper.destination.adapter import DataAdapter
17
+ from interloper.destination.database import DatabaseDestination
18
+ from interloper.errors import ConfigError, DataNotFoundError
19
+ from interloper.resource.fields import InputField, SelectField
20
+ from interloper_pandas import DataFrameAdapter
21
+
22
+ from interloper_google_cloud.connection import GoogleCloudConnection
23
+
24
+
25
+ @destination(
26
+ key="bigquery_destination",
27
+ name="BigQuery",
28
+ icon="icon:bigquery",
29
+ tags=["Cloud"],
30
+ )
31
+ class BigQueryDestination(DatabaseDestination):
32
+ """BigQuery destination."""
33
+
34
+ connection: GoogleCloudConnection
35
+
36
+ # Config fields (previously on BigQueryConfig)
37
+ project: str = InputField(description="Google Cloud project ID")
38
+ location: str = SelectField(
39
+ description="BigQuery dataset location",
40
+ options=[
41
+ {"label": "EU", "value": "EU"},
42
+ {"label": "US", "value": "US"},
43
+ ],
44
+ )
45
+ default_dataset: str | None = InputField(default=None, description="Default BigQuery dataset")
46
+
47
+ @property
48
+ def adapters(self) -> list[DataAdapter]:
49
+ return [DataFrameAdapter()]
50
+
51
+ @cached_property
52
+ def client(self) -> bigquery.Client:
53
+ if self.connection and self.connection.service_account_key:
54
+ key_info = json.loads(self.connection.service_account_key)
55
+ credentials = service_account.Credentials.from_service_account_info(key_info)
56
+ else:
57
+ credentials, _ = google.auth.default()
58
+
59
+ return bigquery.Client(
60
+ project=self.project,
61
+ credentials=credentials,
62
+ location=self.location,
63
+ )
86
64
 
87
65
  # ------------------------------------------------------------------
88
66
  # Helpers
@@ -92,7 +70,7 @@ class BigQueryIO(DatabaseIO):
92
70
  """Return the BigQuery dataset to use.
93
71
 
94
72
  Prefers ``schema`` (from the asset's ``dataset``). Falls back to
95
- :attr:`default_dataset`.
73
+ the destination's ``dataset`` field.
96
74
 
97
75
  Args:
98
76
  schema: Schema parameter from the asset context.
@@ -101,15 +79,15 @@ class BigQueryIO(DatabaseIO):
101
79
  The resolved dataset name.
102
80
 
103
81
  Raises:
104
- ValueError: If neither *schema* nor *default_dataset* is set.
82
+ ConfigError: If neither *schema* nor *dataset* is set.
105
83
  """
106
- dataset = schema or self.default_dataset
107
- if dataset is None:
84
+ ds = schema or self.default_dataset
85
+ if ds is None:
108
86
  raise ConfigError(
109
- "BigQueryIO requires a dataset. Either set 'dataset' on the asset "
110
- "or provide 'default_dataset' to BigQueryIO."
87
+ "BigQueryDestination requires a dataset. Either set 'dataset' on the asset "
88
+ "or provide 'default_dataset' on the destination."
111
89
  )
112
- return dataset
90
+ return ds
113
91
 
114
92
  def _table_ref(self, table: str, schema: str | None) -> str:
115
93
  """Build a fully-qualified BigQuery table reference.
@@ -121,8 +99,8 @@ class BigQueryIO(DatabaseIO):
121
99
  Returns:
122
100
  ``project.dataset.table`` string.
123
101
  """
124
- dataset = self._resolve_dataset(schema)
125
- return f"{self.project}.{dataset}.{table}"
102
+ ds = self._resolve_dataset(schema)
103
+ return f"{self.project}.{ds}.{table}"
126
104
 
127
105
  def _table_exists(self, table: str, schema: str | None) -> bool:
128
106
  """Check whether a BigQuery table exists.
@@ -135,7 +113,7 @@ class BigQueryIO(DatabaseIO):
135
113
  ``True`` if the table exists, ``False`` otherwise.
136
114
  """
137
115
  try:
138
- self._client.get_table(self._table_ref(table, schema))
116
+ self.client.get_table(self._table_ref(table, schema))
139
117
  except NotFound:
140
118
  return False
141
119
  return True
@@ -143,8 +121,7 @@ class BigQueryIO(DatabaseIO):
143
121
  def _create_table(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
144
122
  """Create a BigQuery table from sample row data.
145
123
 
146
- Column types are inferred from the Python values in the first row
147
- using :func:`_infer_bq_type`.
124
+ Column types are inferred from the Python values in the first row.
148
125
 
149
126
  Args:
150
127
  table: Target table name.
@@ -152,9 +129,9 @@ class BigQueryIO(DatabaseIO):
152
129
  rows: Row data (at least one row required for schema inference).
153
130
  """
154
131
  sample = rows[0]
155
- bq_schema = [bigquery.SchemaField(name, _infer_bq_type(value)) for name, value in sample.items()]
132
+ bq_schema = [bigquery.SchemaField(name, _py_to_bq_type(value)) for name, value in sample.items()]
156
133
  bq_table = bigquery.Table(self._table_ref(table, schema), schema=bq_schema)
157
- self._client.create_table(bq_table)
134
+ self.client.create_table(bq_table)
158
135
 
159
136
  def _ensure_dataset(self, schema: str | None) -> None:
160
137
  """Create the BigQuery dataset if it does not already exist.
@@ -162,17 +139,20 @@ class BigQueryIO(DatabaseIO):
162
139
  Args:
163
140
  schema: Schema (dataset) override.
164
141
  """
165
- dataset = self._resolve_dataset(schema)
166
- dataset_ref = bigquery.DatasetReference(self.project, dataset)
142
+ ds = self._resolve_dataset(schema)
143
+ dataset_ref = bigquery.DatasetReference(self.project, ds)
167
144
  try:
168
- self._client.get_dataset(dataset_ref)
145
+ self.client.get_dataset(dataset_ref)
169
146
  except NotFound:
170
147
  bq_dataset = bigquery.Dataset(dataset_ref)
171
- bq_dataset.location = self.location
172
- self._client.create_dataset(bq_dataset)
148
+ bq_dataset.location = self.client.location
149
+ try:
150
+ self.client.create_dataset(bq_dataset)
151
+ except Conflict:
152
+ pass # Created by a concurrent asset — already exists
173
153
 
174
154
  # ------------------------------------------------------------------
175
- # DatabaseIO hooks
155
+ # DatabaseDestination hooks
176
156
  # ------------------------------------------------------------------
177
157
 
178
158
  def _insert(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
@@ -195,8 +175,10 @@ class BigQueryIO(DatabaseIO):
195
175
  source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
196
176
  write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
197
177
  )
198
- job = self._client.load_table_from_json(rows, ref, job_config=job_config)
199
- job.result() # Wait for completion
178
+ safe_rows = [json.loads(json.dumps(row, default=_json_default)) for row in rows]
179
+
180
+ job = self.client.load_table_from_json(safe_rows, ref, job_config=job_config)
181
+ job.result()
200
182
 
201
183
  def _delete_all(self, table: str, schema: str | None) -> None:
202
184
  """Truncate all rows from the BigQuery table.
@@ -210,7 +192,7 @@ class BigQueryIO(DatabaseIO):
210
192
  if not self._table_exists(table, schema):
211
193
  return
212
194
  ref = self._table_ref(table, schema)
213
- self._client.query(f"TRUNCATE TABLE `{ref}`").result()
195
+ self.client.query(f"TRUNCATE TABLE `{ref}`").result()
214
196
 
215
197
  def _delete_partition(self, table: str, schema: str | None, column: str, value: Any) -> None:
216
198
  """Delete rows matching a partition value.
@@ -228,9 +210,9 @@ class BigQueryIO(DatabaseIO):
228
210
  ref = self._table_ref(table, schema)
229
211
  query = f"DELETE FROM `{ref}` WHERE `{column}` = @partition_value"
230
212
  job_config = bigquery.QueryJobConfig(
231
- query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
213
+ query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_to_py_type(value), value)],
232
214
  )
233
- self._client.query(query, job_config=job_config).result()
215
+ self.client.query(query, job_config=job_config).result()
234
216
 
235
217
  def _select_all(self, table: str, schema: str | None) -> list[dict[str, Any]]:
236
218
  """Select all rows from the BigQuery table.
@@ -243,16 +225,22 @@ class BigQueryIO(DatabaseIO):
243
225
  All rows as list of dicts.
244
226
 
245
227
  Raises:
246
- ValueError: If the table does not exist.
228
+ DataNotFoundError: If the table does not exist.
247
229
  """
248
230
  if not self._table_exists(table, schema):
249
231
  qualified = self._table_ref(table, schema)
250
- raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
232
+ raise DataNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
251
233
  ref = self._table_ref(table, schema)
252
- rows = self._client.query(f"SELECT * FROM `{ref}`").result()
234
+ rows = self.client.query(f"SELECT * FROM `{ref}`").result()
253
235
  return [dict(row) for row in rows]
254
236
 
255
- def _select_partition(self, table: str, schema: str | None, column: str, value: Any) -> list[dict[str, Any]]:
237
+ def _select_partition(
238
+ self,
239
+ table: str,
240
+ schema: str | None,
241
+ column: str,
242
+ value: Any,
243
+ ) -> list[dict[str, Any]]:
256
244
  """Select rows matching a partition value.
257
245
 
258
246
  Args:
@@ -265,17 +253,17 @@ class BigQueryIO(DatabaseIO):
265
253
  Matching rows as list of dicts.
266
254
 
267
255
  Raises:
268
- ValueError: If the table does not exist.
256
+ DataNotFoundError: If the table does not exist.
269
257
  """
270
258
  if not self._table_exists(table, schema):
271
259
  qualified = self._table_ref(table, schema)
272
- raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
260
+ raise DataNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
273
261
  ref = self._table_ref(table, schema)
274
262
  query = f"SELECT * FROM `{ref}` WHERE `{column}` = @partition_value"
275
263
  job_config = bigquery.QueryJobConfig(
276
- query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
264
+ query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_to_py_type(value), value)],
277
265
  )
278
- rows = self._client.query(query, job_config=job_config).result()
266
+ rows = self.client.query(query, job_config=job_config).result()
279
267
  return [dict(row) for row in rows]
280
268
 
281
269
  # ------------------------------------------------------------------
@@ -283,7 +271,10 @@ class BigQueryIO(DatabaseIO):
283
271
  # ------------------------------------------------------------------
284
272
 
285
273
  def _count_by_partition(
286
- self, table: str, schema: str | None, column: str,
274
+ self,
275
+ table: str,
276
+ schema: str | None,
277
+ column: str,
287
278
  ) -> dict[str, int]:
288
279
  """Return row counts grouped by partition column via BigQuery SQL.
289
280
 
@@ -296,54 +287,61 @@ class BigQueryIO(DatabaseIO):
296
287
  Mapping from partition value (as string) to row count.
297
288
 
298
289
  Raises:
299
- TableNotFoundError: If the table does not exist.
290
+ DataNotFoundError: If the table does not exist.
300
291
  """
301
292
  if not self._table_exists(table, schema):
302
293
  ref = self._table_ref(table, schema)
303
- raise TableNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
294
+ raise DataNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
304
295
 
305
296
  ref = self._table_ref(table, schema)
306
- query = (
307
- f"SELECT CAST(`{column}` AS STRING) AS partition_value, "
308
- f"COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
309
- )
310
- rows = self._client.query(query).result()
297
+ query = f"SELECT CAST(`{column}` AS STRING) AS partition_value, COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
298
+ rows = self.client.query(query).result()
311
299
  return {row["partition_value"]: row["cnt"] for row in rows}
312
300
 
313
- # ------------------------------------------------------------------
314
- # Serialization
315
- # ------------------------------------------------------------------
316
-
317
- def to_spec(self) -> IOSpec:
318
- """Convert to serializable spec."""
319
- init = self._base_init_kwargs()
320
- init["project"] = self.project
321
- if self.default_dataset is not None:
322
- init["default_dataset"] = self.default_dataset
323
- init["location"] = self.location
324
- return IOSpec(path=self.path, init=init)
325
-
326
301
  # ------------------------------------------------------------------
327
302
  # Lifecycle
328
303
  # ------------------------------------------------------------------
329
304
 
330
305
  def dispose(self) -> None:
331
- """Close the BigQuery client."""
332
- self._client.close()
306
+ if self.client:
307
+ self.client.close()
333
308
 
334
309
 
335
- def _bq_param_type(value: Any) -> str:
336
- """Map a Python value to a BigQuery query parameter type.
310
+ # ---------------------------------------------------------------------------
311
+ # Utility functions
312
+ # ---------------------------------------------------------------------------
337
313
 
338
- Args:
339
- value: A Python value.
340
314
 
341
- Returns:
342
- BigQuery parameter type string.
343
- """
344
- import datetime
345
- from decimal import Decimal
315
+ def _json_default(o: Any) -> Any:
316
+ """JSON serializer for types not handled by the default encoder."""
317
+ if isinstance(o, (datetime.date, datetime.datetime)):
318
+ return o.isoformat()
319
+ if isinstance(o, Decimal):
320
+ return str(o)
321
+ raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
322
+
323
+
324
+ def _py_to_bq_type(value: Any) -> str:
325
+ """Infer a BigQuery field type from a Python value."""
326
+ if isinstance(value, bool):
327
+ return "BOOLEAN"
328
+ if isinstance(value, int):
329
+ return "INTEGER"
330
+ if isinstance(value, float):
331
+ return "FLOAT"
332
+ if isinstance(value, Decimal):
333
+ return "NUMERIC"
334
+ if isinstance(value, datetime.datetime):
335
+ return "TIMESTAMP"
336
+ if isinstance(value, datetime.date):
337
+ return "DATE"
338
+ if isinstance(value, bytes):
339
+ return "BYTES"
340
+ return "STRING"
341
+
346
342
 
343
+ def _bq_to_py_type(value: Any) -> str:
344
+ """Map a Python value to a BigQuery query parameter type."""
347
345
  if isinstance(value, bool):
348
346
  return "BOOL"
349
347
  if isinstance(value, int):
@@ -0,0 +1,31 @@
1
+ """Google Cloud connection resource for service account credentials."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from interloper.connection import Connection, connection
8
+ from interloper.resource.fields import JsonField
9
+ from pydantic import field_validator
10
+ from pydantic_settings import SettingsConfigDict
11
+
12
+
13
+ @connection(
14
+ key="google_cloud_connection",
15
+ name="Google Cloud",
16
+ icon="devicon:googlecloud",
17
+ tags=["Cloud"],
18
+ )
19
+ class GoogleCloudConnection(Connection):
20
+ """Connection resource holding Google Cloud credentials."""
21
+
22
+ model_config = SettingsConfigDict(env_prefix="google_cloud_")
23
+
24
+ service_account_key: str = JsonField()
25
+
26
+ @field_validator("service_account_key", mode="before")
27
+ @classmethod
28
+ def _serialize_key(cls, v: object) -> object:
29
+ if isinstance(v, dict):
30
+ return json.dumps(v)
31
+ return v
@@ -1,3 +0,0 @@
1
- # interloper-google-cloud
2
-
3
- Google Cloud IO managers for Interloper.
@@ -1,7 +0,0 @@
1
- """Interloper Google Cloud integration for BigQuery IO."""
2
-
3
- from interloper_google_cloud.io import BigQueryIO
4
-
5
- __all__ = [
6
- "BigQueryIO",
7
- ]
@@ -1,7 +0,0 @@
1
- """Google Cloud IO managers for reading and writing to BigQuery."""
2
-
3
- from interloper_google_cloud.io.bigquery import BigQueryIO
4
-
5
- __all__ = [
6
- "BigQueryIO",
7
- ]