PyPI - interloper-google-cloud - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

interloper-google-cloud 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{interloper_google_cloud-0.2.0 → interloper_google_cloud-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.3
 Name: interloper-google-cloud
-Version: 0.2.0
-Summary: Interloper Google Cloud IO managers
+Version: 0.3.0
+Summary: Interloper Google Cloud integration: BigQuery destination
 Author: Guillaume Onfroy
 Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
 Requires-Dist: google-cloud-bigquery>=3.0
@@ -11,4 +11,4 @@ Description-Content-Type: text/markdown
 # interloper-google-cloud
-Google Cloud IO managers for Interloper.
+Google Cloud integration for the Interloper framework. Provides a BigQuery destination and Google Cloud connection resource.

interloper_google_cloud-0.3.0/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# interloper-google-cloud
+Google Cloud integration for the Interloper framework. Provides a BigQuery destination and Google Cloud connection resource.

{interloper_google_cloud-0.2.0 → interloper_google_cloud-0.3.0}/pyproject.toml RENAMED Viewed

@@ -3,8 +3,8 @@
 # ###############
 [project]
 name = "interloper-google-cloud"
-version = "0.2.0"
-description = "Interloper Google Cloud IO managers"
+version = "0.3.0"
+description = "Interloper Google Cloud integration: BigQuery destination"
 readme = "README.md"
 authors = [{ name = "Guillaume Onfroy", email = "guillaume@digitlcloud.com" }]
 requires-python = ">=3.10"
@@ -14,7 +14,7 @@ dependencies = [
 ]
 [build-system]
-requires = ["uv_build>=0.9.5,<0.10.0"]
+requires = ["uv_build>=0.11.5,<0.12"]
 build-backend = "uv_build"
 [tool.uv.sources]
@@ -40,4 +40,4 @@ extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
 include = ["src"]
 typeCheckingMode = "basic"
 reportMissingParameterType = true
-ignore = ["libs/**", "tests/**", "scripts/**"]
+ignore = ["tests/**"]

interloper_google_cloud-0.3.0/src/interloper_google_cloud/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Interloper Google Cloud integration: BigQuery destination and connection."""
+from interloper_google_cloud.bigquery import BigQueryDestination
+from interloper_google_cloud.connection import GoogleCloudConnection
+__all__ = [
+    "BigQueryDestination",
+    "GoogleCloudConnection",
+]

interloper_google_cloud-0.3.0/src/interloper_google_cloud/bigquery/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Google Cloud destination implementations."""
+from interloper_google_cloud.bigquery.destination import BigQueryDestination
+__all__ = [
+    "BigQueryDestination",
+]

interloper_google_cloud-0.2.0/src/interloper_google_cloud/io/bigquery.py → interloper_google_cloud-0.3.0/src/interloper_google_cloud/bigquery/destination.py RENAMED Viewed

@@ -1,88 +1,66 @@
-"""BigQuery IO implementation."""
+"""BigQuery destination implementation."""
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any
+import datetime
+import json
+from decimal import Decimal
+from functools import cached_property
+from typing import Any
+import google.auth
 from google.cloud import bigquery
-from google.cloud.exceptions import NotFound
-from interloper.errors import ConfigError, TableNotFoundError
-from interloper.io.database import DatabaseIO, WriteDisposition
-from interloper.serialization.io import IOSpec
-if TYPE_CHECKING:
-    from interloper.io.adapter import DataAdapter
-def _infer_bq_type(value: Any) -> str:
-    """Infer a BigQuery field type from a Python value.
-    Args:
-        value: A sample Python value used to determine the field type.
-    Returns:
-        A BigQuery standard SQL type name.
-    """
-    import datetime
-    from decimal import Decimal
-    if isinstance(value, bool):
-        return "BOOLEAN"
-    if isinstance(value, int):
-        return "INTEGER"
-    if isinstance(value, float):
-        return "FLOAT"
-    if isinstance(value, Decimal):
-        return "NUMERIC"
-    if isinstance(value, datetime.datetime):
-        return "TIMESTAMP"
-    if isinstance(value, datetime.date):
-        return "DATE"
-    if isinstance(value, bytes):
-        return "BYTES"
-    return "STRING"
-class BigQueryIO(DatabaseIO):
-    """BigQuery IO manager.
-    Provides read and write access to Google BigQuery tables.  Uses the
-    ``google-cloud-bigquery`` client directly (no SQLAlchemy).
-    The BigQuery *dataset* is resolved from the asset's ``dataset`` attribute
-    (i.e. the schema parameter in :class:`DatabaseIO` hooks).  If the asset has
-    no ``dataset``, the ``default_dataset`` constructor argument is used as a
-    fallback.
-    Args:
-        project: Google Cloud project ID.
-        default_dataset: Fallback BigQuery dataset when the asset has no
-            ``dataset`` attribute.  At least one of the asset's ``dataset`` or
-            this parameter must be set.
-        location: BigQuery location (e.g. ``"US"``, ``"EU"``).
-        credentials: Optional Google credentials object.  When *None*, the
-            default application credentials are used.
-        write_disposition: Controls whether existing rows are deleted before
-            writing.  Defaults to :attr:`WriteDisposition.REPLACE`.
-        chunk_size: Number of rows per insert batch.
-        adapter: Optional data adapter for type conversion.
-    """
-    def __init__(
-        self,
-        project: str,
-        default_dataset: str | None = None,
-        location: str = "EU",
-        credentials: Any = None,
-        write_disposition: WriteDisposition = WriteDisposition.REPLACE,
-        chunk_size: int = 1000,
-        adapter: DataAdapter | str | None = None,
-    ) -> None:
-        super().__init__(write_disposition, chunk_size, adapter)
-        self.project = project
-        self.default_dataset = default_dataset
-        self.location = location
-        self._client = bigquery.Client(project=project, credentials=credentials, location=location)
+from google.cloud.exceptions import Conflict, NotFound
+from google.oauth2 import service_account
+from interloper.destination import destination
+from interloper.destination.adapter import DataAdapter
+from interloper.destination.database import DatabaseDestination
+from interloper.errors import ConfigError, DataNotFoundError
+from interloper.resource.fields import InputField, SelectField
+from interloper_pandas import DataFrameAdapter
+from interloper_google_cloud.connection import GoogleCloudConnection
+@destination(
+    key="bigquery_destination",
+    name="BigQuery",
+    icon="icon:bigquery",
+    tags=["Cloud"],
+)
+class BigQueryDestination(DatabaseDestination):
+    """BigQuery destination."""
+    connection: GoogleCloudConnection
+    # Config fields (previously on BigQueryConfig)
+    project: str = InputField(description="Google Cloud project ID")
+    location: str = SelectField(
+        description="BigQuery dataset location",
+        options=[
+            {"label": "EU", "value": "EU"},
+            {"label": "US", "value": "US"},
+        ],
+    )
+    default_dataset: str | None = InputField(default=None, description="Default BigQuery dataset")
+    @property
+    def adapters(self) -> list[DataAdapter]:
+        return [DataFrameAdapter()]
+    @cached_property
+    def client(self) -> bigquery.Client:
+        if self.connection and self.connection.service_account_key:
+            key_info = json.loads(self.connection.service_account_key)
+            credentials = service_account.Credentials.from_service_account_info(key_info)
+        else:
+            credentials, _ = google.auth.default()
+        return bigquery.Client(
+            project=self.project,
+            credentials=credentials,
+            location=self.location,
+        )
     # ------------------------------------------------------------------
     # Helpers
@@ -92,7 +70,7 @@ class BigQueryIO(DatabaseIO):
         """Return the BigQuery dataset to use.
         Prefers ``schema`` (from the asset's ``dataset``).  Falls back to
-        :attr:`default_dataset`.
+        the destination's ``dataset`` field.
         Args:
             schema: Schema parameter from the asset context.
@@ -101,15 +79,15 @@ class BigQueryIO(DatabaseIO):
             The resolved dataset name.
         Raises:
-            ValueError: If neither *schema* nor *default_dataset* is set.
+            ConfigError: If neither *schema* nor *dataset* is set.
         """
-        dataset = schema or self.default_dataset
-        if dataset is None:
+        ds = schema or self.default_dataset
+        if ds is None:
             raise ConfigError(
-                "BigQueryIO requires a dataset. Either set 'dataset' on the asset "
-                "or provide 'default_dataset' to BigQueryIO."
+                "BigQueryDestination requires a dataset. Either set 'dataset' on the asset "
+                "or provide 'default_dataset' on the destination."
             )
-        return dataset
+        return ds
     def _table_ref(self, table: str, schema: str | None) -> str:
         """Build a fully-qualified BigQuery table reference.
@@ -121,8 +99,8 @@ class BigQueryIO(DatabaseIO):
         Returns:
             ``project.dataset.table`` string.
         """
-        dataset = self._resolve_dataset(schema)
-        return f"{self.project}.{dataset}.{table}"
+        ds = self._resolve_dataset(schema)
+        return f"{self.project}.{ds}.{table}"
     def _table_exists(self, table: str, schema: str | None) -> bool:
         """Check whether a BigQuery table exists.
@@ -135,7 +113,7 @@ class BigQueryIO(DatabaseIO):
             ``True`` if the table exists, ``False`` otherwise.
         """
         try:
-            self._client.get_table(self._table_ref(table, schema))
+            self.client.get_table(self._table_ref(table, schema))
         except NotFound:
             return False
         return True
@@ -143,8 +121,7 @@ class BigQueryIO(DatabaseIO):
     def _create_table(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
         """Create a BigQuery table from sample row data.
-        Column types are inferred from the Python values in the first row
-        using :func:`_infer_bq_type`.
+        Column types are inferred from the Python values in the first row.
         Args:
             table: Target table name.
@@ -152,9 +129,9 @@ class BigQueryIO(DatabaseIO):
             rows: Row data (at least one row required for schema inference).
         """
         sample = rows[0]
-        bq_schema = [bigquery.SchemaField(name, _infer_bq_type(value)) for name, value in sample.items()]
+        bq_schema = [bigquery.SchemaField(name, _py_to_bq_type(value)) for name, value in sample.items()]
         bq_table = bigquery.Table(self._table_ref(table, schema), schema=bq_schema)
-        self._client.create_table(bq_table)
+        self.client.create_table(bq_table)
     def _ensure_dataset(self, schema: str | None) -> None:
         """Create the BigQuery dataset if it does not already exist.
@@ -162,17 +139,20 @@ class BigQueryIO(DatabaseIO):
         Args:
             schema: Schema (dataset) override.
         """
-        dataset = self._resolve_dataset(schema)
-        dataset_ref = bigquery.DatasetReference(self.project, dataset)
+        ds = self._resolve_dataset(schema)
+        dataset_ref = bigquery.DatasetReference(self.project, ds)
         try:
-            self._client.get_dataset(dataset_ref)
+            self.client.get_dataset(dataset_ref)
         except NotFound:
             bq_dataset = bigquery.Dataset(dataset_ref)
-            bq_dataset.location = self.location
-            self._client.create_dataset(bq_dataset)
+            bq_dataset.location = self.client.location
+            try:
+                self.client.create_dataset(bq_dataset)
+            except Conflict:
+                pass  # Created by a concurrent asset — already exists
     # ------------------------------------------------------------------
-    # DatabaseIO hooks
+    # DatabaseDestination hooks
     # ------------------------------------------------------------------
     def _insert(self, table: str, schema: str | None, rows: list[dict[str, Any]]) -> None:
@@ -195,8 +175,10 @@ class BigQueryIO(DatabaseIO):
             source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
             write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
         )
-        job = self._client.load_table_from_json(rows, ref, job_config=job_config)
-        job.result()  # Wait for completion
+        safe_rows = [json.loads(json.dumps(row, default=_json_default)) for row in rows]
+        job = self.client.load_table_from_json(safe_rows, ref, job_config=job_config)
+        job.result()
     def _delete_all(self, table: str, schema: str | None) -> None:
         """Truncate all rows from the BigQuery table.
@@ -210,7 +192,7 @@ class BigQueryIO(DatabaseIO):
         if not self._table_exists(table, schema):
             return
         ref = self._table_ref(table, schema)
-        self._client.query(f"TRUNCATE TABLE `{ref}`").result()
+        self.client.query(f"TRUNCATE TABLE `{ref}`").result()
     def _delete_partition(self, table: str, schema: str | None, column: str, value: Any) -> None:
         """Delete rows matching a partition value.
@@ -228,9 +210,9 @@ class BigQueryIO(DatabaseIO):
         ref = self._table_ref(table, schema)
         query = f"DELETE FROM `{ref}` WHERE `{column}` = @partition_value"
         job_config = bigquery.QueryJobConfig(
-            query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
+            query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_to_py_type(value), value)],
         )
-        self._client.query(query, job_config=job_config).result()
+        self.client.query(query, job_config=job_config).result()
     def _select_all(self, table: str, schema: str | None) -> list[dict[str, Any]]:
         """Select all rows from the BigQuery table.
@@ -243,16 +225,22 @@ class BigQueryIO(DatabaseIO):
             All rows as list of dicts.
         Raises:
-            ValueError: If the table does not exist.
+            DataNotFoundError: If the table does not exist.
         """
         if not self._table_exists(table, schema):
             qualified = self._table_ref(table, schema)
-            raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
+            raise DataNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
         ref = self._table_ref(table, schema)
-        rows = self._client.query(f"SELECT * FROM `{ref}`").result()
+        rows = self.client.query(f"SELECT * FROM `{ref}`").result()
         return [dict(row) for row in rows]
-    def _select_partition(self, table: str, schema: str | None, column: str, value: Any) -> list[dict[str, Any]]:
+    def _select_partition(
+        self,
+        table: str,
+        schema: str | None,
+        column: str,
+        value: Any,
+    ) -> list[dict[str, Any]]:
         """Select rows matching a partition value.
         Args:
@@ -265,17 +253,17 @@ class BigQueryIO(DatabaseIO):
             Matching rows as list of dicts.
         Raises:
-            ValueError: If the table does not exist.
+            DataNotFoundError: If the table does not exist.
         """
         if not self._table_exists(table, schema):
             qualified = self._table_ref(table, schema)
-            raise TableNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
+            raise DataNotFoundError(f"Table '{qualified}' does not exist. Has the asset been materialized?")
         ref = self._table_ref(table, schema)
         query = f"SELECT * FROM `{ref}` WHERE `{column}` = @partition_value"
         job_config = bigquery.QueryJobConfig(
-            query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_param_type(value), value)],
+            query_parameters=[bigquery.ScalarQueryParameter("partition_value", _bq_to_py_type(value), value)],
         )
-        rows = self._client.query(query, job_config=job_config).result()
+        rows = self.client.query(query, job_config=job_config).result()
         return [dict(row) for row in rows]
     # ------------------------------------------------------------------
@@ -283,7 +271,10 @@ class BigQueryIO(DatabaseIO):
     # ------------------------------------------------------------------
     def _count_by_partition(
-        self, table: str, schema: str | None, column: str,
+        self,
+        table: str,
+        schema: str | None,
+        column: str,
     ) -> dict[str, int]:
         """Return row counts grouped by partition column via BigQuery SQL.
@@ -296,54 +287,61 @@ class BigQueryIO(DatabaseIO):
             Mapping from partition value (as string) to row count.
         Raises:
-            TableNotFoundError: If the table does not exist.
+            DataNotFoundError: If the table does not exist.
         """
         if not self._table_exists(table, schema):
             ref = self._table_ref(table, schema)
-            raise TableNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
+            raise DataNotFoundError(f"Table '{ref}' does not exist. Has the asset been materialized?")
         ref = self._table_ref(table, schema)
-        query = (
-            f"SELECT CAST(`{column}` AS STRING) AS partition_value, "
-            f"COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
-        )
-        rows = self._client.query(query).result()
+        query = f"SELECT CAST(`{column}` AS STRING) AS partition_value, COUNT(*) AS cnt FROM `{ref}` GROUP BY 1"
+        rows = self.client.query(query).result()
         return {row["partition_value"]: row["cnt"] for row in rows}
-    # ------------------------------------------------------------------
-    # Serialization
-    # ------------------------------------------------------------------
-    def to_spec(self) -> IOSpec:
-        """Convert to serializable spec."""
-        init = self._base_init_kwargs()
-        init["project"] = self.project
-        if self.default_dataset is not None:
-            init["default_dataset"] = self.default_dataset
-        init["location"] = self.location
-        return IOSpec(path=self.path, init=init)
     # ------------------------------------------------------------------
     # Lifecycle
     # ------------------------------------------------------------------
     def dispose(self) -> None:
-        """Close the BigQuery client."""
-        self._client.close()
+        if self.client:
+            self.client.close()
-def _bq_param_type(value: Any) -> str:
-    """Map a Python value to a BigQuery query parameter type.
+# ---------------------------------------------------------------------------
+# Utility functions
+# ---------------------------------------------------------------------------
-    Args:
-        value: A Python value.
-    Returns:
-        BigQuery parameter type string.
-    """
-    import datetime
-    from decimal import Decimal
+def _json_default(o: Any) -> Any:
+    """JSON serializer for types not handled by the default encoder."""
+    if isinstance(o, (datetime.date, datetime.datetime)):
+        return o.isoformat()
+    if isinstance(o, Decimal):
+        return str(o)
+    raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
+def _py_to_bq_type(value: Any) -> str:
+    """Infer a BigQuery field type from a Python value."""
+    if isinstance(value, bool):
+        return "BOOLEAN"
+    if isinstance(value, int):
+        return "INTEGER"
+    if isinstance(value, float):
+        return "FLOAT"
+    if isinstance(value, Decimal):
+        return "NUMERIC"
+    if isinstance(value, datetime.datetime):
+        return "TIMESTAMP"
+    if isinstance(value, datetime.date):
+        return "DATE"
+    if isinstance(value, bytes):
+        return "BYTES"
+    return "STRING"
+def _bq_to_py_type(value: Any) -> str:
+    """Map a Python value to a BigQuery query parameter type."""
     if isinstance(value, bool):
         return "BOOL"
     if isinstance(value, int):

interloper_google_cloud-0.3.0/src/interloper_google_cloud/connection.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Google Cloud connection resource for service account credentials."""
+from __future__ import annotations
+import json
+from interloper.connection import Connection, connection
+from interloper.resource.fields import JsonField
+from pydantic import field_validator
+from pydantic_settings import SettingsConfigDict
+@connection(
+    key="google_cloud_connection",
+    name="Google Cloud",
+    icon="devicon:googlecloud",
+    tags=["Cloud"],
+)
+class GoogleCloudConnection(Connection):
+    """Connection resource holding Google Cloud credentials."""
+    model_config = SettingsConfigDict(env_prefix="google_cloud_")
+    service_account_key: str = JsonField()
+    @field_validator("service_account_key", mode="before")
+    @classmethod
+    def _serialize_key(cls, v: object) -> object:
+        if isinstance(v, dict):
+            return json.dumps(v)
+        return v

interloper_google_cloud-0.2.0/README.md DELETED Viewed

@@ -1,3 +0,0 @@
-# interloper-google-cloud
-Google Cloud IO managers for Interloper.

interloper_google_cloud-0.2.0/src/interloper_google_cloud/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-"""Interloper Google Cloud integration for BigQuery IO."""
-from interloper_google_cloud.io import BigQueryIO
-__all__ = [
-    "BigQueryIO",
-]

interloper_google_cloud-0.2.0/src/interloper_google_cloud/io/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-"""Google Cloud IO managers for reading and writing to BigQuery."""
-from interloper_google_cloud.io.bigquery import BigQueryIO
-__all__ = [
-    "BigQueryIO",
-]

interloper-google-cloud 0.2.0__tar.gz → 0.3.0__tar.gz

interloper-google-cloud 0.2.0tar.gz → 0.3.0tar.gz