apache-airflow-providers-google 16.1.0__py3-none-any.whl → 17.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. airflow/providers/google/__init__.py +1 -1
  2. airflow/providers/google/ads/hooks/ads.py +1 -5
  3. airflow/providers/google/cloud/hooks/bigquery.py +1 -130
  4. airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
  5. airflow/providers/google/cloud/hooks/cloud_run.py +1 -1
  6. airflow/providers/google/cloud/hooks/cloud_sql.py +5 -5
  7. airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +1 -1
  8. airflow/providers/google/cloud/hooks/dataflow.py +0 -85
  9. airflow/providers/google/cloud/hooks/datafusion.py +1 -1
  10. airflow/providers/google/cloud/hooks/dataprep.py +1 -4
  11. airflow/providers/google/cloud/hooks/dataproc.py +68 -70
  12. airflow/providers/google/cloud/hooks/gcs.py +3 -5
  13. airflow/providers/google/cloud/hooks/kubernetes_engine.py +2 -2
  14. airflow/providers/google/cloud/hooks/looker.py +1 -5
  15. airflow/providers/google/cloud/hooks/stackdriver.py +10 -8
  16. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +4 -4
  17. airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
  18. airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +7 -0
  19. airflow/providers/google/cloud/links/kubernetes_engine.py +3 -0
  20. airflow/providers/google/cloud/log/gcs_task_handler.py +2 -2
  21. airflow/providers/google/cloud/log/stackdriver_task_handler.py +1 -1
  22. airflow/providers/google/cloud/openlineage/mixins.py +7 -7
  23. airflow/providers/google/cloud/operators/automl.py +1 -1
  24. airflow/providers/google/cloud/operators/bigquery.py +8 -609
  25. airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
  26. airflow/providers/google/cloud/operators/cloud_sql.py +1 -5
  27. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +2 -2
  28. airflow/providers/google/cloud/operators/dataproc.py +1 -1
  29. airflow/providers/google/cloud/operators/dlp.py +2 -2
  30. airflow/providers/google/cloud/operators/kubernetes_engine.py +4 -4
  31. airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
  32. airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +7 -1
  33. airflow/providers/google/cloud/operators/vertex_ai/ray.py +7 -5
  34. airflow/providers/google/cloud/operators/vision.py +1 -1
  35. airflow/providers/google/cloud/sensors/dataflow.py +23 -6
  36. airflow/providers/google/cloud/sensors/datafusion.py +2 -2
  37. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -2
  38. airflow/providers/google/cloud/transfers/gcs_to_local.py +3 -1
  39. airflow/providers/google/cloud/transfers/oracle_to_gcs.py +9 -9
  40. airflow/providers/google/cloud/triggers/bigquery.py +11 -13
  41. airflow/providers/google/cloud/triggers/cloud_build.py +1 -1
  42. airflow/providers/google/cloud/triggers/cloud_run.py +1 -1
  43. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +1 -1
  44. airflow/providers/google/cloud/triggers/datafusion.py +1 -1
  45. airflow/providers/google/cloud/triggers/dataproc.py +10 -9
  46. airflow/providers/google/cloud/triggers/kubernetes_engine.py +45 -27
  47. airflow/providers/google/cloud/triggers/mlengine.py +1 -1
  48. airflow/providers/google/cloud/triggers/pubsub.py +1 -1
  49. airflow/providers/google/cloud/utils/credentials_provider.py +1 -1
  50. airflow/providers/google/common/auth_backend/google_openid.py +2 -2
  51. airflow/providers/google/common/hooks/base_google.py +2 -6
  52. airflow/providers/google/common/utils/id_token_credentials.py +2 -2
  53. airflow/providers/google/get_provider_info.py +19 -16
  54. airflow/providers/google/leveldb/hooks/leveldb.py +1 -5
  55. airflow/providers/google/marketing_platform/hooks/display_video.py +47 -3
  56. airflow/providers/google/marketing_platform/links/analytics_admin.py +1 -1
  57. airflow/providers/google/marketing_platform/operators/display_video.py +64 -15
  58. airflow/providers/google/marketing_platform/sensors/display_video.py +9 -2
  59. airflow/providers/google/version_compat.py +10 -3
  60. {apache_airflow_providers_google-16.1.0.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/METADATA +106 -100
  61. {apache_airflow_providers_google-16.1.0.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/RECORD +63 -62
  62. airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
  63. airflow/providers/google/cloud/links/life_sciences.py +0 -30
  64. airflow/providers/google/cloud/operators/life_sciences.py +0 -118
  65. {apache_airflow_providers_google-16.1.0.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/WHEEL +0 -0
  66. {apache_airflow_providers_google-16.1.0.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -34,7 +34,7 @@ from google.cloud.bigquery.table import RowIterator, Table, TableListItem, Table
34
34
 
35
35
  from airflow.configuration import conf
36
36
  from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
37
- from airflow.providers.common.sql.operators.sql import ( # type: ignore[attr-defined] # for _parse_boolean
37
+ from airflow.providers.common.sql.operators.sql import ( # for _parse_boolean
38
38
  SQLCheckOperator,
39
39
  SQLColumnCheckOperator,
40
40
  SQLIntervalCheckOperator,
@@ -59,7 +59,6 @@ from airflow.providers.google.cloud.triggers.bigquery import (
59
59
  BigQueryValueCheckTrigger,
60
60
  )
61
61
  from airflow.providers.google.cloud.utils.bigquery import convert_job_id
62
- from airflow.providers.google.common.deprecated import deprecated
63
62
  from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
64
63
  from airflow.utils.helpers import exactly_one
65
64
 
@@ -116,8 +115,10 @@ class _BigQueryDbHookMixin:
116
115
  impersonation_chain=self.impersonation_chain,
117
116
  labels=self.labels,
118
117
  )
118
+
119
+ # mypy assuming project_id is read only, as project_id is a property in GoogleBaseHook.
119
120
  if self.project_id:
120
- hook.project_id = self.project_id
121
+ hook.project_id = self.project_id # type:ignore[misc]
121
122
  return hook
122
123
 
123
124
 
@@ -309,9 +310,7 @@ class BigQueryCheckOperator(
309
310
  if not records:
310
311
  raise AirflowException(f"The following query returned zero rows: {self.sql}")
311
312
  if not all(records):
312
- self._raise_exception( # type: ignore[attr-defined]
313
- f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}"
314
- )
313
+ self._raise_exception(f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}")
315
314
 
316
315
  def execute_complete(self, context: Context, event: dict[str, Any]) -> None:
317
316
  """
@@ -428,7 +427,7 @@ class BigQueryValueCheckOperator(
428
427
  nowait=True,
429
428
  )
430
429
 
431
- def execute(self, context: Context) -> None: # type: ignore[override]
430
+ def execute(self, context: Context) -> None:
432
431
  if not self.deferrable:
433
432
  super().execute(context=context)
434
433
  else:
@@ -1156,7 +1155,7 @@ class BigQueryGetDataOperator(GoogleCloudBaseOperator, _BigQueryOperatorsEncrypt
1156
1155
  "BigQueryHook.list_rows() returns iterator when return_iterator is False (default)"
1157
1156
  )
1158
1157
  self.log.info("Total extracted rows: %s", len(rows))
1159
-
1158
+ table_data: list[dict[str, Any]] | list[Any]
1160
1159
  if self.as_dict:
1161
1160
  table_data = [dict(row) for row in rows]
1162
1161
  else:
@@ -1375,606 +1374,6 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
1375
1374
  return OperatorLineage(outputs=[output_dataset])
1376
1375
 
1377
1376
 
1378
- @deprecated(
1379
- planned_removal_date="July 30, 2025",
1380
- use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
1381
- category=AirflowProviderDeprecationWarning,
1382
- )
1383
- class BigQueryCreateEmptyTableOperator(GoogleCloudBaseOperator):
1384
- """
1385
- Creates a new table in the specified BigQuery dataset, optionally with schema.
1386
-
1387
- The schema to be used for the BigQuery table may be specified in one of
1388
- two ways. You may either directly pass the schema fields in, or you may
1389
- point the operator to a Google Cloud Storage object name. The object in
1390
- Google Cloud Storage must be a JSON file with the schema fields in it.
1391
- You can also create a table without schema.
1392
-
1393
- .. seealso::
1394
- For more information on how to use this operator, take a look at the guide:
1395
- :ref:`howto/operator:BigQueryCreateEmptyTableOperator`
1396
-
1397
- :param project_id: The project to create the table into. (templated)
1398
- :param dataset_id: The dataset to create the table into. (templated)
1399
- :param table_id: The Name of the table to be created. (templated)
1400
- :param table_resource: Table resource as described in documentation:
1401
- https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
1402
- If provided all other parameters are ignored. (templated)
1403
- :param schema_fields: If set, the schema field list as defined here:
1404
- https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
1405
-
1406
- **Example**::
1407
-
1408
- schema_fields = [
1409
- {"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
1410
- {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
1411
- ]
1412
-
1413
- :param gcs_schema_object: Full path to the JSON file containing
1414
- schema (templated). For
1415
- example: ``gs://test-bucket/dir1/dir2/employee_schema.json``
1416
- :param time_partitioning: configure optional time partitioning fields i.e.
1417
- partition by field, type and expiration as per API specifications.
1418
-
1419
- .. seealso::
1420
- https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
1421
- :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
1422
- interact with the Bigquery service.
1423
- :param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud.
1424
- and interact with the Google Cloud Storage service.
1425
- :param labels: a dictionary containing labels for the table, passed to BigQuery
1426
-
1427
- **Example (with schema JSON in GCS)**::
1428
-
1429
- CreateTable = BigQueryCreateEmptyTableOperator(
1430
- task_id="BigQueryCreateEmptyTableOperator_task",
1431
- dataset_id="ODS",
1432
- table_id="Employees",
1433
- project_id="internal-gcp-project",
1434
- gcs_schema_object="gs://schema-bucket/employee_schema.json",
1435
- gcp_conn_id="airflow-conn-id",
1436
- google_cloud_storage_conn_id="airflow-conn-id",
1437
- )
1438
-
1439
- **Corresponding Schema file** (``employee_schema.json``)::
1440
-
1441
- [
1442
- {"mode": "NULLABLE", "name": "emp_name", "type": "STRING"},
1443
- {"mode": "REQUIRED", "name": "salary", "type": "INTEGER"},
1444
- ]
1445
-
1446
- **Example (with schema in the DAG)**::
1447
-
1448
- CreateTable = BigQueryCreateEmptyTableOperator(
1449
- task_id="BigQueryCreateEmptyTableOperator_task",
1450
- dataset_id="ODS",
1451
- table_id="Employees",
1452
- project_id="internal-gcp-project",
1453
- schema_fields=[
1454
- {"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
1455
- {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
1456
- ],
1457
- gcp_conn_id="airflow-conn-id-account",
1458
- google_cloud_storage_conn_id="airflow-conn-id",
1459
- )
1460
-
1461
- :param view: (Optional) A dictionary containing definition for the view.
1462
- If set, it will create a view instead of a table:
1463
-
1464
- .. seealso::
1465
- https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ViewDefinition
1466
- :param materialized_view: (Optional) The materialized view definition.
1467
- :param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
1468
-
1469
- .. code-block:: python
1470
-
1471
- encryption_configuration = {
1472
- "kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
1473
- }
1474
- :param location: The location used for the operation.
1475
- :param cluster_fields: (Optional) The fields used for clustering.
1476
- BigQuery supports clustering for both partitioned and
1477
- non-partitioned tables.
1478
-
1479
- .. seealso::
1480
- https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clustering.fields
1481
- :param impersonation_chain: Optional service account to impersonate using short-term
1482
- credentials, or chained list of accounts required to get the access_token
1483
- of the last account in the list, which will be impersonated in the request.
1484
- If set as a string, the account must grant the originating account
1485
- the Service Account Token Creator IAM role.
1486
- If set as a sequence, the identities from the list must grant
1487
- Service Account Token Creator IAM role to the directly preceding identity, with first
1488
- account from the list granting this role to the originating account (templated).
1489
- :param if_exists: What should Airflow do if the table exists. If set to `log`, the TI will be passed to
1490
- success and an error message will be logged. Set to `ignore` to ignore the error, set to `fail` to
1491
- fail the TI, and set to `skip` to skip it.
1492
- :param exists_ok: Deprecated - use `if_exists="ignore"` instead.
1493
- """
1494
-
1495
- template_fields: Sequence[str] = (
1496
- "dataset_id",
1497
- "table_id",
1498
- "table_resource",
1499
- "project_id",
1500
- "gcs_schema_object",
1501
- "labels",
1502
- "view",
1503
- "materialized_view",
1504
- "impersonation_chain",
1505
- )
1506
- template_fields_renderers = {"table_resource": "json", "materialized_view": "json"}
1507
- ui_color = BigQueryUIColors.TABLE.value
1508
- operator_extra_links = (BigQueryTableLink(),)
1509
-
1510
- def __init__(
1511
- self,
1512
- *,
1513
- dataset_id: str,
1514
- table_id: str,
1515
- table_resource: dict[str, Any] | None = None,
1516
- project_id: str = PROVIDE_PROJECT_ID,
1517
- schema_fields: list | None = None,
1518
- gcs_schema_object: str | None = None,
1519
- time_partitioning: dict | None = None,
1520
- gcp_conn_id: str = "google_cloud_default",
1521
- google_cloud_storage_conn_id: str = "google_cloud_default",
1522
- labels: dict | None = None,
1523
- view: dict | None = None,
1524
- materialized_view: dict | None = None,
1525
- encryption_configuration: dict | None = None,
1526
- location: str | None = None,
1527
- cluster_fields: list[str] | None = None,
1528
- impersonation_chain: str | Sequence[str] | None = None,
1529
- if_exists: str = "log",
1530
- bigquery_conn_id: str | None = None,
1531
- exists_ok: bool | None = None,
1532
- **kwargs,
1533
- ) -> None:
1534
- if bigquery_conn_id:
1535
- warnings.warn(
1536
- "The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
1537
- AirflowProviderDeprecationWarning,
1538
- stacklevel=2,
1539
- )
1540
- gcp_conn_id = bigquery_conn_id
1541
-
1542
- super().__init__(**kwargs)
1543
-
1544
- self.project_id = project_id
1545
- self.dataset_id = dataset_id
1546
- self.table_id = table_id
1547
- self.schema_fields = schema_fields
1548
- self.gcs_schema_object = gcs_schema_object
1549
- self.gcp_conn_id = gcp_conn_id
1550
- self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
1551
- self.time_partitioning = time_partitioning or {}
1552
- self.labels = labels
1553
- self.view = view
1554
- self.materialized_view = materialized_view
1555
- self.encryption_configuration = encryption_configuration
1556
- self.location = location
1557
- self.cluster_fields = cluster_fields
1558
- self.table_resource = table_resource
1559
- self.impersonation_chain = impersonation_chain
1560
- self._table: Table | None = None
1561
- if exists_ok is not None:
1562
- warnings.warn(
1563
- "`exists_ok` parameter is deprecated, please use `if_exists`",
1564
- AirflowProviderDeprecationWarning,
1565
- stacklevel=2,
1566
- )
1567
- self.if_exists = IfExistAction.IGNORE if exists_ok else IfExistAction.LOG
1568
- else:
1569
- self.if_exists = IfExistAction(if_exists)
1570
-
1571
- def execute(self, context: Context) -> None:
1572
- bq_hook = BigQueryHook(
1573
- gcp_conn_id=self.gcp_conn_id,
1574
- location=self.location,
1575
- impersonation_chain=self.impersonation_chain,
1576
- )
1577
-
1578
- if not self.schema_fields and self.gcs_schema_object:
1579
- gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)
1580
- gcs_hook = GCSHook(
1581
- gcp_conn_id=self.google_cloud_storage_conn_id,
1582
- impersonation_chain=self.impersonation_chain,
1583
- )
1584
- schema_fields_string = gcs_hook.download_as_byte_array(gcs_bucket, gcs_object).decode("utf-8")
1585
- schema_fields = json.loads(schema_fields_string)
1586
- else:
1587
- schema_fields = self.schema_fields
1588
-
1589
- try:
1590
- self.log.info("Creating table")
1591
- # Save table as attribute for further use by OpenLineage
1592
- self._table = bq_hook.create_empty_table(
1593
- project_id=self.project_id,
1594
- dataset_id=self.dataset_id,
1595
- table_id=self.table_id,
1596
- schema_fields=schema_fields,
1597
- time_partitioning=self.time_partitioning,
1598
- cluster_fields=self.cluster_fields,
1599
- labels=self.labels,
1600
- view=self.view,
1601
- materialized_view=self.materialized_view,
1602
- encryption_configuration=self.encryption_configuration,
1603
- table_resource=self.table_resource,
1604
- exists_ok=self.if_exists == IfExistAction.IGNORE,
1605
- )
1606
- if self._table:
1607
- persist_kwargs = {
1608
- "context": context,
1609
- "project_id": self._table.to_api_repr()["tableReference"]["projectId"],
1610
- "dataset_id": self._table.to_api_repr()["tableReference"]["datasetId"],
1611
- "table_id": self._table.to_api_repr()["tableReference"]["tableId"],
1612
- }
1613
- self.log.info(
1614
- "Table %s.%s.%s created successfully",
1615
- self._table.project,
1616
- self._table.dataset_id,
1617
- self._table.table_id,
1618
- )
1619
- else:
1620
- raise AirflowException("Table creation failed.")
1621
- except Conflict:
1622
- error_msg = f"Table {self.dataset_id}.{self.table_id} already exists."
1623
- if self.if_exists == IfExistAction.LOG:
1624
- self.log.info(error_msg)
1625
- persist_kwargs = {
1626
- "context": context,
1627
- "project_id": self.project_id or bq_hook.project_id,
1628
- "dataset_id": self.dataset_id,
1629
- "table_id": self.table_id,
1630
- }
1631
- elif self.if_exists == IfExistAction.FAIL:
1632
- raise AirflowException(error_msg)
1633
- else:
1634
- raise AirflowSkipException(error_msg)
1635
-
1636
- BigQueryTableLink.persist(**persist_kwargs)
1637
-
1638
- def get_openlineage_facets_on_complete(self, _):
1639
- """Implement _on_complete as we will use table resource returned by create method."""
1640
- from airflow.providers.common.compat.openlineage.facet import Dataset
1641
- from airflow.providers.google.cloud.openlineage.utils import (
1642
- BIGQUERY_NAMESPACE,
1643
- get_facets_from_bq_table,
1644
- )
1645
- from airflow.providers.openlineage.extractors import OperatorLineage
1646
-
1647
- if not self._table:
1648
- self.log.debug("OpenLineage did not find `self._table` attribute.")
1649
- return OperatorLineage()
1650
-
1651
- output_dataset = Dataset(
1652
- namespace=BIGQUERY_NAMESPACE,
1653
- name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
1654
- facets=get_facets_from_bq_table(self._table),
1655
- )
1656
-
1657
- return OperatorLineage(outputs=[output_dataset])
1658
-
1659
-
1660
- @deprecated(
1661
- planned_removal_date="July 30, 2025",
1662
- use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
1663
- category=AirflowProviderDeprecationWarning,
1664
- )
1665
- class BigQueryCreateExternalTableOperator(GoogleCloudBaseOperator):
1666
- """
1667
- Create a new external table with data from Google Cloud Storage.
1668
-
1669
- The schema to be used for the BigQuery table may be specified in one of
1670
- two ways. You may either directly pass the schema fields in, or you may
1671
- point the operator to a Google Cloud Storage object name. The object in
1672
- Google Cloud Storage must be a JSON file with the schema fields in it.
1673
-
1674
- .. seealso::
1675
- For more information on how to use this operator, take a look at the guide:
1676
- :ref:`howto/operator:BigQueryCreateExternalTableOperator`
1677
-
1678
- :param bucket: The bucket to point the external table to. (templated)
1679
- :param source_objects: List of Google Cloud Storage URIs to point
1680
- table to. If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI.
1681
- :param destination_project_dataset_table: The dotted ``(<project>.)<dataset>.<table>``
1682
- BigQuery table to load data into (templated). If ``<project>`` is not included,
1683
- project will be the project defined in the connection json.
1684
- :param schema_fields: If set, the schema field list as defined here:
1685
- https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
1686
-
1687
- **Example**::
1688
-
1689
- schema_fields = [
1690
- {"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
1691
- {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
1692
- ]
1693
-
1694
- Should not be set when source_format is 'DATASTORE_BACKUP'.
1695
- :param table_resource: Table resource as described in documentation:
1696
- https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
1697
- If provided all other parameters are ignored. External schema from object will be resolved.
1698
- :param schema_object: If set, a GCS object path pointing to a .json file that
1699
- contains the schema for the table. (templated)
1700
- :param gcs_schema_bucket: GCS bucket name where the schema JSON is stored (templated).
1701
- The default value is self.bucket.
1702
- :param source_format: File format of the data.
1703
- :param autodetect: Try to detect schema and format options automatically.
1704
- The schema_fields and schema_object options will be honored when specified explicitly.
1705
- https://cloud.google.com/bigquery/docs/schema-detect#schema_auto-detection_for_external_data_sources
1706
- :param compression: (Optional) The compression type of the data source.
1707
- Possible values include GZIP and NONE.
1708
- The default value is NONE.
1709
- This setting is ignored for Google Cloud Bigtable,
1710
- Google Cloud Datastore backups and Avro formats.
1711
- :param skip_leading_rows: Number of rows to skip when loading from a CSV.
1712
- :param field_delimiter: The delimiter to use for the CSV.
1713
- :param max_bad_records: The maximum number of bad records that BigQuery can
1714
- ignore when running the job.
1715
- :param quote_character: The value that is used to quote data sections in a CSV file.
1716
- :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
1717
- :param allow_jagged_rows: Accept rows that are missing trailing optional columns.
1718
- The missing values are treated as nulls. If false, records with missing trailing
1719
- columns are treated as bad records, and if there are too many bad records, an
1720
- invalid error is returned in the job result. Only applicable to CSV, ignored
1721
- for other formats.
1722
- :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
1723
- interact with the Bigquery service.
1724
- :param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud
1725
- and interact with the Google Cloud Storage service.
1726
- :param src_fmt_configs: configure optional fields specific to the source format
1727
- :param labels: a dictionary containing labels for the table, passed to BigQuery
1728
- :param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
1729
-
1730
- .. code-block:: python
1731
-
1732
- encryption_configuration = {
1733
- "kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
1734
- }
1735
- :param location: The location used for the operation.
1736
- :param impersonation_chain: Optional service account to impersonate using short-term
1737
- credentials, or chained list of accounts required to get the access_token
1738
- of the last account in the list, which will be impersonated in the request.
1739
- If set as a string, the account must grant the originating account
1740
- the Service Account Token Creator IAM role.
1741
- If set as a sequence, the identities from the list must grant
1742
- Service Account Token Creator IAM role to the directly preceding identity, with first
1743
- account from the list granting this role to the originating account (templated).
1744
- """
1745
-
1746
- template_fields: Sequence[str] = (
1747
- "bucket",
1748
- "source_objects",
1749
- "schema_object",
1750
- "gcs_schema_bucket",
1751
- "destination_project_dataset_table",
1752
- "labels",
1753
- "table_resource",
1754
- "impersonation_chain",
1755
- )
1756
- template_fields_renderers = {"table_resource": "json"}
1757
- ui_color = BigQueryUIColors.TABLE.value
1758
- operator_extra_links = (BigQueryTableLink(),)
1759
-
1760
- def __init__(
1761
- self,
1762
- *,
1763
- bucket: str | None = None,
1764
- source_objects: list[str] | None = None,
1765
- destination_project_dataset_table: str | None = None,
1766
- table_resource: dict[str, Any] | None = None,
1767
- schema_fields: list | None = None,
1768
- schema_object: str | None = None,
1769
- gcs_schema_bucket: str | None = None,
1770
- source_format: str | None = None,
1771
- autodetect: bool = False,
1772
- compression: str | None = None,
1773
- skip_leading_rows: int | None = None,
1774
- field_delimiter: str | None = None,
1775
- max_bad_records: int = 0,
1776
- quote_character: str | None = None,
1777
- allow_quoted_newlines: bool = False,
1778
- allow_jagged_rows: bool = False,
1779
- gcp_conn_id: str = "google_cloud_default",
1780
- google_cloud_storage_conn_id: str = "google_cloud_default",
1781
- src_fmt_configs: dict | None = None,
1782
- labels: dict | None = None,
1783
- encryption_configuration: dict | None = None,
1784
- location: str | None = None,
1785
- impersonation_chain: str | Sequence[str] | None = None,
1786
- bigquery_conn_id: str | None = None,
1787
- **kwargs,
1788
- ) -> None:
1789
- if bigquery_conn_id:
1790
- warnings.warn(
1791
- "The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
1792
- AirflowProviderDeprecationWarning,
1793
- stacklevel=2,
1794
- )
1795
- gcp_conn_id = bigquery_conn_id
1796
-
1797
- super().__init__(**kwargs)
1798
-
1799
- self.table_resource = table_resource
1800
- self.bucket = bucket or ""
1801
- self.source_objects = source_objects or []
1802
- self.schema_object = schema_object or None
1803
- self.gcs_schema_bucket = gcs_schema_bucket or ""
1804
- self.destination_project_dataset_table = destination_project_dataset_table or ""
1805
-
1806
- # BQ config
1807
- kwargs_passed = any(
1808
- [
1809
- destination_project_dataset_table,
1810
- schema_fields,
1811
- source_format,
1812
- compression,
1813
- skip_leading_rows,
1814
- field_delimiter,
1815
- max_bad_records,
1816
- autodetect,
1817
- quote_character,
1818
- allow_quoted_newlines,
1819
- allow_jagged_rows,
1820
- src_fmt_configs,
1821
- labels,
1822
- encryption_configuration,
1823
- ]
1824
- )
1825
-
1826
- if not table_resource:
1827
- warnings.warn(
1828
- "Passing table parameters via keywords arguments will be deprecated. "
1829
- "Please provide table definition using `table_resource` parameter.",
1830
- AirflowProviderDeprecationWarning,
1831
- stacklevel=2,
1832
- )
1833
- if not bucket:
1834
- raise ValueError("`bucket` is required when not using `table_resource`.")
1835
- if not gcs_schema_bucket:
1836
- gcs_schema_bucket = bucket
1837
- if not source_objects:
1838
- raise ValueError("`source_objects` is required when not using `table_resource`.")
1839
- if not source_format:
1840
- source_format = "CSV"
1841
- if not compression:
1842
- compression = "NONE"
1843
- if not skip_leading_rows:
1844
- skip_leading_rows = 0
1845
- if not field_delimiter:
1846
- field_delimiter = ","
1847
- if not destination_project_dataset_table:
1848
- raise ValueError(
1849
- "`destination_project_dataset_table` is required when not using `table_resource`."
1850
- )
1851
- self.bucket = bucket
1852
- self.source_objects = source_objects
1853
- self.schema_object = schema_object
1854
- self.gcs_schema_bucket = gcs_schema_bucket
1855
- self.destination_project_dataset_table = destination_project_dataset_table
1856
- self.schema_fields = schema_fields
1857
- self.source_format = source_format
1858
- self.compression = compression
1859
- self.skip_leading_rows = skip_leading_rows
1860
- self.field_delimiter = field_delimiter
1861
- self.table_resource = None
1862
- else:
1863
- pass
1864
-
1865
- if table_resource and kwargs_passed:
1866
- raise ValueError("You provided both `table_resource` and exclusive keywords arguments.")
1867
-
1868
- self.max_bad_records = max_bad_records
1869
- self.quote_character = quote_character
1870
- self.allow_quoted_newlines = allow_quoted_newlines
1871
- self.allow_jagged_rows = allow_jagged_rows
1872
- self.gcp_conn_id = gcp_conn_id
1873
- self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
1874
- self.autodetect = autodetect
1875
-
1876
- self.src_fmt_configs = src_fmt_configs or {}
1877
- self.labels = labels
1878
- self.encryption_configuration = encryption_configuration
1879
- self.location = location
1880
- self.impersonation_chain = impersonation_chain
1881
- self._table: Table | None = None
1882
-
1883
- def execute(self, context: Context) -> None:
1884
- bq_hook = BigQueryHook(
1885
- gcp_conn_id=self.gcp_conn_id,
1886
- location=self.location,
1887
- impersonation_chain=self.impersonation_chain,
1888
- )
1889
- if self.table_resource:
1890
- # Save table as attribute for further use by OpenLineage
1891
- self._table = bq_hook.create_empty_table(
1892
- table_resource=self.table_resource,
1893
- )
1894
- if self._table:
1895
- BigQueryTableLink.persist(
1896
- context=context,
1897
- dataset_id=self._table.dataset_id,
1898
- project_id=self._table.project,
1899
- table_id=self._table.table_id,
1900
- )
1901
- return
1902
-
1903
- if not self.schema_fields and self.schema_object and self.source_format != "DATASTORE_BACKUP":
1904
- gcs_hook = GCSHook(
1905
- gcp_conn_id=self.google_cloud_storage_conn_id,
1906
- impersonation_chain=self.impersonation_chain,
1907
- )
1908
- schema_fields = json.loads(
1909
- gcs_hook.download(self.gcs_schema_bucket, self.schema_object).decode("utf-8")
1910
- )
1911
- else:
1912
- schema_fields = self.schema_fields
1913
-
1914
- source_uris = [f"gs://{self.bucket}/{source_object}" for source_object in self.source_objects]
1915
-
1916
- project_id, dataset_id, table_id = bq_hook.split_tablename(
1917
- table_input=self.destination_project_dataset_table,
1918
- default_project_id=bq_hook.project_id or "",
1919
- )
1920
-
1921
- external_data_configuration = {
1922
- "source_uris": source_uris,
1923
- "source_format": self.source_format,
1924
- "autodetect": self.autodetect,
1925
- "compression": self.compression,
1926
- "maxBadRecords": self.max_bad_records,
1927
- }
1928
- if self.source_format == "CSV":
1929
- external_data_configuration["csvOptions"] = {
1930
- "fieldDelimiter": self.field_delimiter,
1931
- "skipLeadingRows": self.skip_leading_rows,
1932
- "quote": self.quote_character,
1933
- "allowQuotedNewlines": self.allow_quoted_newlines,
1934
- "allowJaggedRows": self.allow_jagged_rows,
1935
- }
1936
-
1937
- table_resource = {
1938
- "tableReference": {
1939
- "projectId": project_id,
1940
- "datasetId": dataset_id,
1941
- "tableId": table_id,
1942
- },
1943
- "labels": self.labels,
1944
- "schema": {"fields": schema_fields},
1945
- "externalDataConfiguration": external_data_configuration,
1946
- "location": self.location,
1947
- "encryptionConfiguration": self.encryption_configuration,
1948
- }
1949
-
1950
- # Save table as attribute for further use by OpenLineage
1951
- self._table = bq_hook.create_empty_table(table_resource=table_resource)
1952
- if self._table:
1953
- BigQueryTableLink.persist(
1954
- context=context,
1955
- dataset_id=self._table.dataset_id,
1956
- project_id=self._table.project,
1957
- table_id=self._table.table_id,
1958
- )
1959
-
1960
- def get_openlineage_facets_on_complete(self, _):
1961
- """Implement _on_complete as we will use table resource returned by create method."""
1962
- from airflow.providers.common.compat.openlineage.facet import Dataset
1963
- from airflow.providers.google.cloud.openlineage.utils import (
1964
- BIGQUERY_NAMESPACE,
1965
- get_facets_from_bq_table,
1966
- )
1967
- from airflow.providers.openlineage.extractors import OperatorLineage
1968
-
1969
- output_dataset = Dataset(
1970
- namespace=BIGQUERY_NAMESPACE,
1971
- name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
1972
- facets=get_facets_from_bq_table(self._table),
1973
- )
1974
-
1975
- return OperatorLineage(outputs=[output_dataset])
1976
-
1977
-
1978
1377
  class BigQueryDeleteDatasetOperator(GoogleCloudBaseOperator):
1979
1378
  """
1980
1379
  Delete an existing dataset from your Project in BigQuery.
@@ -3039,7 +2438,7 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
3039
2438
 
3040
2439
  if self.project_id:
3041
2440
  job_id_path = convert_job_id(
3042
- job_id=self.job_id, # type: ignore[arg-type]
2441
+ job_id=self.job_id,
3043
2442
  project_id=self.project_id,
3044
2443
  location=self.location,
3045
2444
  )