apache-airflow-providers-google 16.1.0rc1__py3-none-any.whl → 17.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +1 -1
- airflow/providers/google/ads/hooks/ads.py +1 -5
- airflow/providers/google/cloud/hooks/bigquery.py +1 -130
- airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
- airflow/providers/google/cloud/hooks/cloud_run.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_sql.py +5 -5
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +1 -1
- airflow/providers/google/cloud/hooks/dataflow.py +0 -85
- airflow/providers/google/cloud/hooks/datafusion.py +1 -1
- airflow/providers/google/cloud/hooks/dataprep.py +1 -4
- airflow/providers/google/cloud/hooks/dataproc.py +68 -70
- airflow/providers/google/cloud/hooks/gcs.py +3 -5
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +2 -2
- airflow/providers/google/cloud/hooks/looker.py +1 -5
- airflow/providers/google/cloud/hooks/stackdriver.py +10 -8
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +4 -4
- airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +7 -0
- airflow/providers/google/cloud/links/kubernetes_engine.py +3 -0
- airflow/providers/google/cloud/log/gcs_task_handler.py +2 -2
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +1 -1
- airflow/providers/google/cloud/openlineage/mixins.py +7 -7
- airflow/providers/google/cloud/operators/automl.py +1 -1
- airflow/providers/google/cloud/operators/bigquery.py +8 -609
- airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
- airflow/providers/google/cloud/operators/cloud_sql.py +1 -5
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +2 -2
- airflow/providers/google/cloud/operators/dataproc.py +1 -1
- airflow/providers/google/cloud/operators/dlp.py +2 -2
- airflow/providers/google/cloud/operators/kubernetes_engine.py +4 -4
- airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +7 -1
- airflow/providers/google/cloud/operators/vertex_ai/ray.py +7 -5
- airflow/providers/google/cloud/operators/vision.py +1 -1
- airflow/providers/google/cloud/sensors/dataflow.py +23 -6
- airflow/providers/google/cloud/sensors/datafusion.py +2 -2
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -2
- airflow/providers/google/cloud/transfers/gcs_to_local.py +3 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +9 -9
- airflow/providers/google/cloud/triggers/bigquery.py +11 -13
- airflow/providers/google/cloud/triggers/cloud_build.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_run.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +1 -1
- airflow/providers/google/cloud/triggers/datafusion.py +1 -1
- airflow/providers/google/cloud/triggers/dataproc.py +10 -9
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +45 -27
- airflow/providers/google/cloud/triggers/mlengine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +1 -1
- airflow/providers/google/cloud/utils/credentials_provider.py +1 -1
- airflow/providers/google/common/auth_backend/google_openid.py +2 -2
- airflow/providers/google/common/hooks/base_google.py +2 -6
- airflow/providers/google/common/utils/id_token_credentials.py +2 -2
- airflow/providers/google/get_provider_info.py +19 -16
- airflow/providers/google/leveldb/hooks/leveldb.py +1 -5
- airflow/providers/google/marketing_platform/hooks/display_video.py +47 -3
- airflow/providers/google/marketing_platform/links/analytics_admin.py +1 -1
- airflow/providers/google/marketing_platform/operators/display_video.py +64 -15
- airflow/providers/google/marketing_platform/sensors/display_video.py +9 -2
- airflow/providers/google/version_compat.py +10 -3
- {apache_airflow_providers_google-16.1.0rc1.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/METADATA +99 -93
- {apache_airflow_providers_google-16.1.0rc1.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/RECORD +63 -62
- airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
- airflow/providers/google/cloud/links/life_sciences.py +0 -30
- airflow/providers/google/cloud/operators/life_sciences.py +0 -118
- {apache_airflow_providers_google-16.1.0rc1.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-16.1.0rc1.dist-info → apache_airflow_providers_google-17.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -34,7 +34,7 @@ from google.cloud.bigquery.table import RowIterator, Table, TableListItem, Table
|
|
34
34
|
|
35
35
|
from airflow.configuration import conf
|
36
36
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
|
37
|
-
from airflow.providers.common.sql.operators.sql import ( #
|
37
|
+
from airflow.providers.common.sql.operators.sql import ( # for _parse_boolean
|
38
38
|
SQLCheckOperator,
|
39
39
|
SQLColumnCheckOperator,
|
40
40
|
SQLIntervalCheckOperator,
|
@@ -59,7 +59,6 @@ from airflow.providers.google.cloud.triggers.bigquery import (
|
|
59
59
|
BigQueryValueCheckTrigger,
|
60
60
|
)
|
61
61
|
from airflow.providers.google.cloud.utils.bigquery import convert_job_id
|
62
|
-
from airflow.providers.google.common.deprecated import deprecated
|
63
62
|
from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
|
64
63
|
from airflow.utils.helpers import exactly_one
|
65
64
|
|
@@ -116,8 +115,10 @@ class _BigQueryDbHookMixin:
|
|
116
115
|
impersonation_chain=self.impersonation_chain,
|
117
116
|
labels=self.labels,
|
118
117
|
)
|
118
|
+
|
119
|
+
# mypy assuming project_id is read only, as project_id is a property in GoogleBaseHook.
|
119
120
|
if self.project_id:
|
120
|
-
hook.project_id = self.project_id
|
121
|
+
hook.project_id = self.project_id # type:ignore[misc]
|
121
122
|
return hook
|
122
123
|
|
123
124
|
|
@@ -309,9 +310,7 @@ class BigQueryCheckOperator(
|
|
309
310
|
if not records:
|
310
311
|
raise AirflowException(f"The following query returned zero rows: {self.sql}")
|
311
312
|
if not all(records):
|
312
|
-
self._raise_exception(
|
313
|
-
f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}"
|
314
|
-
)
|
313
|
+
self._raise_exception(f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}")
|
315
314
|
|
316
315
|
def execute_complete(self, context: Context, event: dict[str, Any]) -> None:
|
317
316
|
"""
|
@@ -428,7 +427,7 @@ class BigQueryValueCheckOperator(
|
|
428
427
|
nowait=True,
|
429
428
|
)
|
430
429
|
|
431
|
-
def execute(self, context: Context) -> None:
|
430
|
+
def execute(self, context: Context) -> None:
|
432
431
|
if not self.deferrable:
|
433
432
|
super().execute(context=context)
|
434
433
|
else:
|
@@ -1156,7 +1155,7 @@ class BigQueryGetDataOperator(GoogleCloudBaseOperator, _BigQueryOperatorsEncrypt
|
|
1156
1155
|
"BigQueryHook.list_rows() returns iterator when return_iterator is False (default)"
|
1157
1156
|
)
|
1158
1157
|
self.log.info("Total extracted rows: %s", len(rows))
|
1159
|
-
|
1158
|
+
table_data: list[dict[str, Any]] | list[Any]
|
1160
1159
|
if self.as_dict:
|
1161
1160
|
table_data = [dict(row) for row in rows]
|
1162
1161
|
else:
|
@@ -1375,606 +1374,6 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
|
|
1375
1374
|
return OperatorLineage(outputs=[output_dataset])
|
1376
1375
|
|
1377
1376
|
|
1378
|
-
@deprecated(
|
1379
|
-
planned_removal_date="July 30, 2025",
|
1380
|
-
use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
|
1381
|
-
category=AirflowProviderDeprecationWarning,
|
1382
|
-
)
|
1383
|
-
class BigQueryCreateEmptyTableOperator(GoogleCloudBaseOperator):
|
1384
|
-
"""
|
1385
|
-
Creates a new table in the specified BigQuery dataset, optionally with schema.
|
1386
|
-
|
1387
|
-
The schema to be used for the BigQuery table may be specified in one of
|
1388
|
-
two ways. You may either directly pass the schema fields in, or you may
|
1389
|
-
point the operator to a Google Cloud Storage object name. The object in
|
1390
|
-
Google Cloud Storage must be a JSON file with the schema fields in it.
|
1391
|
-
You can also create a table without schema.
|
1392
|
-
|
1393
|
-
.. seealso::
|
1394
|
-
For more information on how to use this operator, take a look at the guide:
|
1395
|
-
:ref:`howto/operator:BigQueryCreateEmptyTableOperator`
|
1396
|
-
|
1397
|
-
:param project_id: The project to create the table into. (templated)
|
1398
|
-
:param dataset_id: The dataset to create the table into. (templated)
|
1399
|
-
:param table_id: The Name of the table to be created. (templated)
|
1400
|
-
:param table_resource: Table resource as described in documentation:
|
1401
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
|
1402
|
-
If provided all other parameters are ignored. (templated)
|
1403
|
-
:param schema_fields: If set, the schema field list as defined here:
|
1404
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
1405
|
-
|
1406
|
-
**Example**::
|
1407
|
-
|
1408
|
-
schema_fields = [
|
1409
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
1410
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
1411
|
-
]
|
1412
|
-
|
1413
|
-
:param gcs_schema_object: Full path to the JSON file containing
|
1414
|
-
schema (templated). For
|
1415
|
-
example: ``gs://test-bucket/dir1/dir2/employee_schema.json``
|
1416
|
-
:param time_partitioning: configure optional time partitioning fields i.e.
|
1417
|
-
partition by field, type and expiration as per API specifications.
|
1418
|
-
|
1419
|
-
.. seealso::
|
1420
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
|
1421
|
-
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
|
1422
|
-
interact with the Bigquery service.
|
1423
|
-
:param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud.
|
1424
|
-
and interact with the Google Cloud Storage service.
|
1425
|
-
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
1426
|
-
|
1427
|
-
**Example (with schema JSON in GCS)**::
|
1428
|
-
|
1429
|
-
CreateTable = BigQueryCreateEmptyTableOperator(
|
1430
|
-
task_id="BigQueryCreateEmptyTableOperator_task",
|
1431
|
-
dataset_id="ODS",
|
1432
|
-
table_id="Employees",
|
1433
|
-
project_id="internal-gcp-project",
|
1434
|
-
gcs_schema_object="gs://schema-bucket/employee_schema.json",
|
1435
|
-
gcp_conn_id="airflow-conn-id",
|
1436
|
-
google_cloud_storage_conn_id="airflow-conn-id",
|
1437
|
-
)
|
1438
|
-
|
1439
|
-
**Corresponding Schema file** (``employee_schema.json``)::
|
1440
|
-
|
1441
|
-
[
|
1442
|
-
{"mode": "NULLABLE", "name": "emp_name", "type": "STRING"},
|
1443
|
-
{"mode": "REQUIRED", "name": "salary", "type": "INTEGER"},
|
1444
|
-
]
|
1445
|
-
|
1446
|
-
**Example (with schema in the DAG)**::
|
1447
|
-
|
1448
|
-
CreateTable = BigQueryCreateEmptyTableOperator(
|
1449
|
-
task_id="BigQueryCreateEmptyTableOperator_task",
|
1450
|
-
dataset_id="ODS",
|
1451
|
-
table_id="Employees",
|
1452
|
-
project_id="internal-gcp-project",
|
1453
|
-
schema_fields=[
|
1454
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
1455
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
1456
|
-
],
|
1457
|
-
gcp_conn_id="airflow-conn-id-account",
|
1458
|
-
google_cloud_storage_conn_id="airflow-conn-id",
|
1459
|
-
)
|
1460
|
-
|
1461
|
-
:param view: (Optional) A dictionary containing definition for the view.
|
1462
|
-
If set, it will create a view instead of a table:
|
1463
|
-
|
1464
|
-
.. seealso::
|
1465
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ViewDefinition
|
1466
|
-
:param materialized_view: (Optional) The materialized view definition.
|
1467
|
-
:param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
|
1468
|
-
|
1469
|
-
.. code-block:: python
|
1470
|
-
|
1471
|
-
encryption_configuration = {
|
1472
|
-
"kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
|
1473
|
-
}
|
1474
|
-
:param location: The location used for the operation.
|
1475
|
-
:param cluster_fields: (Optional) The fields used for clustering.
|
1476
|
-
BigQuery supports clustering for both partitioned and
|
1477
|
-
non-partitioned tables.
|
1478
|
-
|
1479
|
-
.. seealso::
|
1480
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clustering.fields
|
1481
|
-
:param impersonation_chain: Optional service account to impersonate using short-term
|
1482
|
-
credentials, or chained list of accounts required to get the access_token
|
1483
|
-
of the last account in the list, which will be impersonated in the request.
|
1484
|
-
If set as a string, the account must grant the originating account
|
1485
|
-
the Service Account Token Creator IAM role.
|
1486
|
-
If set as a sequence, the identities from the list must grant
|
1487
|
-
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1488
|
-
account from the list granting this role to the originating account (templated).
|
1489
|
-
:param if_exists: What should Airflow do if the table exists. If set to `log`, the TI will be passed to
|
1490
|
-
success and an error message will be logged. Set to `ignore` to ignore the error, set to `fail` to
|
1491
|
-
fail the TI, and set to `skip` to skip it.
|
1492
|
-
:param exists_ok: Deprecated - use `if_exists="ignore"` instead.
|
1493
|
-
"""
|
1494
|
-
|
1495
|
-
template_fields: Sequence[str] = (
|
1496
|
-
"dataset_id",
|
1497
|
-
"table_id",
|
1498
|
-
"table_resource",
|
1499
|
-
"project_id",
|
1500
|
-
"gcs_schema_object",
|
1501
|
-
"labels",
|
1502
|
-
"view",
|
1503
|
-
"materialized_view",
|
1504
|
-
"impersonation_chain",
|
1505
|
-
)
|
1506
|
-
template_fields_renderers = {"table_resource": "json", "materialized_view": "json"}
|
1507
|
-
ui_color = BigQueryUIColors.TABLE.value
|
1508
|
-
operator_extra_links = (BigQueryTableLink(),)
|
1509
|
-
|
1510
|
-
def __init__(
|
1511
|
-
self,
|
1512
|
-
*,
|
1513
|
-
dataset_id: str,
|
1514
|
-
table_id: str,
|
1515
|
-
table_resource: dict[str, Any] | None = None,
|
1516
|
-
project_id: str = PROVIDE_PROJECT_ID,
|
1517
|
-
schema_fields: list | None = None,
|
1518
|
-
gcs_schema_object: str | None = None,
|
1519
|
-
time_partitioning: dict | None = None,
|
1520
|
-
gcp_conn_id: str = "google_cloud_default",
|
1521
|
-
google_cloud_storage_conn_id: str = "google_cloud_default",
|
1522
|
-
labels: dict | None = None,
|
1523
|
-
view: dict | None = None,
|
1524
|
-
materialized_view: dict | None = None,
|
1525
|
-
encryption_configuration: dict | None = None,
|
1526
|
-
location: str | None = None,
|
1527
|
-
cluster_fields: list[str] | None = None,
|
1528
|
-
impersonation_chain: str | Sequence[str] | None = None,
|
1529
|
-
if_exists: str = "log",
|
1530
|
-
bigquery_conn_id: str | None = None,
|
1531
|
-
exists_ok: bool | None = None,
|
1532
|
-
**kwargs,
|
1533
|
-
) -> None:
|
1534
|
-
if bigquery_conn_id:
|
1535
|
-
warnings.warn(
|
1536
|
-
"The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
|
1537
|
-
AirflowProviderDeprecationWarning,
|
1538
|
-
stacklevel=2,
|
1539
|
-
)
|
1540
|
-
gcp_conn_id = bigquery_conn_id
|
1541
|
-
|
1542
|
-
super().__init__(**kwargs)
|
1543
|
-
|
1544
|
-
self.project_id = project_id
|
1545
|
-
self.dataset_id = dataset_id
|
1546
|
-
self.table_id = table_id
|
1547
|
-
self.schema_fields = schema_fields
|
1548
|
-
self.gcs_schema_object = gcs_schema_object
|
1549
|
-
self.gcp_conn_id = gcp_conn_id
|
1550
|
-
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
1551
|
-
self.time_partitioning = time_partitioning or {}
|
1552
|
-
self.labels = labels
|
1553
|
-
self.view = view
|
1554
|
-
self.materialized_view = materialized_view
|
1555
|
-
self.encryption_configuration = encryption_configuration
|
1556
|
-
self.location = location
|
1557
|
-
self.cluster_fields = cluster_fields
|
1558
|
-
self.table_resource = table_resource
|
1559
|
-
self.impersonation_chain = impersonation_chain
|
1560
|
-
self._table: Table | None = None
|
1561
|
-
if exists_ok is not None:
|
1562
|
-
warnings.warn(
|
1563
|
-
"`exists_ok` parameter is deprecated, please use `if_exists`",
|
1564
|
-
AirflowProviderDeprecationWarning,
|
1565
|
-
stacklevel=2,
|
1566
|
-
)
|
1567
|
-
self.if_exists = IfExistAction.IGNORE if exists_ok else IfExistAction.LOG
|
1568
|
-
else:
|
1569
|
-
self.if_exists = IfExistAction(if_exists)
|
1570
|
-
|
1571
|
-
def execute(self, context: Context) -> None:
|
1572
|
-
bq_hook = BigQueryHook(
|
1573
|
-
gcp_conn_id=self.gcp_conn_id,
|
1574
|
-
location=self.location,
|
1575
|
-
impersonation_chain=self.impersonation_chain,
|
1576
|
-
)
|
1577
|
-
|
1578
|
-
if not self.schema_fields and self.gcs_schema_object:
|
1579
|
-
gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)
|
1580
|
-
gcs_hook = GCSHook(
|
1581
|
-
gcp_conn_id=self.google_cloud_storage_conn_id,
|
1582
|
-
impersonation_chain=self.impersonation_chain,
|
1583
|
-
)
|
1584
|
-
schema_fields_string = gcs_hook.download_as_byte_array(gcs_bucket, gcs_object).decode("utf-8")
|
1585
|
-
schema_fields = json.loads(schema_fields_string)
|
1586
|
-
else:
|
1587
|
-
schema_fields = self.schema_fields
|
1588
|
-
|
1589
|
-
try:
|
1590
|
-
self.log.info("Creating table")
|
1591
|
-
# Save table as attribute for further use by OpenLineage
|
1592
|
-
self._table = bq_hook.create_empty_table(
|
1593
|
-
project_id=self.project_id,
|
1594
|
-
dataset_id=self.dataset_id,
|
1595
|
-
table_id=self.table_id,
|
1596
|
-
schema_fields=schema_fields,
|
1597
|
-
time_partitioning=self.time_partitioning,
|
1598
|
-
cluster_fields=self.cluster_fields,
|
1599
|
-
labels=self.labels,
|
1600
|
-
view=self.view,
|
1601
|
-
materialized_view=self.materialized_view,
|
1602
|
-
encryption_configuration=self.encryption_configuration,
|
1603
|
-
table_resource=self.table_resource,
|
1604
|
-
exists_ok=self.if_exists == IfExistAction.IGNORE,
|
1605
|
-
)
|
1606
|
-
if self._table:
|
1607
|
-
persist_kwargs = {
|
1608
|
-
"context": context,
|
1609
|
-
"project_id": self._table.to_api_repr()["tableReference"]["projectId"],
|
1610
|
-
"dataset_id": self._table.to_api_repr()["tableReference"]["datasetId"],
|
1611
|
-
"table_id": self._table.to_api_repr()["tableReference"]["tableId"],
|
1612
|
-
}
|
1613
|
-
self.log.info(
|
1614
|
-
"Table %s.%s.%s created successfully",
|
1615
|
-
self._table.project,
|
1616
|
-
self._table.dataset_id,
|
1617
|
-
self._table.table_id,
|
1618
|
-
)
|
1619
|
-
else:
|
1620
|
-
raise AirflowException("Table creation failed.")
|
1621
|
-
except Conflict:
|
1622
|
-
error_msg = f"Table {self.dataset_id}.{self.table_id} already exists."
|
1623
|
-
if self.if_exists == IfExistAction.LOG:
|
1624
|
-
self.log.info(error_msg)
|
1625
|
-
persist_kwargs = {
|
1626
|
-
"context": context,
|
1627
|
-
"project_id": self.project_id or bq_hook.project_id,
|
1628
|
-
"dataset_id": self.dataset_id,
|
1629
|
-
"table_id": self.table_id,
|
1630
|
-
}
|
1631
|
-
elif self.if_exists == IfExistAction.FAIL:
|
1632
|
-
raise AirflowException(error_msg)
|
1633
|
-
else:
|
1634
|
-
raise AirflowSkipException(error_msg)
|
1635
|
-
|
1636
|
-
BigQueryTableLink.persist(**persist_kwargs)
|
1637
|
-
|
1638
|
-
def get_openlineage_facets_on_complete(self, _):
|
1639
|
-
"""Implement _on_complete as we will use table resource returned by create method."""
|
1640
|
-
from airflow.providers.common.compat.openlineage.facet import Dataset
|
1641
|
-
from airflow.providers.google.cloud.openlineage.utils import (
|
1642
|
-
BIGQUERY_NAMESPACE,
|
1643
|
-
get_facets_from_bq_table,
|
1644
|
-
)
|
1645
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
1646
|
-
|
1647
|
-
if not self._table:
|
1648
|
-
self.log.debug("OpenLineage did not find `self._table` attribute.")
|
1649
|
-
return OperatorLineage()
|
1650
|
-
|
1651
|
-
output_dataset = Dataset(
|
1652
|
-
namespace=BIGQUERY_NAMESPACE,
|
1653
|
-
name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
|
1654
|
-
facets=get_facets_from_bq_table(self._table),
|
1655
|
-
)
|
1656
|
-
|
1657
|
-
return OperatorLineage(outputs=[output_dataset])
|
1658
|
-
|
1659
|
-
|
1660
|
-
@deprecated(
|
1661
|
-
planned_removal_date="July 30, 2025",
|
1662
|
-
use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
|
1663
|
-
category=AirflowProviderDeprecationWarning,
|
1664
|
-
)
|
1665
|
-
class BigQueryCreateExternalTableOperator(GoogleCloudBaseOperator):
|
1666
|
-
"""
|
1667
|
-
Create a new external table with data from Google Cloud Storage.
|
1668
|
-
|
1669
|
-
The schema to be used for the BigQuery table may be specified in one of
|
1670
|
-
two ways. You may either directly pass the schema fields in, or you may
|
1671
|
-
point the operator to a Google Cloud Storage object name. The object in
|
1672
|
-
Google Cloud Storage must be a JSON file with the schema fields in it.
|
1673
|
-
|
1674
|
-
.. seealso::
|
1675
|
-
For more information on how to use this operator, take a look at the guide:
|
1676
|
-
:ref:`howto/operator:BigQueryCreateExternalTableOperator`
|
1677
|
-
|
1678
|
-
:param bucket: The bucket to point the external table to. (templated)
|
1679
|
-
:param source_objects: List of Google Cloud Storage URIs to point
|
1680
|
-
table to. If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI.
|
1681
|
-
:param destination_project_dataset_table: The dotted ``(<project>.)<dataset>.<table>``
|
1682
|
-
BigQuery table to load data into (templated). If ``<project>`` is not included,
|
1683
|
-
project will be the project defined in the connection json.
|
1684
|
-
:param schema_fields: If set, the schema field list as defined here:
|
1685
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
1686
|
-
|
1687
|
-
**Example**::
|
1688
|
-
|
1689
|
-
schema_fields = [
|
1690
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
1691
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
1692
|
-
]
|
1693
|
-
|
1694
|
-
Should not be set when source_format is 'DATASTORE_BACKUP'.
|
1695
|
-
:param table_resource: Table resource as described in documentation:
|
1696
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
|
1697
|
-
If provided all other parameters are ignored. External schema from object will be resolved.
|
1698
|
-
:param schema_object: If set, a GCS object path pointing to a .json file that
|
1699
|
-
contains the schema for the table. (templated)
|
1700
|
-
:param gcs_schema_bucket: GCS bucket name where the schema JSON is stored (templated).
|
1701
|
-
The default value is self.bucket.
|
1702
|
-
:param source_format: File format of the data.
|
1703
|
-
:param autodetect: Try to detect schema and format options automatically.
|
1704
|
-
The schema_fields and schema_object options will be honored when specified explicitly.
|
1705
|
-
https://cloud.google.com/bigquery/docs/schema-detect#schema_auto-detection_for_external_data_sources
|
1706
|
-
:param compression: (Optional) The compression type of the data source.
|
1707
|
-
Possible values include GZIP and NONE.
|
1708
|
-
The default value is NONE.
|
1709
|
-
This setting is ignored for Google Cloud Bigtable,
|
1710
|
-
Google Cloud Datastore backups and Avro formats.
|
1711
|
-
:param skip_leading_rows: Number of rows to skip when loading from a CSV.
|
1712
|
-
:param field_delimiter: The delimiter to use for the CSV.
|
1713
|
-
:param max_bad_records: The maximum number of bad records that BigQuery can
|
1714
|
-
ignore when running the job.
|
1715
|
-
:param quote_character: The value that is used to quote data sections in a CSV file.
|
1716
|
-
:param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
|
1717
|
-
:param allow_jagged_rows: Accept rows that are missing trailing optional columns.
|
1718
|
-
The missing values are treated as nulls. If false, records with missing trailing
|
1719
|
-
columns are treated as bad records, and if there are too many bad records, an
|
1720
|
-
invalid error is returned in the job result. Only applicable to CSV, ignored
|
1721
|
-
for other formats.
|
1722
|
-
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
|
1723
|
-
interact with the Bigquery service.
|
1724
|
-
:param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud
|
1725
|
-
and interact with the Google Cloud Storage service.
|
1726
|
-
:param src_fmt_configs: configure optional fields specific to the source format
|
1727
|
-
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
1728
|
-
:param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
|
1729
|
-
|
1730
|
-
.. code-block:: python
|
1731
|
-
|
1732
|
-
encryption_configuration = {
|
1733
|
-
"kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
|
1734
|
-
}
|
1735
|
-
:param location: The location used for the operation.
|
1736
|
-
:param impersonation_chain: Optional service account to impersonate using short-term
|
1737
|
-
credentials, or chained list of accounts required to get the access_token
|
1738
|
-
of the last account in the list, which will be impersonated in the request.
|
1739
|
-
If set as a string, the account must grant the originating account
|
1740
|
-
the Service Account Token Creator IAM role.
|
1741
|
-
If set as a sequence, the identities from the list must grant
|
1742
|
-
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1743
|
-
account from the list granting this role to the originating account (templated).
|
1744
|
-
"""
|
1745
|
-
|
1746
|
-
template_fields: Sequence[str] = (
|
1747
|
-
"bucket",
|
1748
|
-
"source_objects",
|
1749
|
-
"schema_object",
|
1750
|
-
"gcs_schema_bucket",
|
1751
|
-
"destination_project_dataset_table",
|
1752
|
-
"labels",
|
1753
|
-
"table_resource",
|
1754
|
-
"impersonation_chain",
|
1755
|
-
)
|
1756
|
-
template_fields_renderers = {"table_resource": "json"}
|
1757
|
-
ui_color = BigQueryUIColors.TABLE.value
|
1758
|
-
operator_extra_links = (BigQueryTableLink(),)
|
1759
|
-
|
1760
|
-
def __init__(
|
1761
|
-
self,
|
1762
|
-
*,
|
1763
|
-
bucket: str | None = None,
|
1764
|
-
source_objects: list[str] | None = None,
|
1765
|
-
destination_project_dataset_table: str | None = None,
|
1766
|
-
table_resource: dict[str, Any] | None = None,
|
1767
|
-
schema_fields: list | None = None,
|
1768
|
-
schema_object: str | None = None,
|
1769
|
-
gcs_schema_bucket: str | None = None,
|
1770
|
-
source_format: str | None = None,
|
1771
|
-
autodetect: bool = False,
|
1772
|
-
compression: str | None = None,
|
1773
|
-
skip_leading_rows: int | None = None,
|
1774
|
-
field_delimiter: str | None = None,
|
1775
|
-
max_bad_records: int = 0,
|
1776
|
-
quote_character: str | None = None,
|
1777
|
-
allow_quoted_newlines: bool = False,
|
1778
|
-
allow_jagged_rows: bool = False,
|
1779
|
-
gcp_conn_id: str = "google_cloud_default",
|
1780
|
-
google_cloud_storage_conn_id: str = "google_cloud_default",
|
1781
|
-
src_fmt_configs: dict | None = None,
|
1782
|
-
labels: dict | None = None,
|
1783
|
-
encryption_configuration: dict | None = None,
|
1784
|
-
location: str | None = None,
|
1785
|
-
impersonation_chain: str | Sequence[str] | None = None,
|
1786
|
-
bigquery_conn_id: str | None = None,
|
1787
|
-
**kwargs,
|
1788
|
-
) -> None:
|
1789
|
-
if bigquery_conn_id:
|
1790
|
-
warnings.warn(
|
1791
|
-
"The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
|
1792
|
-
AirflowProviderDeprecationWarning,
|
1793
|
-
stacklevel=2,
|
1794
|
-
)
|
1795
|
-
gcp_conn_id = bigquery_conn_id
|
1796
|
-
|
1797
|
-
super().__init__(**kwargs)
|
1798
|
-
|
1799
|
-
self.table_resource = table_resource
|
1800
|
-
self.bucket = bucket or ""
|
1801
|
-
self.source_objects = source_objects or []
|
1802
|
-
self.schema_object = schema_object or None
|
1803
|
-
self.gcs_schema_bucket = gcs_schema_bucket or ""
|
1804
|
-
self.destination_project_dataset_table = destination_project_dataset_table or ""
|
1805
|
-
|
1806
|
-
# BQ config
|
1807
|
-
kwargs_passed = any(
|
1808
|
-
[
|
1809
|
-
destination_project_dataset_table,
|
1810
|
-
schema_fields,
|
1811
|
-
source_format,
|
1812
|
-
compression,
|
1813
|
-
skip_leading_rows,
|
1814
|
-
field_delimiter,
|
1815
|
-
max_bad_records,
|
1816
|
-
autodetect,
|
1817
|
-
quote_character,
|
1818
|
-
allow_quoted_newlines,
|
1819
|
-
allow_jagged_rows,
|
1820
|
-
src_fmt_configs,
|
1821
|
-
labels,
|
1822
|
-
encryption_configuration,
|
1823
|
-
]
|
1824
|
-
)
|
1825
|
-
|
1826
|
-
if not table_resource:
|
1827
|
-
warnings.warn(
|
1828
|
-
"Passing table parameters via keywords arguments will be deprecated. "
|
1829
|
-
"Please provide table definition using `table_resource` parameter.",
|
1830
|
-
AirflowProviderDeprecationWarning,
|
1831
|
-
stacklevel=2,
|
1832
|
-
)
|
1833
|
-
if not bucket:
|
1834
|
-
raise ValueError("`bucket` is required when not using `table_resource`.")
|
1835
|
-
if not gcs_schema_bucket:
|
1836
|
-
gcs_schema_bucket = bucket
|
1837
|
-
if not source_objects:
|
1838
|
-
raise ValueError("`source_objects` is required when not using `table_resource`.")
|
1839
|
-
if not source_format:
|
1840
|
-
source_format = "CSV"
|
1841
|
-
if not compression:
|
1842
|
-
compression = "NONE"
|
1843
|
-
if not skip_leading_rows:
|
1844
|
-
skip_leading_rows = 0
|
1845
|
-
if not field_delimiter:
|
1846
|
-
field_delimiter = ","
|
1847
|
-
if not destination_project_dataset_table:
|
1848
|
-
raise ValueError(
|
1849
|
-
"`destination_project_dataset_table` is required when not using `table_resource`."
|
1850
|
-
)
|
1851
|
-
self.bucket = bucket
|
1852
|
-
self.source_objects = source_objects
|
1853
|
-
self.schema_object = schema_object
|
1854
|
-
self.gcs_schema_bucket = gcs_schema_bucket
|
1855
|
-
self.destination_project_dataset_table = destination_project_dataset_table
|
1856
|
-
self.schema_fields = schema_fields
|
1857
|
-
self.source_format = source_format
|
1858
|
-
self.compression = compression
|
1859
|
-
self.skip_leading_rows = skip_leading_rows
|
1860
|
-
self.field_delimiter = field_delimiter
|
1861
|
-
self.table_resource = None
|
1862
|
-
else:
|
1863
|
-
pass
|
1864
|
-
|
1865
|
-
if table_resource and kwargs_passed:
|
1866
|
-
raise ValueError("You provided both `table_resource` and exclusive keywords arguments.")
|
1867
|
-
|
1868
|
-
self.max_bad_records = max_bad_records
|
1869
|
-
self.quote_character = quote_character
|
1870
|
-
self.allow_quoted_newlines = allow_quoted_newlines
|
1871
|
-
self.allow_jagged_rows = allow_jagged_rows
|
1872
|
-
self.gcp_conn_id = gcp_conn_id
|
1873
|
-
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
1874
|
-
self.autodetect = autodetect
|
1875
|
-
|
1876
|
-
self.src_fmt_configs = src_fmt_configs or {}
|
1877
|
-
self.labels = labels
|
1878
|
-
self.encryption_configuration = encryption_configuration
|
1879
|
-
self.location = location
|
1880
|
-
self.impersonation_chain = impersonation_chain
|
1881
|
-
self._table: Table | None = None
|
1882
|
-
|
1883
|
-
def execute(self, context: Context) -> None:
|
1884
|
-
bq_hook = BigQueryHook(
|
1885
|
-
gcp_conn_id=self.gcp_conn_id,
|
1886
|
-
location=self.location,
|
1887
|
-
impersonation_chain=self.impersonation_chain,
|
1888
|
-
)
|
1889
|
-
if self.table_resource:
|
1890
|
-
# Save table as attribute for further use by OpenLineage
|
1891
|
-
self._table = bq_hook.create_empty_table(
|
1892
|
-
table_resource=self.table_resource,
|
1893
|
-
)
|
1894
|
-
if self._table:
|
1895
|
-
BigQueryTableLink.persist(
|
1896
|
-
context=context,
|
1897
|
-
dataset_id=self._table.dataset_id,
|
1898
|
-
project_id=self._table.project,
|
1899
|
-
table_id=self._table.table_id,
|
1900
|
-
)
|
1901
|
-
return
|
1902
|
-
|
1903
|
-
if not self.schema_fields and self.schema_object and self.source_format != "DATASTORE_BACKUP":
|
1904
|
-
gcs_hook = GCSHook(
|
1905
|
-
gcp_conn_id=self.google_cloud_storage_conn_id,
|
1906
|
-
impersonation_chain=self.impersonation_chain,
|
1907
|
-
)
|
1908
|
-
schema_fields = json.loads(
|
1909
|
-
gcs_hook.download(self.gcs_schema_bucket, self.schema_object).decode("utf-8")
|
1910
|
-
)
|
1911
|
-
else:
|
1912
|
-
schema_fields = self.schema_fields
|
1913
|
-
|
1914
|
-
source_uris = [f"gs://{self.bucket}/{source_object}" for source_object in self.source_objects]
|
1915
|
-
|
1916
|
-
project_id, dataset_id, table_id = bq_hook.split_tablename(
|
1917
|
-
table_input=self.destination_project_dataset_table,
|
1918
|
-
default_project_id=bq_hook.project_id or "",
|
1919
|
-
)
|
1920
|
-
|
1921
|
-
external_data_configuration = {
|
1922
|
-
"source_uris": source_uris,
|
1923
|
-
"source_format": self.source_format,
|
1924
|
-
"autodetect": self.autodetect,
|
1925
|
-
"compression": self.compression,
|
1926
|
-
"maxBadRecords": self.max_bad_records,
|
1927
|
-
}
|
1928
|
-
if self.source_format == "CSV":
|
1929
|
-
external_data_configuration["csvOptions"] = {
|
1930
|
-
"fieldDelimiter": self.field_delimiter,
|
1931
|
-
"skipLeadingRows": self.skip_leading_rows,
|
1932
|
-
"quote": self.quote_character,
|
1933
|
-
"allowQuotedNewlines": self.allow_quoted_newlines,
|
1934
|
-
"allowJaggedRows": self.allow_jagged_rows,
|
1935
|
-
}
|
1936
|
-
|
1937
|
-
table_resource = {
|
1938
|
-
"tableReference": {
|
1939
|
-
"projectId": project_id,
|
1940
|
-
"datasetId": dataset_id,
|
1941
|
-
"tableId": table_id,
|
1942
|
-
},
|
1943
|
-
"labels": self.labels,
|
1944
|
-
"schema": {"fields": schema_fields},
|
1945
|
-
"externalDataConfiguration": external_data_configuration,
|
1946
|
-
"location": self.location,
|
1947
|
-
"encryptionConfiguration": self.encryption_configuration,
|
1948
|
-
}
|
1949
|
-
|
1950
|
-
# Save table as attribute for further use by OpenLineage
|
1951
|
-
self._table = bq_hook.create_empty_table(table_resource=table_resource)
|
1952
|
-
if self._table:
|
1953
|
-
BigQueryTableLink.persist(
|
1954
|
-
context=context,
|
1955
|
-
dataset_id=self._table.dataset_id,
|
1956
|
-
project_id=self._table.project,
|
1957
|
-
table_id=self._table.table_id,
|
1958
|
-
)
|
1959
|
-
|
1960
|
-
def get_openlineage_facets_on_complete(self, _):
|
1961
|
-
"""Implement _on_complete as we will use table resource returned by create method."""
|
1962
|
-
from airflow.providers.common.compat.openlineage.facet import Dataset
|
1963
|
-
from airflow.providers.google.cloud.openlineage.utils import (
|
1964
|
-
BIGQUERY_NAMESPACE,
|
1965
|
-
get_facets_from_bq_table,
|
1966
|
-
)
|
1967
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
1968
|
-
|
1969
|
-
output_dataset = Dataset(
|
1970
|
-
namespace=BIGQUERY_NAMESPACE,
|
1971
|
-
name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
|
1972
|
-
facets=get_facets_from_bq_table(self._table),
|
1973
|
-
)
|
1974
|
-
|
1975
|
-
return OperatorLineage(outputs=[output_dataset])
|
1976
|
-
|
1977
|
-
|
1978
1377
|
class BigQueryDeleteDatasetOperator(GoogleCloudBaseOperator):
|
1979
1378
|
"""
|
1980
1379
|
Delete an existing dataset from your Project in BigQuery.
|
@@ -3039,7 +2438,7 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
3039
2438
|
|
3040
2439
|
if self.project_id:
|
3041
2440
|
job_id_path = convert_job_id(
|
3042
|
-
job_id=self.job_id,
|
2441
|
+
job_id=self.job_id,
|
3043
2442
|
project_id=self.project_id,
|
3044
2443
|
location=self.location,
|
3045
2444
|
)
|