apache-airflow-providers-google 10.12.0__py3-none-any.whl → 10.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. airflow/providers/google/__init__.py +3 -3
  2. airflow/providers/google/cloud/fs/gcs.py +16 -13
  3. airflow/providers/google/cloud/hooks/bigquery_dts.py +2 -1
  4. airflow/providers/google/cloud/hooks/cloud_build.py +2 -1
  5. airflow/providers/google/cloud/hooks/cloud_composer.py +4 -3
  6. airflow/providers/google/cloud/hooks/compute_ssh.py +18 -6
  7. airflow/providers/google/cloud/hooks/dataflow.py +61 -3
  8. airflow/providers/google/cloud/hooks/dataplex.py +2 -1
  9. airflow/providers/google/cloud/hooks/dataproc.py +19 -18
  10. airflow/providers/google/cloud/hooks/gcs.py +10 -6
  11. airflow/providers/google/cloud/hooks/pubsub.py +3 -2
  12. airflow/providers/google/cloud/log/gcs_task_handler.py +2 -39
  13. airflow/providers/google/cloud/log/stackdriver_task_handler.py +2 -11
  14. airflow/providers/google/cloud/operators/bigquery.py +47 -47
  15. airflow/providers/google/cloud/operators/cloud_composer.py +1 -1
  16. airflow/providers/google/cloud/operators/cloud_run.py +3 -3
  17. airflow/providers/google/cloud/operators/dataflow.py +6 -0
  18. airflow/providers/google/cloud/operators/dataplex.py +530 -1
  19. airflow/providers/google/cloud/operators/dataproc.py +11 -11
  20. airflow/providers/google/cloud/operators/gcs.py +90 -15
  21. airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -3
  22. airflow/providers/google/cloud/operators/pubsub.py +47 -55
  23. airflow/providers/google/cloud/secrets/secret_manager.py +22 -1
  24. airflow/providers/google/cloud/sensors/cloud_composer.py +14 -1
  25. airflow/providers/google/cloud/sensors/dataplex.py +118 -0
  26. airflow/providers/google/cloud/sensors/gcs.py +10 -1
  27. airflow/providers/google/cloud/transfers/adls_to_gcs.py +5 -5
  28. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +42 -42
  29. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +9 -9
  30. airflow/providers/google/cloud/triggers/cloud_run.py +7 -7
  31. airflow/providers/google/cloud/triggers/dataplex.py +82 -0
  32. airflow/providers/google/cloud/triggers/dataproc.py +2 -5
  33. airflow/providers/google/cloud/triggers/gcs.py +13 -3
  34. airflow/providers/google/cloud/triggers/kubernetes_engine.py +3 -1
  35. airflow/providers/google/common/hooks/base_google.py +6 -4
  36. airflow/providers/google/get_provider_info.py +14 -13
  37. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/METADATA +30 -30
  38. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/RECORD +40 -40
  39. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/WHEEL +0 -0
  40. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/entry_points.txt +0 -0
@@ -22,7 +22,10 @@ import time
22
22
  from typing import TYPE_CHECKING, Any, Sequence
23
23
 
24
24
  from airflow.exceptions import AirflowException
25
- from airflow.providers.google.cloud.triggers.dataplex import DataplexDataQualityJobTrigger
25
+ from airflow.providers.google.cloud.triggers.dataplex import (
26
+ DataplexDataProfileJobTrigger,
27
+ DataplexDataQualityJobTrigger,
28
+ )
26
29
 
27
30
  if TYPE_CHECKING:
28
31
  from google.protobuf.field_mask_pb2 import FieldMask
@@ -1204,6 +1207,532 @@ class DataplexGetDataQualityScanResultOperator(GoogleCloudBaseOperator):
1204
1207
  return job
1205
1208
 
1206
1209
 
1210
+ class DataplexCreateOrUpdateDataProfileScanOperator(GoogleCloudBaseOperator):
1211
+ """
1212
+ Creates a DataScan Data Profile resource.
1213
+
1214
+ :param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
1215
+ :param region: Required. The ID of the Google Cloud region that the lake belongs to.
1216
+ :param body: Required. The Request body contains an instance of DataScan.
1217
+ :param data_scan_id: Required. Data Profile scan identifier.
1218
+ :param update_mask: Mask of fields to update.
1219
+ :param api_version: The version of the api that will be requested for example 'v1'.
1220
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
1221
+ will not be retried.
1222
+ :param timeout: The amount of time, in seconds, to wait for the request to complete.
1223
+ Note that if `retry` is specified, the timeout applies to each individual attempt.
1224
+ :param metadata: Additional metadata that is provided to the method.
1225
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
1226
+ :param impersonation_chain: Optional service account to impersonate using short-term
1227
+ credentials, or chained list of accounts required to get the access_token
1228
+ of the last account in the list, which will be impersonated in the request.
1229
+ If set as a string, the account must grant the originating account
1230
+ the Service Account Token Creator IAM role.
1231
+ If set as a sequence, the identities from the list must grant
1232
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1233
+ account from the list granting this role to the originating account (templated).
1234
+
1235
+ :return: Dataplex data profile id
1236
+ """
1237
+
1238
+ template_fields = ("project_id", "data_scan_id", "body", "impersonation_chain")
1239
+ template_fields_renderers = {"body": "json"}
1240
+
1241
+ def __init__(
1242
+ self,
1243
+ project_id: str,
1244
+ region: str,
1245
+ data_scan_id: str,
1246
+ body: dict[str, Any] | DataScan,
1247
+ api_version: str = "v1",
1248
+ retry: Retry | _MethodDefault = DEFAULT,
1249
+ timeout: float | None = None,
1250
+ update_mask: dict | FieldMask | None = None,
1251
+ metadata: Sequence[tuple[str, str]] = (),
1252
+ gcp_conn_id: str = "google_cloud_default",
1253
+ impersonation_chain: str | Sequence[str] | None = None,
1254
+ *args,
1255
+ **kwargs,
1256
+ ) -> None:
1257
+ super().__init__(*args, **kwargs)
1258
+ self.project_id = project_id
1259
+ self.region = region
1260
+ self.data_scan_id = data_scan_id
1261
+ self.body = body
1262
+ self.update_mask = update_mask
1263
+ self.api_version = api_version
1264
+ self.retry = retry
1265
+ self.timeout = timeout
1266
+ self.metadata = metadata
1267
+ self.gcp_conn_id = gcp_conn_id
1268
+ self.impersonation_chain = impersonation_chain
1269
+
1270
+ def execute(self, context: Context):
1271
+ hook = DataplexHook(
1272
+ gcp_conn_id=self.gcp_conn_id,
1273
+ api_version=self.api_version,
1274
+ impersonation_chain=self.impersonation_chain,
1275
+ )
1276
+
1277
+ self.log.info("Creating Dataplex Data Profile scan %s", self.data_scan_id)
1278
+ try:
1279
+ operation = hook.create_data_scan(
1280
+ project_id=self.project_id,
1281
+ region=self.region,
1282
+ data_scan_id=self.data_scan_id,
1283
+ body=self.body,
1284
+ retry=self.retry,
1285
+ timeout=self.timeout,
1286
+ metadata=self.metadata,
1287
+ )
1288
+ hook.wait_for_operation(timeout=self.timeout, operation=operation)
1289
+ self.log.info("Dataplex Data Profile scan %s created successfully!", self.data_scan_id)
1290
+ except AlreadyExists:
1291
+ self.log.info("Dataplex Data Profile scan already exists: %s", {self.data_scan_id})
1292
+
1293
+ operation = hook.update_data_scan(
1294
+ project_id=self.project_id,
1295
+ region=self.region,
1296
+ data_scan_id=self.data_scan_id,
1297
+ body=self.body,
1298
+ update_mask=self.update_mask,
1299
+ retry=self.retry,
1300
+ timeout=self.timeout,
1301
+ metadata=self.metadata,
1302
+ )
1303
+ hook.wait_for_operation(timeout=self.timeout, operation=operation)
1304
+ self.log.info("Dataplex Data Profile scan %s updated successfully!", self.data_scan_id)
1305
+ except GoogleAPICallError as e:
1306
+ raise AirflowException(f"Error creating Data Profile scan {self.data_scan_id}", e)
1307
+
1308
+ return self.data_scan_id
1309
+
1310
+
1311
+ class DataplexGetDataProfileScanOperator(GoogleCloudBaseOperator):
1312
+ """
1313
+ Gets a DataScan DataProfile resource.
1314
+
1315
+ :param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
1316
+ :param region: Required. The ID of the Google Cloud region that the lake belongs to.
1317
+ :param data_scan_id: Required. Data Profile scan identifier.
1318
+ :param api_version: The version of the api that will be requested for example 'v1'.
1319
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
1320
+ will not be retried.
1321
+ :param timeout: The amount of time, in seconds, to wait for the request to complete.
1322
+ Note that if `retry` is specified, the timeout applies to each individual attempt.
1323
+ :param metadata: Additional metadata that is provided to the method.
1324
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
1325
+ :param impersonation_chain: Optional service account to impersonate using short-term
1326
+ credentials, or chained list of accounts required to get the access_token
1327
+ of the last account in the list, which will be impersonated in the request.
1328
+ If set as a string, the account must grant the originating account
1329
+ the Service Account Token Creator IAM role.
1330
+ If set as a sequence, the identities from the list must grant
1331
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1332
+ account from the list granting this role to the originating account (templated).
1333
+
1334
+ :return: Dataplex data profile
1335
+ """
1336
+
1337
+ template_fields = ("project_id", "data_scan_id", "impersonation_chain")
1338
+
1339
+ def __init__(
1340
+ self,
1341
+ project_id: str,
1342
+ region: str,
1343
+ data_scan_id: str,
1344
+ api_version: str = "v1",
1345
+ retry: Retry | _MethodDefault = DEFAULT,
1346
+ timeout: float | None = None,
1347
+ metadata: Sequence[tuple[str, str]] = (),
1348
+ gcp_conn_id: str = "google_cloud_default",
1349
+ impersonation_chain: str | Sequence[str] | None = None,
1350
+ *args,
1351
+ **kwargs,
1352
+ ) -> None:
1353
+ super().__init__(*args, **kwargs)
1354
+ self.project_id = project_id
1355
+ self.region = region
1356
+ self.data_scan_id = data_scan_id
1357
+ self.api_version = api_version
1358
+ self.retry = retry
1359
+ self.timeout = timeout
1360
+ self.metadata = metadata
1361
+ self.gcp_conn_id = gcp_conn_id
1362
+ self.impersonation_chain = impersonation_chain
1363
+
1364
+ def execute(self, context: Context):
1365
+ hook = DataplexHook(
1366
+ gcp_conn_id=self.gcp_conn_id,
1367
+ api_version=self.api_version,
1368
+ impersonation_chain=self.impersonation_chain,
1369
+ )
1370
+
1371
+ self.log.info("Retrieving the details of Dataplex Data Profile scan %s", self.data_scan_id)
1372
+ data_profile_scan = hook.get_data_scan(
1373
+ project_id=self.project_id,
1374
+ region=self.region,
1375
+ data_scan_id=self.data_scan_id,
1376
+ retry=self.retry,
1377
+ timeout=self.timeout,
1378
+ metadata=self.metadata,
1379
+ )
1380
+
1381
+ return DataScan.to_dict(data_profile_scan)
1382
+
1383
+
1384
+ class DataplexDeleteDataProfileScanOperator(GoogleCloudBaseOperator):
1385
+ """
1386
+ Deletes a DataScan DataProfile resource.
1387
+
1388
+ :param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
1389
+ :param region: Required. The ID of the Google Cloud region that the lake belongs to.
1390
+ :param data_scan_id: Required. Data Profile scan identifier.
1391
+ :param api_version: The version of the api that will be requested for example 'v1'.
1392
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
1393
+ will not be retried.
1394
+ :param timeout: The amount of time, in seconds, to wait for the request to complete.
1395
+ Note that if `retry` is specified, the timeout applies to each individual attempt.
1396
+ :param metadata: Additional metadata that is provided to the method.
1397
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
1398
+ :param impersonation_chain: Optional service account to impersonate using short-term
1399
+ credentials, or chained list of accounts required to get the access_token
1400
+ of the last account in the list, which will be impersonated in the request.
1401
+ If set as a string, the account must grant the originating account
1402
+ the Service Account Token Creator IAM role.
1403
+ If set as a sequence, the identities from the list must grant
1404
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1405
+ account from the list granting this role to the originating account (templated).
1406
+ :return: None
1407
+ """
1408
+
1409
+ template_fields = ("project_id", "data_scan_id", "impersonation_chain")
1410
+
1411
+ def __init__(
1412
+ self,
1413
+ project_id: str,
1414
+ region: str,
1415
+ data_scan_id: str,
1416
+ api_version: str = "v1",
1417
+ retry: Retry | _MethodDefault = DEFAULT,
1418
+ timeout: float | None = None,
1419
+ metadata: Sequence[tuple[str, str]] = (),
1420
+ gcp_conn_id: str = "google_cloud_default",
1421
+ impersonation_chain: str | Sequence[str] | None = None,
1422
+ *args,
1423
+ **kwargs,
1424
+ ) -> None:
1425
+ super().__init__(*args, **kwargs)
1426
+ self.project_id = project_id
1427
+ self.region = region
1428
+ self.data_scan_id = data_scan_id
1429
+ self.api_version = api_version
1430
+ self.retry = retry
1431
+ self.timeout = timeout
1432
+ self.metadata = metadata
1433
+ self.gcp_conn_id = gcp_conn_id
1434
+ self.impersonation_chain = impersonation_chain
1435
+
1436
+ def execute(self, context: Context) -> None:
1437
+ hook = DataplexHook(
1438
+ gcp_conn_id=self.gcp_conn_id,
1439
+ api_version=self.api_version,
1440
+ impersonation_chain=self.impersonation_chain,
1441
+ )
1442
+
1443
+ self.log.info("Deleting Dataplex Data Profile Scan: %s", self.data_scan_id)
1444
+
1445
+ operation = hook.delete_data_scan(
1446
+ project_id=self.project_id,
1447
+ region=self.region,
1448
+ data_scan_id=self.data_scan_id,
1449
+ retry=self.retry,
1450
+ timeout=self.timeout,
1451
+ metadata=self.metadata,
1452
+ )
1453
+ hook.wait_for_operation(timeout=self.timeout, operation=operation)
1454
+ self.log.info("Dataplex Data Profile scan %s deleted successfully!", self.data_scan_id)
1455
+
1456
+
1457
+ class DataplexRunDataProfileScanOperator(GoogleCloudBaseOperator):
1458
+ """
1459
+ Runs an on-demand execution of a DataScan Data Profile Scan.
1460
+
1461
+ :param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
1462
+ :param region: Required. The ID of the Google Cloud region that the lake belongs to.
1463
+ :param data_scan_id: Required. Data Profile scan identifier.
1464
+ :param api_version: The version of the api that will be requested for example 'v1'.
1465
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
1466
+ will not be retried.
1467
+ :param timeout: The amount of time, in seconds, to wait for the request to complete.
1468
+ Note that if `retry` is specified, the timeout applies to each individual attempt.
1469
+ :param metadata: Additional metadata that is provided to the method.
1470
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
1471
+ :param impersonation_chain: Optional service account to impersonate using short-term
1472
+ credentials, or chained list of accounts required to get the access_token
1473
+ of the last account in the list, which will be impersonated in the request.
1474
+ If set as a string, the account must grant the originating account
1475
+ the Service Account Token Creator IAM role.
1476
+ If set as a sequence, the identities from the list must grant
1477
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1478
+ account from the list granting this role to the originating account (templated).
1479
+ :param asynchronous: Flag informing that the Dataplex job should be run asynchronously.
1480
+ This is useful for submitting long-running jobs and
1481
+ waiting on them asynchronously using the DataplexDataProfileJobStatusSensor
1482
+ :param result_timeout: Value in seconds for which operator will wait for the Data Profile scan result
1483
+ when the flag `asynchronous = False`.
1484
+ Throws exception if there is no result found after specified amount of seconds.
1485
+ :param polling_interval_seconds: time in seconds between polling for job completion.
1486
+ The value is considered only when running in deferrable mode. Must be greater than 0.
1487
+ :param deferrable: Run operator in the deferrable mode.
1488
+
1489
+ :return: Dataplex Data Profile scan job id.
1490
+ """
1491
+
1492
+ template_fields = ("project_id", "data_scan_id", "impersonation_chain")
1493
+
1494
+ def __init__(
1495
+ self,
1496
+ project_id: str,
1497
+ region: str,
1498
+ data_scan_id: str,
1499
+ api_version: str = "v1",
1500
+ retry: Retry | _MethodDefault = DEFAULT,
1501
+ timeout: float | None = None,
1502
+ metadata: Sequence[tuple[str, str]] = (),
1503
+ gcp_conn_id: str = "google_cloud_default",
1504
+ impersonation_chain: str | Sequence[str] | None = None,
1505
+ asynchronous: bool = False,
1506
+ result_timeout: float = 60.0 * 10,
1507
+ deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
1508
+ polling_interval_seconds: int = 10,
1509
+ *args,
1510
+ **kwargs,
1511
+ ) -> None:
1512
+ super().__init__(*args, **kwargs)
1513
+ self.project_id = project_id
1514
+ self.region = region
1515
+ self.data_scan_id = data_scan_id
1516
+ self.api_version = api_version
1517
+ self.retry = retry
1518
+ self.timeout = timeout
1519
+ self.metadata = metadata
1520
+ self.gcp_conn_id = gcp_conn_id
1521
+ self.impersonation_chain = impersonation_chain
1522
+ self.asynchronous = asynchronous
1523
+ self.result_timeout = result_timeout
1524
+ self.deferrable = deferrable
1525
+ self.polling_interval_seconds = polling_interval_seconds
1526
+
1527
+ def execute(self, context: Context) -> dict:
1528
+ hook = DataplexHook(
1529
+ gcp_conn_id=self.gcp_conn_id,
1530
+ api_version=self.api_version,
1531
+ impersonation_chain=self.impersonation_chain,
1532
+ )
1533
+
1534
+ result = hook.run_data_scan(
1535
+ project_id=self.project_id,
1536
+ region=self.region,
1537
+ data_scan_id=self.data_scan_id,
1538
+ retry=self.retry,
1539
+ timeout=self.timeout,
1540
+ metadata=self.metadata,
1541
+ )
1542
+ job_id = result.job.name.split("/")[-1]
1543
+
1544
+ if self.deferrable:
1545
+ if self.asynchronous:
1546
+ raise AirflowException(
1547
+ "Both asynchronous and deferrable parameters were passed. Please, provide only one."
1548
+ )
1549
+ self.defer(
1550
+ trigger=DataplexDataProfileJobTrigger(
1551
+ job_id=job_id,
1552
+ data_scan_id=self.data_scan_id,
1553
+ project_id=self.project_id,
1554
+ region=self.region,
1555
+ gcp_conn_id=self.gcp_conn_id,
1556
+ impersonation_chain=self.impersonation_chain,
1557
+ polling_interval_seconds=self.polling_interval_seconds,
1558
+ ),
1559
+ method_name="execute_complete",
1560
+ )
1561
+ if not self.asynchronous:
1562
+ job = hook.wait_for_data_scan_job(
1563
+ job_id=job_id,
1564
+ data_scan_id=self.data_scan_id,
1565
+ project_id=self.project_id,
1566
+ region=self.region,
1567
+ result_timeout=self.result_timeout,
1568
+ )
1569
+
1570
+ if job.state == DataScanJob.State.FAILED:
1571
+ raise AirflowException(f"Data Profile job failed: {job_id}")
1572
+ if job.state == DataScanJob.State.SUCCEEDED:
1573
+ self.log.info("Data Profile job executed successfully.")
1574
+ else:
1575
+ self.log.info("Data Profile job execution returned status: %s", job.status)
1576
+
1577
+ return job_id
1578
+
1579
+ def execute_complete(self, context, event=None) -> None:
1580
+ """
1581
+ Callback for when the trigger fires - returns immediately.
1582
+
1583
+ Relies on trigger to throw an exception, otherwise it assumes execution was
1584
+ successful.
1585
+ """
1586
+ job_state = event["job_state"]
1587
+ job_id = event["job_id"]
1588
+ if job_state == DataScanJob.State.FAILED:
1589
+ raise AirflowException(f"Job failed:\n{job_id}")
1590
+ if job_state == DataScanJob.State.CANCELLED:
1591
+ raise AirflowException(f"Job was cancelled:\n{job_id}")
1592
+ if job_state == DataScanJob.State.SUCCEEDED:
1593
+ self.log.info("Data Profile job executed successfully.")
1594
+ return job_id
1595
+
1596
+
1597
+ class DataplexGetDataProfileScanResultOperator(GoogleCloudBaseOperator):
1598
+ """
1599
+ Gets a DataScan Data Profile Job resource.
1600
+
1601
+ :param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
1602
+ :param region: Required. The ID of the Google Cloud region that the lake belongs to.
1603
+ :param data_scan_id: Required. Data Profile scan identifier.
1604
+ :param job_id: Optional. Data Profile scan job identifier.
1605
+ :param api_version: The version of the api that will be requested for example 'v1'.
1606
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
1607
+ will not be retried.
1608
+ :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
1609
+ ``retry`` is specified, the timeout applies to each individual attempt.
1610
+ :param metadata: Additional metadata that is provided to the method.
1611
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
1612
+ :param impersonation_chain: Optional service account to impersonate using short-term
1613
+ credentials, or chained list of accounts required to get the access_token
1614
+ of the last account in the list, which will be impersonated in the request.
1615
+ If set as a string, the account must grant the originating account
1616
+ the Service Account Token Creator IAM role.
1617
+ If set as a sequence, the identities from the list must grant
1618
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1619
+ account from the list granting this role to the originating account (templated).
1620
+ :param wait_for_results: Flag indicating whether to wait for the result of a job execution
1621
+ or to return the job in its current state.
1622
+ :param result_timeout: Value in seconds for which operator will wait for the Data Profile scan result
1623
+ when the flag `wait_for_results = True`.
1624
+ Throws exception if there is no result found after specified amount of seconds.
1625
+
1626
+ :return: Dict representing DataScanJob.
1627
+ When the job completes with a successful status, information about the Data Profile result
1628
+ is available.
1629
+ """
1630
+
1631
+ template_fields = ("project_id", "data_scan_id", "impersonation_chain")
1632
+
1633
+ def __init__(
1634
+ self,
1635
+ project_id: str,
1636
+ region: str,
1637
+ data_scan_id: str,
1638
+ job_id: str | None = None,
1639
+ api_version: str = "v1",
1640
+ retry: Retry | _MethodDefault = DEFAULT,
1641
+ timeout: float | None = None,
1642
+ metadata: Sequence[tuple[str, str]] = (),
1643
+ gcp_conn_id: str = "google_cloud_default",
1644
+ impersonation_chain: str | Sequence[str] | None = None,
1645
+ wait_for_results: bool = True,
1646
+ result_timeout: float = 60.0 * 10,
1647
+ *args,
1648
+ **kwargs,
1649
+ ) -> None:
1650
+ super().__init__(*args, **kwargs)
1651
+ self.project_id = project_id
1652
+ self.region = region
1653
+ self.data_scan_id = data_scan_id
1654
+ self.job_id = job_id
1655
+ self.api_version = api_version
1656
+ self.retry = retry
1657
+ self.timeout = timeout
1658
+ self.metadata = metadata
1659
+ self.gcp_conn_id = gcp_conn_id
1660
+ self.impersonation_chain = impersonation_chain
1661
+ self.wait_for_results = wait_for_results
1662
+ self.result_timeout = result_timeout
1663
+
1664
+ def execute(self, context: Context) -> dict:
1665
+ hook = DataplexHook(
1666
+ gcp_conn_id=self.gcp_conn_id,
1667
+ api_version=self.api_version,
1668
+ impersonation_chain=self.impersonation_chain,
1669
+ )
1670
+ # fetch the last job
1671
+ if not self.job_id:
1672
+ jobs = hook.list_data_scan_jobs(
1673
+ project_id=self.project_id,
1674
+ region=self.region,
1675
+ data_scan_id=self.data_scan_id,
1676
+ retry=self.retry,
1677
+ timeout=self.timeout,
1678
+ metadata=self.metadata,
1679
+ )
1680
+ job_ids = [DataScanJob.to_dict(job) for job in jobs]
1681
+ if not job_ids:
1682
+ raise AirflowException("There are no jobs, you should create one before.")
1683
+ job_id = job_ids[0]["name"]
1684
+ self.job_id = job_id.split("/")[-1]
1685
+
1686
+ if self.wait_for_results:
1687
+ job = hook.wait_for_data_scan_job(
1688
+ job_id=self.job_id,
1689
+ data_scan_id=self.data_scan_id,
1690
+ project_id=self.project_id,
1691
+ region=self.region,
1692
+ result_timeout=self.result_timeout,
1693
+ )
1694
+ else:
1695
+ job = hook.get_data_scan_job(
1696
+ project_id=self.project_id,
1697
+ region=self.region,
1698
+ job_id=self.job_id,
1699
+ data_scan_id=self.data_scan_id,
1700
+ retry=self.retry,
1701
+ timeout=self.timeout,
1702
+ metadata=self.metadata,
1703
+ )
1704
+ if job.state == DataScanJob.State.SUCCEEDED:
1705
+ self.log.info("Data Profile job executed successfully")
1706
+ else:
1707
+ self.log.info("Data Profile job execution returned status: %s", job.state)
1708
+
1709
+ result = DataScanJob.to_dict(job)
1710
+ result["state"] = DataScanJob.State(result["state"]).name
1711
+
1712
+ return result
1713
+
1714
+ def execute_complete(self, context, event=None) -> None:
1715
+ """
1716
+ Callback for when the trigger fires - returns immediately.
1717
+
1718
+ Relies on trigger to throw an exception, otherwise it assumes execution was
1719
+ successful.
1720
+ """
1721
+ job_state = event["job_state"]
1722
+ job_id = event["job_id"]
1723
+ job = event["job"]
1724
+ if job_state == DataScanJob.State.FAILED:
1725
+ raise AirflowException(f"Job failed:\n{job_id}")
1726
+ if job_state == DataScanJob.State.CANCELLED:
1727
+ raise AirflowException(f"Job was cancelled:\n{job_id}")
1728
+ if job_state == DataScanJob.State.SUCCEEDED:
1729
+ self.log.info("Data Profile job executed successfully")
1730
+ else:
1731
+ self.log.info("Data Profile job execution returned status: %s", job_state)
1732
+
1733
+ return job
1734
+
1735
+
1207
1736
  class DataplexCreateZoneOperator(GoogleCloudBaseOperator):
1208
1737
  """
1209
1738
  Creates a Zone resource within a Lake.
@@ -63,6 +63,7 @@ from airflow.utils import timezone
63
63
 
64
64
  if TYPE_CHECKING:
65
65
  from google.api_core import operation
66
+ from google.api_core.retry_async import AsyncRetry
66
67
  from google.protobuf.duration_pb2 import Duration
67
68
  from google.protobuf.field_mask_pb2 import FieldMask
68
69
 
@@ -511,9 +512,7 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
511
512
  The operator will wait until the creation is successful or an error occurs
512
513
  in the creation process.
513
514
 
514
- If the cluster already exists and ``use_if_exists`` is True, the operator will:
515
-
516
- If the cluster already exists and ``use_if_exists`` is True then the operator will:
515
+ If the cluster already exists and ``use_if_exists`` is True, then the operator will:
517
516
  - if cluster state is ERROR then delete it if specified and raise error
518
517
  - if cluster state is CREATING wait for it and then check for ERROR state
519
518
  - if cluster state is DELETING wait for it and then create new cluster
@@ -592,7 +591,7 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
592
591
  request_id: str | None = None,
593
592
  delete_on_error: bool = True,
594
593
  use_if_exists: bool = True,
595
- retry: Retry | _MethodDefault = DEFAULT,
594
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
596
595
  timeout: float = 1 * 60 * 60,
597
596
  metadata: Sequence[tuple[str, str]] = (),
598
597
  gcp_conn_id: str = "google_cloud_default",
@@ -985,7 +984,7 @@ class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
985
984
  project_id: str | None = None,
986
985
  cluster_uuid: str | None = None,
987
986
  request_id: str | None = None,
988
- retry: Retry | _MethodDefault = DEFAULT,
987
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
989
988
  timeout: float = 1 * 60 * 60,
990
989
  metadata: Sequence[tuple[str, str]] = (),
991
990
  gcp_conn_id: str = "google_cloud_default",
@@ -1891,7 +1890,7 @@ class DataprocInstantiateWorkflowTemplateOperator(GoogleCloudBaseOperator):
1891
1890
  version: int | None = None,
1892
1891
  request_id: str | None = None,
1893
1892
  parameters: dict[str, str] | None = None,
1894
- retry: Retry | _MethodDefault = DEFAULT,
1893
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
1895
1894
  timeout: float | None = None,
1896
1895
  metadata: Sequence[tuple[str, str]] = (),
1897
1896
  gcp_conn_id: str = "google_cloud_default",
@@ -2152,7 +2151,7 @@ class DataprocSubmitJobOperator(GoogleCloudBaseOperator):
2152
2151
  Service Account Token Creator IAM role to the directly preceding identity, with first
2153
2152
  account from the list granting this role to the originating account (templated).
2154
2153
  :param asynchronous: Flag to return after submitting the job to the Dataproc API.
2155
- This is useful for submitting long running jobs and
2154
+ This is useful for submitting long-running jobs and
2156
2155
  waiting on them asynchronously using the DataprocJobSensor
2157
2156
  :param deferrable: Run operator in the deferrable mode
2158
2157
  :param polling_interval_seconds: time in seconds between polling for job completion.
@@ -2267,10 +2266,11 @@ class DataprocSubmitJobOperator(GoogleCloudBaseOperator):
2267
2266
  """
2268
2267
  job_state = event["job_state"]
2269
2268
  job_id = event["job_id"]
2269
+ job = event["job"]
2270
2270
  if job_state == JobStatus.State.ERROR:
2271
- raise AirflowException(f"Job failed:\n{job_id}")
2271
+ raise AirflowException(f"Job {job_id} failed:\n{job}")
2272
2272
  if job_state == JobStatus.State.CANCELLED:
2273
- raise AirflowException(f"Job was cancelled:\n{job_id}")
2273
+ raise AirflowException(f"Job {job_id} was cancelled:\n{job}")
2274
2274
  self.log.info("%s completed successfully.", self.task_id)
2275
2275
  return job_id
2276
2276
 
@@ -2340,7 +2340,7 @@ class DataprocUpdateClusterOperator(GoogleCloudBaseOperator):
2340
2340
  region: str,
2341
2341
  request_id: str | None = None,
2342
2342
  project_id: str | None = None,
2343
- retry: Retry | _MethodDefault = DEFAULT,
2343
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
2344
2344
  timeout: float | None = None,
2345
2345
  metadata: Sequence[tuple[str, str]] = (),
2346
2346
  gcp_conn_id: str = "google_cloud_default",
@@ -2480,7 +2480,7 @@ class DataprocCreateBatchOperator(GoogleCloudBaseOperator):
2480
2480
  metadata: Sequence[tuple[str, str]] = (),
2481
2481
  gcp_conn_id: str = "google_cloud_default",
2482
2482
  impersonation_chain: str | Sequence[str] | None = None,
2483
- result_retry: Retry | _MethodDefault = DEFAULT,
2483
+ result_retry: AsyncRetry | _MethodDefault = DEFAULT,
2484
2484
  asynchronous: bool = False,
2485
2485
  deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
2486
2486
  polling_interval_seconds: int = 5,