apache-airflow-providers-google 10.12.0rc1__py3-none-any.whl → 10.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/cloud/fs/gcs.py +16 -13
- airflow/providers/google/cloud/hooks/bigquery_dts.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_build.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_composer.py +4 -3
- airflow/providers/google/cloud/hooks/compute_ssh.py +18 -6
- airflow/providers/google/cloud/hooks/dataflow.py +61 -3
- airflow/providers/google/cloud/hooks/dataplex.py +2 -1
- airflow/providers/google/cloud/hooks/dataproc.py +19 -18
- airflow/providers/google/cloud/hooks/gcs.py +10 -6
- airflow/providers/google/cloud/hooks/pubsub.py +3 -2
- airflow/providers/google/cloud/log/gcs_task_handler.py +2 -39
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +2 -11
- airflow/providers/google/cloud/operators/bigquery.py +47 -47
- airflow/providers/google/cloud/operators/cloud_composer.py +1 -1
- airflow/providers/google/cloud/operators/cloud_run.py +3 -3
- airflow/providers/google/cloud/operators/dataflow.py +6 -0
- airflow/providers/google/cloud/operators/dataplex.py +530 -1
- airflow/providers/google/cloud/operators/dataproc.py +11 -11
- airflow/providers/google/cloud/operators/gcs.py +90 -15
- airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -3
- airflow/providers/google/cloud/operators/pubsub.py +47 -55
- airflow/providers/google/cloud/secrets/secret_manager.py +22 -1
- airflow/providers/google/cloud/sensors/cloud_composer.py +14 -1
- airflow/providers/google/cloud/sensors/dataplex.py +118 -0
- airflow/providers/google/cloud/sensors/gcs.py +10 -1
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +5 -5
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +42 -42
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +9 -9
- airflow/providers/google/cloud/triggers/cloud_run.py +7 -7
- airflow/providers/google/cloud/triggers/dataplex.py +82 -0
- airflow/providers/google/cloud/triggers/dataproc.py +2 -5
- airflow/providers/google/cloud/triggers/gcs.py +13 -3
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +3 -1
- airflow/providers/google/common/hooks/base_google.py +6 -4
- airflow/providers/google/get_provider_info.py +14 -13
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/METADATA +31 -31
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/RECORD +40 -40
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/entry_points.txt +0 -0
@@ -22,7 +22,10 @@ import time
|
|
22
22
|
from typing import TYPE_CHECKING, Any, Sequence
|
23
23
|
|
24
24
|
from airflow.exceptions import AirflowException
|
25
|
-
from airflow.providers.google.cloud.triggers.dataplex import
|
25
|
+
from airflow.providers.google.cloud.triggers.dataplex import (
|
26
|
+
DataplexDataProfileJobTrigger,
|
27
|
+
DataplexDataQualityJobTrigger,
|
28
|
+
)
|
26
29
|
|
27
30
|
if TYPE_CHECKING:
|
28
31
|
from google.protobuf.field_mask_pb2 import FieldMask
|
@@ -1204,6 +1207,532 @@ class DataplexGetDataQualityScanResultOperator(GoogleCloudBaseOperator):
|
|
1204
1207
|
return job
|
1205
1208
|
|
1206
1209
|
|
1210
|
+
class DataplexCreateOrUpdateDataProfileScanOperator(GoogleCloudBaseOperator):
|
1211
|
+
"""
|
1212
|
+
Creates a DataScan Data Profile resource.
|
1213
|
+
|
1214
|
+
:param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
|
1215
|
+
:param region: Required. The ID of the Google Cloud region that the lake belongs to.
|
1216
|
+
:param body: Required. The Request body contains an instance of DataScan.
|
1217
|
+
:param data_scan_id: Required. Data Profile scan identifier.
|
1218
|
+
:param update_mask: Mask of fields to update.
|
1219
|
+
:param api_version: The version of the api that will be requested for example 'v1'.
|
1220
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
1221
|
+
will not be retried.
|
1222
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete.
|
1223
|
+
Note that if `retry` is specified, the timeout applies to each individual attempt.
|
1224
|
+
:param metadata: Additional metadata that is provided to the method.
|
1225
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
1226
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1227
|
+
credentials, or chained list of accounts required to get the access_token
|
1228
|
+
of the last account in the list, which will be impersonated in the request.
|
1229
|
+
If set as a string, the account must grant the originating account
|
1230
|
+
the Service Account Token Creator IAM role.
|
1231
|
+
If set as a sequence, the identities from the list must grant
|
1232
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1233
|
+
account from the list granting this role to the originating account (templated).
|
1234
|
+
|
1235
|
+
:return: Dataplex data profile id
|
1236
|
+
"""
|
1237
|
+
|
1238
|
+
template_fields = ("project_id", "data_scan_id", "body", "impersonation_chain")
|
1239
|
+
template_fields_renderers = {"body": "json"}
|
1240
|
+
|
1241
|
+
def __init__(
|
1242
|
+
self,
|
1243
|
+
project_id: str,
|
1244
|
+
region: str,
|
1245
|
+
data_scan_id: str,
|
1246
|
+
body: dict[str, Any] | DataScan,
|
1247
|
+
api_version: str = "v1",
|
1248
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
1249
|
+
timeout: float | None = None,
|
1250
|
+
update_mask: dict | FieldMask | None = None,
|
1251
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1252
|
+
gcp_conn_id: str = "google_cloud_default",
|
1253
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1254
|
+
*args,
|
1255
|
+
**kwargs,
|
1256
|
+
) -> None:
|
1257
|
+
super().__init__(*args, **kwargs)
|
1258
|
+
self.project_id = project_id
|
1259
|
+
self.region = region
|
1260
|
+
self.data_scan_id = data_scan_id
|
1261
|
+
self.body = body
|
1262
|
+
self.update_mask = update_mask
|
1263
|
+
self.api_version = api_version
|
1264
|
+
self.retry = retry
|
1265
|
+
self.timeout = timeout
|
1266
|
+
self.metadata = metadata
|
1267
|
+
self.gcp_conn_id = gcp_conn_id
|
1268
|
+
self.impersonation_chain = impersonation_chain
|
1269
|
+
|
1270
|
+
def execute(self, context: Context):
|
1271
|
+
hook = DataplexHook(
|
1272
|
+
gcp_conn_id=self.gcp_conn_id,
|
1273
|
+
api_version=self.api_version,
|
1274
|
+
impersonation_chain=self.impersonation_chain,
|
1275
|
+
)
|
1276
|
+
|
1277
|
+
self.log.info("Creating Dataplex Data Profile scan %s", self.data_scan_id)
|
1278
|
+
try:
|
1279
|
+
operation = hook.create_data_scan(
|
1280
|
+
project_id=self.project_id,
|
1281
|
+
region=self.region,
|
1282
|
+
data_scan_id=self.data_scan_id,
|
1283
|
+
body=self.body,
|
1284
|
+
retry=self.retry,
|
1285
|
+
timeout=self.timeout,
|
1286
|
+
metadata=self.metadata,
|
1287
|
+
)
|
1288
|
+
hook.wait_for_operation(timeout=self.timeout, operation=operation)
|
1289
|
+
self.log.info("Dataplex Data Profile scan %s created successfully!", self.data_scan_id)
|
1290
|
+
except AlreadyExists:
|
1291
|
+
self.log.info("Dataplex Data Profile scan already exists: %s", {self.data_scan_id})
|
1292
|
+
|
1293
|
+
operation = hook.update_data_scan(
|
1294
|
+
project_id=self.project_id,
|
1295
|
+
region=self.region,
|
1296
|
+
data_scan_id=self.data_scan_id,
|
1297
|
+
body=self.body,
|
1298
|
+
update_mask=self.update_mask,
|
1299
|
+
retry=self.retry,
|
1300
|
+
timeout=self.timeout,
|
1301
|
+
metadata=self.metadata,
|
1302
|
+
)
|
1303
|
+
hook.wait_for_operation(timeout=self.timeout, operation=operation)
|
1304
|
+
self.log.info("Dataplex Data Profile scan %s updated successfully!", self.data_scan_id)
|
1305
|
+
except GoogleAPICallError as e:
|
1306
|
+
raise AirflowException(f"Error creating Data Profile scan {self.data_scan_id}", e)
|
1307
|
+
|
1308
|
+
return self.data_scan_id
|
1309
|
+
|
1310
|
+
|
1311
|
+
class DataplexGetDataProfileScanOperator(GoogleCloudBaseOperator):
|
1312
|
+
"""
|
1313
|
+
Gets a DataScan DataProfile resource.
|
1314
|
+
|
1315
|
+
:param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
|
1316
|
+
:param region: Required. The ID of the Google Cloud region that the lake belongs to.
|
1317
|
+
:param data_scan_id: Required. Data Profile scan identifier.
|
1318
|
+
:param api_version: The version of the api that will be requested for example 'v1'.
|
1319
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
1320
|
+
will not be retried.
|
1321
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete.
|
1322
|
+
Note that if `retry` is specified, the timeout applies to each individual attempt.
|
1323
|
+
:param metadata: Additional metadata that is provided to the method.
|
1324
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
1325
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1326
|
+
credentials, or chained list of accounts required to get the access_token
|
1327
|
+
of the last account in the list, which will be impersonated in the request.
|
1328
|
+
If set as a string, the account must grant the originating account
|
1329
|
+
the Service Account Token Creator IAM role.
|
1330
|
+
If set as a sequence, the identities from the list must grant
|
1331
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1332
|
+
account from the list granting this role to the originating account (templated).
|
1333
|
+
|
1334
|
+
:return: Dataplex data profile
|
1335
|
+
"""
|
1336
|
+
|
1337
|
+
template_fields = ("project_id", "data_scan_id", "impersonation_chain")
|
1338
|
+
|
1339
|
+
def __init__(
|
1340
|
+
self,
|
1341
|
+
project_id: str,
|
1342
|
+
region: str,
|
1343
|
+
data_scan_id: str,
|
1344
|
+
api_version: str = "v1",
|
1345
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
1346
|
+
timeout: float | None = None,
|
1347
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1348
|
+
gcp_conn_id: str = "google_cloud_default",
|
1349
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1350
|
+
*args,
|
1351
|
+
**kwargs,
|
1352
|
+
) -> None:
|
1353
|
+
super().__init__(*args, **kwargs)
|
1354
|
+
self.project_id = project_id
|
1355
|
+
self.region = region
|
1356
|
+
self.data_scan_id = data_scan_id
|
1357
|
+
self.api_version = api_version
|
1358
|
+
self.retry = retry
|
1359
|
+
self.timeout = timeout
|
1360
|
+
self.metadata = metadata
|
1361
|
+
self.gcp_conn_id = gcp_conn_id
|
1362
|
+
self.impersonation_chain = impersonation_chain
|
1363
|
+
|
1364
|
+
def execute(self, context: Context):
|
1365
|
+
hook = DataplexHook(
|
1366
|
+
gcp_conn_id=self.gcp_conn_id,
|
1367
|
+
api_version=self.api_version,
|
1368
|
+
impersonation_chain=self.impersonation_chain,
|
1369
|
+
)
|
1370
|
+
|
1371
|
+
self.log.info("Retrieving the details of Dataplex Data Profile scan %s", self.data_scan_id)
|
1372
|
+
data_profile_scan = hook.get_data_scan(
|
1373
|
+
project_id=self.project_id,
|
1374
|
+
region=self.region,
|
1375
|
+
data_scan_id=self.data_scan_id,
|
1376
|
+
retry=self.retry,
|
1377
|
+
timeout=self.timeout,
|
1378
|
+
metadata=self.metadata,
|
1379
|
+
)
|
1380
|
+
|
1381
|
+
return DataScan.to_dict(data_profile_scan)
|
1382
|
+
|
1383
|
+
|
1384
|
+
class DataplexDeleteDataProfileScanOperator(GoogleCloudBaseOperator):
|
1385
|
+
"""
|
1386
|
+
Deletes a DataScan DataProfile resource.
|
1387
|
+
|
1388
|
+
:param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
|
1389
|
+
:param region: Required. The ID of the Google Cloud region that the lake belongs to.
|
1390
|
+
:param data_scan_id: Required. Data Profile scan identifier.
|
1391
|
+
:param api_version: The version of the api that will be requested for example 'v1'.
|
1392
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
1393
|
+
will not be retried.
|
1394
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete.
|
1395
|
+
Note that if `retry` is specified, the timeout applies to each individual attempt.
|
1396
|
+
:param metadata: Additional metadata that is provided to the method.
|
1397
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
1398
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1399
|
+
credentials, or chained list of accounts required to get the access_token
|
1400
|
+
of the last account in the list, which will be impersonated in the request.
|
1401
|
+
If set as a string, the account must grant the originating account
|
1402
|
+
the Service Account Token Creator IAM role.
|
1403
|
+
If set as a sequence, the identities from the list must grant
|
1404
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1405
|
+
account from the list granting this role to the originating account (templated).
|
1406
|
+
:return: None
|
1407
|
+
"""
|
1408
|
+
|
1409
|
+
template_fields = ("project_id", "data_scan_id", "impersonation_chain")
|
1410
|
+
|
1411
|
+
def __init__(
|
1412
|
+
self,
|
1413
|
+
project_id: str,
|
1414
|
+
region: str,
|
1415
|
+
data_scan_id: str,
|
1416
|
+
api_version: str = "v1",
|
1417
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
1418
|
+
timeout: float | None = None,
|
1419
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1420
|
+
gcp_conn_id: str = "google_cloud_default",
|
1421
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1422
|
+
*args,
|
1423
|
+
**kwargs,
|
1424
|
+
) -> None:
|
1425
|
+
super().__init__(*args, **kwargs)
|
1426
|
+
self.project_id = project_id
|
1427
|
+
self.region = region
|
1428
|
+
self.data_scan_id = data_scan_id
|
1429
|
+
self.api_version = api_version
|
1430
|
+
self.retry = retry
|
1431
|
+
self.timeout = timeout
|
1432
|
+
self.metadata = metadata
|
1433
|
+
self.gcp_conn_id = gcp_conn_id
|
1434
|
+
self.impersonation_chain = impersonation_chain
|
1435
|
+
|
1436
|
+
def execute(self, context: Context) -> None:
|
1437
|
+
hook = DataplexHook(
|
1438
|
+
gcp_conn_id=self.gcp_conn_id,
|
1439
|
+
api_version=self.api_version,
|
1440
|
+
impersonation_chain=self.impersonation_chain,
|
1441
|
+
)
|
1442
|
+
|
1443
|
+
self.log.info("Deleting Dataplex Data Profile Scan: %s", self.data_scan_id)
|
1444
|
+
|
1445
|
+
operation = hook.delete_data_scan(
|
1446
|
+
project_id=self.project_id,
|
1447
|
+
region=self.region,
|
1448
|
+
data_scan_id=self.data_scan_id,
|
1449
|
+
retry=self.retry,
|
1450
|
+
timeout=self.timeout,
|
1451
|
+
metadata=self.metadata,
|
1452
|
+
)
|
1453
|
+
hook.wait_for_operation(timeout=self.timeout, operation=operation)
|
1454
|
+
self.log.info("Dataplex Data Profile scan %s deleted successfully!", self.data_scan_id)
|
1455
|
+
|
1456
|
+
|
1457
|
+
class DataplexRunDataProfileScanOperator(GoogleCloudBaseOperator):
|
1458
|
+
"""
|
1459
|
+
Runs an on-demand execution of a DataScan Data Profile Scan.
|
1460
|
+
|
1461
|
+
:param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
|
1462
|
+
:param region: Required. The ID of the Google Cloud region that the lake belongs to.
|
1463
|
+
:param data_scan_id: Required. Data Profile scan identifier.
|
1464
|
+
:param api_version: The version of the api that will be requested for example 'v1'.
|
1465
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
1466
|
+
will not be retried.
|
1467
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete.
|
1468
|
+
Note that if `retry` is specified, the timeout applies to each individual attempt.
|
1469
|
+
:param metadata: Additional metadata that is provided to the method.
|
1470
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
1471
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1472
|
+
credentials, or chained list of accounts required to get the access_token
|
1473
|
+
of the last account in the list, which will be impersonated in the request.
|
1474
|
+
If set as a string, the account must grant the originating account
|
1475
|
+
the Service Account Token Creator IAM role.
|
1476
|
+
If set as a sequence, the identities from the list must grant
|
1477
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1478
|
+
account from the list granting this role to the originating account (templated).
|
1479
|
+
:param asynchronous: Flag informing that the Dataplex job should be run asynchronously.
|
1480
|
+
This is useful for submitting long-running jobs and
|
1481
|
+
waiting on them asynchronously using the DataplexDataProfileJobStatusSensor
|
1482
|
+
:param result_timeout: Value in seconds for which operator will wait for the Data Profile scan result
|
1483
|
+
when the flag `asynchronous = False`.
|
1484
|
+
Throws exception if there is no result found after specified amount of seconds.
|
1485
|
+
:param polling_interval_seconds: time in seconds between polling for job completion.
|
1486
|
+
The value is considered only when running in deferrable mode. Must be greater than 0.
|
1487
|
+
:param deferrable: Run operator in the deferrable mode.
|
1488
|
+
|
1489
|
+
:return: Dataplex Data Profile scan job id.
|
1490
|
+
"""
|
1491
|
+
|
1492
|
+
template_fields = ("project_id", "data_scan_id", "impersonation_chain")
|
1493
|
+
|
1494
|
+
def __init__(
|
1495
|
+
self,
|
1496
|
+
project_id: str,
|
1497
|
+
region: str,
|
1498
|
+
data_scan_id: str,
|
1499
|
+
api_version: str = "v1",
|
1500
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
1501
|
+
timeout: float | None = None,
|
1502
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1503
|
+
gcp_conn_id: str = "google_cloud_default",
|
1504
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1505
|
+
asynchronous: bool = False,
|
1506
|
+
result_timeout: float = 60.0 * 10,
|
1507
|
+
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
1508
|
+
polling_interval_seconds: int = 10,
|
1509
|
+
*args,
|
1510
|
+
**kwargs,
|
1511
|
+
) -> None:
|
1512
|
+
super().__init__(*args, **kwargs)
|
1513
|
+
self.project_id = project_id
|
1514
|
+
self.region = region
|
1515
|
+
self.data_scan_id = data_scan_id
|
1516
|
+
self.api_version = api_version
|
1517
|
+
self.retry = retry
|
1518
|
+
self.timeout = timeout
|
1519
|
+
self.metadata = metadata
|
1520
|
+
self.gcp_conn_id = gcp_conn_id
|
1521
|
+
self.impersonation_chain = impersonation_chain
|
1522
|
+
self.asynchronous = asynchronous
|
1523
|
+
self.result_timeout = result_timeout
|
1524
|
+
self.deferrable = deferrable
|
1525
|
+
self.polling_interval_seconds = polling_interval_seconds
|
1526
|
+
|
1527
|
+
def execute(self, context: Context) -> dict:
|
1528
|
+
hook = DataplexHook(
|
1529
|
+
gcp_conn_id=self.gcp_conn_id,
|
1530
|
+
api_version=self.api_version,
|
1531
|
+
impersonation_chain=self.impersonation_chain,
|
1532
|
+
)
|
1533
|
+
|
1534
|
+
result = hook.run_data_scan(
|
1535
|
+
project_id=self.project_id,
|
1536
|
+
region=self.region,
|
1537
|
+
data_scan_id=self.data_scan_id,
|
1538
|
+
retry=self.retry,
|
1539
|
+
timeout=self.timeout,
|
1540
|
+
metadata=self.metadata,
|
1541
|
+
)
|
1542
|
+
job_id = result.job.name.split("/")[-1]
|
1543
|
+
|
1544
|
+
if self.deferrable:
|
1545
|
+
if self.asynchronous:
|
1546
|
+
raise AirflowException(
|
1547
|
+
"Both asynchronous and deferrable parameters were passed. Please, provide only one."
|
1548
|
+
)
|
1549
|
+
self.defer(
|
1550
|
+
trigger=DataplexDataProfileJobTrigger(
|
1551
|
+
job_id=job_id,
|
1552
|
+
data_scan_id=self.data_scan_id,
|
1553
|
+
project_id=self.project_id,
|
1554
|
+
region=self.region,
|
1555
|
+
gcp_conn_id=self.gcp_conn_id,
|
1556
|
+
impersonation_chain=self.impersonation_chain,
|
1557
|
+
polling_interval_seconds=self.polling_interval_seconds,
|
1558
|
+
),
|
1559
|
+
method_name="execute_complete",
|
1560
|
+
)
|
1561
|
+
if not self.asynchronous:
|
1562
|
+
job = hook.wait_for_data_scan_job(
|
1563
|
+
job_id=job_id,
|
1564
|
+
data_scan_id=self.data_scan_id,
|
1565
|
+
project_id=self.project_id,
|
1566
|
+
region=self.region,
|
1567
|
+
result_timeout=self.result_timeout,
|
1568
|
+
)
|
1569
|
+
|
1570
|
+
if job.state == DataScanJob.State.FAILED:
|
1571
|
+
raise AirflowException(f"Data Profile job failed: {job_id}")
|
1572
|
+
if job.state == DataScanJob.State.SUCCEEDED:
|
1573
|
+
self.log.info("Data Profile job executed successfully.")
|
1574
|
+
else:
|
1575
|
+
self.log.info("Data Profile job execution returned status: %s", job.status)
|
1576
|
+
|
1577
|
+
return job_id
|
1578
|
+
|
1579
|
+
def execute_complete(self, context, event=None) -> None:
|
1580
|
+
"""
|
1581
|
+
Callback for when the trigger fires - returns immediately.
|
1582
|
+
|
1583
|
+
Relies on trigger to throw an exception, otherwise it assumes execution was
|
1584
|
+
successful.
|
1585
|
+
"""
|
1586
|
+
job_state = event["job_state"]
|
1587
|
+
job_id = event["job_id"]
|
1588
|
+
if job_state == DataScanJob.State.FAILED:
|
1589
|
+
raise AirflowException(f"Job failed:\n{job_id}")
|
1590
|
+
if job_state == DataScanJob.State.CANCELLED:
|
1591
|
+
raise AirflowException(f"Job was cancelled:\n{job_id}")
|
1592
|
+
if job_state == DataScanJob.State.SUCCEEDED:
|
1593
|
+
self.log.info("Data Profile job executed successfully.")
|
1594
|
+
return job_id
|
1595
|
+
|
1596
|
+
|
1597
|
+
class DataplexGetDataProfileScanResultOperator(GoogleCloudBaseOperator):
|
1598
|
+
"""
|
1599
|
+
Gets a DataScan Data Profile Job resource.
|
1600
|
+
|
1601
|
+
:param project_id: Required. The ID of the Google Cloud project that the lake belongs to.
|
1602
|
+
:param region: Required. The ID of the Google Cloud region that the lake belongs to.
|
1603
|
+
:param data_scan_id: Required. Data Profile scan identifier.
|
1604
|
+
:param job_id: Optional. Data Profile scan job identifier.
|
1605
|
+
:param api_version: The version of the api that will be requested for example 'v1'.
|
1606
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
1607
|
+
will not be retried.
|
1608
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
|
1609
|
+
``retry`` is specified, the timeout applies to each individual attempt.
|
1610
|
+
:param metadata: Additional metadata that is provided to the method.
|
1611
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
1612
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1613
|
+
credentials, or chained list of accounts required to get the access_token
|
1614
|
+
of the last account in the list, which will be impersonated in the request.
|
1615
|
+
If set as a string, the account must grant the originating account
|
1616
|
+
the Service Account Token Creator IAM role.
|
1617
|
+
If set as a sequence, the identities from the list must grant
|
1618
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1619
|
+
account from the list granting this role to the originating account (templated).
|
1620
|
+
:param wait_for_results: Flag indicating whether to wait for the result of a job execution
|
1621
|
+
or to return the job in its current state.
|
1622
|
+
:param result_timeout: Value in seconds for which operator will wait for the Data Profile scan result
|
1623
|
+
when the flag `wait_for_results = True`.
|
1624
|
+
Throws exception if there is no result found after specified amount of seconds.
|
1625
|
+
|
1626
|
+
:return: Dict representing DataScanJob.
|
1627
|
+
When the job completes with a successful status, information about the Data Profile result
|
1628
|
+
is available.
|
1629
|
+
"""
|
1630
|
+
|
1631
|
+
template_fields = ("project_id", "data_scan_id", "impersonation_chain")
|
1632
|
+
|
1633
|
+
def __init__(
|
1634
|
+
self,
|
1635
|
+
project_id: str,
|
1636
|
+
region: str,
|
1637
|
+
data_scan_id: str,
|
1638
|
+
job_id: str | None = None,
|
1639
|
+
api_version: str = "v1",
|
1640
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
1641
|
+
timeout: float | None = None,
|
1642
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1643
|
+
gcp_conn_id: str = "google_cloud_default",
|
1644
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1645
|
+
wait_for_results: bool = True,
|
1646
|
+
result_timeout: float = 60.0 * 10,
|
1647
|
+
*args,
|
1648
|
+
**kwargs,
|
1649
|
+
) -> None:
|
1650
|
+
super().__init__(*args, **kwargs)
|
1651
|
+
self.project_id = project_id
|
1652
|
+
self.region = region
|
1653
|
+
self.data_scan_id = data_scan_id
|
1654
|
+
self.job_id = job_id
|
1655
|
+
self.api_version = api_version
|
1656
|
+
self.retry = retry
|
1657
|
+
self.timeout = timeout
|
1658
|
+
self.metadata = metadata
|
1659
|
+
self.gcp_conn_id = gcp_conn_id
|
1660
|
+
self.impersonation_chain = impersonation_chain
|
1661
|
+
self.wait_for_results = wait_for_results
|
1662
|
+
self.result_timeout = result_timeout
|
1663
|
+
|
1664
|
+
def execute(self, context: Context) -> dict:
|
1665
|
+
hook = DataplexHook(
|
1666
|
+
gcp_conn_id=self.gcp_conn_id,
|
1667
|
+
api_version=self.api_version,
|
1668
|
+
impersonation_chain=self.impersonation_chain,
|
1669
|
+
)
|
1670
|
+
# fetch the last job
|
1671
|
+
if not self.job_id:
|
1672
|
+
jobs = hook.list_data_scan_jobs(
|
1673
|
+
project_id=self.project_id,
|
1674
|
+
region=self.region,
|
1675
|
+
data_scan_id=self.data_scan_id,
|
1676
|
+
retry=self.retry,
|
1677
|
+
timeout=self.timeout,
|
1678
|
+
metadata=self.metadata,
|
1679
|
+
)
|
1680
|
+
job_ids = [DataScanJob.to_dict(job) for job in jobs]
|
1681
|
+
if not job_ids:
|
1682
|
+
raise AirflowException("There are no jobs, you should create one before.")
|
1683
|
+
job_id = job_ids[0]["name"]
|
1684
|
+
self.job_id = job_id.split("/")[-1]
|
1685
|
+
|
1686
|
+
if self.wait_for_results:
|
1687
|
+
job = hook.wait_for_data_scan_job(
|
1688
|
+
job_id=self.job_id,
|
1689
|
+
data_scan_id=self.data_scan_id,
|
1690
|
+
project_id=self.project_id,
|
1691
|
+
region=self.region,
|
1692
|
+
result_timeout=self.result_timeout,
|
1693
|
+
)
|
1694
|
+
else:
|
1695
|
+
job = hook.get_data_scan_job(
|
1696
|
+
project_id=self.project_id,
|
1697
|
+
region=self.region,
|
1698
|
+
job_id=self.job_id,
|
1699
|
+
data_scan_id=self.data_scan_id,
|
1700
|
+
retry=self.retry,
|
1701
|
+
timeout=self.timeout,
|
1702
|
+
metadata=self.metadata,
|
1703
|
+
)
|
1704
|
+
if job.state == DataScanJob.State.SUCCEEDED:
|
1705
|
+
self.log.info("Data Profile job executed successfully")
|
1706
|
+
else:
|
1707
|
+
self.log.info("Data Profile job execution returned status: %s", job.state)
|
1708
|
+
|
1709
|
+
result = DataScanJob.to_dict(job)
|
1710
|
+
result["state"] = DataScanJob.State(result["state"]).name
|
1711
|
+
|
1712
|
+
return result
|
1713
|
+
|
1714
|
+
def execute_complete(self, context, event=None) -> None:
|
1715
|
+
"""
|
1716
|
+
Callback for when the trigger fires - returns immediately.
|
1717
|
+
|
1718
|
+
Relies on trigger to throw an exception, otherwise it assumes execution was
|
1719
|
+
successful.
|
1720
|
+
"""
|
1721
|
+
job_state = event["job_state"]
|
1722
|
+
job_id = event["job_id"]
|
1723
|
+
job = event["job"]
|
1724
|
+
if job_state == DataScanJob.State.FAILED:
|
1725
|
+
raise AirflowException(f"Job failed:\n{job_id}")
|
1726
|
+
if job_state == DataScanJob.State.CANCELLED:
|
1727
|
+
raise AirflowException(f"Job was cancelled:\n{job_id}")
|
1728
|
+
if job_state == DataScanJob.State.SUCCEEDED:
|
1729
|
+
self.log.info("Data Profile job executed successfully")
|
1730
|
+
else:
|
1731
|
+
self.log.info("Data Profile job execution returned status: %s", job_state)
|
1732
|
+
|
1733
|
+
return job
|
1734
|
+
|
1735
|
+
|
1207
1736
|
class DataplexCreateZoneOperator(GoogleCloudBaseOperator):
|
1208
1737
|
"""
|
1209
1738
|
Creates a Zone resource within a Lake.
|
@@ -63,6 +63,7 @@ from airflow.utils import timezone
|
|
63
63
|
|
64
64
|
if TYPE_CHECKING:
|
65
65
|
from google.api_core import operation
|
66
|
+
from google.api_core.retry_async import AsyncRetry
|
66
67
|
from google.protobuf.duration_pb2 import Duration
|
67
68
|
from google.protobuf.field_mask_pb2 import FieldMask
|
68
69
|
|
@@ -511,9 +512,7 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
511
512
|
The operator will wait until the creation is successful or an error occurs
|
512
513
|
in the creation process.
|
513
514
|
|
514
|
-
If the cluster already exists and ``use_if_exists`` is True, the operator will:
|
515
|
-
|
516
|
-
If the cluster already exists and ``use_if_exists`` is True then the operator will:
|
515
|
+
If the cluster already exists and ``use_if_exists`` is True, then the operator will:
|
517
516
|
- if cluster state is ERROR then delete it if specified and raise error
|
518
517
|
- if cluster state is CREATING wait for it and then check for ERROR state
|
519
518
|
- if cluster state is DELETING wait for it and then create new cluster
|
@@ -592,7 +591,7 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
592
591
|
request_id: str | None = None,
|
593
592
|
delete_on_error: bool = True,
|
594
593
|
use_if_exists: bool = True,
|
595
|
-
retry:
|
594
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
596
595
|
timeout: float = 1 * 60 * 60,
|
597
596
|
metadata: Sequence[tuple[str, str]] = (),
|
598
597
|
gcp_conn_id: str = "google_cloud_default",
|
@@ -985,7 +984,7 @@ class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
|
|
985
984
|
project_id: str | None = None,
|
986
985
|
cluster_uuid: str | None = None,
|
987
986
|
request_id: str | None = None,
|
988
|
-
retry:
|
987
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
989
988
|
timeout: float = 1 * 60 * 60,
|
990
989
|
metadata: Sequence[tuple[str, str]] = (),
|
991
990
|
gcp_conn_id: str = "google_cloud_default",
|
@@ -1891,7 +1890,7 @@ class DataprocInstantiateWorkflowTemplateOperator(GoogleCloudBaseOperator):
|
|
1891
1890
|
version: int | None = None,
|
1892
1891
|
request_id: str | None = None,
|
1893
1892
|
parameters: dict[str, str] | None = None,
|
1894
|
-
retry:
|
1893
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
1895
1894
|
timeout: float | None = None,
|
1896
1895
|
metadata: Sequence[tuple[str, str]] = (),
|
1897
1896
|
gcp_conn_id: str = "google_cloud_default",
|
@@ -2152,7 +2151,7 @@ class DataprocSubmitJobOperator(GoogleCloudBaseOperator):
|
|
2152
2151
|
Service Account Token Creator IAM role to the directly preceding identity, with first
|
2153
2152
|
account from the list granting this role to the originating account (templated).
|
2154
2153
|
:param asynchronous: Flag to return after submitting the job to the Dataproc API.
|
2155
|
-
This is useful for submitting long
|
2154
|
+
This is useful for submitting long-running jobs and
|
2156
2155
|
waiting on them asynchronously using the DataprocJobSensor
|
2157
2156
|
:param deferrable: Run operator in the deferrable mode
|
2158
2157
|
:param polling_interval_seconds: time in seconds between polling for job completion.
|
@@ -2267,10 +2266,11 @@ class DataprocSubmitJobOperator(GoogleCloudBaseOperator):
|
|
2267
2266
|
"""
|
2268
2267
|
job_state = event["job_state"]
|
2269
2268
|
job_id = event["job_id"]
|
2269
|
+
job = event["job"]
|
2270
2270
|
if job_state == JobStatus.State.ERROR:
|
2271
|
-
raise AirflowException(f"Job failed:\n{
|
2271
|
+
raise AirflowException(f"Job {job_id} failed:\n{job}")
|
2272
2272
|
if job_state == JobStatus.State.CANCELLED:
|
2273
|
-
raise AirflowException(f"Job was cancelled:\n{
|
2273
|
+
raise AirflowException(f"Job {job_id} was cancelled:\n{job}")
|
2274
2274
|
self.log.info("%s completed successfully.", self.task_id)
|
2275
2275
|
return job_id
|
2276
2276
|
|
@@ -2340,7 +2340,7 @@ class DataprocUpdateClusterOperator(GoogleCloudBaseOperator):
|
|
2340
2340
|
region: str,
|
2341
2341
|
request_id: str | None = None,
|
2342
2342
|
project_id: str | None = None,
|
2343
|
-
retry:
|
2343
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
2344
2344
|
timeout: float | None = None,
|
2345
2345
|
metadata: Sequence[tuple[str, str]] = (),
|
2346
2346
|
gcp_conn_id: str = "google_cloud_default",
|
@@ -2480,7 +2480,7 @@ class DataprocCreateBatchOperator(GoogleCloudBaseOperator):
|
|
2480
2480
|
metadata: Sequence[tuple[str, str]] = (),
|
2481
2481
|
gcp_conn_id: str = "google_cloud_default",
|
2482
2482
|
impersonation_chain: str | Sequence[str] | None = None,
|
2483
|
-
result_retry:
|
2483
|
+
result_retry: AsyncRetry | _MethodDefault = DEFAULT,
|
2484
2484
|
asynchronous: bool = False,
|
2485
2485
|
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
2486
2486
|
polling_interval_seconds: int = 5,
|