acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +41 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -22,6 +22,7 @@ from typing import (
|
|
|
22
22
|
Union,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
+
import progressbar
|
|
25
26
|
from avro.schema import RecordSchema
|
|
26
27
|
from pydantic import BaseModel
|
|
27
28
|
from requests.models import HTTPError
|
|
@@ -504,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
504
505
|
"limit": limit,
|
|
505
506
|
"filter": filter,
|
|
506
507
|
}
|
|
507
|
-
end_point = f"{self.
|
|
508
|
+
end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
|
|
508
509
|
resp: Dict = self._post_generic(end_point, query_body)
|
|
509
510
|
|
|
510
511
|
values: Optional[List] = resp.get("value", {}).get("values")
|
|
@@ -524,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
524
525
|
def get_entity_raw(
|
|
525
526
|
self, entity_urn: str, aspects: Optional[List[str]] = None
|
|
526
527
|
) -> Dict:
|
|
527
|
-
endpoint: str = f"{self.
|
|
528
|
+
endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
|
|
528
529
|
if aspects is not None:
|
|
529
530
|
assert aspects, "if provided, aspects must be a non-empty list"
|
|
530
531
|
endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
|
|
@@ -654,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
654
655
|
|
|
655
656
|
@property
|
|
656
657
|
def _search_endpoint(self):
|
|
657
|
-
return f"{self.
|
|
658
|
+
return f"{self._gms_server}/entities?action=search"
|
|
658
659
|
|
|
659
660
|
@property
|
|
660
661
|
def _relationships_endpoint(self):
|
|
661
|
-
return f"{self.
|
|
662
|
+
return f"{self._gms_server}/openapi/relationships/v1/"
|
|
662
663
|
|
|
663
664
|
@property
|
|
664
665
|
def _aspect_count_endpoint(self):
|
|
665
|
-
return f"{self.
|
|
666
|
+
return f"{self._gms_server}/aspects?action=getCount"
|
|
666
667
|
|
|
667
668
|
def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
|
|
668
669
|
"""Retrieve a domain urn based on its name. Returns None if there is no match found"""
|
|
@@ -1209,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1209
1210
|
operation_name: Optional[str] = None,
|
|
1210
1211
|
format_exception: bool = True,
|
|
1211
1212
|
) -> Dict:
|
|
1212
|
-
url = f"{self.
|
|
1213
|
+
url = f"{self._gms_server}/api/graphql"
|
|
1213
1214
|
|
|
1214
1215
|
body: Dict = {
|
|
1215
1216
|
"query": query,
|
|
@@ -1434,40 +1435,82 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1434
1435
|
related_aspects = response.get("relatedAspects", [])
|
|
1435
1436
|
return reference_count, related_aspects
|
|
1436
1437
|
|
|
1438
|
+
def get_kafka_consumer_offsets(
|
|
1439
|
+
self,
|
|
1440
|
+
) -> dict:
|
|
1441
|
+
"""
|
|
1442
|
+
Get Kafka consumer offsets from the DataHub API.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
graph (DataHubGraph): The DataHub graph client
|
|
1446
|
+
|
|
1447
|
+
"""
|
|
1448
|
+
urls = {
|
|
1449
|
+
"mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
|
|
1450
|
+
"mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
|
|
1451
|
+
"mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
params = {"skipCache": "true", "detailed": "true"}
|
|
1455
|
+
results = {}
|
|
1456
|
+
for key, url in urls.items():
|
|
1457
|
+
response = self._get_generic(url=url, params=params)
|
|
1458
|
+
results[key] = response
|
|
1459
|
+
if "errors" in response:
|
|
1460
|
+
logger.error(f"Error: {response['errors']}")
|
|
1461
|
+
return results
|
|
1462
|
+
|
|
1463
|
+
def _restore_index_call(self, payload_obj: dict) -> None:
|
|
1464
|
+
result = self._post_generic(
|
|
1465
|
+
f"{self._gms_server}/operations?action=restoreIndices", payload_obj
|
|
1466
|
+
)
|
|
1467
|
+
logger.debug(f"Restore indices result: {result}")
|
|
1468
|
+
|
|
1437
1469
|
def restore_indices(
|
|
1438
1470
|
self,
|
|
1439
|
-
urn_pattern: str,
|
|
1471
|
+
urn_pattern: Optional[str] = None,
|
|
1440
1472
|
aspect: Optional[str] = None,
|
|
1441
1473
|
start: Optional[int] = None,
|
|
1442
1474
|
batch_size: Optional[int] = None,
|
|
1443
|
-
|
|
1475
|
+
file: Optional[str] = None,
|
|
1476
|
+
) -> None:
|
|
1444
1477
|
"""Restore the indices for a given urn or urn-like pattern.
|
|
1445
1478
|
|
|
1446
1479
|
Args:
|
|
1447
|
-
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
|
|
1480
|
+
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
|
|
1448
1481
|
aspect: Optional aspect string to restore indices for a specific aspect.
|
|
1449
|
-
start: Optional integer to decide which row number of sql store to restore from. Default: 0.
|
|
1450
|
-
batch_size: Optional integer to decide how many rows to restore. Default: 10.
|
|
1482
|
+
start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
|
|
1483
|
+
batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
|
|
1484
|
+
file: Optional file path to a file containing URNs to restore indices for.
|
|
1451
1485
|
|
|
1452
1486
|
Returns:
|
|
1453
1487
|
A string containing the result of the restore indices operation. This format is subject to change.
|
|
1454
1488
|
"""
|
|
1455
|
-
|
|
1456
|
-
|
|
1489
|
+
payload_obj = {}
|
|
1490
|
+
if file is not None:
|
|
1491
|
+
with open(file) as f:
|
|
1492
|
+
for urn in progressbar.progressbar(f.readlines()):
|
|
1493
|
+
urn = urn.strip()
|
|
1494
|
+
if "%" in urn:
|
|
1495
|
+
payload_obj["urnLike"] = urn
|
|
1496
|
+
else:
|
|
1497
|
+
payload_obj["urn"] = urn
|
|
1498
|
+
if aspect is not None:
|
|
1499
|
+
payload_obj["aspect"] = aspect
|
|
1500
|
+
self._restore_index_call(payload_obj)
|
|
1457
1501
|
else:
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
return result
|
|
1502
|
+
if urn_pattern is not None:
|
|
1503
|
+
if "%" in urn_pattern:
|
|
1504
|
+
payload_obj["urnLike"] = urn_pattern
|
|
1505
|
+
else:
|
|
1506
|
+
payload_obj["urn"] = urn_pattern
|
|
1507
|
+
if aspect is not None:
|
|
1508
|
+
payload_obj["aspect"] = aspect
|
|
1509
|
+
if start is not None:
|
|
1510
|
+
payload_obj["start"] = start
|
|
1511
|
+
if batch_size is not None:
|
|
1512
|
+
payload_obj["batchSize"] = batch_size
|
|
1513
|
+
self._restore_index_call(payload_obj)
|
|
1471
1514
|
|
|
1472
1515
|
@functools.lru_cache
|
|
1473
1516
|
def _make_schema_resolver(
|
|
@@ -1533,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1533
1576
|
env: str = DEFAULT_ENV,
|
|
1534
1577
|
default_db: Optional[str] = None,
|
|
1535
1578
|
default_schema: Optional[str] = None,
|
|
1536
|
-
|
|
1579
|
+
override_dialect: Optional[str] = None,
|
|
1537
1580
|
) -> "SqlParsingResult":
|
|
1538
1581
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
|
1539
1582
|
|
|
@@ -1547,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1547
1590
|
schema_resolver=schema_resolver,
|
|
1548
1591
|
default_db=default_db,
|
|
1549
1592
|
default_schema=default_schema,
|
|
1550
|
-
|
|
1593
|
+
override_dialect=override_dialect,
|
|
1551
1594
|
)
|
|
1552
1595
|
|
|
1553
1596
|
def create_tag(self, tag_name: str) -> str:
|
|
@@ -1774,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1774
1817
|
"Accept": "application/json",
|
|
1775
1818
|
"Content-Type": "application/json",
|
|
1776
1819
|
}
|
|
1777
|
-
url = f"{self.
|
|
1820
|
+
url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
|
|
1778
1821
|
response = self._session.post(url, data=json.dumps(payload), headers=headers)
|
|
1779
1822
|
response.raise_for_status()
|
|
1780
1823
|
|
|
@@ -1831,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1831
1874
|
"Content-Type": "application/json",
|
|
1832
1875
|
}
|
|
1833
1876
|
|
|
1834
|
-
url = f"{self.
|
|
1877
|
+
url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
|
|
1835
1878
|
if with_system_metadata:
|
|
1836
1879
|
url += "?systemMetadata=true"
|
|
1837
1880
|
|
|
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
|
44
44
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
45
45
|
from datahub.telemetry import stats
|
|
46
46
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
47
|
+
from datahub.upgrade.upgrade import (
|
|
48
|
+
is_server_default_cli_ahead,
|
|
49
|
+
retrieve_version_stats,
|
|
50
|
+
)
|
|
47
51
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
48
52
|
from datahub.utilities.global_warning_util import (
|
|
49
53
|
clear_global_warnings,
|
|
@@ -171,7 +175,10 @@ class Pipeline:
|
|
|
171
175
|
self.last_time_printed = int(time.time())
|
|
172
176
|
self.cli_report = CliReport()
|
|
173
177
|
|
|
174
|
-
with
|
|
178
|
+
with (
|
|
179
|
+
contextlib.ExitStack() as exit_stack,
|
|
180
|
+
contextlib.ExitStack() as inner_exit_stack,
|
|
181
|
+
):
|
|
175
182
|
self.graph: Optional[DataHubGraph] = None
|
|
176
183
|
with _add_init_error_context("connect to DataHub"):
|
|
177
184
|
if self.config.datahub_api:
|
|
@@ -340,6 +347,44 @@ class Pipeline:
|
|
|
340
347
|
except Exception as e:
|
|
341
348
|
logger.warning("Reporting failed on start", exc_info=e)
|
|
342
349
|
|
|
350
|
+
def _warn_old_cli_version(self) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Check if the server default CLI version is ahead of the CLI version being used.
|
|
353
|
+
If so, add a warning to the report.
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
|
|
358
|
+
except RuntimeError as e:
|
|
359
|
+
# Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
|
|
360
|
+
if "no current event loop" in str(e):
|
|
361
|
+
logger.debug("Skipping version check - no event loop available")
|
|
362
|
+
return
|
|
363
|
+
raise
|
|
364
|
+
|
|
365
|
+
if not version_stats or not self.graph:
|
|
366
|
+
return
|
|
367
|
+
|
|
368
|
+
if is_server_default_cli_ahead(version_stats):
|
|
369
|
+
server_default_version = (
|
|
370
|
+
version_stats.server.current_server_default_cli_version.version
|
|
371
|
+
if version_stats.server.current_server_default_cli_version
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
current_version = version_stats.client.current.version
|
|
375
|
+
|
|
376
|
+
logger.debug(f"""
|
|
377
|
+
client_version: {current_version}
|
|
378
|
+
server_default_version: {server_default_version}
|
|
379
|
+
server_default_cli_ahead: True
|
|
380
|
+
""")
|
|
381
|
+
|
|
382
|
+
self.source.get_report().warning(
|
|
383
|
+
title="Server default CLI version is ahead of CLI version",
|
|
384
|
+
message="Please upgrade the CLI version being used",
|
|
385
|
+
context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
|
|
386
|
+
)
|
|
387
|
+
|
|
343
388
|
def _notify_reporters_on_ingestion_completion(self) -> None:
|
|
344
389
|
for reporter in self.reporters:
|
|
345
390
|
try:
|
|
@@ -396,6 +441,7 @@ class Pipeline:
|
|
|
396
441
|
return False
|
|
397
442
|
|
|
398
443
|
def run(self) -> None:
|
|
444
|
+
self._warn_old_cli_version()
|
|
399
445
|
with self.exit_stack, self.inner_exit_stack:
|
|
400
446
|
if self.config.flags.generate_memory_profiles:
|
|
401
447
|
import memray
|
|
@@ -502,7 +548,7 @@ class Pipeline:
|
|
|
502
548
|
self._handle_uncaught_pipeline_exception(exc)
|
|
503
549
|
finally:
|
|
504
550
|
clear_global_warnings()
|
|
505
|
-
|
|
551
|
+
self.sink.flush()
|
|
506
552
|
self._notify_reporters_on_ingestion_completion()
|
|
507
553
|
|
|
508
554
|
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
|
@@ -578,11 +624,17 @@ class Pipeline:
|
|
|
578
624
|
sink_failures = len(self.sink.get_report().failures)
|
|
579
625
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
580
626
|
global_warnings = len(get_global_warnings())
|
|
627
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
628
|
+
source_aspects_by_subtype = (
|
|
629
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
630
|
+
)
|
|
581
631
|
|
|
582
632
|
telemetry_instance.ping(
|
|
583
633
|
"ingest_stats",
|
|
584
634
|
{
|
|
585
635
|
"source_type": self.source_type,
|
|
636
|
+
"source_aspects": source_aspects,
|
|
637
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
586
638
|
"sink_type": self.sink_type,
|
|
587
639
|
"transformer_types": [
|
|
588
640
|
transformer.type for transformer in self.config.transformers or []
|
|
@@ -5,6 +5,7 @@ import functools
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import threading
|
|
8
|
+
import time
|
|
8
9
|
import uuid
|
|
9
10
|
from enum import auto
|
|
10
11
|
from typing import List, Optional, Tuple, Union
|
|
@@ -346,6 +347,17 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
346
347
|
RecordEnvelope(item, metadata={}), NoopWriteCallback()
|
|
347
348
|
)
|
|
348
349
|
|
|
350
|
+
def flush(self) -> None:
|
|
351
|
+
"""Wait for all pending records to be written."""
|
|
352
|
+
i = 0
|
|
353
|
+
while self.report.pending_requests > 0:
|
|
354
|
+
time.sleep(0.1)
|
|
355
|
+
i += 1
|
|
356
|
+
if i % 1000 == 0:
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Waiting for {self.report.pending_requests} records to be written"
|
|
359
|
+
)
|
|
360
|
+
|
|
349
361
|
def close(self):
|
|
350
362
|
with self.report.main_thread_blocking_timer:
|
|
351
363
|
self.executor.shutdown()
|
|
@@ -533,7 +533,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
533
533
|
)
|
|
534
534
|
path_spec.sample_files = False
|
|
535
535
|
for obj in container_client.list_blobs(
|
|
536
|
-
|
|
536
|
+
name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
|
|
537
537
|
):
|
|
538
538
|
abs_path = self.create_abs_path(obj.name)
|
|
539
539
|
logger.debug(f"Path: {abs_path}")
|
|
@@ -269,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
269
269
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
270
270
|
@capability(
|
|
271
271
|
SourceCapability.DELETION_DETECTION,
|
|
272
|
-
"Enabled by default
|
|
272
|
+
"Enabled by default via stateful ingestion.",
|
|
273
273
|
)
|
|
274
274
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
275
275
|
@capability(
|
|
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
61
61
|
def get_blob_service_client(self):
|
|
62
62
|
return BlobServiceClient(
|
|
63
63
|
account_url=f"https://{self.account_name}.blob.core.windows.net",
|
|
64
|
-
credential=
|
|
64
|
+
credential=self.get_credentials(),
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
def get_data_lake_service_client(self) -> DataLakeServiceClient:
|
|
68
68
|
return DataLakeServiceClient(
|
|
69
69
|
account_url=f"https://{self.account_name}.dfs.core.windows.net",
|
|
70
|
-
credential=
|
|
70
|
+
credential=self.get_credentials(),
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
def get_credentials(
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
|
|
|
44
45
|
BigQueryQueriesExtractorConfig,
|
|
45
46
|
)
|
|
46
47
|
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
48
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
49
51
|
RedundantLineageRunSkipHandler,
|
|
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
77
79
|
supported=False,
|
|
78
80
|
)
|
|
79
81
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
80
|
-
@capability(
|
|
82
|
+
@capability(
|
|
83
|
+
SourceCapability.CONTAINERS,
|
|
84
|
+
"Enabled by default",
|
|
85
|
+
subtype_modifier=[
|
|
86
|
+
SourceCapabilityModifier.BIGQUERY_PROJECT,
|
|
87
|
+
SourceCapabilityModifier.BIGQUERY_DATASET,
|
|
88
|
+
],
|
|
89
|
+
)
|
|
81
90
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
82
91
|
@capability(
|
|
83
92
|
SourceCapability.DATA_PROFILING,
|
|
@@ -242,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
242
251
|
).workunit_processor,
|
|
243
252
|
]
|
|
244
253
|
|
|
254
|
+
def _warn_deprecated_configs(self):
|
|
255
|
+
if (
|
|
256
|
+
self.config.match_fully_qualified_names is not None
|
|
257
|
+
and not self.config.match_fully_qualified_names
|
|
258
|
+
and self.config.schema_pattern is not None
|
|
259
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
260
|
+
):
|
|
261
|
+
self.report.report_warning(
|
|
262
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
263
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
264
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
265
|
+
context="Config option deprecation warning",
|
|
266
|
+
title="Config option deprecation warning",
|
|
267
|
+
)
|
|
268
|
+
|
|
245
269
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
270
|
+
self._warn_deprecated_configs()
|
|
246
271
|
projects = get_projects(
|
|
247
272
|
self.bq_schema_extractor.schema_api,
|
|
248
273
|
self.report,
|
|
@@ -271,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
271
296
|
):
|
|
272
297
|
return
|
|
273
298
|
|
|
274
|
-
with
|
|
275
|
-
f"*: {QUERIES_EXTRACTION}"
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
299
|
+
with (
|
|
300
|
+
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
301
|
+
BigQueryQueriesExtractor(
|
|
302
|
+
connection=self.config.get_bigquery_client(),
|
|
303
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
304
|
+
config=BigQueryQueriesExtractorConfig(
|
|
305
|
+
window=self.config,
|
|
306
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
307
|
+
include_lineage=self.config.include_table_lineage,
|
|
308
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
309
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
310
|
+
include_queries=self.config.include_queries,
|
|
311
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
312
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
313
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
314
|
+
),
|
|
315
|
+
structured_report=self.report,
|
|
316
|
+
filters=self.filters,
|
|
317
|
+
identifiers=self.identifiers,
|
|
318
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
319
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
320
|
+
) as queries_extractor,
|
|
321
|
+
):
|
|
296
322
|
self.report.queries_extractor = queries_extractor.report
|
|
297
323
|
yield from queries_extractor.get_workunits_internal()
|
|
298
324
|
else:
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
48
|
+
ts.row_count as row_count,
|
|
49
49
|
ts.size_bytes as bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|
|
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
|
|
|
80
80
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
81
81
|
@capability(
|
|
82
82
|
SourceCapability.DELETION_DETECTION,
|
|
83
|
-
"
|
|
83
|
+
"Enabled by default via stateful ingestion",
|
|
84
84
|
supported=True,
|
|
85
85
|
)
|
|
86
86
|
class CassandraSource(StatefulIngestionSourceBase):
|
|
@@ -70,11 +70,12 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
f"{keyspace_name}: {PROFILING}"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
|
|
75
|
+
ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
78
79
|
future_to_dataset = {
|
|
79
80
|
executor.submit(
|
|
80
81
|
self.generate_profile,
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
1
4
|
from datahub.utilities.str_enum import StrEnum
|
|
2
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
3
8
|
|
|
4
9
|
class DatasetSubTypes(StrEnum):
|
|
5
10
|
# Generic SubTypes
|
|
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
26
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
28
33
|
API_ENDPOINT = "API Endpoint"
|
|
34
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
35
|
+
PROJECTIONS = "Projections"
|
|
29
36
|
|
|
30
37
|
# TODO: Create separate entity...
|
|
31
38
|
NOTEBOOK = "Notebook"
|
|
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
|
|
|
52
59
|
LOOKER_FOLDER = "Folder"
|
|
53
60
|
LOOKML_PROJECT = "LookML Project"
|
|
54
61
|
LOOKML_MODEL = "LookML Model"
|
|
62
|
+
TABLEAU_SITE = "Site"
|
|
63
|
+
TABLEAU_PROJECT = "Project"
|
|
55
64
|
TABLEAU_WORKBOOK = "Workbook"
|
|
56
65
|
POWERBI_DATASET = "Semantic Model"
|
|
57
66
|
POWERBI_DATASET_TABLE = "Table"
|
|
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
|
|
|
74
83
|
|
|
75
84
|
|
|
76
85
|
class BIAssetSubTypes(StrEnum):
|
|
86
|
+
DASHBOARD = "Dashboard"
|
|
87
|
+
CHART = "Chart"
|
|
88
|
+
|
|
77
89
|
# Generic SubTypes
|
|
78
90
|
REPORT = "Report"
|
|
79
91
|
|
|
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
|
|
|
116
128
|
VERTEX_PIPELINE = "Pipeline Job"
|
|
117
129
|
VERTEX_PIPELINE_TASK = "Pipeline Task"
|
|
118
130
|
VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_source_capability_modifier_enum():
|
|
134
|
+
all_values: Dict[str, Any] = {}
|
|
135
|
+
source_enums = [
|
|
136
|
+
DatasetSubTypes,
|
|
137
|
+
DatasetContainerSubTypes,
|
|
138
|
+
BIContainerSubTypes,
|
|
139
|
+
FlowContainerSubTypes,
|
|
140
|
+
JobContainerSubTypes,
|
|
141
|
+
BIAssetSubTypes,
|
|
142
|
+
MLAssetSubTypes,
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
for enum_class in source_enums:
|
|
146
|
+
for member in enum_class: # type: ignore[var-annotated]
|
|
147
|
+
if member.name in all_values:
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
150
|
+
)
|
|
151
|
+
continue
|
|
152
|
+
all_values[member.name] = member.value
|
|
153
|
+
|
|
154
|
+
enum_code = "class SourceCapabilityModifier(StrEnum):\n"
|
|
155
|
+
for name, value in all_values.items():
|
|
156
|
+
enum_code += f' {name} = "{value}"\n'
|
|
157
|
+
|
|
158
|
+
exec(enum_code, globals())
|
|
159
|
+
return globals()["SourceCapabilityModifier"]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# This will have all values from the enums above
|
|
163
|
+
SourceCapabilityModifier = create_source_capability_modifier_enum()
|