acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from datahub.api.entities.external.external_entities import (
|
|
7
|
+
ExternalEntity,
|
|
8
|
+
ExternalEntityId,
|
|
9
|
+
LinkedResourceSet,
|
|
10
|
+
PlatformResourceRepository,
|
|
11
|
+
)
|
|
12
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
13
|
+
LakeFormationTag,
|
|
14
|
+
)
|
|
15
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
16
|
+
PlatformResource,
|
|
17
|
+
PlatformResourceKey,
|
|
18
|
+
PlatformResourceSearchFields,
|
|
19
|
+
)
|
|
20
|
+
from datahub.metadata.urns import TagUrn
|
|
21
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
22
|
+
from datahub.utilities.urns.urn import Urn
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LakeFormationTagSyncContext(BaseModel):
|
|
28
|
+
# it is intentionally empty
|
|
29
|
+
platform_instance: Optional[str] = None
|
|
30
|
+
catalog: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
34
|
+
"""
|
|
35
|
+
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
tag_key: str
|
|
39
|
+
tag_value: Optional[str] = None
|
|
40
|
+
platform_instance: Optional[str]
|
|
41
|
+
catalog: Optional[str] = None
|
|
42
|
+
exists_in_lake_formation: bool = False
|
|
43
|
+
persisted: bool = False
|
|
44
|
+
|
|
45
|
+
def __hash__(self) -> int:
|
|
46
|
+
return hash(self.to_platform_resource_key().id)
|
|
47
|
+
|
|
48
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
49
|
+
@staticmethod
|
|
50
|
+
def _RESOURCE_TYPE() -> str:
|
|
51
|
+
return "LakeFormationTagPlatformResource"
|
|
52
|
+
|
|
53
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
54
|
+
return PlatformResourceKey(
|
|
55
|
+
platform="glue",
|
|
56
|
+
resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
57
|
+
primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
|
|
58
|
+
if self.catalog
|
|
59
|
+
else f"{self.tag_key}:{self.tag_value}",
|
|
60
|
+
platform_instance=self.platform_instance,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_tag(
|
|
65
|
+
cls,
|
|
66
|
+
tag: LakeFormationTag,
|
|
67
|
+
platform_instance: Optional[str],
|
|
68
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
69
|
+
catalog: Optional[str] = None,
|
|
70
|
+
exists_in_lake_formation: bool = False,
|
|
71
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
72
|
+
"""
|
|
73
|
+
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
existing_platform_resource = cls.search_by_urn(
|
|
77
|
+
tag.to_datahub_tag_urn().urn(),
|
|
78
|
+
platform_resource_repository=platform_resource_repository,
|
|
79
|
+
tag_sync_context=LakeFormationTagSyncContext(
|
|
80
|
+
platform_instance=platform_instance,
|
|
81
|
+
catalog=catalog,
|
|
82
|
+
),
|
|
83
|
+
)
|
|
84
|
+
if existing_platform_resource:
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
|
|
87
|
+
)
|
|
88
|
+
return existing_platform_resource
|
|
89
|
+
|
|
90
|
+
return LakeFormationTagPlatformResourceId(
|
|
91
|
+
tag_key=tag.key,
|
|
92
|
+
tag_value=tag.value if tag.value is not None else None,
|
|
93
|
+
platform_instance=platform_instance,
|
|
94
|
+
exists_in_lake_formation=exists_in_lake_formation,
|
|
95
|
+
catalog=catalog,
|
|
96
|
+
persisted=False,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def search_by_urn(
|
|
101
|
+
cls,
|
|
102
|
+
urn: str,
|
|
103
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
104
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
105
|
+
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
106
|
+
mapped_tags = [
|
|
107
|
+
t
|
|
108
|
+
for t in platform_resource_repository.search_by_filter(
|
|
109
|
+
ElasticDocumentQuery.create_from(
|
|
110
|
+
(
|
|
111
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
112
|
+
str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
113
|
+
),
|
|
114
|
+
(PlatformResourceSearchFields.SECONDARY_KEYS, urn),
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
]
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
|
|
120
|
+
)
|
|
121
|
+
if len(mapped_tags) > 0:
|
|
122
|
+
for platform_resource in mapped_tags:
|
|
123
|
+
if (
|
|
124
|
+
platform_resource.resource_info
|
|
125
|
+
and platform_resource.resource_info.value
|
|
126
|
+
):
|
|
127
|
+
lake_formation_tag_platform_resource = (
|
|
128
|
+
LakeFormationTagPlatformResource(
|
|
129
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
130
|
+
LakeFormationTagPlatformResource
|
|
131
|
+
).dict()
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
if (
|
|
135
|
+
lake_formation_tag_platform_resource.id.platform_instance
|
|
136
|
+
== tag_sync_context.platform_instance
|
|
137
|
+
and lake_formation_tag_platform_resource.id.catalog
|
|
138
|
+
== tag_sync_context.catalog
|
|
139
|
+
):
|
|
140
|
+
lake_formation_tag_id = lake_formation_tag_platform_resource.id
|
|
141
|
+
lake_formation_tag_id.exists_in_lake_formation = True
|
|
142
|
+
lake_formation_tag_id.persisted = True
|
|
143
|
+
return lake_formation_tag_id
|
|
144
|
+
else:
|
|
145
|
+
logger.warning(
|
|
146
|
+
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
147
|
+
)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# If we reach here, it means we did not find a mapped tag for the URN
|
|
151
|
+
logger.info(
|
|
152
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
153
|
+
)
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def from_datahub_urn(
|
|
158
|
+
cls,
|
|
159
|
+
urn: str,
|
|
160
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
161
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
162
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
163
|
+
"""
|
|
164
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
165
|
+
"""
|
|
166
|
+
# First we check if we already have a mapped platform resource for this
|
|
167
|
+
# urn that is of the type UnityCatalogTagPlatformResource
|
|
168
|
+
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
169
|
+
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
170
|
+
existing_platform_resource_id = cls.search_by_urn(
|
|
171
|
+
urn, platform_resource_repository, tag_sync_context
|
|
172
|
+
)
|
|
173
|
+
if existing_platform_resource_id:
|
|
174
|
+
logger.info(
|
|
175
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
176
|
+
)
|
|
177
|
+
return existing_platform_resource_id
|
|
178
|
+
|
|
179
|
+
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
180
|
+
new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
|
|
181
|
+
if new_tag_id:
|
|
182
|
+
# we then check if this tag has already been ingested as a platform
|
|
183
|
+
# resource in the platform resource repository
|
|
184
|
+
resource_key = platform_resource_repository.get(
|
|
185
|
+
new_tag_id.to_platform_resource_key()
|
|
186
|
+
)
|
|
187
|
+
if resource_key:
|
|
188
|
+
logger.info(
|
|
189
|
+
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
190
|
+
)
|
|
191
|
+
new_tag_id.exists_in_lake_formation = (
|
|
192
|
+
True # TODO: Check if this is a safe assumption
|
|
193
|
+
)
|
|
194
|
+
return new_tag_id
|
|
195
|
+
raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
|
|
196
|
+
|
|
197
|
+
@classmethod
|
|
198
|
+
def generate_tag_id(
|
|
199
|
+
cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
|
|
200
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
201
|
+
parsed_urn = Urn.from_string(urn)
|
|
202
|
+
entity_type = parsed_urn.entity_type
|
|
203
|
+
if entity_type == "tag":
|
|
204
|
+
new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
|
|
205
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
209
|
+
return new_tag_id
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def from_datahub_tag(
|
|
213
|
+
cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
|
|
214
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
215
|
+
tag = LakeFormationTag.from_urn(tag_urn)
|
|
216
|
+
|
|
217
|
+
return LakeFormationTagPlatformResourceId(
|
|
218
|
+
tag_key=str(tag.key),
|
|
219
|
+
tag_value=str(tag.value),
|
|
220
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
221
|
+
catalog=tag_sync_context.catalog,
|
|
222
|
+
exists_in_lake_formation=False,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
|
|
227
|
+
datahub_urns: LinkedResourceSet
|
|
228
|
+
managed_by_datahub: bool
|
|
229
|
+
id: LakeFormationTagPlatformResourceId
|
|
230
|
+
allowed_values: Optional[List[str]]
|
|
231
|
+
|
|
232
|
+
def get_id(self) -> ExternalEntityId:
|
|
233
|
+
return self.id
|
|
234
|
+
|
|
235
|
+
def is_managed_by_datahub(self) -> bool:
|
|
236
|
+
return self.managed_by_datahub
|
|
237
|
+
|
|
238
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
239
|
+
return self.datahub_urns
|
|
240
|
+
|
|
241
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
242
|
+
return PlatformResource.create(
|
|
243
|
+
key=self.id.to_platform_resource_key(),
|
|
244
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
245
|
+
value=self,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def get_from_datahub(
|
|
250
|
+
cls,
|
|
251
|
+
lake_formation_tag_id: LakeFormationTagPlatformResourceId,
|
|
252
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
253
|
+
managed_by_datahub: bool = False,
|
|
254
|
+
) -> "LakeFormationTagPlatformResource":
|
|
255
|
+
# Search for linked DataHub URNs
|
|
256
|
+
platform_resources = [
|
|
257
|
+
r
|
|
258
|
+
for r in platform_resource_repository.search_by_filter(
|
|
259
|
+
ElasticDocumentQuery.create_from(
|
|
260
|
+
(
|
|
261
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
262
|
+
str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
263
|
+
),
|
|
264
|
+
(
|
|
265
|
+
PlatformResourceSearchFields.PRIMARY_KEY,
|
|
266
|
+
f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
|
|
267
|
+
),
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
]
|
|
271
|
+
for platform_resource in platform_resources:
|
|
272
|
+
if (
|
|
273
|
+
platform_resource.resource_info
|
|
274
|
+
and platform_resource.resource_info.value
|
|
275
|
+
):
|
|
276
|
+
lf_tag = LakeFormationTagPlatformResource(
|
|
277
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
278
|
+
LakeFormationTagPlatformResource
|
|
279
|
+
).dict()
|
|
280
|
+
)
|
|
281
|
+
if (
|
|
282
|
+
lf_tag.id.platform_instance
|
|
283
|
+
== lake_formation_tag_id.platform_instance
|
|
284
|
+
and lf_tag.id.catalog == lake_formation_tag_id.catalog
|
|
285
|
+
):
|
|
286
|
+
return lf_tag
|
|
287
|
+
return cls(
|
|
288
|
+
id=lake_formation_tag_id,
|
|
289
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
290
|
+
managed_by_datahub=managed_by_datahub,
|
|
291
|
+
allowed_values=None,
|
|
292
|
+
)
|
|
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
61
61
|
def get_blob_service_client(self):
|
|
62
62
|
return BlobServiceClient(
|
|
63
63
|
account_url=f"https://{self.account_name}.blob.core.windows.net",
|
|
64
|
-
credential=
|
|
64
|
+
credential=self.get_credentials(),
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
def get_data_lake_service_client(self) -> DataLakeServiceClient:
|
|
68
68
|
return DataLakeServiceClient(
|
|
69
69
|
account_url=f"https://{self.account_name}.dfs.core.windows.net",
|
|
70
|
-
credential=
|
|
70
|
+
credential=self.get_credentials(),
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
def get_credentials(
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
|
|
|
44
45
|
BigQueryQueriesExtractorConfig,
|
|
45
46
|
)
|
|
46
47
|
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
48
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
49
51
|
RedundantLineageRunSkipHandler,
|
|
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
77
79
|
supported=False,
|
|
78
80
|
)
|
|
79
81
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
80
|
-
@capability(
|
|
82
|
+
@capability(
|
|
83
|
+
SourceCapability.CONTAINERS,
|
|
84
|
+
"Enabled by default",
|
|
85
|
+
subtype_modifier=[
|
|
86
|
+
SourceCapabilityModifier.BIGQUERY_PROJECT,
|
|
87
|
+
SourceCapabilityModifier.BIGQUERY_DATASET,
|
|
88
|
+
],
|
|
89
|
+
)
|
|
81
90
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
82
91
|
@capability(
|
|
83
92
|
SourceCapability.DATA_PROFILING,
|
|
@@ -99,6 +108,7 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
99
108
|
SourceCapability.PARTITION_SUPPORT,
|
|
100
109
|
"Enabled by default, partition keys and clustering keys are supported.",
|
|
101
110
|
)
|
|
111
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
102
112
|
class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
103
113
|
def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
|
|
104
114
|
super().__init__(config, ctx)
|
|
@@ -241,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
241
251
|
).workunit_processor,
|
|
242
252
|
]
|
|
243
253
|
|
|
254
|
+
def _warn_deprecated_configs(self):
|
|
255
|
+
if (
|
|
256
|
+
self.config.match_fully_qualified_names is not None
|
|
257
|
+
and not self.config.match_fully_qualified_names
|
|
258
|
+
and self.config.schema_pattern is not None
|
|
259
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
260
|
+
):
|
|
261
|
+
self.report.report_warning(
|
|
262
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
263
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
264
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
265
|
+
context="Config option deprecation warning",
|
|
266
|
+
title="Config option deprecation warning",
|
|
267
|
+
)
|
|
268
|
+
|
|
244
269
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
270
|
+
self._warn_deprecated_configs()
|
|
245
271
|
projects = get_projects(
|
|
246
272
|
self.bq_schema_extractor.schema_api,
|
|
247
273
|
self.report,
|
|
@@ -270,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
296
|
):
|
|
271
297
|
return
|
|
272
298
|
|
|
273
|
-
with
|
|
274
|
-
f"*: {QUERIES_EXTRACTION}"
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
299
|
+
with (
|
|
300
|
+
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
301
|
+
BigQueryQueriesExtractor(
|
|
302
|
+
connection=self.config.get_bigquery_client(),
|
|
303
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
304
|
+
config=BigQueryQueriesExtractorConfig(
|
|
305
|
+
window=self.config,
|
|
306
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
307
|
+
include_lineage=self.config.include_table_lineage,
|
|
308
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
309
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
310
|
+
include_queries=self.config.include_queries,
|
|
311
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
312
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
313
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
314
|
+
),
|
|
315
|
+
structured_report=self.report,
|
|
316
|
+
filters=self.filters,
|
|
317
|
+
identifiers=self.identifiers,
|
|
318
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
319
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
320
|
+
) as queries_extractor,
|
|
321
|
+
):
|
|
295
322
|
self.report.queries_extractor = queries_extractor.report
|
|
296
323
|
yield from queries_extractor.get_workunits_internal()
|
|
297
324
|
else:
|
|
@@ -286,6 +286,7 @@ class BigQuerySchemaGenerator:
|
|
|
286
286
|
yield from gen_database_container(
|
|
287
287
|
database=database,
|
|
288
288
|
name=database,
|
|
289
|
+
qualified_name=database,
|
|
289
290
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
|
|
290
291
|
domain_registry=self.domain_registry,
|
|
291
292
|
domain_config=self.config.domain,
|
|
@@ -332,6 +333,7 @@ class BigQuerySchemaGenerator:
|
|
|
332
333
|
yield from gen_schema_container(
|
|
333
334
|
database=project_id,
|
|
334
335
|
schema=dataset,
|
|
336
|
+
qualified_name=f"{project_id}.{dataset}",
|
|
335
337
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
|
|
336
338
|
domain_registry=self.domain_registry,
|
|
337
339
|
domain_config=self.config.domain,
|
|
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
|
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
def gen_user_urn(self, user_email: str) -> str:
|
|
66
|
-
return make_user_urn(user_email
|
|
66
|
+
return make_user_urn(user_email)
|
|
67
67
|
|
|
68
68
|
def make_data_platform_urn(self) -> str:
|
|
69
69
|
return make_data_platform_urn(self.platform)
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
48
|
+
ts.row_count as row_count,
|
|
49
49
|
ts.size_bytes as bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|
|
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
|
|
|
80
80
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
81
81
|
@capability(
|
|
82
82
|
SourceCapability.DELETION_DETECTION,
|
|
83
|
-
"
|
|
83
|
+
"Enabled by default via stateful ingestion",
|
|
84
84
|
supported=True,
|
|
85
85
|
)
|
|
86
86
|
class CassandraSource(StatefulIngestionSourceBase):
|
|
@@ -70,11 +70,12 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
f"{keyspace_name}: {PROFILING}"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
|
|
75
|
+
ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
78
79
|
future_to_dataset = {
|
|
79
80
|
executor.submit(
|
|
80
81
|
self.generate_profile,
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
1
4
|
from datahub.utilities.str_enum import StrEnum
|
|
2
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
3
8
|
|
|
4
9
|
class DatasetSubTypes(StrEnum):
|
|
5
10
|
# Generic SubTypes
|
|
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
26
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
28
33
|
API_ENDPOINT = "API Endpoint"
|
|
34
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
35
|
+
PROJECTIONS = "Projections"
|
|
29
36
|
|
|
30
37
|
# TODO: Create separate entity...
|
|
31
38
|
NOTEBOOK = "Notebook"
|
|
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
|
|
|
52
59
|
LOOKER_FOLDER = "Folder"
|
|
53
60
|
LOOKML_PROJECT = "LookML Project"
|
|
54
61
|
LOOKML_MODEL = "LookML Model"
|
|
62
|
+
TABLEAU_SITE = "Site"
|
|
63
|
+
TABLEAU_PROJECT = "Project"
|
|
55
64
|
TABLEAU_WORKBOOK = "Workbook"
|
|
56
65
|
POWERBI_DATASET = "Semantic Model"
|
|
57
66
|
POWERBI_DATASET_TABLE = "Table"
|
|
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
|
|
|
74
83
|
|
|
75
84
|
|
|
76
85
|
class BIAssetSubTypes(StrEnum):
|
|
86
|
+
DASHBOARD = "Dashboard"
|
|
87
|
+
CHART = "Chart"
|
|
88
|
+
|
|
77
89
|
# Generic SubTypes
|
|
78
90
|
REPORT = "Report"
|
|
79
91
|
|
|
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
|
|
|
116
128
|
VERTEX_PIPELINE = "Pipeline Job"
|
|
117
129
|
VERTEX_PIPELINE_TASK = "Pipeline Task"
|
|
118
130
|
VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_source_capability_modifier_enum():
|
|
134
|
+
all_values: Dict[str, Any] = {}
|
|
135
|
+
source_enums = [
|
|
136
|
+
DatasetSubTypes,
|
|
137
|
+
DatasetContainerSubTypes,
|
|
138
|
+
BIContainerSubTypes,
|
|
139
|
+
FlowContainerSubTypes,
|
|
140
|
+
JobContainerSubTypes,
|
|
141
|
+
BIAssetSubTypes,
|
|
142
|
+
MLAssetSubTypes,
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
for enum_class in source_enums:
|
|
146
|
+
for member in enum_class: # type: ignore[var-annotated]
|
|
147
|
+
if member.name in all_values:
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
150
|
+
)
|
|
151
|
+
continue
|
|
152
|
+
all_values[member.name] = member.value
|
|
153
|
+
|
|
154
|
+
enum_code = "class SourceCapabilityModifier(StrEnum):\n"
|
|
155
|
+
for name, value in all_values.items():
|
|
156
|
+
enum_code += f' {name} = "{value}"\n'
|
|
157
|
+
|
|
158
|
+
exec(enum_code, globals())
|
|
159
|
+
return globals()["SourceCapabilityModifier"]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# This will have all values from the enums above
|
|
163
|
+
SourceCapabilityModifier = create_source_capability_modifier_enum()
|