acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,292 @@
1
+ import logging
2
+ from typing import List, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from datahub.api.entities.external.external_entities import (
7
+ ExternalEntity,
8
+ ExternalEntityId,
9
+ LinkedResourceSet,
10
+ PlatformResourceRepository,
11
+ )
12
+ from datahub.api.entities.external.lake_formation_external_entites import (
13
+ LakeFormationTag,
14
+ )
15
+ from datahub.api.entities.platformresource.platform_resource import (
16
+ PlatformResource,
17
+ PlatformResourceKey,
18
+ PlatformResourceSearchFields,
19
+ )
20
+ from datahub.metadata.urns import TagUrn
21
+ from datahub.utilities.search_utils import ElasticDocumentQuery
22
+ from datahub.utilities.urns.urn import Urn
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class LakeFormationTagSyncContext(BaseModel):
28
+ # it is intentionally empty
29
+ platform_instance: Optional[str] = None
30
+ catalog: Optional[str] = None
31
+
32
+
33
+ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
34
+ """
35
+ A LakeFormationTag is a unique identifier for a Lakeformation tag.
36
+ """
37
+
38
+ tag_key: str
39
+ tag_value: Optional[str] = None
40
+ platform_instance: Optional[str]
41
+ catalog: Optional[str] = None
42
+ exists_in_lake_formation: bool = False
43
+ persisted: bool = False
44
+
45
+ def __hash__(self) -> int:
46
+ return hash(self.to_platform_resource_key().id)
47
+
48
+ # this is a hack to make sure the property is a string and not private pydantic field
49
+ @staticmethod
50
+ def _RESOURCE_TYPE() -> str:
51
+ return "LakeFormationTagPlatformResource"
52
+
53
+ def to_platform_resource_key(self) -> PlatformResourceKey:
54
+ return PlatformResourceKey(
55
+ platform="glue",
56
+ resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
57
+ primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
58
+ if self.catalog
59
+ else f"{self.tag_key}:{self.tag_value}",
60
+ platform_instance=self.platform_instance,
61
+ )
62
+
63
+ @classmethod
64
+ def from_tag(
65
+ cls,
66
+ tag: LakeFormationTag,
67
+ platform_instance: Optional[str],
68
+ platform_resource_repository: PlatformResourceRepository,
69
+ catalog: Optional[str] = None,
70
+ exists_in_lake_formation: bool = False,
71
+ ) -> "LakeFormationTagPlatformResourceId":
72
+ """
73
+ Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
74
+ """
75
+
76
+ existing_platform_resource = cls.search_by_urn(
77
+ tag.to_datahub_tag_urn().urn(),
78
+ platform_resource_repository=platform_resource_repository,
79
+ tag_sync_context=LakeFormationTagSyncContext(
80
+ platform_instance=platform_instance,
81
+ catalog=catalog,
82
+ ),
83
+ )
84
+ if existing_platform_resource:
85
+ logger.info(
86
+ f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
87
+ )
88
+ return existing_platform_resource
89
+
90
+ return LakeFormationTagPlatformResourceId(
91
+ tag_key=tag.key,
92
+ tag_value=tag.value if tag.value is not None else None,
93
+ platform_instance=platform_instance,
94
+ exists_in_lake_formation=exists_in_lake_formation,
95
+ catalog=catalog,
96
+ persisted=False,
97
+ )
98
+
99
+ @classmethod
100
+ def search_by_urn(
101
+ cls,
102
+ urn: str,
103
+ platform_resource_repository: PlatformResourceRepository,
104
+ tag_sync_context: LakeFormationTagSyncContext,
105
+ ) -> Optional["LakeFormationTagPlatformResourceId"]:
106
+ mapped_tags = [
107
+ t
108
+ for t in platform_resource_repository.search_by_filter(
109
+ ElasticDocumentQuery.create_from(
110
+ (
111
+ PlatformResourceSearchFields.RESOURCE_TYPE,
112
+ str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
113
+ ),
114
+ (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
115
+ )
116
+ )
117
+ ]
118
+ logger.info(
119
+ f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
120
+ )
121
+ if len(mapped_tags) > 0:
122
+ for platform_resource in mapped_tags:
123
+ if (
124
+ platform_resource.resource_info
125
+ and platform_resource.resource_info.value
126
+ ):
127
+ lake_formation_tag_platform_resource = (
128
+ LakeFormationTagPlatformResource(
129
+ **platform_resource.resource_info.value.as_pydantic_object(
130
+ LakeFormationTagPlatformResource
131
+ ).dict()
132
+ )
133
+ )
134
+ if (
135
+ lake_formation_tag_platform_resource.id.platform_instance
136
+ == tag_sync_context.platform_instance
137
+ and lake_formation_tag_platform_resource.id.catalog
138
+ == tag_sync_context.catalog
139
+ ):
140
+ lake_formation_tag_id = lake_formation_tag_platform_resource.id
141
+ lake_formation_tag_id.exists_in_lake_formation = True
142
+ lake_formation_tag_id.persisted = True
143
+ return lake_formation_tag_id
144
+ else:
145
+ logger.warning(
146
+ f"Platform resource {platform_resource} does not have a resource_info value"
147
+ )
148
+ continue
149
+
150
+ # If we reach here, it means we did not find a mapped tag for the URN
151
+ logger.info(
152
+ f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
153
+ )
154
+ return None
155
+
156
+ @classmethod
157
+ def from_datahub_urn(
158
+ cls,
159
+ urn: str,
160
+ platform_resource_repository: PlatformResourceRepository,
161
+ tag_sync_context: LakeFormationTagSyncContext,
162
+ ) -> "LakeFormationTagPlatformResourceId":
163
+ """
164
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
165
+ """
166
+ # First we check if we already have a mapped platform resource for this
167
+ # urn that is of the type UnityCatalogTagPlatformResource
168
+ # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
169
+ # Else, we need to generate a new UnityCatalogTagPlatformResourceId
170
+ existing_platform_resource_id = cls.search_by_urn(
171
+ urn, platform_resource_repository, tag_sync_context
172
+ )
173
+ if existing_platform_resource_id:
174
+ logger.info(
175
+ f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
176
+ )
177
+ return existing_platform_resource_id
178
+
179
+ # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
180
+ new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
181
+ if new_tag_id:
182
+ # we then check if this tag has already been ingested as a platform
183
+ # resource in the platform resource repository
184
+ resource_key = platform_resource_repository.get(
185
+ new_tag_id.to_platform_resource_key()
186
+ )
187
+ if resource_key:
188
+ logger.info(
189
+ f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
190
+ )
191
+ new_tag_id.exists_in_lake_formation = (
192
+ True # TODO: Check if this is a safe assumption
193
+ )
194
+ return new_tag_id
195
+ raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
196
+
197
+ @classmethod
198
+ def generate_tag_id(
199
+ cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
200
+ ) -> "LakeFormationTagPlatformResourceId":
201
+ parsed_urn = Urn.from_string(urn)
202
+ entity_type = parsed_urn.entity_type
203
+ if entity_type == "tag":
204
+ new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
205
+ TagUrn.from_string(urn), tag_sync_context
206
+ )
207
+ else:
208
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
209
+ return new_tag_id
210
+
211
+ @classmethod
212
+ def from_datahub_tag(
213
+ cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
214
+ ) -> "LakeFormationTagPlatformResourceId":
215
+ tag = LakeFormationTag.from_urn(tag_urn)
216
+
217
+ return LakeFormationTagPlatformResourceId(
218
+ tag_key=str(tag.key),
219
+ tag_value=str(tag.value),
220
+ platform_instance=tag_sync_context.platform_instance,
221
+ catalog=tag_sync_context.catalog,
222
+ exists_in_lake_formation=False,
223
+ )
224
+
225
+
226
+ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
227
+ datahub_urns: LinkedResourceSet
228
+ managed_by_datahub: bool
229
+ id: LakeFormationTagPlatformResourceId
230
+ allowed_values: Optional[List[str]]
231
+
232
+ def get_id(self) -> ExternalEntityId:
233
+ return self.id
234
+
235
+ def is_managed_by_datahub(self) -> bool:
236
+ return self.managed_by_datahub
237
+
238
+ def datahub_linked_resources(self) -> LinkedResourceSet:
239
+ return self.datahub_urns
240
+
241
+ def as_platform_resource(self) -> PlatformResource:
242
+ return PlatformResource.create(
243
+ key=self.id.to_platform_resource_key(),
244
+ secondary_keys=[u for u in self.datahub_urns.urns],
245
+ value=self,
246
+ )
247
+
248
+ @classmethod
249
+ def get_from_datahub(
250
+ cls,
251
+ lake_formation_tag_id: LakeFormationTagPlatformResourceId,
252
+ platform_resource_repository: PlatformResourceRepository,
253
+ managed_by_datahub: bool = False,
254
+ ) -> "LakeFormationTagPlatformResource":
255
+ # Search for linked DataHub URNs
256
+ platform_resources = [
257
+ r
258
+ for r in platform_resource_repository.search_by_filter(
259
+ ElasticDocumentQuery.create_from(
260
+ (
261
+ PlatformResourceSearchFields.RESOURCE_TYPE,
262
+ str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
263
+ ),
264
+ (
265
+ PlatformResourceSearchFields.PRIMARY_KEY,
266
+ f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
267
+ ),
268
+ )
269
+ )
270
+ ]
271
+ for platform_resource in platform_resources:
272
+ if (
273
+ platform_resource.resource_info
274
+ and platform_resource.resource_info.value
275
+ ):
276
+ lf_tag = LakeFormationTagPlatformResource(
277
+ **platform_resource.resource_info.value.as_pydantic_object(
278
+ LakeFormationTagPlatformResource
279
+ ).dict()
280
+ )
281
+ if (
282
+ lf_tag.id.platform_instance
283
+ == lake_formation_tag_id.platform_instance
284
+ and lf_tag.id.catalog == lake_formation_tag_id.catalog
285
+ ):
286
+ return lf_tag
287
+ return cls(
288
+ id=lake_formation_tag_id,
289
+ datahub_urns=LinkedResourceSet(urns=[]),
290
+ managed_by_datahub=managed_by_datahub,
291
+ allowed_values=None,
292
+ )
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
61
61
  def get_blob_service_client(self):
62
62
  return BlobServiceClient(
63
63
  account_url=f"https://{self.account_name}.blob.core.windows.net",
64
- credential=f"{self.get_credentials()}",
64
+ credential=self.get_credentials(),
65
65
  )
66
66
 
67
67
  def get_data_lake_service_client(self) -> DataLakeServiceClient:
68
68
  return DataLakeServiceClient(
69
69
  account_url=f"https://{self.account_name}.dfs.core.windows.net",
70
- credential=f"{self.get_credentials()}",
70
+ credential=self.get_credentials(),
71
71
  )
72
72
 
73
73
  def get_credentials(
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  from typing import Iterable, List, Optional
6
6
 
7
+ from datahub.configuration.common import AllowDenyPattern
7
8
  from datahub.ingestion.api.common import PipelineContext
8
9
  from datahub.ingestion.api.decorators import (
9
10
  SupportStatus,
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
44
45
  BigQueryQueriesExtractorConfig,
45
46
  )
46
47
  from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
47
49
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
48
50
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
49
51
  RedundantLineageRunSkipHandler,
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
77
79
  supported=False,
78
80
  )
79
81
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
80
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
82
+ @capability(
83
+ SourceCapability.CONTAINERS,
84
+ "Enabled by default",
85
+ subtype_modifier=[
86
+ SourceCapabilityModifier.BIGQUERY_PROJECT,
87
+ SourceCapabilityModifier.BIGQUERY_DATASET,
88
+ ],
89
+ )
81
90
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
82
91
  @capability(
83
92
  SourceCapability.DATA_PROFILING,
@@ -99,6 +108,7 @@ def cleanup(config: BigQueryV2Config) -> None:
99
108
  SourceCapability.PARTITION_SUPPORT,
100
109
  "Enabled by default, partition keys and clustering keys are supported.",
101
110
  )
111
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
102
112
  class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
103
113
  def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
104
114
  super().__init__(config, ctx)
@@ -241,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
241
251
  ).workunit_processor,
242
252
  ]
243
253
 
254
+ def _warn_deprecated_configs(self):
255
+ if (
256
+ self.config.match_fully_qualified_names is not None
257
+ and not self.config.match_fully_qualified_names
258
+ and self.config.schema_pattern is not None
259
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
260
+ ):
261
+ self.report.report_warning(
262
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
263
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
264
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
265
+ context="Config option deprecation warning",
266
+ title="Config option deprecation warning",
267
+ )
268
+
244
269
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
270
+ self._warn_deprecated_configs()
245
271
  projects = get_projects(
246
272
  self.bq_schema_extractor.schema_api,
247
273
  self.report,
@@ -270,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
296
  ):
271
297
  return
272
298
 
273
- with self.report.new_stage(
274
- f"*: {QUERIES_EXTRACTION}"
275
- ), BigQueryQueriesExtractor(
276
- connection=self.config.get_bigquery_client(),
277
- schema_api=self.bq_schema_extractor.schema_api,
278
- config=BigQueryQueriesExtractorConfig(
279
- window=self.config,
280
- user_email_pattern=self.config.usage.user_email_pattern,
281
- include_lineage=self.config.include_table_lineage,
282
- include_usage_statistics=self.config.include_usage_statistics,
283
- include_operations=self.config.usage.include_operational_stats,
284
- include_queries=self.config.include_queries,
285
- include_query_usage_statistics=self.config.include_query_usage_statistics,
286
- top_n_queries=self.config.usage.top_n_queries,
287
- region_qualifiers=self.config.region_qualifiers,
288
- ),
289
- structured_report=self.report,
290
- filters=self.filters,
291
- identifiers=self.identifiers,
292
- schema_resolver=self.sql_parser_schema_resolver,
293
- discovered_tables=self.bq_schema_extractor.table_refs,
294
- ) as queries_extractor:
299
+ with (
300
+ self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
301
+ BigQueryQueriesExtractor(
302
+ connection=self.config.get_bigquery_client(),
303
+ schema_api=self.bq_schema_extractor.schema_api,
304
+ config=BigQueryQueriesExtractorConfig(
305
+ window=self.config,
306
+ user_email_pattern=self.config.usage.user_email_pattern,
307
+ include_lineage=self.config.include_table_lineage,
308
+ include_usage_statistics=self.config.include_usage_statistics,
309
+ include_operations=self.config.usage.include_operational_stats,
310
+ include_queries=self.config.include_queries,
311
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
312
+ top_n_queries=self.config.usage.top_n_queries,
313
+ region_qualifiers=self.config.region_qualifiers,
314
+ ),
315
+ structured_report=self.report,
316
+ filters=self.filters,
317
+ identifiers=self.identifiers,
318
+ schema_resolver=self.sql_parser_schema_resolver,
319
+ discovered_tables=self.bq_schema_extractor.table_refs,
320
+ ) as queries_extractor,
321
+ ):
295
322
  self.report.queries_extractor = queries_extractor.report
296
323
  yield from queries_extractor.get_workunits_internal()
297
324
  else:
@@ -342,7 +342,7 @@ class BigQueryV2Config(
342
342
  )
343
343
 
344
344
  use_queries_v2: bool = Field(
345
- default=False,
345
+ default=True,
346
346
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
347
347
  )
348
348
  include_queries: bool = Field(
@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
94
94
  def close(self) -> None:
95
95
  self.queries_extractor.close()
96
96
  self.connection.close()
97
+ super().close()
@@ -286,6 +286,7 @@ class BigQuerySchemaGenerator:
286
286
  yield from gen_database_container(
287
287
  database=database,
288
288
  name=database,
289
+ qualified_name=database,
289
290
  sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
290
291
  domain_registry=self.domain_registry,
291
292
  domain_config=self.config.domain,
@@ -332,6 +333,7 @@ class BigQuerySchemaGenerator:
332
333
  yield from gen_schema_container(
333
334
  database=project_id,
334
335
  schema=dataset,
336
+ qualified_name=f"{project_id}.{dataset}",
335
337
  sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
336
338
  domain_registry=self.domain_registry,
337
339
  domain_config=self.config.domain,
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
63
63
  )
64
64
 
65
65
  def gen_user_urn(self, user_email: str) -> str:
66
- return make_user_urn(user_email.split("@")[0])
66
+ return make_user_urn(user_email)
67
67
 
68
68
  def make_data_platform_urn(self) -> str:
69
69
  return make_data_platform_urn(self.platform)
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
48
+ ts.row_count as row_count,
49
49
  ts.size_bytes as bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56
 
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
80
80
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
81
81
  @capability(
82
82
  SourceCapability.DELETION_DETECTION,
83
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
83
+ "Enabled by default via stateful ingestion",
84
84
  supported=True,
85
85
  )
86
86
  class CassandraSource(StatefulIngestionSourceBase):
@@ -70,11 +70,12 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(
74
- f"{keyspace_name}: {PROFILING}"
75
- ), ThreadPoolExecutor(
76
- max_workers=self.config.profiling.max_workers
77
- ) as executor:
73
+ with (
74
+ self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
75
+ ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor,
78
+ ):
78
79
  future_to_dataset = {
79
80
  executor.submit(
80
81
  self.generate_profile,
@@ -1,5 +1,10 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
1
4
  from datahub.utilities.str_enum import StrEnum
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
3
8
 
4
9
  class DatasetSubTypes(StrEnum):
5
10
  # Generic SubTypes
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
26
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
28
33
  API_ENDPOINT = "API Endpoint"
34
+ SLACK_CHANNEL = "Slack Channel"
35
+ PROJECTIONS = "Projections"
29
36
 
30
37
  # TODO: Create separate entity...
31
38
  NOTEBOOK = "Notebook"
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
52
59
  LOOKER_FOLDER = "Folder"
53
60
  LOOKML_PROJECT = "LookML Project"
54
61
  LOOKML_MODEL = "LookML Model"
62
+ TABLEAU_SITE = "Site"
63
+ TABLEAU_PROJECT = "Project"
55
64
  TABLEAU_WORKBOOK = "Workbook"
56
65
  POWERBI_DATASET = "Semantic Model"
57
66
  POWERBI_DATASET_TABLE = "Table"
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
74
83
 
75
84
 
76
85
  class BIAssetSubTypes(StrEnum):
86
+ DASHBOARD = "Dashboard"
87
+ CHART = "Chart"
88
+
77
89
  # Generic SubTypes
78
90
  REPORT = "Report"
79
91
 
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
116
128
  VERTEX_PIPELINE = "Pipeline Job"
117
129
  VERTEX_PIPELINE_TASK = "Pipeline Task"
118
130
  VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
131
+
132
+
133
+ def create_source_capability_modifier_enum():
134
+ all_values: Dict[str, Any] = {}
135
+ source_enums = [
136
+ DatasetSubTypes,
137
+ DatasetContainerSubTypes,
138
+ BIContainerSubTypes,
139
+ FlowContainerSubTypes,
140
+ JobContainerSubTypes,
141
+ BIAssetSubTypes,
142
+ MLAssetSubTypes,
143
+ ]
144
+
145
+ for enum_class in source_enums:
146
+ for member in enum_class: # type: ignore[var-annotated]
147
+ if member.name in all_values:
148
+ logger.debug(
149
+ f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
150
+ )
151
+ continue
152
+ all_values[member.name] = member.value
153
+
154
+ enum_code = "class SourceCapabilityModifier(StrEnum):\n"
155
+ for name, value in all_values.items():
156
+ enum_code += f' {name} = "{value}"\n'
157
+
158
+ exec(enum_code, globals())
159
+ return globals()["SourceCapabilityModifier"]
160
+
161
+
162
+ # This will have all values from the enums above
163
+ SourceCapabilityModifier = create_source_capability_modifier_enum()