acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (161) hide show
  1. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/METADATA +2378 -2380
  2. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/RECORD +161 -161
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/delete_cli.py +16 -2
  12. datahub/cli/docker_cli.py +6 -6
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  24. datahub/ingestion/api/report.py +1 -2
  25. datahub/ingestion/api/source_helpers.py +1 -1
  26. datahub/ingestion/extractor/json_schema_util.py +3 -3
  27. datahub/ingestion/extractor/schema_util.py +3 -5
  28. datahub/ingestion/fs/s3_fs.py +3 -3
  29. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  30. datahub/ingestion/graph/client.py +4 -6
  31. datahub/ingestion/run/pipeline.py +8 -7
  32. datahub/ingestion/run/pipeline_config.py +3 -3
  33. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  34. datahub/ingestion/source/abs/source.py +19 -8
  35. datahub/ingestion/source/aws/glue.py +11 -11
  36. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  37. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  38. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  39. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  40. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  42. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  43. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  44. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  45. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  46. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  47. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  48. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  49. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  50. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  51. datahub/ingestion/source/csv_enricher.py +29 -29
  52. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  53. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  54. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  55. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  56. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  57. datahub/ingestion/source/elastic_search.py +4 -4
  58. datahub/ingestion/source/fivetran/config.py +4 -0
  59. datahub/ingestion/source/fivetran/fivetran.py +15 -5
  60. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
  61. datahub/ingestion/source/gcs/gcs_source.py +5 -3
  62. datahub/ingestion/source/ge_data_profiler.py +4 -5
  63. datahub/ingestion/source/ge_profiling_config.py +3 -3
  64. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  65. datahub/ingestion/source/identity/azure_ad.py +3 -3
  66. datahub/ingestion/source/identity/okta.py +3 -3
  67. datahub/ingestion/source/kafka/kafka.py +11 -9
  68. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  69. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  70. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  71. datahub/ingestion/source/looker/looker_common.py +19 -19
  72. datahub/ingestion/source/looker/looker_config.py +3 -3
  73. datahub/ingestion/source/looker/looker_source.py +25 -25
  74. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  75. datahub/ingestion/source/looker/looker_usage.py +5 -7
  76. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  77. datahub/ingestion/source/looker/lookml_source.py +13 -15
  78. datahub/ingestion/source/looker/view_upstream.py +5 -5
  79. datahub/ingestion/source/mlflow.py +4 -4
  80. datahub/ingestion/source/mongodb.py +6 -4
  81. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  82. datahub/ingestion/source/nifi.py +24 -26
  83. datahub/ingestion/source/openapi.py +9 -9
  84. datahub/ingestion/source/powerbi/config.py +12 -12
  85. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  87. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  88. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  89. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  90. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  91. datahub/ingestion/source/redshift/config.py +3 -3
  92. datahub/ingestion/source/redshift/query.py +77 -47
  93. datahub/ingestion/source/redshift/redshift.py +12 -12
  94. datahub/ingestion/source/redshift/usage.py +8 -8
  95. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  96. datahub/ingestion/source/s3/source.py +1 -1
  97. datahub/ingestion/source/salesforce.py +26 -25
  98. datahub/ingestion/source/schema/json_schema.py +1 -1
  99. datahub/ingestion/source/sigma/sigma.py +3 -3
  100. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  101. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  102. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  103. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  104. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  105. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  106. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  107. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  108. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  109. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  110. datahub/ingestion/source/sql/athena.py +1 -3
  111. datahub/ingestion/source/sql/clickhouse.py +8 -14
  112. datahub/ingestion/source/sql/oracle.py +1 -3
  113. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  114. datahub/ingestion/source/sql/teradata.py +16 -3
  115. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  116. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  117. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  118. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  119. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  120. datahub/ingestion/source/tableau/tableau.py +48 -49
  121. datahub/ingestion/source/unity/config.py +3 -1
  122. datahub/ingestion/source/unity/proxy.py +1 -1
  123. datahub/ingestion/source/unity/source.py +3 -3
  124. datahub/ingestion/source/unity/usage.py +3 -1
  125. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  126. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  127. datahub/ingestion/source/usage/usage_common.py +1 -1
  128. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  129. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  130. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  131. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  132. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  133. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  134. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  135. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  136. datahub/lite/duckdb_lite.py +12 -10
  137. datahub/metadata/_schema_classes.py +1 -1
  138. datahub/metadata/schema.avsc +6 -2
  139. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  140. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  141. datahub/secret/secret_common.py +14 -8
  142. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  143. datahub/sql_parsing/schema_resolver.py +5 -10
  144. datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
  145. datahub/sql_parsing/sqlglot_lineage.py +5 -4
  146. datahub/sql_parsing/sqlglot_utils.py +3 -2
  147. datahub/telemetry/stats.py +1 -2
  148. datahub/testing/mcp_diff.py +1 -1
  149. datahub/utilities/file_backed_collections.py +10 -10
  150. datahub/utilities/hive_schema_to_avro.py +2 -2
  151. datahub/utilities/logging_manager.py +2 -2
  152. datahub/utilities/lossy_collections.py +3 -3
  153. datahub/utilities/mapping.py +3 -3
  154. datahub/utilities/serialized_lru_cache.py +3 -1
  155. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  156. datahub/utilities/sqllineage_patch.py +1 -1
  157. datahub/utilities/stats_collections.py +3 -1
  158. datahub/utilities/urns/urn_iter.py +2 -2
  159. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/WHEEL +0 -0
  160. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/entry_points.txt +0 -0
  161. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/top_level.txt +0 -0
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
194
194
 
195
195
  _DBT_FIELDS_BY_TYPE = {
196
196
  "models": f"""
197
- { _DBT_GRAPHQL_COMMON_FIELDS }
198
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
199
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
197
+ {_DBT_GRAPHQL_COMMON_FIELDS}
198
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
199
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
200
200
  dependsOn
201
201
  materializedType
202
202
  """,
203
203
  "seeds": f"""
204
- { _DBT_GRAPHQL_COMMON_FIELDS }
205
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
206
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
204
+ {_DBT_GRAPHQL_COMMON_FIELDS}
205
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
206
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
207
207
  """,
208
208
  "sources": f"""
209
- { _DBT_GRAPHQL_COMMON_FIELDS }
210
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
209
+ {_DBT_GRAPHQL_COMMON_FIELDS}
210
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
211
211
  identifier
212
212
  sourceName
213
213
  sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
218
218
  loader
219
219
  """,
220
220
  "snapshots": f"""
221
- { _DBT_GRAPHQL_COMMON_FIELDS }
222
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
223
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
221
+ {_DBT_GRAPHQL_COMMON_FIELDS}
222
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
223
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
224
224
  parentsSources {{
225
225
  uniqueId
226
226
  }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
229
229
  }}
230
230
  """,
231
231
  "tests": f"""
232
- { _DBT_GRAPHQL_COMMON_FIELDS }
232
+ {_DBT_GRAPHQL_COMMON_FIELDS}
233
233
  state
234
234
  columnName
235
235
  status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
315
315
  res = response.json()
316
316
  if "errors" in res:
317
317
  raise ValueError(
318
- f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
318
+ f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
319
319
  )
320
320
  data = res["data"]
321
321
  except JSONDecodeError as e:
@@ -506,16 +506,18 @@ class DBTNode:
506
506
  materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
507
507
  # see https://docs.getdbt.com/reference/artifacts/manifest-json
508
508
  catalog_type: Optional[str]
509
- missing_from_catalog: bool # indicates if the node was missing from the catalog.json
509
+ missing_from_catalog: (
510
+ bool # indicates if the node was missing from the catalog.json
511
+ )
510
512
 
511
513
  owner: Optional[str]
512
514
 
513
515
  columns: List[DBTColumn] = field(default_factory=list)
514
516
  upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
515
517
  upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
516
- raw_sql_parsing_result: Optional[
517
- SqlParsingResult
518
- ] = None # only set for nodes that don't depend on ephemeral models
518
+ raw_sql_parsing_result: Optional[SqlParsingResult] = (
519
+ None # only set for nodes that don't depend on ephemeral models
520
+ )
519
521
  cll_debug_info: Optional[SqlParsingDebugInfo] = None
520
522
 
521
523
  meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
869
871
  "platform": DBT_PLATFORM,
870
872
  "name": node.dbt_name,
871
873
  "instance": self.config.platform_instance,
874
+ # Ideally we'd include the env unconditionally. However, we started out
875
+ # not including env in the guid, so we need to maintain backwards compatibility
876
+ # with existing PROD assertions.
872
877
  **(
873
- # Ideally we'd include the env unconditionally. However, we started out
874
- # not including env in the guid, so we need to maintain backwards compatibility
875
- # with existing PROD assertions.
876
878
  {"env": self.config.env}
877
879
  if self.config.env != mce_builder.DEFAULT_ENV
878
880
  and self.config.include_env_in_assertion_guid
@@ -181,7 +181,7 @@ class DremioAPIOperations:
181
181
  return
182
182
 
183
183
  # On-prem Dremio authentication (PAT or Basic Auth)
184
- for retry in range(1, self._retry_count + 1):
184
+ for _ in range(1, self._retry_count + 1):
185
185
  try:
186
186
  if connection_args.authentication_method == "PAT":
187
187
  self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
191
191
  )
192
192
  return
193
193
  else:
194
- assert (
195
- connection_args.username and connection_args.password
196
- ), "Username and password are required for authentication"
194
+ assert connection_args.username and connection_args.password, (
195
+ "Username and password are required for authentication"
196
+ )
197
197
  host = connection_args.hostname
198
198
  port = connection_args.port
199
199
  protocol = "https" if connection_args.tls else "http"
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
101
101
  Add a new source type if not in the map (e.g., Dremio ARP).
102
102
  """
103
103
  dremio_source_type = dremio_source_type.upper()
104
- DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
105
- dremio_source_type
106
- ] = datahub_source_type
104
+ DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
105
+ datahub_source_type
106
+ )
107
107
 
108
108
  if category:
109
109
  if category.lower() == "file_object_storage":
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
111
111
 
112
112
  @staticmethod
113
113
  def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
114
- type_class: Optional[
115
- Type
116
- ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
117
- elastic_column_type
114
+ type_class: Optional[Type] = (
115
+ ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
116
+ elastic_column_type
117
+ )
118
118
  )
119
119
  if type_class is None:
120
120
  logger.warning(
@@ -167,6 +167,10 @@ class PlatformDetail(ConfigModel):
167
167
  description="The database that all assets produced by this connector belong to. "
168
168
  "For destinations, this defaults to the fivetran log config's database.",
169
169
  )
170
+ include_schema_in_urn: bool = pydantic.Field(
171
+ default=True,
172
+ description="Include schema in the dataset URN. In some cases, the schema is not relevant to the dataset URN and Fivetran sets it to the source and destination table names in the connector.",
173
+ )
170
174
 
171
175
 
172
176
  class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
@@ -119,21 +119,31 @@ class FivetranSource(StatefulIngestionSourceBase):
119
119
  )
120
120
 
121
121
  for lineage in connector.lineage:
122
+ source_table = (
123
+ lineage.source_table
124
+ if source_details.include_schema_in_urn
125
+ else lineage.source_table.split(".", 1)[1]
126
+ )
122
127
  input_dataset_urn = DatasetUrn.create_from_ids(
123
128
  platform_id=source_details.platform,
124
129
  table_name=(
125
- f"{source_details.database.lower()}.{lineage.source_table}"
130
+ f"{source_details.database.lower()}.{source_table}"
126
131
  if source_details.database
127
- else lineage.source_table
132
+ else source_table
128
133
  ),
129
134
  env=source_details.env,
130
135
  platform_instance=source_details.platform_instance,
131
136
  )
132
137
  input_dataset_urn_list.append(input_dataset_urn)
133
138
 
139
+ destination_table = (
140
+ lineage.destination_table
141
+ if destination_details.include_schema_in_urn
142
+ else lineage.destination_table.split(".", 1)[1]
143
+ )
134
144
  output_dataset_urn = DatasetUrn.create_from_ids(
135
145
  platform_id=destination_details.platform,
136
- table_name=f"{destination_details.database.lower()}.{lineage.destination_table}",
146
+ table_name=f"{destination_details.database.lower()}.{destination_table}",
137
147
  env=destination_details.env,
138
148
  platform_instance=destination_details.platform_instance,
139
149
  )
@@ -176,12 +186,12 @@ class FivetranSource(StatefulIngestionSourceBase):
176
186
  **{
177
187
  f"source.{k}": str(v)
178
188
  for k, v in source_details.dict().items()
179
- if v is not None
189
+ if v is not None and not isinstance(v, bool)
180
190
  },
181
191
  **{
182
192
  f"destination.{k}": str(v)
183
193
  for k, v in destination_details.dict().items()
184
- if v is not None
194
+ if v is not None and not isinstance(v, bool)
185
195
  },
186
196
  )
187
197
 
@@ -155,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
155
155
  current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
156
156
  self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
157
157
  if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
158
- self.report.sample_hard_deleted_aspects_by_type[
159
- entity_type
160
- ] = LossyList()
158
+ self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
159
+ LossyList()
160
+ )
161
161
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
162
 
163
163
  def delete_entity(self, urn: str) -> None:
@@ -88,6 +88,7 @@ class GCSSource(StatefulIngestionSourceBase):
88
88
  super().__init__(config, ctx)
89
89
  self.config = config
90
90
  self.report = GCSSourceReport()
91
+ self.platform: str = PLATFORM_GCS
91
92
  self.s3_source = self.create_equivalent_s3_source(ctx)
92
93
 
93
94
  @classmethod
@@ -135,14 +136,15 @@ class GCSSource(StatefulIngestionSourceBase):
135
136
 
136
137
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
137
138
  config = self.create_equivalent_s3_config()
138
- return self.s3_source_overrides(S3Source(config, ctx))
139
+ return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
139
140
 
140
141
  def s3_source_overrides(self, source: S3Source) -> S3Source:
141
142
  source.source_config.platform = PLATFORM_GCS
142
143
 
143
144
  source.is_s3_platform = lambda: True # type: ignore
144
- source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}") # type: ignore
145
-
145
+ source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
146
+ f"s3://{bucket_name}/{key}"
147
+ )
146
148
  return source
147
149
 
148
150
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -1,3 +1,5 @@
1
+ from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
2
+
1
3
  import collections
2
4
  import concurrent.futures
3
5
  import contextlib
@@ -10,7 +12,6 @@ import threading
10
12
  import traceback
11
13
  import unittest.mock
12
14
  import uuid
13
- from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
14
15
  from functools import lru_cache
15
16
  from typing import (
16
17
  TYPE_CHECKING,
@@ -326,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
326
327
 
327
328
 
328
329
  def _run_with_query_combiner(
329
- method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
330
+ method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
330
331
  ) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
331
332
  @functools.wraps(method)
332
333
  def inner(
@@ -1536,9 +1537,7 @@ def create_bigquery_temp_table(
1536
1537
  query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
1537
1538
  # In google-cloud-bigquery 3.15.0, the _query_job attribute was
1538
1539
  # made public and renamed to query_job.
1539
- cursor.query_job
1540
- if hasattr(cursor, "query_job")
1541
- else cursor._query_job # type: ignore[attr-defined]
1540
+ cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
1542
1541
  )
1543
1542
  assert query_job
1544
1543
  temp_destination_table = query_job.destination
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
220
220
  )
221
221
  values[field_level_metric] = False
222
222
 
223
- assert (
224
- max_num_fields_to_profile is None
225
- ), f"{max_num_fields_to_profile_key} should be set to None"
223
+ assert max_num_fields_to_profile is None, (
224
+ f"{max_num_fields_to_profile_key} should be set to None"
225
+ )
226
226
 
227
227
  # Disable expensive queries.
228
228
  if values.get("turn_off_expensive_profiling_metrics"):
@@ -296,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
296
296
  custom_properties["snapshot-id"] = str(
297
297
  table.current_snapshot().snapshot_id
298
298
  )
299
- custom_properties[
300
- "manifest-list"
301
- ] = table.current_snapshot().manifest_list
299
+ custom_properties["manifest-list"] = (
300
+ table.current_snapshot().manifest_list
301
+ )
302
302
  dataset_properties = DatasetPropertiesClass(
303
303
  name=table.name()[-1],
304
304
  description=table.metadata.properties.get("comment", None),
@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
354
354
  yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
355
355
 
356
356
  # Populate GroupMembership Aspects for CorpUsers
357
- datahub_corp_user_urn_to_group_membership: Dict[
358
- str, GroupMembershipClass
359
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
357
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
358
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
359
+ )
360
360
  if (
361
361
  self.config.ingest_group_membership
362
362
  and len(self.selected_azure_ad_groups) > 0
@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
344
344
  ).as_workunit()
345
345
 
346
346
  # Step 2: Populate GroupMembership Aspects for CorpUsers
347
- datahub_corp_user_urn_to_group_membership: Dict[
348
- str, GroupMembershipClass
349
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
347
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
348
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
349
+ )
350
350
  if self.config.ingest_group_membership and okta_groups is not None:
351
351
  # Fetch membership for each group.
352
352
  for okta_group in okta_groups:
@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
419
419
  custom_props = self.build_custom_properties(
420
420
  topic, topic_detail, extra_topic_config
421
421
  )
422
- schema_name: Optional[
423
- str
424
- ] = self.schema_registry_client._get_subject_for_topic(
425
- topic, is_key_schema=False
422
+ schema_name: Optional[str] = (
423
+ self.schema_registry_client._get_subject_for_topic(
424
+ topic, is_key_schema=False
425
+ )
426
426
  )
427
427
  if schema_name is not None:
428
428
  custom_props["Schema Name"] = schema_name
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
610
610
 
611
611
  def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
612
612
  logger.info("Fetching config details for all topics")
613
- configs: Dict[
614
- ConfigResource, concurrent.futures.Future
615
- ] = self.admin_client.describe_configs(
616
- resources=[ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics],
617
- request_timeout=self.source_config.connection.client_timeout_seconds,
613
+ configs: Dict[ConfigResource, concurrent.futures.Future] = (
614
+ self.admin_client.describe_configs(
615
+ resources=[
616
+ ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
617
+ ],
618
+ request_timeout=self.source_config.connection.client_timeout_seconds,
619
+ )
618
620
  )
619
621
  logger.debug("Waiting for config details futures to complete")
620
622
  concurrent.futures.wait(configs.values())
@@ -110,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
110
110
  connector_manifest = self._get_connector_manifest(
111
111
  connector_name, connector_url
112
112
  )
113
- if (
114
- connector_manifest is None
115
- or not self.config.connector_patterns.allowed(connector_manifest.name)
113
+ if connector_manifest is None or not self.config.connector_patterns.allowed(
114
+ connector_manifest.name
116
115
  ):
117
116
  self.report.report_dropped(connector_name)
118
117
  continue
@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
199
199
  transforms.append(transform)
200
200
  for key in self.connector_manifest.config.keys():
201
201
  if key.startswith(f"transforms.{name}."):
202
- transform[
203
- key.replace(f"transforms.{name}.", "")
204
- ] = self.connector_manifest.config[key]
202
+ transform[key.replace(f"transforms.{name}.", "")] = (
203
+ self.connector_manifest.config[key]
204
+ )
205
205
 
206
206
  if "defaultDataset" in connector_manifest.config:
207
207
  defaultDataset = connector_manifest.config["defaultDataset"]
@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
123
123
  transforms.append(transform)
124
124
  for key in self.connector_manifest.config.keys():
125
125
  if key.startswith(f"transforms.{name}."):
126
- transform[
127
- key.replace(f"transforms.{name}.", "")
128
- ] = self.connector_manifest.config[key]
126
+ transform[key.replace(f"transforms.{name}.", "")] = (
127
+ self.connector_manifest.config[key]
128
+ )
129
129
 
130
130
  return self.JdbcParser(
131
131
  db_connection_url,
@@ -596,9 +596,9 @@ class LookerUtil:
596
596
 
597
597
  @staticmethod
598
598
  def _extract_view_from_field(field: str) -> str:
599
- assert (
600
- field.count(".") == 1
601
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
599
+ assert field.count(".") == 1, (
600
+ f"Error: A field must be prefixed by a view name, field is: {field}"
601
+ )
602
602
  return field.split(".")[0]
603
603
 
604
604
  @staticmethod
@@ -815,9 +815,9 @@ class LookerExplore:
815
815
  project_name: Optional[str] = None
816
816
  label: Optional[str] = None
817
817
  description: Optional[str] = None
818
- upstream_views: Optional[
819
- List[ProjectInclude]
820
- ] = None # captures the view name(s) this explore is derived from
818
+ upstream_views: Optional[List[ProjectInclude]] = (
819
+ None # captures the view name(s) this explore is derived from
820
+ )
821
821
  upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
822
822
  default_factory=dict
823
823
  ) # view_name is key and file_path is value. A single file may contains multiple views
@@ -889,7 +889,7 @@ class LookerExplore:
889
889
  upstream_views.extend(parsed_explore.upstream_views or [])
890
890
  else:
891
891
  logger.warning(
892
- f'Could not find extended explore {extended_explore} for explore {dict["name"]} in model {model_name}'
892
+ f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
893
893
  )
894
894
  else:
895
895
  # we only fallback to the view_names list if this is not an extended explore
@@ -903,7 +903,7 @@ class LookerExplore:
903
903
  )
904
904
  if not info:
905
905
  logger.warning(
906
- f'Could not resolve view {view_name} for explore {dict["name"]} in model {model_name}'
906
+ f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
907
907
  )
908
908
  else:
909
909
  upstream_views.append(
@@ -935,9 +935,9 @@ class LookerExplore:
935
935
  try:
936
936
  explore = client.lookml_model_explore(model, explore_name)
937
937
  views: Set[str] = set()
938
- lkml_fields: List[
939
- LookmlModelExploreField
940
- ] = explore_field_set_to_lkml_fields(explore)
938
+ lkml_fields: List[LookmlModelExploreField] = (
939
+ explore_field_set_to_lkml_fields(explore)
940
+ )
941
941
 
942
942
  if explore.view_name is not None and explore.view_name != explore.name:
943
943
  # explore is not named after a view and is instead using a from field, which is modeled as view_name.
@@ -1034,9 +1034,9 @@ class LookerExplore:
1034
1034
  if measure_field.name is None:
1035
1035
  continue
1036
1036
  else:
1037
- field_name_vs_raw_explore_field[
1038
- measure_field.name
1039
- ] = measure_field
1037
+ field_name_vs_raw_explore_field[measure_field.name] = (
1038
+ measure_field
1039
+ )
1040
1040
 
1041
1041
  view_fields.append(
1042
1042
  ViewField(
@@ -1072,11 +1072,11 @@ class LookerExplore:
1072
1072
  if view_project_map:
1073
1073
  logger.debug(f"views and their projects: {view_project_map}")
1074
1074
 
1075
- upstream_views_file_path: Dict[
1076
- str, Optional[str]
1077
- ] = create_upstream_views_file_path_map(
1078
- lkml_fields=lkml_fields,
1079
- view_names=views,
1075
+ upstream_views_file_path: Dict[str, Optional[str]] = (
1076
+ create_upstream_views_file_path_map(
1077
+ lkml_fields=lkml_fields,
1078
+ view_names=views,
1079
+ )
1080
1080
  )
1081
1081
  if upstream_views_file_path:
1082
1082
  logger.debug(f"views and their file-paths: {upstream_views_file_path}")
@@ -166,9 +166,9 @@ def _get_generic_definition(
166
166
  # e.g. spark1 or hive2 or druid_18
167
167
  platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
168
168
 
169
- assert (
170
- platform is not None
171
- ), f"Failed to extract a valid platform from connection {looker_connection}"
169
+ assert platform is not None, (
170
+ f"Failed to extract a valid platform from connection {looker_connection}"
171
+ )
172
172
  db = looker_connection.database
173
173
  schema = looker_connection.schema # ok for this to be None
174
174
  return platform, db, schema
@@ -250,9 +250,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
250
250
 
251
251
  @staticmethod
252
252
  def _extract_view_from_field(field: str) -> str:
253
- assert (
254
- field.count(".") == 1
255
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
253
+ assert field.count(".") == 1, (
254
+ f"Error: A field must be prefixed by a view name, field is: {field}"
255
+ )
256
256
  return field.split(".")[0]
257
257
 
258
258
  def _get_views_from_fields(self, fields: List[str]) -> List[str]:
@@ -610,12 +610,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
610
610
  def _create_platform_instance_aspect(
611
611
  self,
612
612
  ) -> DataPlatformInstance:
613
- assert (
614
- self.source_config.platform_name
615
- ), "Platform name is not set in the configuration."
616
- assert (
617
- self.source_config.platform_instance
618
- ), "Platform instance is not set in the configuration."
613
+ assert self.source_config.platform_name, (
614
+ "Platform name is not set in the configuration."
615
+ )
616
+ assert self.source_config.platform_instance, (
617
+ "Platform instance is not set in the configuration."
618
+ )
619
619
 
620
620
  return DataPlatformInstance(
621
621
  platform=builder.make_data_platform_urn(self.source_config.platform_name),
@@ -1016,9 +1016,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1016
1016
  yield from chart_events
1017
1017
 
1018
1018
  # Step 2: Emit metadata events for the Dashboard itself.
1019
- chart_urns: Set[
1020
- str
1021
- ] = set() # Collect the unique child chart urns for dashboard input lineage.
1019
+ chart_urns: Set[str] = (
1020
+ set()
1021
+ ) # Collect the unique child chart urns for dashboard input lineage.
1022
1022
  for chart_event in chart_events:
1023
1023
  chart_event_urn = self._extract_event_urn(chart_event)
1024
1024
  if chart_event_urn:
@@ -1538,20 +1538,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1538
1538
  }
1539
1539
  )
1540
1540
 
1541
- dashboard_element: Optional[
1542
- LookerDashboardElement
1543
- ] = self._get_looker_dashboard_element(
1544
- DashboardElement(
1545
- id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1546
- # we add the "looks_" prefix to look.id.
1547
- title=look.title,
1548
- subtitle_text=look.description,
1549
- look_id=look.id,
1550
- dashboard_id=None, # As this is an independent look
1551
- look=LookWithQuery(
1552
- query=query, folder=look.folder, user_id=look.user_id
1541
+ dashboard_element: Optional[LookerDashboardElement] = (
1542
+ self._get_looker_dashboard_element(
1543
+ DashboardElement(
1544
+ id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1545
+ # we add the "looks_" prefix to look.id.
1546
+ title=look.title,
1547
+ subtitle_text=look.description,
1548
+ look_id=look.id,
1549
+ dashboard_id=None, # As this is an independent look
1550
+ look=LookWithQuery(
1551
+ query=query, folder=look.folder, user_id=look.user_id
1552
+ ),
1553
1553
  ),
1554
- ),
1554
+ )
1555
1555
  )
1556
1556
 
1557
1557
  if dashboard_element is not None:
@@ -33,9 +33,9 @@ logger = logging.getLogger(__name__)
33
33
 
34
34
 
35
35
  class SpecialVariable:
36
- SPECIAL_VARIABLE_PATTERN: ClassVar[
37
- str
38
- ] = r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
36
+ SPECIAL_VARIABLE_PATTERN: ClassVar[str] = (
37
+ r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
38
+ )
39
39
  liquid_variable: dict
40
40
 
41
41
  def __init__(self, liquid_variable):
@@ -257,9 +257,9 @@ class BaseStatGenerator(ABC):
257
257
 
258
258
  for row in rows:
259
259
  logger.debug(row)
260
- entity_stat_aspect[
261
- self.get_entity_stat_key(row)
262
- ] = self.to_entity_timeseries_stat_aspect(row)
260
+ entity_stat_aspect[self.get_entity_stat_key(row)] = (
261
+ self.to_entity_timeseries_stat_aspect(row)
262
+ )
263
263
 
264
264
  return entity_stat_aspect
265
265
 
@@ -385,10 +385,8 @@ class BaseStatGenerator(ABC):
385
385
  entity_rows: List[Dict] = self._execute_query(
386
386
  entity_query_with_filters, "entity_query"
387
387
  )
388
- entity_usage_stat: Dict[
389
- Tuple[str, str], Any
390
- ] = self._process_entity_timeseries_rows(
391
- entity_rows
388
+ entity_usage_stat: Dict[Tuple[str, str], Any] = (
389
+ self._process_entity_timeseries_rows(entity_rows)
392
390
  ) # Any type to pass mypy unbound Aspect type error
393
391
 
394
392
  user_wise_query_with_filters: LookerQuery = self._append_filters(