acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show
  1. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
  2. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/ingest_cli.py +25 -15
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/entrypoints.py +6 -0
  24. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  25. datahub/ingestion/api/report.py +1 -2
  26. datahub/ingestion/api/source_helpers.py +1 -1
  27. datahub/ingestion/extractor/json_schema_util.py +3 -3
  28. datahub/ingestion/extractor/schema_util.py +3 -5
  29. datahub/ingestion/fs/s3_fs.py +3 -3
  30. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  31. datahub/ingestion/graph/client.py +4 -6
  32. datahub/ingestion/run/pipeline.py +8 -7
  33. datahub/ingestion/run/pipeline_config.py +3 -3
  34. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  35. datahub/ingestion/source/abs/source.py +19 -8
  36. datahub/ingestion/source/aws/glue.py +11 -11
  37. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  38. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  39. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  40. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  42. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  43. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  44. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  45. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  46. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  47. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  48. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  49. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  50. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  51. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  52. datahub/ingestion/source/csv_enricher.py +29 -29
  53. datahub/ingestion/source/datahub/config.py +4 -0
  54. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  55. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  56. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  57. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  58. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  59. datahub/ingestion/source/elastic_search.py +4 -4
  60. datahub/ingestion/source/gc/datahub_gc.py +1 -0
  61. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
  62. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  63. datahub/ingestion/source/ge_data_profiler.py +2 -5
  64. datahub/ingestion/source/ge_profiling_config.py +3 -3
  65. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  66. datahub/ingestion/source/identity/azure_ad.py +3 -3
  67. datahub/ingestion/source/identity/okta.py +3 -3
  68. datahub/ingestion/source/kafka/kafka.py +11 -9
  69. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  72. datahub/ingestion/source/looker/looker_common.py +19 -19
  73. datahub/ingestion/source/looker/looker_config.py +3 -3
  74. datahub/ingestion/source/looker/looker_source.py +25 -25
  75. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  76. datahub/ingestion/source/looker/looker_usage.py +5 -7
  77. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  78. datahub/ingestion/source/looker/lookml_source.py +13 -15
  79. datahub/ingestion/source/looker/view_upstream.py +5 -5
  80. datahub/ingestion/source/mlflow.py +4 -4
  81. datahub/ingestion/source/mode.py +5 -5
  82. datahub/ingestion/source/mongodb.py +6 -4
  83. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  84. datahub/ingestion/source/nifi.py +24 -26
  85. datahub/ingestion/source/openapi.py +9 -9
  86. datahub/ingestion/source/powerbi/config.py +12 -12
  87. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  88. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  89. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  90. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  91. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  92. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  93. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  94. datahub/ingestion/source/redshift/config.py +3 -3
  95. datahub/ingestion/source/redshift/redshift.py +12 -12
  96. datahub/ingestion/source/redshift/usage.py +8 -8
  97. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  98. datahub/ingestion/source/s3/source.py +1 -1
  99. datahub/ingestion/source/salesforce.py +26 -25
  100. datahub/ingestion/source/schema/json_schema.py +1 -1
  101. datahub/ingestion/source/sigma/sigma.py +3 -3
  102. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  103. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  104. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  105. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  106. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  107. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  108. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  109. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  110. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  112. datahub/ingestion/source/sql/athena.py +1 -3
  113. datahub/ingestion/source/sql/clickhouse.py +8 -14
  114. datahub/ingestion/source/sql/oracle.py +1 -3
  115. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  116. datahub/ingestion/source/sql/sql_types.py +0 -1
  117. datahub/ingestion/source/sql/teradata.py +16 -3
  118. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  119. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  120. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  121. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  122. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  123. datahub/ingestion/source/tableau/tableau.py +245 -101
  124. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  125. datahub/ingestion/source/unity/config.py +3 -1
  126. datahub/ingestion/source/unity/proxy.py +1 -1
  127. datahub/ingestion/source/unity/source.py +3 -3
  128. datahub/ingestion/source/unity/usage.py +3 -1
  129. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  130. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  131. datahub/ingestion/source/usage/usage_common.py +1 -1
  132. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  133. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  134. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  135. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  136. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  137. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  138. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  139. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  140. datahub/lite/duckdb_lite.py +12 -10
  141. datahub/metadata/_schema_classes.py +1 -1
  142. datahub/metadata/schema.avsc +6 -2
  143. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  144. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  145. datahub/secret/datahub_secrets_client.py +12 -21
  146. datahub/secret/secret_common.py +14 -8
  147. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  148. datahub/sql_parsing/schema_resolver.py +5 -10
  149. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  150. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  151. datahub/sql_parsing/sqlglot_utils.py +1 -1
  152. datahub/telemetry/stats.py +1 -2
  153. datahub/testing/mcp_diff.py +1 -1
  154. datahub/utilities/file_backed_collections.py +10 -10
  155. datahub/utilities/hive_schema_to_avro.py +2 -2
  156. datahub/utilities/logging_manager.py +2 -2
  157. datahub/utilities/lossy_collections.py +3 -3
  158. datahub/utilities/mapping.py +3 -3
  159. datahub/utilities/memory_footprint.py +3 -2
  160. datahub/utilities/serialized_lru_cache.py +3 -1
  161. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  162. datahub/utilities/sqllineage_patch.py +1 -1
  163. datahub/utilities/stats_collections.py +3 -1
  164. datahub/utilities/urns/_urn_base.py +28 -5
  165. datahub/utilities/urns/urn_iter.py +2 -2
  166. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  167. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  168. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
371
371
  domain: Optional[str],
372
372
  description: Optional[str],
373
373
  ) -> Iterable[MetadataWorkUnit]:
374
- maybe_terms_wu: Optional[
375
- MetadataWorkUnit
376
- ] = self.get_resource_glossary_terms_work_unit(
377
- entity_urn=entity_urn,
378
- term_associations=term_associations,
374
+ maybe_terms_wu: Optional[MetadataWorkUnit] = (
375
+ self.get_resource_glossary_terms_work_unit(
376
+ entity_urn=entity_urn,
377
+ term_associations=term_associations,
378
+ )
379
379
  )
380
380
  if maybe_terms_wu:
381
381
  self.report.num_glossary_term_workunits_produced += 1
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
389
389
  self.report.num_tag_workunits_produced += 1
390
390
  yield maybe_tags_wu
391
391
 
392
- maybe_owners_wu: Optional[
393
- MetadataWorkUnit
394
- ] = self.get_resource_owners_work_unit(
395
- entity_urn=entity_urn,
396
- owners=owners,
392
+ maybe_owners_wu: Optional[MetadataWorkUnit] = (
393
+ self.get_resource_owners_work_unit(
394
+ entity_urn=entity_urn,
395
+ owners=owners,
396
+ )
397
397
  )
398
398
  if maybe_owners_wu:
399
399
  self.report.num_owners_workunits_produced += 1
400
400
  yield maybe_owners_wu
401
401
 
402
- maybe_domain_wu: Optional[
403
- MetadataWorkUnit
404
- ] = self.get_resource_domain_work_unit(
405
- entity_urn=entity_urn,
406
- domain=domain,
402
+ maybe_domain_wu: Optional[MetadataWorkUnit] = (
403
+ self.get_resource_domain_work_unit(
404
+ entity_urn=entity_urn,
405
+ domain=domain,
406
+ )
407
407
  )
408
408
  if maybe_domain_wu:
409
409
  self.report.num_domain_workunits_produced += 1
410
410
  yield maybe_domain_wu
411
411
 
412
- maybe_description_wu: Optional[
413
- MetadataWorkUnit
414
- ] = self.get_resource_description_work_unit(
415
- entity_urn=entity_urn,
416
- description=description,
412
+ maybe_description_wu: Optional[MetadataWorkUnit] = (
413
+ self.get_resource_description_work_unit(
414
+ entity_urn=entity_urn,
415
+ description=description,
416
+ )
417
417
  )
418
418
  if maybe_description_wu:
419
419
  self.report.num_description_workunits_produced += 1
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
426
426
  needs_write: bool,
427
427
  ) -> Tuple[EditableSchemaMetadataClass, bool]:
428
428
  field_path: str = sub_resource_row.field_path
429
- term_associations: List[
430
- GlossaryTermAssociationClass
431
- ] = sub_resource_row.term_associations
429
+ term_associations: List[GlossaryTermAssociationClass] = (
430
+ sub_resource_row.term_associations
431
+ )
432
432
  tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
433
433
  description: Optional[str] = sub_resource_row.description
434
434
  has_terms: bool = len(term_associations) > 0
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
517
517
  # Boolean field to tell whether we need to write an MCPW.
518
518
  needs_write = False
519
519
 
520
- current_editable_schema_metadata: Optional[
521
- EditableSchemaMetadataClass
522
- ] = None
520
+ current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
521
+ None
522
+ )
523
523
  if self.ctx.graph and not self.should_overwrite:
524
524
  # Fetch the current editable schema metadata
525
525
  current_editable_schema_metadata = self.ctx.graph.get_aspect(
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
655
655
  entity_urn = row["resource"]
656
656
  entity_type = Urn.from_string(row["resource"]).get_type()
657
657
 
658
- term_associations: List[
659
- GlossaryTermAssociationClass
660
- ] = self.maybe_extract_glossary_terms(row)
658
+ term_associations: List[GlossaryTermAssociationClass] = (
659
+ self.maybe_extract_glossary_terms(row)
660
+ )
661
661
  tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
662
662
  owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
663
663
 
@@ -25,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
25
25
  "globalSettingsKey",
26
26
  "globalSettingsInfo",
27
27
  "testResults",
28
+ "dataHubExecutionRequestKey",
29
+ "dataHubExecutionRequestInput",
30
+ "dataHubExecutionRequestSignal",
31
+ "dataHubExecutionRequestResult",
28
32
  }
29
33
 
30
34
 
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
154
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
- with conn.begin(): # Transaction required for PostgreSQL server-side cursor
155
+ with (
156
+ conn.begin()
157
+ ): # Transaction required for PostgreSQL server-side cursor
156
158
  # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
159
  # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
158
160
  conn = conn.execution_options(
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
222
224
  )
223
225
  except Exception as e:
224
226
  logger.warning(
225
- f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
227
+ f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
226
228
  )
227
229
  self.report.num_database_parse_errors += 1
228
230
  self.report.database_parse_errors.setdefault(
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
194
194
 
195
195
  _DBT_FIELDS_BY_TYPE = {
196
196
  "models": f"""
197
- { _DBT_GRAPHQL_COMMON_FIELDS }
198
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
199
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
197
+ {_DBT_GRAPHQL_COMMON_FIELDS}
198
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
199
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
200
200
  dependsOn
201
201
  materializedType
202
202
  """,
203
203
  "seeds": f"""
204
- { _DBT_GRAPHQL_COMMON_FIELDS }
205
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
206
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
204
+ {_DBT_GRAPHQL_COMMON_FIELDS}
205
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
206
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
207
207
  """,
208
208
  "sources": f"""
209
- { _DBT_GRAPHQL_COMMON_FIELDS }
210
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
209
+ {_DBT_GRAPHQL_COMMON_FIELDS}
210
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
211
211
  identifier
212
212
  sourceName
213
213
  sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
218
218
  loader
219
219
  """,
220
220
  "snapshots": f"""
221
- { _DBT_GRAPHQL_COMMON_FIELDS }
222
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
223
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
221
+ {_DBT_GRAPHQL_COMMON_FIELDS}
222
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
223
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
224
224
  parentsSources {{
225
225
  uniqueId
226
226
  }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
229
229
  }}
230
230
  """,
231
231
  "tests": f"""
232
- { _DBT_GRAPHQL_COMMON_FIELDS }
232
+ {_DBT_GRAPHQL_COMMON_FIELDS}
233
233
  state
234
234
  columnName
235
235
  status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
315
315
  res = response.json()
316
316
  if "errors" in res:
317
317
  raise ValueError(
318
- f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
318
+ f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
319
319
  )
320
320
  data = res["data"]
321
321
  except JSONDecodeError as e:
@@ -506,16 +506,18 @@ class DBTNode:
506
506
  materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
507
507
  # see https://docs.getdbt.com/reference/artifacts/manifest-json
508
508
  catalog_type: Optional[str]
509
- missing_from_catalog: bool # indicates if the node was missing from the catalog.json
509
+ missing_from_catalog: (
510
+ bool # indicates if the node was missing from the catalog.json
511
+ )
510
512
 
511
513
  owner: Optional[str]
512
514
 
513
515
  columns: List[DBTColumn] = field(default_factory=list)
514
516
  upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
515
517
  upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
516
- raw_sql_parsing_result: Optional[
517
- SqlParsingResult
518
- ] = None # only set for nodes that don't depend on ephemeral models
518
+ raw_sql_parsing_result: Optional[SqlParsingResult] = (
519
+ None # only set for nodes that don't depend on ephemeral models
520
+ )
519
521
  cll_debug_info: Optional[SqlParsingDebugInfo] = None
520
522
 
521
523
  meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
869
871
  "platform": DBT_PLATFORM,
870
872
  "name": node.dbt_name,
871
873
  "instance": self.config.platform_instance,
874
+ # Ideally we'd include the env unconditionally. However, we started out
875
+ # not including env in the guid, so we need to maintain backwards compatibility
876
+ # with existing PROD assertions.
872
877
  **(
873
- # Ideally we'd include the env unconditionally. However, we started out
874
- # not including env in the guid, so we need to maintain backwards compatibility
875
- # with existing PROD assertions.
876
878
  {"env": self.config.env}
877
879
  if self.config.env != mce_builder.DEFAULT_ENV
878
880
  and self.config.include_env_in_assertion_guid
@@ -181,7 +181,7 @@ class DremioAPIOperations:
181
181
  return
182
182
 
183
183
  # On-prem Dremio authentication (PAT or Basic Auth)
184
- for retry in range(1, self._retry_count + 1):
184
+ for _ in range(1, self._retry_count + 1):
185
185
  try:
186
186
  if connection_args.authentication_method == "PAT":
187
187
  self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
191
191
  )
192
192
  return
193
193
  else:
194
- assert (
195
- connection_args.username and connection_args.password
196
- ), "Username and password are required for authentication"
194
+ assert connection_args.username and connection_args.password, (
195
+ "Username and password are required for authentication"
196
+ )
197
197
  host = connection_args.hostname
198
198
  port = connection_args.port
199
199
  protocol = "https" if connection_args.tls else "http"
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
101
101
  Add a new source type if not in the map (e.g., Dremio ARP).
102
102
  """
103
103
  dremio_source_type = dremio_source_type.upper()
104
- DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
105
- dremio_source_type
106
- ] = datahub_source_type
104
+ DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
105
+ datahub_source_type
106
+ )
107
107
 
108
108
  if category:
109
109
  if category.lower() == "file_object_storage":
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
111
111
 
112
112
  @staticmethod
113
113
  def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
114
- type_class: Optional[
115
- Type
116
- ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
117
- elastic_column_type
114
+ type_class: Optional[Type] = (
115
+ ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
116
+ elastic_column_type
117
+ )
118
118
  )
119
119
  if type_class is None:
120
120
  logger.warning(
@@ -292,6 +292,7 @@ class DataHubGcSource(Source):
292
292
  tokens = list_access_tokens.get("tokens", [])
293
293
  total = list_access_tokens.get("total", 0)
294
294
  if tokens == []:
295
+ # Due to a server bug we cannot rely on just total
295
296
  break
296
297
  for token in tokens:
297
298
  self.report.expired_tokens_revoked += 1
@@ -99,6 +99,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
99
99
 
100
100
  @dataclass
101
101
  class SoftDeletedEntitiesReport(SourceReport):
102
+ num_calls_made: Dict[str, int] = field(default_factory=dict)
102
103
  num_entities_found: Dict[str, int] = field(default_factory=dict)
103
104
  num_soft_deleted_entity_processed: int = 0
104
105
  num_soft_deleted_retained_due_to_age: int = 0
@@ -154,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
154
155
  current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
155
156
  self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
156
157
  if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
157
- self.report.sample_hard_deleted_aspects_by_type[
158
- entity_type
159
- ] = LossyList()
158
+ self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
159
+ LossyList()
160
+ )
160
161
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
161
162
 
162
163
  def delete_entity(self, urn: str) -> None:
@@ -242,6 +243,11 @@ class SoftDeletedEntitiesCleanup:
242
243
 
243
244
  while True:
244
245
  try:
246
+ if entity_type not in self.report.num_calls_made:
247
+ self.report.num_calls_made[entity_type] = 1
248
+ else:
249
+ self.report.num_calls_made[entity_type] += 1
250
+ self._print_report()
245
251
  result = self.ctx.graph.execute_graphql(
246
252
  graphql_query,
247
253
  {
@@ -270,7 +276,13 @@ class SoftDeletedEntitiesCleanup:
270
276
  )
271
277
  break
272
278
  scroll_across_entities = result.get("scrollAcrossEntities")
273
- if not scroll_across_entities or not scroll_across_entities.get("count"):
279
+ if not scroll_across_entities:
280
+ break
281
+ search_results = scroll_across_entities.get("searchResults")
282
+ count = scroll_across_entities.get("count")
283
+ if not count or not search_results:
284
+ # Due to a server bug we cannot rely on just count as it was returning response like this
285
+ # {'count': 1, 'nextScrollId': None, 'searchResults': []}
274
286
  break
275
287
  if entity_type == "DATA_PROCESS_INSTANCE":
276
288
  # Temp workaround. See note in beginning of the function
@@ -282,7 +294,7 @@ class SoftDeletedEntitiesCleanup:
282
294
  self.report.num_entities_found[entity_type] += scroll_across_entities.get(
283
295
  "count"
284
296
  )
285
- for query in scroll_across_entities.get("searchResults"):
297
+ for query in search_results:
286
298
  yield query["entity"]["urn"]
287
299
 
288
300
  def _get_urns(self) -> Iterable[str]:
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
141
141
  source.source_config.platform = PLATFORM_GCS
142
142
 
143
143
  source.is_s3_platform = lambda: True # type: ignore
144
- source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}") # type: ignore
145
-
144
+ source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
145
+ f"s3://{bucket_name}/{key}"
146
+ )
146
147
  return source
147
148
 
148
149
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
267
267
  "get_column_max",
268
268
  "get_column_mean",
269
269
  "get_column_stdev",
270
- "get_column_stdev",
271
270
  "get_column_nonnull_count",
272
271
  "get_column_unique_count",
273
272
  }
@@ -328,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
328
327
 
329
328
 
330
329
  def _run_with_query_combiner(
331
- method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
330
+ method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
332
331
  ) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
333
332
  @functools.wraps(method)
334
333
  def inner(
@@ -1538,9 +1537,7 @@ def create_bigquery_temp_table(
1538
1537
  query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
1539
1538
  # In google-cloud-bigquery 3.15.0, the _query_job attribute was
1540
1539
  # made public and renamed to query_job.
1541
- cursor.query_job
1542
- if hasattr(cursor, "query_job")
1543
- else cursor._query_job # type: ignore[attr-defined]
1540
+ cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
1544
1541
  )
1545
1542
  assert query_job
1546
1543
  temp_destination_table = query_job.destination
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
220
220
  )
221
221
  values[field_level_metric] = False
222
222
 
223
- assert (
224
- max_num_fields_to_profile is None
225
- ), f"{max_num_fields_to_profile_key} should be set to None"
223
+ assert max_num_fields_to_profile is None, (
224
+ f"{max_num_fields_to_profile_key} should be set to None"
225
+ )
226
226
 
227
227
  # Disable expensive queries.
228
228
  if values.get("turn_off_expensive_profiling_metrics"):
@@ -296,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
296
296
  custom_properties["snapshot-id"] = str(
297
297
  table.current_snapshot().snapshot_id
298
298
  )
299
- custom_properties[
300
- "manifest-list"
301
- ] = table.current_snapshot().manifest_list
299
+ custom_properties["manifest-list"] = (
300
+ table.current_snapshot().manifest_list
301
+ )
302
302
  dataset_properties = DatasetPropertiesClass(
303
303
  name=table.name()[-1],
304
304
  description=table.metadata.properties.get("comment", None),
@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
354
354
  yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
355
355
 
356
356
  # Populate GroupMembership Aspects for CorpUsers
357
- datahub_corp_user_urn_to_group_membership: Dict[
358
- str, GroupMembershipClass
359
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
357
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
358
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
359
+ )
360
360
  if (
361
361
  self.config.ingest_group_membership
362
362
  and len(self.selected_azure_ad_groups) > 0
@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
344
344
  ).as_workunit()
345
345
 
346
346
  # Step 2: Populate GroupMembership Aspects for CorpUsers
347
- datahub_corp_user_urn_to_group_membership: Dict[
348
- str, GroupMembershipClass
349
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
347
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
348
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
349
+ )
350
350
  if self.config.ingest_group_membership and okta_groups is not None:
351
351
  # Fetch membership for each group.
352
352
  for okta_group in okta_groups:
@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
419
419
  custom_props = self.build_custom_properties(
420
420
  topic, topic_detail, extra_topic_config
421
421
  )
422
- schema_name: Optional[
423
- str
424
- ] = self.schema_registry_client._get_subject_for_topic(
425
- topic, is_key_schema=False
422
+ schema_name: Optional[str] = (
423
+ self.schema_registry_client._get_subject_for_topic(
424
+ topic, is_key_schema=False
425
+ )
426
426
  )
427
427
  if schema_name is not None:
428
428
  custom_props["Schema Name"] = schema_name
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
610
610
 
611
611
  def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
612
612
  logger.info("Fetching config details for all topics")
613
- configs: Dict[
614
- ConfigResource, concurrent.futures.Future
615
- ] = self.admin_client.describe_configs(
616
- resources=[ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics],
617
- request_timeout=self.source_config.connection.client_timeout_seconds,
613
+ configs: Dict[ConfigResource, concurrent.futures.Future] = (
614
+ self.admin_client.describe_configs(
615
+ resources=[
616
+ ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
617
+ ],
618
+ request_timeout=self.source_config.connection.client_timeout_seconds,
619
+ )
618
620
  )
619
621
  logger.debug("Waiting for config details futures to complete")
620
622
  concurrent.futures.wait(configs.values())
@@ -110,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
110
110
  connector_manifest = self._get_connector_manifest(
111
111
  connector_name, connector_url
112
112
  )
113
- if (
114
- connector_manifest is None
115
- or not self.config.connector_patterns.allowed(connector_manifest.name)
113
+ if connector_manifest is None or not self.config.connector_patterns.allowed(
114
+ connector_manifest.name
116
115
  ):
117
116
  self.report.report_dropped(connector_name)
118
117
  continue
@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
199
199
  transforms.append(transform)
200
200
  for key in self.connector_manifest.config.keys():
201
201
  if key.startswith(f"transforms.{name}."):
202
- transform[
203
- key.replace(f"transforms.{name}.", "")
204
- ] = self.connector_manifest.config[key]
202
+ transform[key.replace(f"transforms.{name}.", "")] = (
203
+ self.connector_manifest.config[key]
204
+ )
205
205
 
206
206
  if "defaultDataset" in connector_manifest.config:
207
207
  defaultDataset = connector_manifest.config["defaultDataset"]
@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
123
123
  transforms.append(transform)
124
124
  for key in self.connector_manifest.config.keys():
125
125
  if key.startswith(f"transforms.{name}."):
126
- transform[
127
- key.replace(f"transforms.{name}.", "")
128
- ] = self.connector_manifest.config[key]
126
+ transform[key.replace(f"transforms.{name}.", "")] = (
127
+ self.connector_manifest.config[key]
128
+ )
129
129
 
130
130
  return self.JdbcParser(
131
131
  db_connection_url,
@@ -596,9 +596,9 @@ class LookerUtil:
596
596
 
597
597
  @staticmethod
598
598
  def _extract_view_from_field(field: str) -> str:
599
- assert (
600
- field.count(".") == 1
601
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
599
+ assert field.count(".") == 1, (
600
+ f"Error: A field must be prefixed by a view name, field is: {field}"
601
+ )
602
602
  return field.split(".")[0]
603
603
 
604
604
  @staticmethod
@@ -815,9 +815,9 @@ class LookerExplore:
815
815
  project_name: Optional[str] = None
816
816
  label: Optional[str] = None
817
817
  description: Optional[str] = None
818
- upstream_views: Optional[
819
- List[ProjectInclude]
820
- ] = None # captures the view name(s) this explore is derived from
818
+ upstream_views: Optional[List[ProjectInclude]] = (
819
+ None # captures the view name(s) this explore is derived from
820
+ )
821
821
  upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
822
822
  default_factory=dict
823
823
  ) # view_name is key and file_path is value. A single file may contains multiple views
@@ -889,7 +889,7 @@ class LookerExplore:
889
889
  upstream_views.extend(parsed_explore.upstream_views or [])
890
890
  else:
891
891
  logger.warning(
892
- f'Could not find extended explore {extended_explore} for explore {dict["name"]} in model {model_name}'
892
+ f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
893
893
  )
894
894
  else:
895
895
  # we only fallback to the view_names list if this is not an extended explore
@@ -903,7 +903,7 @@ class LookerExplore:
903
903
  )
904
904
  if not info:
905
905
  logger.warning(
906
- f'Could not resolve view {view_name} for explore {dict["name"]} in model {model_name}'
906
+ f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
907
907
  )
908
908
  else:
909
909
  upstream_views.append(
@@ -935,9 +935,9 @@ class LookerExplore:
935
935
  try:
936
936
  explore = client.lookml_model_explore(model, explore_name)
937
937
  views: Set[str] = set()
938
- lkml_fields: List[
939
- LookmlModelExploreField
940
- ] = explore_field_set_to_lkml_fields(explore)
938
+ lkml_fields: List[LookmlModelExploreField] = (
939
+ explore_field_set_to_lkml_fields(explore)
940
+ )
941
941
 
942
942
  if explore.view_name is not None and explore.view_name != explore.name:
943
943
  # explore is not named after a view and is instead using a from field, which is modeled as view_name.
@@ -1034,9 +1034,9 @@ class LookerExplore:
1034
1034
  if measure_field.name is None:
1035
1035
  continue
1036
1036
  else:
1037
- field_name_vs_raw_explore_field[
1038
- measure_field.name
1039
- ] = measure_field
1037
+ field_name_vs_raw_explore_field[measure_field.name] = (
1038
+ measure_field
1039
+ )
1040
1040
 
1041
1041
  view_fields.append(
1042
1042
  ViewField(
@@ -1072,11 +1072,11 @@ class LookerExplore:
1072
1072
  if view_project_map:
1073
1073
  logger.debug(f"views and their projects: {view_project_map}")
1074
1074
 
1075
- upstream_views_file_path: Dict[
1076
- str, Optional[str]
1077
- ] = create_upstream_views_file_path_map(
1078
- lkml_fields=lkml_fields,
1079
- view_names=views,
1075
+ upstream_views_file_path: Dict[str, Optional[str]] = (
1076
+ create_upstream_views_file_path_map(
1077
+ lkml_fields=lkml_fields,
1078
+ view_names=views,
1079
+ )
1080
1080
  )
1081
1081
  if upstream_views_file_path:
1082
1082
  logger.debug(f"views and their file-paths: {upstream_views_file_path}")
@@ -166,9 +166,9 @@ def _get_generic_definition(
166
166
  # e.g. spark1 or hive2 or druid_18
167
167
  platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
168
168
 
169
- assert (
170
- platform is not None
171
- ), f"Failed to extract a valid platform from connection {looker_connection}"
169
+ assert platform is not None, (
170
+ f"Failed to extract a valid platform from connection {looker_connection}"
171
+ )
172
172
  db = looker_connection.database
173
173
  schema = looker_connection.schema # ok for this to be None
174
174
  return platform, db, schema