acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (161) hide show
  1. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/METADATA +2378 -2380
  2. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/RECORD +161 -161
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/delete_cli.py +16 -2
  12. datahub/cli/docker_cli.py +6 -6
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  24. datahub/ingestion/api/report.py +1 -2
  25. datahub/ingestion/api/source_helpers.py +1 -1
  26. datahub/ingestion/extractor/json_schema_util.py +3 -3
  27. datahub/ingestion/extractor/schema_util.py +3 -5
  28. datahub/ingestion/fs/s3_fs.py +3 -3
  29. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  30. datahub/ingestion/graph/client.py +4 -6
  31. datahub/ingestion/run/pipeline.py +8 -7
  32. datahub/ingestion/run/pipeline_config.py +3 -3
  33. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  34. datahub/ingestion/source/abs/source.py +19 -8
  35. datahub/ingestion/source/aws/glue.py +11 -11
  36. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  37. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  38. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  39. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  40. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  42. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  43. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  44. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  45. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  46. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  47. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  48. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  49. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  50. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  51. datahub/ingestion/source/csv_enricher.py +29 -29
  52. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  53. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  54. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  55. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  56. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  57. datahub/ingestion/source/elastic_search.py +4 -4
  58. datahub/ingestion/source/fivetran/config.py +4 -0
  59. datahub/ingestion/source/fivetran/fivetran.py +15 -5
  60. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
  61. datahub/ingestion/source/gcs/gcs_source.py +5 -3
  62. datahub/ingestion/source/ge_data_profiler.py +4 -5
  63. datahub/ingestion/source/ge_profiling_config.py +3 -3
  64. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  65. datahub/ingestion/source/identity/azure_ad.py +3 -3
  66. datahub/ingestion/source/identity/okta.py +3 -3
  67. datahub/ingestion/source/kafka/kafka.py +11 -9
  68. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  69. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  70. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  71. datahub/ingestion/source/looker/looker_common.py +19 -19
  72. datahub/ingestion/source/looker/looker_config.py +3 -3
  73. datahub/ingestion/source/looker/looker_source.py +25 -25
  74. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  75. datahub/ingestion/source/looker/looker_usage.py +5 -7
  76. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  77. datahub/ingestion/source/looker/lookml_source.py +13 -15
  78. datahub/ingestion/source/looker/view_upstream.py +5 -5
  79. datahub/ingestion/source/mlflow.py +4 -4
  80. datahub/ingestion/source/mongodb.py +6 -4
  81. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  82. datahub/ingestion/source/nifi.py +24 -26
  83. datahub/ingestion/source/openapi.py +9 -9
  84. datahub/ingestion/source/powerbi/config.py +12 -12
  85. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  87. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  88. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  89. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  90. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  91. datahub/ingestion/source/redshift/config.py +3 -3
  92. datahub/ingestion/source/redshift/query.py +77 -47
  93. datahub/ingestion/source/redshift/redshift.py +12 -12
  94. datahub/ingestion/source/redshift/usage.py +8 -8
  95. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  96. datahub/ingestion/source/s3/source.py +1 -1
  97. datahub/ingestion/source/salesforce.py +26 -25
  98. datahub/ingestion/source/schema/json_schema.py +1 -1
  99. datahub/ingestion/source/sigma/sigma.py +3 -3
  100. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  101. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  102. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  103. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  104. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  105. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  106. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  107. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  108. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  109. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  110. datahub/ingestion/source/sql/athena.py +1 -3
  111. datahub/ingestion/source/sql/clickhouse.py +8 -14
  112. datahub/ingestion/source/sql/oracle.py +1 -3
  113. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  114. datahub/ingestion/source/sql/teradata.py +16 -3
  115. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  116. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  117. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  118. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  119. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  120. datahub/ingestion/source/tableau/tableau.py +48 -49
  121. datahub/ingestion/source/unity/config.py +3 -1
  122. datahub/ingestion/source/unity/proxy.py +1 -1
  123. datahub/ingestion/source/unity/source.py +3 -3
  124. datahub/ingestion/source/unity/usage.py +3 -1
  125. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  126. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  127. datahub/ingestion/source/usage/usage_common.py +1 -1
  128. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  129. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  130. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  131. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  132. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  133. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  134. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  135. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  136. datahub/lite/duckdb_lite.py +12 -10
  137. datahub/metadata/_schema_classes.py +1 -1
  138. datahub/metadata/schema.avsc +6 -2
  139. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  140. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  141. datahub/secret/secret_common.py +14 -8
  142. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  143. datahub/sql_parsing/schema_resolver.py +5 -10
  144. datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
  145. datahub/sql_parsing/sqlglot_lineage.py +5 -4
  146. datahub/sql_parsing/sqlglot_utils.py +3 -2
  147. datahub/telemetry/stats.py +1 -2
  148. datahub/testing/mcp_diff.py +1 -1
  149. datahub/utilities/file_backed_collections.py +10 -10
  150. datahub/utilities/hive_schema_to_avro.py +2 -2
  151. datahub/utilities/logging_manager.py +2 -2
  152. datahub/utilities/lossy_collections.py +3 -3
  153. datahub/utilities/mapping.py +3 -3
  154. datahub/utilities/serialized_lru_cache.py +3 -1
  155. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  156. datahub/utilities/sqllineage_patch.py +1 -1
  157. datahub/utilities/stats_collections.py +3 -1
  158. datahub/utilities/urns/urn_iter.py +2 -2
  159. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/WHEEL +0 -0
  160. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/entry_points.txt +0 -0
  161. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/top_level.txt +0 -0
@@ -797,61 +797,91 @@ class RedshiftServerlessQuery(RedshiftCommonQuery):
797
797
  db_name: str, start_time: datetime, end_time: datetime
798
798
  ) -> str:
799
799
  return """
800
- SELECT
801
- distinct cluster,
802
- target_schema,
803
- target_table,
804
- username,
805
- source_schema,
806
- source_table,
807
- query_text AS ddl,
808
- start_time AS timestamp
809
- FROM
810
- (
811
- SELECT
812
- sti.schema AS target_schema,
813
- sti.table AS target_table,
814
- sti.database AS cluster,
815
- qi.table_id AS target_table_id,
816
- qi.query_id AS query_id,
817
- qi.start_time AS start_time
818
- FROM
819
- SYS_QUERY_DETAIL qi
820
- JOIN
821
- SVV_TABLE_INFO sti on sti.table_id = qi.table_id
822
- WHERE
823
- start_time >= '{start_time}' and
824
- start_time < '{end_time}' and
825
- cluster = '{db_name}' and
826
- step_name = 'insert'
827
- ) AS target_tables
828
- JOIN
829
- (
800
+ WITH queries AS (
830
801
  SELECT
831
- sti.schema AS source_schema,
832
- sti.table AS source_table,
833
- qs.table_id AS source_table_id,
834
- qs.query_id AS query_id,
835
- sui.user_name AS username,
836
- LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
802
+ sti.database as cluster,
803
+ sti.schema AS "schema",
804
+ sti.table AS "table",
805
+ qs.table_id AS table_id,
806
+ qs.query_id as query_id,
807
+ qs.step_name as step_name,
808
+ sui.user_name as username,
809
+ source,
810
+ MIN(qs.start_time) as "timestamp" -- multiple duplicate records with start_time increasing slightly by miliseconds
837
811
  FROM
838
812
  SYS_QUERY_DETAIL qs
839
813
  JOIN
840
814
  SVV_TABLE_INFO sti ON sti.table_id = qs.table_id
841
815
  LEFT JOIN
842
- SYS_QUERY_TEXT qt ON qt.query_id = qs.query_id
843
- LEFT JOIN
844
816
  SVV_USER_INFO sui ON qs.user_id = sui.user_id
845
817
  WHERE
846
- qs.step_name = 'scan' AND
847
- qs.source = 'Redshift(local)' AND
848
- qt.sequence < 16 AND -- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
849
- sti.database = '{db_name}' AND -- this was required to not retrieve some internal redshift tables, try removing to see what happens
850
- sui.user_name <> 'rdsdb' -- not entirely sure about this filter
851
- GROUP BY sti.schema, sti.table, qs.table_id, qs.query_id, sui.user_name
852
- ) AS source_tables ON target_tables.query_id = source_tables.query_id
853
- WHERE source_tables.source_table_id <> target_tables.target_table_id
854
- ORDER BY cluster, target_schema, target_table, start_time ASC
818
+ cluster = '{db_name}' AND
819
+ qs.user_id <> 1 AND -- this is user 'rdsdb'
820
+ qs.start_time >= '{start_time}' AND
821
+ qs.start_time < '{end_time}'
822
+ GROUP BY cluster, "schema", "table", qs.table_id, query_id, step_name, username, source -- to be sure we are not making duplicates ourselves the list of group by must match whatever we use in "group by" and "where" of subsequent queries ("cluster" is already set to single value in this query)
823
+ ),
824
+ unique_query_text AS (
825
+ SELECT
826
+ query_id,
827
+ sequence,
828
+ text
829
+ FROM (
830
+ SELECT
831
+ query_id,
832
+ "sequence",
833
+ text,
834
+ ROW_NUMBER() OVER (
835
+ PARTITION BY query_id, sequence
836
+ ) as rn
837
+ FROM SYS_QUERY_TEXT
838
+ )
839
+ WHERE rn = 1
840
+ ),
841
+ scan_queries AS (
842
+ SELECT
843
+ "schema" as source_schema,
844
+ "table" as source_table,
845
+ table_id as source_table_id,
846
+ queries.query_id as query_id,
847
+ username,
848
+ LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
849
+ FROM
850
+ "queries" LEFT JOIN
851
+ unique_query_text qt ON qt.query_id = queries.query_id
852
+ WHERE
853
+ source = 'Redshift(local)' AND
854
+ step_name = 'scan' AND
855
+ qt.sequence < 16 -- truncating query to not exceed Redshift limit on LISTAGG function (each sequence has at most 4k characters, limit is 64k, divided by 4k gives 16, starts count from 0)
856
+ GROUP BY source_schema, source_table, source_table_id, queries.query_id, username
857
+ ),
858
+ insert_queries AS (
859
+ SELECT
860
+ "schema" as target_schema,
861
+ "table" as target_table,
862
+ table_id as target_table_id,
863
+ query_id,
864
+ cluster,
865
+ min("timestamp") as "timestamp"
866
+ FROM
867
+ queries
868
+ WHERE
869
+ step_name = 'insert'
870
+ GROUP BY cluster, target_schema, target_table, target_table_id, query_id
871
+ )
872
+ SELECT
873
+ cluster,
874
+ target_schema,
875
+ target_table,
876
+ username,
877
+ source_schema,
878
+ source_table,
879
+ query_text AS ddl,
880
+ "timestamp"
881
+ FROM scan_queries
882
+ JOIN insert_queries on insert_queries.query_id = scan_queries.query_id
883
+ WHERE source_table_id <> target_table_id
884
+ ORDER BY cluster, target_schema, target_table, "timestamp" ASC;
855
885
  """.format(
856
886
  # We need the original database name for filtering
857
887
  db_name=db_name,
@@ -305,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
305
305
  test_report.capability_report = {}
306
306
  try:
307
307
  RedshiftDataDictionary.get_schemas(connection, database=config.database)
308
- test_report.capability_report[
309
- SourceCapability.SCHEMA_METADATA
310
- ] = CapabilityReport(capable=True)
308
+ test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
309
+ CapabilityReport(capable=True)
310
+ )
311
311
  except Exception as e:
312
- test_report.capability_report[
313
- SourceCapability.SCHEMA_METADATA
314
- ] = CapabilityReport(capable=False, failure_reason=str(e))
312
+ test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
313
+ CapabilityReport(capable=False, failure_reason=str(e))
314
+ )
315
315
 
316
316
  except Exception as e:
317
317
  test_report.basic_connectivity = CapabilityReport(
@@ -947,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
947
947
  def get_all_tables(
948
948
  self,
949
949
  ) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
950
- all_tables: Dict[
951
- str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]
952
- ] = defaultdict(dict)
950
+ all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
951
+ defaultdict(dict)
952
+ )
953
953
  for db in set().union(self.db_tables, self.db_views):
954
954
  tables = self.db_tables.get(db, {})
955
955
  views = self.db_views.get(db, {})
@@ -967,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
967
967
  all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
968
968
  ) -> Iterable[MetadataWorkUnit]:
969
969
  with PerfTimer() as timer:
970
- redundant_usage_run_skip_handler: Optional[
971
- RedundantUsageRunSkipHandler
972
- ] = None
970
+ redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
971
+ None
972
+ )
973
973
  if self.config.enable_stateful_usage_ingestion:
974
974
  redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
975
975
  source=self,
@@ -199,10 +199,10 @@ class RedshiftUsageExtractor:
199
199
  end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
200
  database=self.config.database,
201
201
  )
202
- access_events_iterable: Iterable[
203
- RedshiftAccessEvent
204
- ] = self._gen_access_events_from_history_query(
205
- query, connection=self.connection, all_tables=all_tables
202
+ access_events_iterable: Iterable[RedshiftAccessEvent] = (
203
+ self._gen_access_events_from_history_query(
204
+ query, connection=self.connection, all_tables=all_tables
205
+ )
206
206
  )
207
207
 
208
208
  aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
225
225
  start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
226
226
  end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
227
227
  )
228
- access_events_iterable: Iterable[
229
- RedshiftAccessEvent
230
- ] = self._gen_access_events_from_history_query(
231
- query, connection, all_tables=all_tables
228
+ access_events_iterable: Iterable[RedshiftAccessEvent] = (
229
+ self._gen_access_events_from_history_query(
230
+ query, connection, all_tables=all_tables
231
+ )
232
232
  )
233
233
 
234
234
  # Generate operation aspect work units from the access events
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
85
85
  if field_level_metric.startswith("include_field_"):
86
86
  values.setdefault(field_level_metric, False)
87
87
 
88
- assert (
89
- max_num_fields_to_profile is None
90
- ), f"{max_num_fields_to_profile_key} should be set to None"
88
+ assert max_num_fields_to_profile is None, (
89
+ f"{max_num_fields_to_profile_key} should be set to None"
90
+ )
91
91
 
92
92
  return values
@@ -1124,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
1124
1124
  table_data.table_path
1125
1125
  ].timestamp = table_data.timestamp
1126
1126
 
1127
- for guid, table_data in table_dict.items():
1127
+ for _, table_data in table_dict.items():
1128
1128
  yield from self.ingest_table(table_data, path_spec)
1129
1129
 
1130
1130
  if not self.source_config.is_profiling_enabled():
@@ -236,12 +236,12 @@ class SalesforceSource(Source):
236
236
  try:
237
237
  if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
238
238
  logger.debug("Access Token Provided in Config")
239
- assert (
240
- self.config.access_token is not None
241
- ), "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
242
- assert (
243
- self.config.instance_url is not None
244
- ), "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
239
+ assert self.config.access_token is not None, (
240
+ "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
241
+ )
242
+ assert self.config.instance_url is not None, (
243
+ "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
244
+ )
245
245
 
246
246
  self.sf = Salesforce(
247
247
  instance_url=self.config.instance_url,
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
250
250
  )
251
251
  elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
252
252
  logger.debug("Username/Password Provided in Config")
253
- assert (
254
- self.config.username is not None
255
- ), "Config username is required for USERNAME_PASSWORD auth"
256
- assert (
257
- self.config.password is not None
258
- ), "Config password is required for USERNAME_PASSWORD auth"
259
- assert (
260
- self.config.security_token is not None
261
- ), "Config security_token is required for USERNAME_PASSWORD auth"
253
+ assert self.config.username is not None, (
254
+ "Config username is required for USERNAME_PASSWORD auth"
255
+ )
256
+ assert self.config.password is not None, (
257
+ "Config password is required for USERNAME_PASSWORD auth"
258
+ )
259
+ assert self.config.security_token is not None, (
260
+ "Config security_token is required for USERNAME_PASSWORD auth"
261
+ )
262
262
 
263
263
  self.sf = Salesforce(
264
264
  username=self.config.username,
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
269
269
 
270
270
  elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
271
271
  logger.debug("Json Web Token provided in the config")
272
- assert (
273
- self.config.username is not None
274
- ), "Config username is required for JSON_WEB_TOKEN auth"
275
- assert (
276
- self.config.consumer_key is not None
277
- ), "Config consumer_key is required for JSON_WEB_TOKEN auth"
278
- assert (
279
- self.config.private_key is not None
280
- ), "Config private_key is required for JSON_WEB_TOKEN auth"
272
+ assert self.config.username is not None, (
273
+ "Config username is required for JSON_WEB_TOKEN auth"
274
+ )
275
+ assert self.config.consumer_key is not None, (
276
+ "Config consumer_key is required for JSON_WEB_TOKEN auth"
277
+ )
278
+ assert self.config.private_key is not None, (
279
+ "Config private_key is required for JSON_WEB_TOKEN auth"
280
+ )
281
281
 
282
282
  self.sf = Salesforce(
283
283
  username=self.config.username,
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
439
439
  dataPlatformInstance = DataPlatformInstanceClass(
440
440
  builder.make_data_platform_urn(self.platform),
441
441
  instance=builder.make_dataplatform_instance_urn(
442
- self.platform, self.config.platform_instance # type:ignore
442
+ self.platform,
443
+ self.config.platform_instance, # type:ignore
443
444
  ),
444
445
  )
445
446
 
@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
354
354
  browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
355
355
 
356
356
  if os.path.isdir(self.config.path):
357
- for root, dirs, files in os.walk(self.config.path, topdown=False):
357
+ for root, _, files in os.walk(self.config.path, topdown=False):
358
358
  for file_name in [f for f in files if f.endswith(".json")]:
359
359
  try:
360
360
  yield from self._load_one_file(
@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
477
477
  upstream_dataset_urns
478
478
  and dataset_urn not in self.dataset_upstream_urn_mapping
479
479
  ):
480
- self.dataset_upstream_urn_mapping[
481
- dataset_urn
482
- ] = upstream_dataset_urns
480
+ self.dataset_upstream_urn_mapping[dataset_urn] = (
481
+ upstream_dataset_urns
482
+ )
483
483
 
484
484
  element_input_fields = [
485
485
  InputFieldClass(
@@ -126,9 +126,9 @@ class SigmaAPI:
126
126
  response.raise_for_status()
127
127
  response_dict = response.json()
128
128
  for workspace_dict in response_dict[Constant.ENTRIES]:
129
- self.workspaces[
130
- workspace_dict[Constant.WORKSPACEID]
131
- ] = Workspace.parse_obj(workspace_dict)
129
+ self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
130
+ Workspace.parse_obj(workspace_dict)
131
+ )
132
132
  if response_dict[Constant.NEXTPAGE]:
133
133
  url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
134
134
  else:
@@ -147,9 +147,9 @@ class SigmaAPI:
147
147
  response.raise_for_status()
148
148
  response_dict = response.json()
149
149
  for user_dict in response_dict[Constant.ENTRIES]:
150
- users[
151
- user_dict[Constant.MEMBERID]
152
- ] = f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
150
+ users[user_dict[Constant.MEMBERID]] = (
151
+ f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
152
+ )
153
153
  if response_dict[Constant.NEXTPAGE]:
154
154
  url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
155
155
  else:
@@ -327,10 +327,12 @@ class SigmaAPI:
327
327
  response.raise_for_status()
328
328
  for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
329
329
  if not element_dict.get(Constant.NAME):
330
- element_dict[Constant.NAME] = f"Element {i+1} of Page '{page.name}'"
331
- element_dict[
332
- Constant.URL
333
- ] = f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
330
+ element_dict[Constant.NAME] = (
331
+ f"Element {i + 1} of Page '{page.name}'"
332
+ )
333
+ element_dict[Constant.URL] = (
334
+ f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
335
+ )
334
336
  element = Element.parse_obj(element_dict)
335
337
  if (
336
338
  self.config.extract_lineage
@@ -384,18 +384,20 @@ class SnowflakeV2Config(
384
384
  assert all(
385
385
  consumer.platform_instance != share_details.platform_instance
386
386
  for consumer in share_details.consumers
387
- ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
387
+ ), (
388
+ "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
389
+ )
388
390
 
389
391
  databases_included_in_share.append(shared_db)
390
392
  databases_created_from_share.extend(share_details.consumers)
391
393
 
392
394
  for db_from_share in databases_created_from_share:
393
- assert (
394
- db_from_share not in databases_included_in_share
395
- ), "Database included in a share can not be present as consumer in any share."
396
- assert (
397
- databases_created_from_share.count(db_from_share) == 1
398
- ), "Same database can not be present as consumer in more than one share."
395
+ assert db_from_share not in databases_included_in_share, (
396
+ "Database included in a share can not be present as consumer in any share."
397
+ )
398
+ assert databases_created_from_share.count(db_from_share) == 1, (
399
+ "Same database can not be present as consumer in more than one share."
400
+ )
399
401
 
400
402
  return shares
401
403
 
@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
250
250
  if self.private_key is not None:
251
251
  pkey_bytes = self.private_key.replace("\\n", "\n").encode()
252
252
  else:
253
- assert (
254
- self.private_key_path
255
- ), "missing required private key path to read key from"
253
+ assert self.private_key_path, (
254
+ "missing required private key path to read key from"
255
+ )
256
256
  with open(self.private_key_path, "rb") as key:
257
257
  pkey_bytes = key.read()
258
258
 
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
284
284
  return self.options
285
285
 
286
286
  def get_oauth_connection(self) -> NativeSnowflakeConnection:
287
- assert (
288
- self.oauth_config
289
- ), "oauth_config should be provided if using oauth based authentication"
287
+ assert self.oauth_config, (
288
+ "oauth_config should be provided if using oauth based authentication"
289
+ )
290
290
  generator = OAuthTokenGenerator(
291
291
  client_id=self.oauth_config.client_id,
292
292
  authority_url=self.oauth_config.authority_url,
@@ -623,7 +623,7 @@ fingerprinted_queries as (
623
623
  query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
624
624
  AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
625
625
  AND execution_status = 'SUCCESS'
626
- AND {users_filter or 'TRUE'}
626
+ AND {users_filter or "TRUE"}
627
627
  )
628
628
  , deduplicated_queries as (
629
629
  SELECT
@@ -651,7 +651,7 @@ fingerprinted_queries as (
651
651
  WHERE
652
652
  query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
653
653
  AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
654
- AND {users_filter or 'TRUE'}
654
+ AND {users_filter or "TRUE"}
655
655
  AND query_id IN (
656
656
  SELECT query_id FROM deduplicated_queries
657
657
  )
@@ -142,9 +142,9 @@ class _SnowflakeTagCache:
142
142
  )
143
143
 
144
144
  # self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
145
- self._table_tags: Dict[
146
- str, Dict[str, Dict[str, List[SnowflakeTag]]]
147
- ] = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
145
+ self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
146
+ defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
147
+ )
148
148
 
149
149
  # self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
150
150
  self._column_tags: Dict[
@@ -194,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
194
194
  config, self.data_dictionary, self.report
195
195
  )
196
196
  self.profiler: Optional[SnowflakeProfiler] = profiler
197
- self.snowsight_url_builder: Optional[
198
- SnowsightUrlBuilder
199
- ] = snowsight_url_builder
197
+ self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
198
+ snowsight_url_builder
199
+ )
200
200
 
201
201
  # These are populated as side-effects of get_workunits_internal.
202
202
  self.databases: List[SnowflakeDatabase] = []
@@ -267,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
267
267
  )
268
268
  return None
269
269
  else:
270
- ischema_databases: List[
271
- SnowflakeDatabase
272
- ] = self.get_databases_from_ischema(databases)
270
+ ischema_databases: List[SnowflakeDatabase] = (
271
+ self.get_databases_from_ischema(databases)
272
+ )
273
273
 
274
274
  if len(ischema_databases) == 0:
275
275
  self.structured_reporter.failure(
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
38
38
  table_name: Optional[str],
39
39
  ) -> List[SnowflakeTag]:
40
40
  if db_name not in self.tag_cache:
41
- self.tag_cache[
42
- db_name
43
- ] = self.data_dictionary.get_tags_for_database_without_propagation(db_name)
41
+ self.tag_cache[db_name] = (
42
+ self.data_dictionary.get_tags_for_database_without_propagation(db_name)
43
+ )
44
44
 
45
45
  if domain == SnowflakeObjectDomain.DATABASE:
46
46
  return self.tag_cache[db_name].get_database_tags(db_name)
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
130
130
  temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
131
131
  if self.config.extract_tags == TagOption.without_lineage:
132
132
  if db_name not in self.tag_cache:
133
- self.tag_cache[
134
- db_name
135
- ] = self.data_dictionary.get_tags_for_database_without_propagation(
136
- db_name
133
+ self.tag_cache[db_name] = (
134
+ self.data_dictionary.get_tags_for_database_without_propagation(
135
+ db_name
136
+ )
137
137
  )
138
138
  temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
139
139
  table_name, schema_name, db_name
@@ -549,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
549
549
  ):
550
550
  # NOTE: Generated emails may be incorrect, as email may be different than
551
551
  # username@email_domain
552
- event_dict[
553
- "EMAIL"
554
- ] = f'{event_dict["USER_NAME"]}@{self.config.email_domain}'.lower()
552
+ event_dict["EMAIL"] = (
553
+ f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
554
+ )
555
555
 
556
556
  if not event_dict["EMAIL"]:
557
557
  self.report.rows_missing_email += 1
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
21
21
  class SnowflakeStructuredReportMixin(abc.ABC):
22
22
  @property
23
23
  @abc.abstractmethod
24
- def structured_reporter(self) -> SourceReport:
25
- ...
24
+ def structured_reporter(self) -> SourceReport: ...
26
25
 
27
26
 
28
27
  class SnowsightUrlBuilder:
@@ -211,9 +211,9 @@ class SnowflakeV2Source(
211
211
 
212
212
  self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
213
213
  if self.config.include_usage_stats or self.config.include_operational_stats:
214
- redundant_usage_run_skip_handler: Optional[
215
- RedundantUsageRunSkipHandler
216
- ] = None
214
+ redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
215
+ None
216
+ )
217
217
  if self.config.enable_stateful_usage_ingestion:
218
218
  redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
219
219
  source=self,
@@ -296,7 +296,16 @@ class SnowflakeV2Source(
296
296
 
297
297
  _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
298
298
  privileges: List[SnowflakePrivilege] = []
299
- capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore
299
+ capabilities: List[SourceCapability] = [
300
+ c.capability
301
+ for c in SnowflakeV2Source.get_capabilities() # type: ignore
302
+ if c.capability
303
+ not in (
304
+ SourceCapability.PLATFORM_INSTANCE,
305
+ SourceCapability.DOMAINS,
306
+ SourceCapability.DELETION_DETECTION,
307
+ )
308
+ ]
300
309
 
301
310
  cur = conn.query("select current_role()")
302
311
  current_role = [row["CURRENT_ROLE()"] for row in cur][0]
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
104
104
  return "\n".join([r for r in res])
105
105
 
106
106
  @typing.no_type_check
107
- def _get_column_type(
108
- self, type_: Union[str, Dict[str, Any]]
109
- ) -> TypeEngine: # noqa: C901
107
+ def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
110
108
  """Derives the data type of the Athena column.
111
109
 
112
110
  This method is overwritten to extend the behavior of PyAthena.