acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show
  1. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
  2. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/ingest_cli.py +25 -15
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/entrypoints.py +6 -0
  24. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  25. datahub/ingestion/api/report.py +1 -2
  26. datahub/ingestion/api/source_helpers.py +1 -1
  27. datahub/ingestion/extractor/json_schema_util.py +3 -3
  28. datahub/ingestion/extractor/schema_util.py +3 -5
  29. datahub/ingestion/fs/s3_fs.py +3 -3
  30. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  31. datahub/ingestion/graph/client.py +4 -6
  32. datahub/ingestion/run/pipeline.py +8 -7
  33. datahub/ingestion/run/pipeline_config.py +3 -3
  34. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  35. datahub/ingestion/source/abs/source.py +19 -8
  36. datahub/ingestion/source/aws/glue.py +11 -11
  37. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  38. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  39. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  40. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  42. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  43. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  44. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  45. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  46. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  47. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  48. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  49. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  50. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  51. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  52. datahub/ingestion/source/csv_enricher.py +29 -29
  53. datahub/ingestion/source/datahub/config.py +4 -0
  54. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  55. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  56. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  57. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  58. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  59. datahub/ingestion/source/elastic_search.py +4 -4
  60. datahub/ingestion/source/gc/datahub_gc.py +1 -0
  61. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
  62. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  63. datahub/ingestion/source/ge_data_profiler.py +2 -5
  64. datahub/ingestion/source/ge_profiling_config.py +3 -3
  65. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  66. datahub/ingestion/source/identity/azure_ad.py +3 -3
  67. datahub/ingestion/source/identity/okta.py +3 -3
  68. datahub/ingestion/source/kafka/kafka.py +11 -9
  69. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  72. datahub/ingestion/source/looker/looker_common.py +19 -19
  73. datahub/ingestion/source/looker/looker_config.py +3 -3
  74. datahub/ingestion/source/looker/looker_source.py +25 -25
  75. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  76. datahub/ingestion/source/looker/looker_usage.py +5 -7
  77. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  78. datahub/ingestion/source/looker/lookml_source.py +13 -15
  79. datahub/ingestion/source/looker/view_upstream.py +5 -5
  80. datahub/ingestion/source/mlflow.py +4 -4
  81. datahub/ingestion/source/mode.py +5 -5
  82. datahub/ingestion/source/mongodb.py +6 -4
  83. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  84. datahub/ingestion/source/nifi.py +24 -26
  85. datahub/ingestion/source/openapi.py +9 -9
  86. datahub/ingestion/source/powerbi/config.py +12 -12
  87. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  88. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  89. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  90. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  91. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  92. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  93. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  94. datahub/ingestion/source/redshift/config.py +3 -3
  95. datahub/ingestion/source/redshift/redshift.py +12 -12
  96. datahub/ingestion/source/redshift/usage.py +8 -8
  97. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  98. datahub/ingestion/source/s3/source.py +1 -1
  99. datahub/ingestion/source/salesforce.py +26 -25
  100. datahub/ingestion/source/schema/json_schema.py +1 -1
  101. datahub/ingestion/source/sigma/sigma.py +3 -3
  102. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  103. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  104. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  105. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  106. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  107. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  108. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  109. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  110. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  112. datahub/ingestion/source/sql/athena.py +1 -3
  113. datahub/ingestion/source/sql/clickhouse.py +8 -14
  114. datahub/ingestion/source/sql/oracle.py +1 -3
  115. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  116. datahub/ingestion/source/sql/sql_types.py +0 -1
  117. datahub/ingestion/source/sql/teradata.py +16 -3
  118. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  119. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  120. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  121. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  122. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  123. datahub/ingestion/source/tableau/tableau.py +245 -101
  124. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  125. datahub/ingestion/source/unity/config.py +3 -1
  126. datahub/ingestion/source/unity/proxy.py +1 -1
  127. datahub/ingestion/source/unity/source.py +3 -3
  128. datahub/ingestion/source/unity/usage.py +3 -1
  129. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  130. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  131. datahub/ingestion/source/usage/usage_common.py +1 -1
  132. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  133. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  134. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  135. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  136. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  137. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  138. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  139. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  140. datahub/lite/duckdb_lite.py +12 -10
  141. datahub/metadata/_schema_classes.py +1 -1
  142. datahub/metadata/schema.avsc +6 -2
  143. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  144. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  145. datahub/secret/datahub_secrets_client.py +12 -21
  146. datahub/secret/secret_common.py +14 -8
  147. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  148. datahub/sql_parsing/schema_resolver.py +5 -10
  149. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  150. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  151. datahub/sql_parsing/sqlglot_utils.py +1 -1
  152. datahub/telemetry/stats.py +1 -2
  153. datahub/testing/mcp_diff.py +1 -1
  154. datahub/utilities/file_backed_collections.py +10 -10
  155. datahub/utilities/hive_schema_to_avro.py +2 -2
  156. datahub/utilities/logging_manager.py +2 -2
  157. datahub/utilities/lossy_collections.py +3 -3
  158. datahub/utilities/mapping.py +3 -3
  159. datahub/utilities/memory_footprint.py +3 -2
  160. datahub/utilities/serialized_lru_cache.py +3 -1
  161. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  162. datahub/utilities/sqllineage_patch.py +1 -1
  163. datahub/utilities/stats_collections.py +3 -1
  164. datahub/utilities/urns/_urn_base.py +28 -5
  165. datahub/utilities/urns/urn_iter.py +2 -2
  166. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  167. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  168. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -250,9 +250,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
250
250
 
251
251
  @staticmethod
252
252
  def _extract_view_from_field(field: str) -> str:
253
- assert (
254
- field.count(".") == 1
255
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
253
+ assert field.count(".") == 1, (
254
+ f"Error: A field must be prefixed by a view name, field is: {field}"
255
+ )
256
256
  return field.split(".")[0]
257
257
 
258
258
  def _get_views_from_fields(self, fields: List[str]) -> List[str]:
@@ -610,12 +610,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
610
610
  def _create_platform_instance_aspect(
611
611
  self,
612
612
  ) -> DataPlatformInstance:
613
- assert (
614
- self.source_config.platform_name
615
- ), "Platform name is not set in the configuration."
616
- assert (
617
- self.source_config.platform_instance
618
- ), "Platform instance is not set in the configuration."
613
+ assert self.source_config.platform_name, (
614
+ "Platform name is not set in the configuration."
615
+ )
616
+ assert self.source_config.platform_instance, (
617
+ "Platform instance is not set in the configuration."
618
+ )
619
619
 
620
620
  return DataPlatformInstance(
621
621
  platform=builder.make_data_platform_urn(self.source_config.platform_name),
@@ -1016,9 +1016,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1016
1016
  yield from chart_events
1017
1017
 
1018
1018
  # Step 2: Emit metadata events for the Dashboard itself.
1019
- chart_urns: Set[
1020
- str
1021
- ] = set() # Collect the unique child chart urns for dashboard input lineage.
1019
+ chart_urns: Set[str] = (
1020
+ set()
1021
+ ) # Collect the unique child chart urns for dashboard input lineage.
1022
1022
  for chart_event in chart_events:
1023
1023
  chart_event_urn = self._extract_event_urn(chart_event)
1024
1024
  if chart_event_urn:
@@ -1538,20 +1538,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1538
1538
  }
1539
1539
  )
1540
1540
 
1541
- dashboard_element: Optional[
1542
- LookerDashboardElement
1543
- ] = self._get_looker_dashboard_element(
1544
- DashboardElement(
1545
- id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1546
- # we add the "looks_" prefix to look.id.
1547
- title=look.title,
1548
- subtitle_text=look.description,
1549
- look_id=look.id,
1550
- dashboard_id=None, # As this is an independent look
1551
- look=LookWithQuery(
1552
- query=query, folder=look.folder, user_id=look.user_id
1541
+ dashboard_element: Optional[LookerDashboardElement] = (
1542
+ self._get_looker_dashboard_element(
1543
+ DashboardElement(
1544
+ id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1545
+ # we add the "looks_" prefix to look.id.
1546
+ title=look.title,
1547
+ subtitle_text=look.description,
1548
+ look_id=look.id,
1549
+ dashboard_id=None, # As this is an independent look
1550
+ look=LookWithQuery(
1551
+ query=query, folder=look.folder, user_id=look.user_id
1552
+ ),
1553
1553
  ),
1554
- ),
1554
+ )
1555
1555
  )
1556
1556
 
1557
1557
  if dashboard_element is not None:
@@ -33,9 +33,9 @@ logger = logging.getLogger(__name__)
33
33
 
34
34
 
35
35
  class SpecialVariable:
36
- SPECIAL_VARIABLE_PATTERN: ClassVar[
37
- str
38
- ] = r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
36
+ SPECIAL_VARIABLE_PATTERN: ClassVar[str] = (
37
+ r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
38
+ )
39
39
  liquid_variable: dict
40
40
 
41
41
  def __init__(self, liquid_variable):
@@ -257,9 +257,9 @@ class BaseStatGenerator(ABC):
257
257
 
258
258
  for row in rows:
259
259
  logger.debug(row)
260
- entity_stat_aspect[
261
- self.get_entity_stat_key(row)
262
- ] = self.to_entity_timeseries_stat_aspect(row)
260
+ entity_stat_aspect[self.get_entity_stat_key(row)] = (
261
+ self.to_entity_timeseries_stat_aspect(row)
262
+ )
263
263
 
264
264
  return entity_stat_aspect
265
265
 
@@ -385,10 +385,8 @@ class BaseStatGenerator(ABC):
385
385
  entity_rows: List[Dict] = self._execute_query(
386
386
  entity_query_with_filters, "entity_query"
387
387
  )
388
- entity_usage_stat: Dict[
389
- Tuple[str, str], Any
390
- ] = self._process_entity_timeseries_rows(
391
- entity_rows
388
+ entity_usage_stat: Dict[Tuple[str, str], Any] = (
389
+ self._process_entity_timeseries_rows(entity_rows)
392
390
  ) # Any type to pass mypy unbound Aspect type error
393
391
 
394
392
  user_wise_query_with_filters: LookerQuery = self._append_filters(
@@ -38,16 +38,16 @@ def merge_parent_and_child_fields(
38
38
  # Create a map field-name vs field
39
39
  child_field_map: dict = {}
40
40
  for field in child_fields:
41
- assert (
42
- NAME in field
43
- ), "A lookml view must have a name field" # name is required field of lookml field array
41
+ assert NAME in field, (
42
+ "A lookml view must have a name field"
43
+ ) # name is required field of lookml field array
44
44
 
45
45
  child_field_map[field[NAME]] = field
46
46
 
47
47
  for field in parent_fields:
48
- assert (
49
- NAME in field
50
- ), "A lookml view must have a name field" # name is required field of lookml field array
48
+ assert NAME in field, (
49
+ "A lookml view must have a name field"
50
+ ) # name is required field of lookml field array
51
51
 
52
52
  if field[NAME] in child_field_map:
53
53
  # Fields defined in the child view take higher precedence.
@@ -482,14 +482,14 @@ class LookMLSource(StatefulIngestionSourceBase):
482
482
  if self.source_config.project_name is not None:
483
483
  return self.source_config.project_name
484
484
 
485
- assert (
486
- self.looker_client is not None
487
- ), "Failed to find a configured Looker API client"
485
+ assert self.looker_client is not None, (
486
+ "Failed to find a configured Looker API client"
487
+ )
488
488
  try:
489
489
  model = self.looker_client.lookml_model(model_name, fields="project_name")
490
- assert (
491
- model.project_name is not None
492
- ), f"Failed to find a project name for model {model_name}"
490
+ assert model.project_name is not None, (
491
+ f"Failed to find a project name for model {model_name}"
492
+ )
493
493
  return model.project_name
494
494
  except SDKError:
495
495
  raise ValueError(
@@ -541,9 +541,9 @@ class LookMLSource(StatefulIngestionSourceBase):
541
541
  self.reporter.git_clone_latency = datetime.now() - start_time
542
542
  self.source_config.base_folder = checkout_dir.resolve()
543
543
 
544
- self.base_projects_folder[
545
- BASE_PROJECT_NAME
546
- ] = self.source_config.base_folder
544
+ self.base_projects_folder[BASE_PROJECT_NAME] = (
545
+ self.source_config.base_folder
546
+ )
547
547
 
548
548
  visited_projects: Set[str] = set()
549
549
 
@@ -641,9 +641,9 @@ class LookMLSource(StatefulIngestionSourceBase):
641
641
  repo_url=remote_project.url,
642
642
  )
643
643
 
644
- self.base_projects_folder[
645
- remote_project.name
646
- ] = p_checkout_dir.resolve()
644
+ self.base_projects_folder[remote_project.name] = (
645
+ p_checkout_dir.resolve()
646
+ )
647
647
  repo = p_cloner.get_last_repo_cloned()
648
648
  assert repo
649
649
  remote_git_info = GitInfo(
@@ -930,9 +930,7 @@ class LookMLSource(StatefulIngestionSourceBase):
930
930
  logger.warning(
931
931
  f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables"
932
932
  )
933
- if (
934
- not self.source_config.emit_reachable_views_only
935
- ):
933
+ if not self.source_config.emit_reachable_views_only:
936
934
  logger.warning(
937
935
  "Consider enabling the `emit_reachable_views_only` flag to handle this case."
938
936
  )
@@ -484,11 +484,11 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
484
484
  )
485
485
 
486
486
  def __get_upstream_dataset_urn(self) -> List[str]:
487
- current_view_id: Optional[
488
- LookerViewId
489
- ] = self.looker_view_id_cache.get_looker_view_id(
490
- view_name=self.view_context.name(),
491
- base_folder_path=self.view_context.base_folder_path,
487
+ current_view_id: Optional[LookerViewId] = (
488
+ self.looker_view_id_cache.get_looker_view_id(
489
+ view_name=self.view_context.name(),
490
+ base_folder_path=self.view_context.base_folder_path,
491
+ )
492
492
  )
493
493
 
494
494
  # Current view will always be present in cache. assert will silence the lint
@@ -172,10 +172,10 @@ class MLflowSource(Source):
172
172
  """
173
173
  Get all Registered Models in MLflow Model Registry.
174
174
  """
175
- registered_models: Iterable[
176
- RegisteredModel
177
- ] = self._traverse_mlflow_search_func(
178
- search_func=self.client.search_registered_models,
175
+ registered_models: Iterable[RegisteredModel] = (
176
+ self._traverse_mlflow_search_func(
177
+ search_func=self.client.search_registered_models,
178
+ )
179
179
  )
180
180
  return registered_models
181
181
 
@@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
893
893
  jinja_params[key] = parameters[key].get("default", "")
894
894
 
895
895
  normalized_query = re.sub(
896
- r"{% form %}(.*){% endform %}",
897
- "",
898
- query,
899
- 0,
900
- re.MULTILINE | re.DOTALL,
896
+ pattern=r"{% form %}(.*){% endform %}",
897
+ repl="",
898
+ string=query,
899
+ count=0,
900
+ flags=re.MULTILINE | re.DOTALL,
901
901
  )
902
902
 
903
903
  # Wherever we don't resolve the jinja params, we replace it with NULL
@@ -288,7 +288,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
288
288
 
289
289
  # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes
290
290
  self.mongo_client = MongoClient(
291
- self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options # type: ignore
291
+ self.config.connect_uri,
292
+ datetime_conversion="DATETIME_AUTO",
293
+ **options, # type: ignore
292
294
  )
293
295
 
294
296
  # This cheaply tests the connection. For details, see
@@ -470,9 +472,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
470
472
  )
471
473
  # Add this information to the custom properties so user can know they are looking at downsampled schema
472
474
  dataset_properties.customProperties["schema.downsampled"] = "True"
473
- dataset_properties.customProperties[
474
- "schema.totalFields"
475
- ] = f"{collection_schema_size}"
475
+ dataset_properties.customProperties["schema.totalFields"] = (
476
+ f"{collection_schema_size}"
477
+ )
476
478
 
477
479
  logger.debug(f"Size of collection fields = {len(collection_fields)}")
478
480
  # append each schema field (sort so output is consistent)
@@ -286,7 +286,7 @@ class Neo4jSource(Source):
286
286
  df = self.get_neo4j_metadata(
287
287
  "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
288
288
  )
289
- for index, row in df.iterrows():
289
+ for _, row in df.iterrows():
290
290
  try:
291
291
  yield MetadataWorkUnit(
292
292
  id=row["key"],
@@ -184,9 +184,9 @@ class NifiSourceConfig(EnvConfigMixin):
184
184
 
185
185
  @validator("site_url")
186
186
  def validator_site_url(cls, site_url: str) -> str:
187
- assert site_url.startswith(
188
- ("http://", "https://")
189
- ), "site_url must start with http:// or https://"
187
+ assert site_url.startswith(("http://", "https://")), (
188
+ "site_url must start with http:// or https://"
189
+ )
190
190
 
191
191
  if not site_url.endswith("/"):
192
192
  site_url = site_url + "/"
@@ -487,9 +487,7 @@ class NifiSource(Source):
487
487
  def get_report(self) -> SourceReport:
488
488
  return self.report
489
489
 
490
- def update_flow(
491
- self, pg_flow_dto: Dict, recursion_level: int = 0
492
- ) -> None: # noqa: C901
490
+ def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901
493
491
  """
494
492
  Update self.nifi_flow with contents of the input process group `pg_flow_dto`
495
493
  """
@@ -548,16 +546,16 @@ class NifiSource(Source):
548
546
  for inputPort in flow_dto.get("inputPorts", []):
549
547
  component = inputPort.get("component")
550
548
  if inputPort.get("allowRemoteAccess"):
551
- self.nifi_flow.remotely_accessible_ports[
552
- component.get("id")
553
- ] = NifiComponent(
554
- component.get("id"),
555
- component.get("name"),
556
- component.get("type"),
557
- component.get("parentGroupId"),
558
- NifiType.INPUT_PORT,
559
- comments=component.get("comments"),
560
- status=component.get("status", {}).get("runStatus"),
549
+ self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
550
+ NifiComponent(
551
+ component.get("id"),
552
+ component.get("name"),
553
+ component.get("type"),
554
+ component.get("parentGroupId"),
555
+ NifiType.INPUT_PORT,
556
+ comments=component.get("comments"),
557
+ status=component.get("status", {}).get("runStatus"),
558
+ )
561
559
  )
562
560
  logger.debug(f"Adding remotely accessible port {component.get('id')}")
563
561
  else:
@@ -576,16 +574,16 @@ class NifiSource(Source):
576
574
  for outputPort in flow_dto.get("outputPorts", []):
577
575
  component = outputPort.get("component")
578
576
  if outputPort.get("allowRemoteAccess"):
579
- self.nifi_flow.remotely_accessible_ports[
580
- component.get("id")
581
- ] = NifiComponent(
582
- component.get("id"),
583
- component.get("name"),
584
- component.get("type"),
585
- component.get("parentGroupId"),
586
- NifiType.OUTPUT_PORT,
587
- comments=component.get("comments"),
588
- status=component.get("status", {}).get("runStatus"),
577
+ self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
578
+ NifiComponent(
579
+ component.get("id"),
580
+ component.get("name"),
581
+ component.get("type"),
582
+ component.get("parentGroupId"),
583
+ NifiType.OUTPUT_PORT,
584
+ comments=component.get("comments"),
585
+ status=component.get("status", {}).get("runStatus"),
586
+ )
589
587
  )
590
588
  logger.debug(f"Adding remotely accessible port {component.get('id')}")
591
589
  else:
@@ -101,16 +101,16 @@ class OpenApiConfig(ConfigModel):
101
101
  # details there once, and then use that session for all requests.
102
102
  self.token = f"Bearer {self.bearer_token}"
103
103
  else:
104
- assert (
105
- "url_complement" in self.get_token.keys()
106
- ), "When 'request_type' is set to 'get', an url_complement is needed for the request."
104
+ assert "url_complement" in self.get_token.keys(), (
105
+ "When 'request_type' is set to 'get', an url_complement is needed for the request."
106
+ )
107
107
  if self.get_token["request_type"] == "get":
108
- assert (
109
- "{username}" in self.get_token["url_complement"]
110
- ), "we expect the keyword {username} to be present in the url"
111
- assert (
112
- "{password}" in self.get_token["url_complement"]
113
- ), "we expect the keyword {password} to be present in the url"
108
+ assert "{username}" in self.get_token["url_complement"], (
109
+ "we expect the keyword {username} to be present in the url"
110
+ )
111
+ assert "{password}" in self.get_token["url_complement"], (
112
+ "we expect the keyword {password} to be present in the url"
113
+ )
114
114
  url4req = self.get_token["url_complement"].replace(
115
115
  "{username}", self.username
116
116
  )
@@ -225,9 +225,9 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
225
225
  def default_for_dataset_type_mapping() -> Dict[str, str]:
226
226
  dict_: dict = {}
227
227
  for item in SupportedDataPlatform:
228
- dict_[
229
- item.value.powerbi_data_platform_name
230
- ] = item.value.datahub_data_platform_name
228
+ dict_[item.value.powerbi_data_platform_name] = (
229
+ item.value.datahub_data_platform_name
230
+ )
231
231
 
232
232
  return dict_
233
233
 
@@ -303,15 +303,15 @@ class PowerBiDashboardSourceConfig(
303
303
  # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
304
304
  # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
305
305
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
306
- dataset_type_mapping: Union[
307
- Dict[str, str], Dict[str, PlatformDetail]
308
- ] = pydantic.Field(
309
- default_factory=default_for_dataset_type_mapping,
310
- description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
311
- "DataHub supported datasources."
312
- "You can configured platform instance for dataset lineage. "
313
- "See Quickstart Recipe for mapping",
314
- hidden_from_docs=True,
306
+ dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
307
+ pydantic.Field(
308
+ default_factory=default_for_dataset_type_mapping,
309
+ description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
310
+ "DataHub supported datasources."
311
+ "You can configured platform instance for dataset lineage. "
312
+ "See Quickstart Recipe for mapping",
313
+ hidden_from_docs=True,
314
+ )
315
315
  )
316
316
  # PowerBI datasource's server to platform instance mapping
317
317
  server_to_platform_instance: Dict[
@@ -128,17 +128,17 @@ def get_upstream_tables(
128
128
  reporter.m_query_parse_successes += 1
129
129
 
130
130
  try:
131
- lineage: List[
132
- datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
133
- ] = resolver.MQueryResolver(
134
- table=table,
135
- parse_tree=parse_tree,
136
- reporter=reporter,
137
- parameters=parameters,
138
- ).resolve_to_lineage(
139
- ctx=ctx,
140
- config=config,
141
- platform_instance_resolver=platform_instance_resolver,
131
+ lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = (
132
+ resolver.MQueryResolver(
133
+ table=table,
134
+ parse_tree=parse_tree,
135
+ reporter=reporter,
136
+ parameters=parameters,
137
+ ).resolve_to_lineage(
138
+ ctx=ctx,
139
+ config=config,
140
+ platform_instance_resolver=platform_instance_resolver,
141
+ )
142
142
  )
143
143
 
144
144
  if lineage:
@@ -170,8 +170,7 @@ class AbstractLineage(ABC):
170
170
  logger.debug(f"Processing arguments {arguments}")
171
171
 
172
172
  if (
173
- len(arguments)
174
- >= 4 # [0] is warehouse FQDN.
173
+ len(arguments) >= 4 # [0] is warehouse FQDN.
175
174
  # [1] is endpoint, we are not using it.
176
175
  # [2] is "Catalog" key
177
176
  # [3] is catalog's value
@@ -215,16 +214,16 @@ class AbstractLineage(ABC):
215
214
  native_sql_parser.remove_special_characters(query)
216
215
  )
217
216
 
218
- parsed_result: Optional[
219
- "SqlParsingResult"
220
- ] = native_sql_parser.parse_custom_sql(
221
- ctx=self.ctx,
222
- query=query,
223
- platform=self.get_platform_pair().datahub_data_platform_name,
224
- platform_instance=platform_detail.platform_instance,
225
- env=platform_detail.env,
226
- database=database,
227
- schema=schema,
217
+ parsed_result: Optional["SqlParsingResult"] = (
218
+ native_sql_parser.parse_custom_sql(
219
+ ctx=self.ctx,
220
+ query=query,
221
+ platform=self.get_platform_pair().datahub_data_platform_name,
222
+ platform_instance=platform_detail.platform_instance,
223
+ env=platform_detail.env,
224
+ database=database,
225
+ schema=schema,
226
+ )
228
227
  )
229
228
 
230
229
  if parsed_result is None:
@@ -410,9 +409,9 @@ class DatabricksLineage(AbstractLineage):
410
409
  f"Processing Databrick data-access function detail {data_access_func_detail}"
411
410
  )
412
411
  table_detail: Dict[str, str] = {}
413
- temp_accessor: Optional[
414
- IdentifierAccessor
415
- ] = data_access_func_detail.identifier_accessor
412
+ temp_accessor: Optional[IdentifierAccessor] = (
413
+ data_access_func_detail.identifier_accessor
414
+ )
416
415
 
417
416
  while temp_accessor:
418
417
  # Condition to handle databricks M-query pattern where table, schema and database all are present in
@@ -647,11 +646,13 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
647
646
  db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
648
647
  # Second is schema name
649
648
  schema_name: str = cast(
650
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
649
+ IdentifierAccessor,
650
+ data_access_func_detail.identifier_accessor.next, # type: ignore
651
651
  ).items["Name"]
652
652
  # Third is table name
653
653
  table_name: str = cast(
654
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
654
+ IdentifierAccessor,
655
+ data_access_func_detail.identifier_accessor.next.next, # type: ignore
655
656
  ).items["Name"]
656
657
 
657
658
  qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
@@ -768,10 +769,13 @@ class NativeQueryLineage(AbstractLineage):
768
769
  ): # database name is explicitly set
769
770
  return database
770
771
 
771
- return get_next_item( # database name is set in Name argument
772
- data_access_tokens, "Name"
773
- ) or get_next_item( # If both above arguments are not available, then try Catalog
774
- data_access_tokens, "Catalog"
772
+ return (
773
+ get_next_item( # database name is set in Name argument
774
+ data_access_tokens, "Name"
775
+ )
776
+ or get_next_item( # If both above arguments are not available, then try Catalog
777
+ data_access_tokens, "Catalog"
778
+ )
775
779
  )
776
780
 
777
781
  def create_lineage(
@@ -819,9 +823,7 @@ class NativeQueryLineage(AbstractLineage):
819
823
  values=tree_function.remove_whitespaces_from_list(
820
824
  tree_function.token_values(flat_argument_list[1])
821
825
  ),
822
- )[
823
- 0
824
- ] # Remove any whitespaces and double quotes character
826
+ )[0] # Remove any whitespaces and double quotes character
825
827
 
826
828
  server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
827
829
 
@@ -188,9 +188,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
188
188
  # - The inner function Table.TransformColumnTypes takes #"Removed Columns1"
189
189
  # (a table reference) as its first argument
190
190
  # - Its result is then passed as the first argument to Table.SplitColumn
191
- second_invoke_expression: Optional[
192
- Tree
193
- ] = tree_function.first_invoke_expression_func(first_argument)
191
+ second_invoke_expression: Optional[Tree] = (
192
+ tree_function.first_invoke_expression_func(first_argument)
193
+ )
194
194
  if second_invoke_expression:
195
195
  # 1. The First argument is function call
196
196
  # 2. That function's first argument references next table variable
@@ -304,14 +304,14 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
304
304
  logger.debug(v_statement.pretty())
305
305
  return None
306
306
 
307
- invoke_expression: Optional[
308
- Tree
309
- ] = tree_function.first_invoke_expression_func(rh_tree)
307
+ invoke_expression: Optional[Tree] = (
308
+ tree_function.first_invoke_expression_func(rh_tree)
309
+ )
310
310
 
311
311
  if invoke_expression is not None:
312
- result: Union[
313
- DataAccessFunctionDetail, List[str], None
314
- ] = self._process_invoke_expression(invoke_expression)
312
+ result: Union[DataAccessFunctionDetail, List[str], None] = (
313
+ self._process_invoke_expression(invoke_expression)
314
+ )
315
315
  if result is None:
316
316
  return None # No need to process some un-expected grammar found while processing invoke_expression
317
317
  if isinstance(result, DataAccessFunctionDetail):
@@ -368,9 +368,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
368
368
  return lineage
369
369
 
370
370
  # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
371
- table_links: List[
372
- DataAccessFunctionDetail
373
- ] = self.create_data_access_functional_detail(output_variable)
371
+ table_links: List[DataAccessFunctionDetail] = (
372
+ self.create_data_access_functional_detail(output_variable)
373
+ )
374
374
 
375
375
  # Each item is data-access function
376
376
  for f_detail in table_links:
@@ -390,7 +390,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
390
390
 
391
391
  # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
392
392
  # & also pass additional information that will be need to generate lineage
393
- pattern_handler: (AbstractLineage) = supported_resolver.handler()(
393
+ pattern_handler: AbstractLineage = supported_resolver.handler()(
394
394
  ctx=ctx,
395
395
  table=self.table,
396
396
  config=config,
@@ -945,9 +945,9 @@ class Mapper:
945
945
  # Convert tiles to charts
946
946
  ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
947
947
  # Lets convert dashboard to datahub dashboard
948
- dashboard_mcps: List[
949
- MetadataChangeProposalWrapper
950
- ] = self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
948
+ dashboard_mcps: List[MetadataChangeProposalWrapper] = (
949
+ self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
950
+ )
951
951
 
952
952
  # Now add MCPs in sequence
953
953
  mcps.extend(ds_mcps)
@@ -1472,9 +1472,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1472
1472
  def _get_dashboard_patch_work_unit(
1473
1473
  self, work_unit: MetadataWorkUnit
1474
1474
  ) -> Optional[MetadataWorkUnit]:
1475
- dashboard_info_aspect: Optional[
1476
- DashboardInfoClass
1477
- ] = work_unit.get_aspect_of_type(DashboardInfoClass)
1475
+ dashboard_info_aspect: Optional[DashboardInfoClass] = (
1476
+ work_unit.get_aspect_of_type(DashboardInfoClass)
1477
+ )
1478
1478
 
1479
1479
  if dashboard_info_aspect and self.source_config.patch_metadata:
1480
1480
  return convert_dashboard_info_to_patch(