acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show
  1. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
  2. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/ingest_cli.py +25 -15
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/entrypoints.py +6 -0
  24. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  25. datahub/ingestion/api/report.py +1 -2
  26. datahub/ingestion/api/source_helpers.py +1 -1
  27. datahub/ingestion/extractor/json_schema_util.py +3 -3
  28. datahub/ingestion/extractor/schema_util.py +3 -5
  29. datahub/ingestion/fs/s3_fs.py +3 -3
  30. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  31. datahub/ingestion/graph/client.py +4 -6
  32. datahub/ingestion/run/pipeline.py +8 -7
  33. datahub/ingestion/run/pipeline_config.py +3 -3
  34. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  35. datahub/ingestion/source/abs/source.py +19 -8
  36. datahub/ingestion/source/aws/glue.py +11 -11
  37. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  38. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  39. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  40. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  42. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  43. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  44. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  45. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  46. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  47. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  48. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  49. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  50. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  51. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  52. datahub/ingestion/source/csv_enricher.py +29 -29
  53. datahub/ingestion/source/datahub/config.py +4 -0
  54. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  55. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  56. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  57. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  58. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  59. datahub/ingestion/source/elastic_search.py +4 -4
  60. datahub/ingestion/source/gc/datahub_gc.py +1 -0
  61. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
  62. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  63. datahub/ingestion/source/ge_data_profiler.py +2 -5
  64. datahub/ingestion/source/ge_profiling_config.py +3 -3
  65. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  66. datahub/ingestion/source/identity/azure_ad.py +3 -3
  67. datahub/ingestion/source/identity/okta.py +3 -3
  68. datahub/ingestion/source/kafka/kafka.py +11 -9
  69. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  72. datahub/ingestion/source/looker/looker_common.py +19 -19
  73. datahub/ingestion/source/looker/looker_config.py +3 -3
  74. datahub/ingestion/source/looker/looker_source.py +25 -25
  75. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  76. datahub/ingestion/source/looker/looker_usage.py +5 -7
  77. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  78. datahub/ingestion/source/looker/lookml_source.py +13 -15
  79. datahub/ingestion/source/looker/view_upstream.py +5 -5
  80. datahub/ingestion/source/mlflow.py +4 -4
  81. datahub/ingestion/source/mode.py +5 -5
  82. datahub/ingestion/source/mongodb.py +6 -4
  83. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  84. datahub/ingestion/source/nifi.py +24 -26
  85. datahub/ingestion/source/openapi.py +9 -9
  86. datahub/ingestion/source/powerbi/config.py +12 -12
  87. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  88. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  89. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  90. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  91. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  92. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  93. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  94. datahub/ingestion/source/redshift/config.py +3 -3
  95. datahub/ingestion/source/redshift/redshift.py +12 -12
  96. datahub/ingestion/source/redshift/usage.py +8 -8
  97. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  98. datahub/ingestion/source/s3/source.py +1 -1
  99. datahub/ingestion/source/salesforce.py +26 -25
  100. datahub/ingestion/source/schema/json_schema.py +1 -1
  101. datahub/ingestion/source/sigma/sigma.py +3 -3
  102. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  103. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  104. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  105. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  106. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  107. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  108. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  109. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  110. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  112. datahub/ingestion/source/sql/athena.py +1 -3
  113. datahub/ingestion/source/sql/clickhouse.py +8 -14
  114. datahub/ingestion/source/sql/oracle.py +1 -3
  115. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  116. datahub/ingestion/source/sql/sql_types.py +0 -1
  117. datahub/ingestion/source/sql/teradata.py +16 -3
  118. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  119. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  120. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  121. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  122. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  123. datahub/ingestion/source/tableau/tableau.py +245 -101
  124. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  125. datahub/ingestion/source/unity/config.py +3 -1
  126. datahub/ingestion/source/unity/proxy.py +1 -1
  127. datahub/ingestion/source/unity/source.py +3 -3
  128. datahub/ingestion/source/unity/usage.py +3 -1
  129. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  130. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  131. datahub/ingestion/source/usage/usage_common.py +1 -1
  132. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  133. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  134. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  135. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  136. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  137. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  138. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  139. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  140. datahub/lite/duckdb_lite.py +12 -10
  141. datahub/metadata/_schema_classes.py +1 -1
  142. datahub/metadata/schema.avsc +6 -2
  143. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  144. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  145. datahub/secret/datahub_secrets_client.py +12 -21
  146. datahub/secret/secret_common.py +14 -8
  147. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  148. datahub/sql_parsing/schema_resolver.py +5 -10
  149. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  150. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  151. datahub/sql_parsing/sqlglot_utils.py +1 -1
  152. datahub/telemetry/stats.py +1 -2
  153. datahub/testing/mcp_diff.py +1 -1
  154. datahub/utilities/file_backed_collections.py +10 -10
  155. datahub/utilities/hive_schema_to_avro.py +2 -2
  156. datahub/utilities/logging_manager.py +2 -2
  157. datahub/utilities/lossy_collections.py +3 -3
  158. datahub/utilities/mapping.py +3 -3
  159. datahub/utilities/memory_footprint.py +3 -2
  160. datahub/utilities/serialized_lru_cache.py +3 -1
  161. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  162. datahub/utilities/sqllineage_patch.py +1 -1
  163. datahub/utilities/stats_collections.py +3 -1
  164. datahub/utilities/urns/_urn_base.py +28 -5
  165. datahub/utilities/urns/urn_iter.py +2 -2
  166. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  167. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  168. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -174,6 +174,8 @@ from datahub.utilities.perf_timer import PerfTimer
174
174
  from datahub.utilities.stats_collections import TopKDict
175
175
  from datahub.utilities.urns.dataset_urn import DatasetUrn
176
176
 
177
+ DEFAULT_PAGE_SIZE = 10
178
+
177
179
  try:
178
180
  # On earlier versions of the tableauserverclient, the NonXMLResponseError
179
181
  # was thrown when reauthentication was necessary. We'll keep both exceptions
@@ -279,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
279
281
  return authentication
280
282
 
281
283
  def make_tableau_client(self, site: str) -> Server:
282
- authentication: Union[
283
- TableauAuth, PersonalAccessTokenAuth
284
- ] = self.get_tableau_auth(site)
284
+ authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
285
+ self.get_tableau_auth(site)
286
+ )
285
287
  try:
286
288
  server = Server(
287
289
  self.connect_uri,
@@ -342,11 +344,140 @@ class PermissionIngestionConfig(ConfigModel):
342
344
  )
343
345
 
344
346
 
347
+ class TableauPageSizeConfig(ConfigModel):
348
+ """
349
+ Configuration for setting page sizes for different Tableau metadata objects.
350
+
351
+ Some considerations:
352
+ - All have default values, so no setting is mandatory.
353
+ - In general, with the `effective_` methods, if not specifically set fine-grained metrics fallback to `page_size`
354
+ or correlate with `page_size`.
355
+
356
+ Measuring the impact of changing these values can be done by looking at the
357
+ `num_(filter_|paginated_)?queries_by_connection_type` metrics in the report.
358
+ """
359
+
360
+ page_size: int = Field(
361
+ default=DEFAULT_PAGE_SIZE,
362
+ description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
363
+ )
364
+
365
+ database_server_page_size: Optional[int] = Field(
366
+ default=None,
367
+ description="[advanced] Number of database servers to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
368
+ )
369
+
370
+ @property
371
+ def effective_database_server_page_size(self) -> int:
372
+ return self.database_server_page_size or self.page_size
373
+
374
+ # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
375
+ # returns warnings like this:
376
+ # {
377
+ # 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
378
+ # 'extensions': {
379
+ # 'severity': 'WARNING',
380
+ # 'code': 'NODE_LIMIT_EXCEEDED',
381
+ # 'properties': {
382
+ # 'nodeLimit': 20000
383
+ # }
384
+ # }
385
+ # }
386
+ # Reducing the page size for the workbook queries helps to avoid this.
387
+ workbook_page_size: Optional[int] = Field(
388
+ default=1,
389
+ description="[advanced] Number of workbooks to query at a time using the Tableau API; defaults to `1` and fallbacks to `page_size` if not set.",
390
+ )
391
+
392
+ @property
393
+ def effective_workbook_page_size(self) -> int:
394
+ return self.workbook_page_size or self.page_size
395
+
396
+ sheet_page_size: Optional[int] = Field(
397
+ default=None,
398
+ description="[advanced] Number of sheets to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
399
+ )
400
+
401
+ @property
402
+ def effective_sheet_page_size(self) -> int:
403
+ return self.sheet_page_size or self.page_size
404
+
405
+ dashboard_page_size: Optional[int] = Field(
406
+ default=None,
407
+ description="[advanced] Number of dashboards to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
408
+ )
409
+
410
+ @property
411
+ def effective_dashboard_page_size(self) -> int:
412
+ return self.dashboard_page_size or self.page_size
413
+
414
+ embedded_datasource_page_size: Optional[int] = Field(
415
+ default=None,
416
+ description="[advanced] Number of embedded datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
417
+ )
418
+
419
+ @property
420
+ def effective_embedded_datasource_page_size(self) -> int:
421
+ return self.embedded_datasource_page_size or self.page_size
422
+
423
+ # Since the field upstream query was separated from the embedded datasource queries into an independent query,
424
+ # the number of queries increased significantly and so the execution time.
425
+ # To increase the batching and so reduce the number of queries, we can increase the page size for that
426
+ # particular case.
427
+ #
428
+ # That's why unless specifically set, we will effectively use 10 times the page size as the default page size.
429
+ embedded_datasource_field_upstream_page_size: Optional[int] = Field(
430
+ default=None,
431
+ description="[advanced] Number of upstream fields to query at a time for embedded datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
432
+ )
433
+
434
+ @property
435
+ def effective_embedded_datasource_field_upstream_page_size(self) -> int:
436
+ return self.embedded_datasource_field_upstream_page_size or self.page_size * 10
437
+
438
+ published_datasource_page_size: Optional[int] = Field(
439
+ default=None,
440
+ description="[advanced] Number of published datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
441
+ )
442
+
443
+ @property
444
+ def effective_published_datasource_page_size(self) -> int:
445
+ return self.published_datasource_page_size or self.page_size
446
+
447
+ published_datasource_field_upstream_page_size: Optional[int] = Field(
448
+ default=None,
449
+ description="[advanced] Number of upstream fields to query at a time for published datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
450
+ )
451
+
452
+ @property
453
+ def effective_published_datasource_field_upstream_page_size(self) -> int:
454
+ return self.published_datasource_field_upstream_page_size or self.page_size * 10
455
+
456
+ custom_sql_table_page_size: Optional[int] = Field(
457
+ default=None,
458
+ description="[advanced] Number of custom sql datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
459
+ )
460
+
461
+ @property
462
+ def effective_custom_sql_table_page_size(self) -> int:
463
+ return self.custom_sql_table_page_size or self.page_size
464
+
465
+ database_table_page_size: Optional[int] = Field(
466
+ default=None,
467
+ description="[advanced] Number of database tables to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
468
+ )
469
+
470
+ @property
471
+ def effective_database_table_page_size(self) -> int:
472
+ return self.database_table_page_size or self.page_size
473
+
474
+
345
475
  class TableauConfig(
346
476
  DatasetLineageProviderConfigBase,
347
477
  StatefulIngestionConfigBase,
348
478
  DatasetSourceConfigMixin,
349
479
  TableauConnectionConfig,
480
+ TableauPageSizeConfig,
350
481
  ):
351
482
  projects: Optional[List[str]] = Field(
352
483
  default=["default"],
@@ -396,29 +527,6 @@ class TableauConfig(
396
527
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
397
528
  )
398
529
 
399
- page_size: int = Field(
400
- default=10,
401
- description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
402
- )
403
-
404
- # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
405
- # returns warnings like this:
406
- # {
407
- # 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
408
- # 'extensions': {
409
- # 'severity': 'WARNING',
410
- # 'code': 'NODE_LIMIT_EXCEEDED',
411
- # 'properties': {
412
- # 'nodeLimit': 20000
413
- # }
414
- # }
415
- # }
416
- # Reducing the page size for the workbook queries helps to avoid this.
417
- workbook_page_size: int = Field(
418
- default=1,
419
- description="[advanced] Number of workbooks to query at a time using the Tableau API.",
420
- )
421
-
422
530
  env: str = Field(
423
531
  default=builder.DEFAULT_ENV,
424
532
  description="Environment to use in namespace when constructing URNs.",
@@ -527,7 +635,7 @@ class TableauConfig(
527
635
  project_path_pattern = values.get("project_path_pattern")
528
636
  if project_pattern is None and project_path_pattern is None and projects:
529
637
  logger.warning(
530
- "projects is deprecated, please use " "project_path_pattern instead."
638
+ "projects is deprecated, please use project_path_pattern instead."
531
639
  )
532
640
  logger.info("Initializing project_pattern from projects")
533
641
  values["project_pattern"] = AllowDenyPattern(
@@ -600,18 +708,18 @@ class DatabaseTable:
600
708
  """
601
709
 
602
710
  urn: str
603
- id: Optional[
604
- str
605
- ] = None # is not None only for tables that came from Tableau metadata
711
+ id: Optional[str] = (
712
+ None # is not None only for tables that came from Tableau metadata
713
+ )
606
714
  num_cols: Optional[int] = None
607
715
 
608
- paths: Optional[
609
- Set[str]
610
- ] = None # maintains all browse paths encountered for this table
716
+ paths: Optional[Set[str]] = (
717
+ None # maintains all browse paths encountered for this table
718
+ )
611
719
 
612
- parsed_columns: Optional[
613
- Set[str]
614
- ] = None # maintains all columns encountered for this table during parsing SQL queries
720
+ parsed_columns: Optional[Set[str]] = (
721
+ None # maintains all columns encountered for this table during parsing SQL queries
722
+ )
615
723
 
616
724
  def update_table(
617
725
  self,
@@ -700,6 +808,23 @@ class TableauSourceReport(
700
808
  default_factory=(lambda: defaultdict(int))
701
809
  )
702
810
 
811
+ # Counters for tracking the number of queries made to get_connection_objects method
812
+ # by connection type (static and short set of keys):
813
+ # - num_queries_by_connection_type: total number of queries
814
+ # - num_filter_queries_by_connection_type: number of paginated queries due to splitting query filters
815
+ # - num_paginated_queries_by_connection_type: total number of queries due to Tableau pagination
816
+ # These counters are useful to understand the impact of changing the page size.
817
+
818
+ num_queries_by_connection_type: Dict[str, int] = dataclass_field(
819
+ default_factory=(lambda: defaultdict(int))
820
+ )
821
+ num_filter_queries_by_connection_type: Dict[str, int] = dataclass_field(
822
+ default_factory=(lambda: defaultdict(int))
823
+ )
824
+ num_paginated_queries_by_connection_type: Dict[str, int] = dataclass_field(
825
+ default_factory=(lambda: defaultdict(int))
826
+ )
827
+
703
828
 
704
829
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
705
830
  title: str = "Insufficient Permissions"
@@ -994,7 +1119,9 @@ class TableauSiteSource:
994
1119
  return server_connection
995
1120
 
996
1121
  for database_server in self.get_connection_objects(
997
- database_servers_graphql_query, c.DATABASE_SERVERS_CONNECTION
1122
+ query=database_servers_graphql_query,
1123
+ connection_type=c.DATABASE_SERVERS_CONNECTION,
1124
+ page_size=self.config.effective_database_server_page_size,
998
1125
  ):
999
1126
  database_server_id = database_server.get(c.ID)
1000
1127
  server_connection = database_server.get(c.HOST_NAME)
@@ -1420,22 +1547,30 @@ class TableauSiteSource:
1420
1547
  self,
1421
1548
  query: str,
1422
1549
  connection_type: str,
1550
+ page_size: int,
1423
1551
  query_filter: dict = {},
1424
- page_size_override: Optional[int] = None,
1425
1552
  ) -> Iterable[dict]:
1426
1553
  query_filter = optimize_query_filter(query_filter)
1427
1554
 
1428
1555
  # Calls the get_connection_object_page function to get the objects,
1429
1556
  # and automatically handles pagination.
1430
- page_size = page_size_override or self.config.page_size
1431
1557
 
1432
1558
  filter_pages = get_filter_pages(query_filter, page_size)
1559
+ self.report.num_queries_by_connection_type[connection_type] += 1
1560
+ self.report.num_filter_queries_by_connection_type[connection_type] += len(
1561
+ filter_pages
1562
+ )
1563
+
1433
1564
  for filter_page in filter_pages:
1434
1565
  has_next_page = 1
1435
1566
  current_cursor: Optional[str] = None
1436
1567
  while has_next_page:
1437
1568
  filter_: str = make_filter(filter_page)
1438
1569
 
1570
+ self.report.num_paginated_queries_by_connection_type[
1571
+ connection_type
1572
+ ] += 1
1573
+
1439
1574
  self.report.num_expected_tableau_metadata_queries += 1
1440
1575
  (
1441
1576
  connection_objects,
@@ -1463,10 +1598,10 @@ class TableauSiteSource:
1463
1598
  projects = {c.PROJECT_NAME_WITH_IN: project_names}
1464
1599
 
1465
1600
  for workbook in self.get_connection_objects(
1466
- workbook_graphql_query,
1467
- c.WORKBOOKS_CONNECTION,
1468
- projects,
1469
- page_size_override=self.config.workbook_page_size,
1601
+ query=workbook_graphql_query,
1602
+ connection_type=c.WORKBOOKS_CONNECTION,
1603
+ query_filter=projects,
1604
+ page_size=self.config.effective_workbook_page_size,
1470
1605
  ):
1471
1606
  # This check is needed as we are using projectNameWithin which return project as per project name so if
1472
1607
  # user want to ingest only nested project C from A->B->C then tableau might return more than one Project
@@ -1921,9 +2056,10 @@ class TableauSiteSource:
1921
2056
 
1922
2057
  custom_sql_connection = list(
1923
2058
  self.get_connection_objects(
1924
- custom_sql_graphql_query,
1925
- c.CUSTOM_SQL_TABLE_CONNECTION,
1926
- custom_sql_filter,
2059
+ query=custom_sql_graphql_query,
2060
+ connection_type=c.CUSTOM_SQL_TABLE_CONNECTION,
2061
+ query_filter=custom_sql_filter,
2062
+ page_size=self.config.effective_custom_sql_table_page_size,
1927
2063
  )
1928
2064
  )
1929
2065
 
@@ -2174,8 +2310,7 @@ class TableauSiteSource:
2174
2310
  c.EMBEDDED_DATA_SOURCE,
2175
2311
  ):
2176
2312
  logger.debug(
2177
- f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
2178
- f"unsupported"
2313
+ f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
2179
2314
  )
2180
2315
  return None
2181
2316
 
@@ -2357,9 +2492,9 @@ class TableauSiteSource:
2357
2492
  def _enrich_database_tables_with_parsed_schemas(
2358
2493
  self, parsing_result: SqlParsingResult
2359
2494
  ) -> None:
2360
- in_tables_schemas: Dict[
2361
- str, Set[str]
2362
- ] = transform_parsing_result_to_in_tables_schemas(parsing_result)
2495
+ in_tables_schemas: Dict[str, Set[str]] = (
2496
+ transform_parsing_result_to_in_tables_schemas(parsing_result)
2497
+ )
2363
2498
 
2364
2499
  if not in_tables_schemas:
2365
2500
  logger.info("Unable to extract table schema from parsing result")
@@ -2632,6 +2767,7 @@ class TableauSiteSource:
2632
2767
  self,
2633
2768
  datasource: dict,
2634
2769
  field_upstream_query: str,
2770
+ page_size: int,
2635
2771
  ) -> dict:
2636
2772
  # Collect field ids to fetch field upstreams
2637
2773
  field_ids: List[str] = []
@@ -2642,9 +2778,10 @@ class TableauSiteSource:
2642
2778
  # Fetch field upstreams and arrange them in map
2643
2779
  field_vs_upstream: Dict[str, dict] = {}
2644
2780
  for field_upstream in self.get_connection_objects(
2645
- field_upstream_query,
2646
- c.FIELDS_CONNECTION,
2647
- {c.ID_WITH_IN: field_ids},
2781
+ query=field_upstream_query,
2782
+ connection_type=c.FIELDS_CONNECTION,
2783
+ query_filter={c.ID_WITH_IN: field_ids},
2784
+ page_size=page_size,
2648
2785
  ):
2649
2786
  if field_upstream.get(c.ID):
2650
2787
  field_id = field_upstream[c.ID]
@@ -2667,13 +2804,15 @@ class TableauSiteSource:
2667
2804
  datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2668
2805
 
2669
2806
  for datasource in self.get_connection_objects(
2670
- published_datasource_graphql_query,
2671
- c.PUBLISHED_DATA_SOURCES_CONNECTION,
2672
- datasource_filter,
2807
+ query=published_datasource_graphql_query,
2808
+ connection_type=c.PUBLISHED_DATA_SOURCES_CONNECTION,
2809
+ query_filter=datasource_filter,
2810
+ page_size=self.config.effective_published_datasource_page_size,
2673
2811
  ):
2674
2812
  datasource = self.update_datasource_for_field_upstream(
2675
2813
  datasource=datasource,
2676
2814
  field_upstream_query=datasource_upstream_fields_graphql_query,
2815
+ page_size=self.config.effective_published_datasource_field_upstream_page_size,
2677
2816
  )
2678
2817
 
2679
2818
  yield from self.emit_datasource(datasource)
@@ -2689,11 +2828,12 @@ class TableauSiteSource:
2689
2828
  c.ID_WITH_IN: list(tableau_database_table_id_to_urn_map.keys())
2690
2829
  }
2691
2830
 
2692
- # Emmitting tables that came from Tableau metadata
2831
+ # Emitting tables that came from Tableau metadata
2693
2832
  for tableau_table in self.get_connection_objects(
2694
- database_tables_graphql_query,
2695
- c.DATABASE_TABLES_CONNECTION,
2696
- tables_filter,
2833
+ query=database_tables_graphql_query,
2834
+ connection_type=c.DATABASE_TABLES_CONNECTION,
2835
+ query_filter=tables_filter,
2836
+ page_size=self.config.effective_database_table_page_size,
2697
2837
  ):
2698
2838
  database_table = self.database_tables[
2699
2839
  tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
@@ -2882,9 +3022,10 @@ class TableauSiteSource:
2882
3022
  sheets_filter = {c.ID_WITH_IN: self.sheet_ids}
2883
3023
 
2884
3024
  for sheet in self.get_connection_objects(
2885
- sheet_graphql_query,
2886
- c.SHEETS_CONNECTION,
2887
- sheets_filter,
3025
+ query=sheet_graphql_query,
3026
+ connection_type=c.SHEETS_CONNECTION,
3027
+ query_filter=sheets_filter,
3028
+ page_size=self.config.effective_sheet_page_size,
2888
3029
  ):
2889
3030
  if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
2890
3031
  yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
@@ -3202,9 +3343,10 @@ class TableauSiteSource:
3202
3343
  dashboards_filter = {c.ID_WITH_IN: self.dashboard_ids}
3203
3344
 
3204
3345
  for dashboard in self.get_connection_objects(
3205
- dashboard_graphql_query,
3206
- c.DASHBOARDS_CONNECTION,
3207
- dashboards_filter,
3346
+ query=dashboard_graphql_query,
3347
+ connection_type=c.DASHBOARDS_CONNECTION,
3348
+ query_filter=dashboards_filter,
3349
+ page_size=self.config.effective_dashboard_page_size,
3208
3350
  ):
3209
3351
  if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3210
3352
  yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
@@ -3349,13 +3491,15 @@ class TableauSiteSource:
3349
3491
  datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3350
3492
 
3351
3493
  for datasource in self.get_connection_objects(
3352
- embedded_datasource_graphql_query,
3353
- c.EMBEDDED_DATA_SOURCES_CONNECTION,
3354
- datasource_filter,
3494
+ query=embedded_datasource_graphql_query,
3495
+ connection_type=c.EMBEDDED_DATA_SOURCES_CONNECTION,
3496
+ query_filter=datasource_filter,
3497
+ page_size=self.config.effective_embedded_datasource_page_size,
3355
3498
  ):
3356
3499
  datasource = self.update_datasource_for_field_upstream(
3357
3500
  datasource=datasource,
3358
3501
  field_upstream_query=datasource_upstream_fields_graphql_query,
3502
+ page_size=self.config.effective_embedded_datasource_field_upstream_page_size,
3359
3503
  )
3360
3504
  yield from self.emit_datasource(
3361
3505
  datasource,
@@ -3414,25 +3558,25 @@ class TableauSiteSource:
3414
3558
 
3415
3559
  generated_project_keys.add(project_key.guid())
3416
3560
 
3417
- parent_project_key: Optional[
3418
- Union[ProjectKey, SiteKey]
3419
- ] = None # It is going
3561
+ parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
3562
+ None # It is going
3563
+ )
3420
3564
  # to be used as a parent container key for the current tableau project
3421
3565
 
3422
3566
  if project_.parent_id is not None:
3423
3567
  # Go to the parent project as we need to generate container first for parent
3424
3568
  parent_project_key = self.gen_project_key(project_.parent_id)
3425
3569
 
3426
- parent_tableau_project: Optional[
3427
- TableauProject
3428
- ] = self.tableau_project_registry.get(project_.parent_id)
3570
+ parent_tableau_project: Optional[TableauProject] = (
3571
+ self.tableau_project_registry.get(project_.parent_id)
3572
+ )
3429
3573
 
3430
3574
  if (
3431
3575
  parent_tableau_project is None
3432
3576
  ): # It is not in project registry because of project_pattern
3433
- assert (
3434
- project_.parent_name
3435
- ), f"project {project_.name} should not be null"
3577
+ assert project_.parent_name, (
3578
+ f"project {project_.name} should not be null"
3579
+ )
3436
3580
  parent_tableau_project = TableauProject(
3437
3581
  id=project_.parent_id,
3438
3582
  name=project_.parent_name,
@@ -3460,7 +3604,7 @@ class TableauSiteSource:
3460
3604
  parent_container_key=parent_project_key,
3461
3605
  )
3462
3606
 
3463
- for id_, project in self.tableau_project_registry.items():
3607
+ for project in self.tableau_project_registry.values():
3464
3608
  logger.debug(
3465
3609
  f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
3466
3610
  )
@@ -3524,16 +3668,16 @@ class TableauSiteSource:
3524
3668
  if self.config.extract_usage_stats:
3525
3669
  with PerfTimer() as timer:
3526
3670
  self._populate_usage_stat_registry()
3527
- self.report.extract_usage_stats_timer[
3528
- self.site_content_url
3529
- ] = timer.elapsed_seconds(digits=2)
3671
+ self.report.extract_usage_stats_timer[self.site_content_url] = (
3672
+ timer.elapsed_seconds(digits=2)
3673
+ )
3530
3674
 
3531
3675
  if self.config.permission_ingestion:
3532
3676
  with PerfTimer() as timer:
3533
3677
  self._fetch_groups()
3534
- self.report.fetch_groups_timer[
3535
- self.site_content_url
3536
- ] = timer.elapsed_seconds(digits=2)
3678
+ self.report.fetch_groups_timer[self.site_content_url] = (
3679
+ timer.elapsed_seconds(digits=2)
3680
+ )
3537
3681
 
3538
3682
  # Populate the map of database names and database hostnames to be used later to map
3539
3683
  # databases to platform instances.
@@ -3546,9 +3690,9 @@ class TableauSiteSource:
3546
3690
 
3547
3691
  with PerfTimer() as timer:
3548
3692
  self._populate_projects_registry()
3549
- self.report.populate_projects_registry_timer[
3550
- self.site_content_url
3551
- ] = timer.elapsed_seconds(digits=2)
3693
+ self.report.populate_projects_registry_timer[self.site_content_url] = (
3694
+ timer.elapsed_seconds(digits=2)
3695
+ )
3552
3696
 
3553
3697
  if self.config.add_site_container:
3554
3698
  yield from self.emit_site_container()
@@ -3556,23 +3700,23 @@ class TableauSiteSource:
3556
3700
 
3557
3701
  with PerfTimer() as timer:
3558
3702
  yield from self.emit_workbooks()
3559
- self.report.emit_workbooks_timer[
3560
- self.site_content_url
3561
- ] = timer.elapsed_seconds(digits=2)
3703
+ self.report.emit_workbooks_timer[self.site_content_url] = (
3704
+ timer.elapsed_seconds(digits=2)
3705
+ )
3562
3706
 
3563
3707
  if self.sheet_ids:
3564
3708
  with PerfTimer() as timer:
3565
3709
  yield from self.emit_sheets()
3566
- self.report.emit_sheets_timer[
3567
- self.site_content_url
3568
- ] = timer.elapsed_seconds(digits=2)
3710
+ self.report.emit_sheets_timer[self.site_content_url] = (
3711
+ timer.elapsed_seconds(digits=2)
3712
+ )
3569
3713
 
3570
3714
  if self.dashboard_ids:
3571
3715
  with PerfTimer() as timer:
3572
3716
  yield from self.emit_dashboards()
3573
- self.report.emit_dashboards_timer[
3574
- self.site_content_url
3575
- ] = timer.elapsed_seconds(digits=2)
3717
+ self.report.emit_dashboards_timer[self.site_content_url] = (
3718
+ timer.elapsed_seconds(digits=2)
3719
+ )
3576
3720
 
3577
3721
  if self.embedded_datasource_ids_being_used:
3578
3722
  with PerfTimer() as timer:
@@ -3598,6 +3742,6 @@ class TableauSiteSource:
3598
3742
  if self.database_tables:
3599
3743
  with PerfTimer() as timer:
3600
3744
  yield from self.emit_upstream_tables()
3601
- self.report.emit_upstream_tables_timer[
3602
- self.site_content_url
3603
- ] = timer.elapsed_seconds(digits=2)
3745
+ self.report.emit_upstream_tables_timer[self.site_content_url] = (
3746
+ timer.elapsed_seconds(digits=2)
3747
+ )
@@ -642,8 +642,11 @@ class TableauUpstreamReference:
642
642
 
643
643
  @classmethod
644
644
  def create(
645
- cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None
645
+ cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
646
646
  ) -> "TableauUpstreamReference":
647
+ if d is None:
648
+ raise ValueError("TableauUpstreamReference.create: d is None")
649
+
647
650
  # Values directly from `table` object from Tableau
648
651
  database_dict = (
649
652
  d.get(c.DATABASE) or {}
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
717
720
  # schema
718
721
 
719
722
  # TODO: Validate the startswith check. Currently required for our integration tests
720
- if full_name is None or not full_name.startswith("["):
723
+ if full_name is None:
721
724
  return None
722
725
 
723
726
  return full_name.replace("[", "").replace("]", "").split(".")
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
254
254
  )
255
255
 
256
256
  # TODO: Remove `type:ignore` by refactoring config
257
- profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore
257
+ profiling: Union[
258
+ UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
259
+ ] = Field( # type: ignore
258
260
  default=UnityCatalogGEProfilerConfig(),
259
261
  description="Data profiling configuration",
260
262
  discriminator="method",
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
363
363
 
364
364
  @staticmethod
365
365
  def _create_metastore(
366
- obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
366
+ obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
367
367
  ) -> Optional[Metastore]:
368
368
  if not obj.name:
369
369
  return None
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
205
205
  self.table_refs: Set[TableReference] = set()
206
206
  self.view_refs: Set[TableReference] = set()
207
207
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
208
- self.view_definitions: FileBackedDict[
209
- Tuple[TableReference, str]
210
- ] = FileBackedDict()
208
+ self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
209
+ FileBackedDict()
210
+ )
211
211
 
212
212
  # Global map of tables, for profiling
213
213
  self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
103
103
  query, table_info
104
104
  )
105
105
  for source_table in table_info.source_tables:
106
- with self.report.usage_perf_report.aggregator_add_event_timer:
106
+ with (
107
+ self.report.usage_perf_report.aggregator_add_event_timer
108
+ ):
107
109
  self.usage_aggregator.aggregate_event(
108
110
  resource=source_table,
109
111
  start_time=query.start_time,
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
213
213
  def _aggregate_access_events(
214
214
  self, events: List[ClickHouseJoinedAccessEvent]
215
215
  ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
216
- datasets: Dict[
217
- datetime, Dict[ClickHouseTableRef, AggregatedDataset]
218
- ] = collections.defaultdict(dict)
216
+ datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
217
+ collections.defaultdict(dict)
218
+ )
219
219
 
220
220
  for event in events:
221
221
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
222
222
 
223
223
  resource = (
224
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
224
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
225
225
  f"{event.database}.{event.table}"
226
226
  )
227
227
 
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
235
235
  def _aggregate_access_events(
236
236
  self, events: List[TrinoJoinedAccessEvent]
237
237
  ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
238
- datasets: Dict[
239
- datetime, Dict[TrinoTableRef, AggregatedDataset]
240
- ] = collections.defaultdict(dict)
238
+ datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
239
+ collections.defaultdict(dict)
240
+ )
241
241
 
242
242
  for event in events:
243
243
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)