acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -71,7 +71,6 @@ from datahub.ingestion.api.decorators import (
71
71
  from datahub.ingestion.api.source import (
72
72
  CapabilityReport,
73
73
  MetadataWorkUnitProcessor,
74
- Source,
75
74
  StructuredLogLevel,
76
75
  TestableSource,
77
76
  TestConnectionReport,
@@ -118,6 +117,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
118
117
  )
119
118
  from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
120
119
  from datahub.ingestion.source.tableau.tableau_validation import check_user_role
120
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
121
121
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
122
122
  AuditStamp,
123
123
  ChangeAuditStamps,
@@ -170,8 +170,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
170
  create_lineage_sql_parsed_result,
171
171
  )
172
172
  from datahub.utilities import config_clean
173
+ from datahub.utilities.perf_timer import PerfTimer
174
+ from datahub.utilities.stats_collections import TopKDict
173
175
  from datahub.utilities.urns.dataset_urn import DatasetUrn
174
176
 
177
+ DEFAULT_PAGE_SIZE = 10
178
+
175
179
  try:
176
180
  # On earlier versions of the tableauserverclient, the NonXMLResponseError
177
181
  # was thrown when reauthentication was necessary. We'll keep both exceptions
@@ -277,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
277
281
  return authentication
278
282
 
279
283
  def make_tableau_client(self, site: str) -> Server:
280
- authentication: Union[
281
- TableauAuth, PersonalAccessTokenAuth
282
- ] = self.get_tableau_auth(site)
284
+ authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
285
+ self.get_tableau_auth(site)
286
+ )
283
287
  try:
284
288
  server = Server(
285
289
  self.connect_uri,
@@ -340,11 +344,140 @@ class PermissionIngestionConfig(ConfigModel):
340
344
  )
341
345
 
342
346
 
347
+ class TableauPageSizeConfig(ConfigModel):
348
+ """
349
+ Configuration for setting page sizes for different Tableau metadata objects.
350
+
351
+ Some considerations:
352
+ - All have default values, so no setting is mandatory.
353
+ - In general, with the `effective_` methods, if not specifically set fine-grained metrics fallback to `page_size`
354
+ or correlate with `page_size`.
355
+
356
+ Measuring the impact of changing these values can be done by looking at the
357
+ `num_(filter_|paginated_)?queries_by_connection_type` metrics in the report.
358
+ """
359
+
360
+ page_size: int = Field(
361
+ default=DEFAULT_PAGE_SIZE,
362
+ description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
363
+ )
364
+
365
+ database_server_page_size: Optional[int] = Field(
366
+ default=None,
367
+ description="[advanced] Number of database servers to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
368
+ )
369
+
370
+ @property
371
+ def effective_database_server_page_size(self) -> int:
372
+ return self.database_server_page_size or self.page_size
373
+
374
+ # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
375
+ # returns warnings like this:
376
+ # {
377
+ # 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
378
+ # 'extensions': {
379
+ # 'severity': 'WARNING',
380
+ # 'code': 'NODE_LIMIT_EXCEEDED',
381
+ # 'properties': {
382
+ # 'nodeLimit': 20000
383
+ # }
384
+ # }
385
+ # }
386
+ # Reducing the page size for the workbook queries helps to avoid this.
387
+ workbook_page_size: Optional[int] = Field(
388
+ default=1,
389
+ description="[advanced] Number of workbooks to query at a time using the Tableau API; defaults to `1` and fallbacks to `page_size` if not set.",
390
+ )
391
+
392
+ @property
393
+ def effective_workbook_page_size(self) -> int:
394
+ return self.workbook_page_size or self.page_size
395
+
396
+ sheet_page_size: Optional[int] = Field(
397
+ default=None,
398
+ description="[advanced] Number of sheets to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
399
+ )
400
+
401
+ @property
402
+ def effective_sheet_page_size(self) -> int:
403
+ return self.sheet_page_size or self.page_size
404
+
405
+ dashboard_page_size: Optional[int] = Field(
406
+ default=None,
407
+ description="[advanced] Number of dashboards to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
408
+ )
409
+
410
+ @property
411
+ def effective_dashboard_page_size(self) -> int:
412
+ return self.dashboard_page_size or self.page_size
413
+
414
+ embedded_datasource_page_size: Optional[int] = Field(
415
+ default=None,
416
+ description="[advanced] Number of embedded datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
417
+ )
418
+
419
+ @property
420
+ def effective_embedded_datasource_page_size(self) -> int:
421
+ return self.embedded_datasource_page_size or self.page_size
422
+
423
+ # Since the field upstream query was separated from the embedded datasource queries into an independent query,
424
+ # the number of queries increased significantly and so the execution time.
425
+ # To increase the batching and so reduce the number of queries, we can increase the page size for that
426
+ # particular case.
427
+ #
428
+ # That's why unless specifically set, we will effectively use 10 times the page size as the default page size.
429
+ embedded_datasource_field_upstream_page_size: Optional[int] = Field(
430
+ default=None,
431
+ description="[advanced] Number of upstream fields to query at a time for embedded datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
432
+ )
433
+
434
+ @property
435
+ def effective_embedded_datasource_field_upstream_page_size(self) -> int:
436
+ return self.embedded_datasource_field_upstream_page_size or self.page_size * 10
437
+
438
+ published_datasource_page_size: Optional[int] = Field(
439
+ default=None,
440
+ description="[advanced] Number of published datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
441
+ )
442
+
443
+ @property
444
+ def effective_published_datasource_page_size(self) -> int:
445
+ return self.published_datasource_page_size or self.page_size
446
+
447
+ published_datasource_field_upstream_page_size: Optional[int] = Field(
448
+ default=None,
449
+ description="[advanced] Number of upstream fields to query at a time for published datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
450
+ )
451
+
452
+ @property
453
+ def effective_published_datasource_field_upstream_page_size(self) -> int:
454
+ return self.published_datasource_field_upstream_page_size or self.page_size * 10
455
+
456
+ custom_sql_table_page_size: Optional[int] = Field(
457
+ default=None,
458
+ description="[advanced] Number of custom sql datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
459
+ )
460
+
461
+ @property
462
+ def effective_custom_sql_table_page_size(self) -> int:
463
+ return self.custom_sql_table_page_size or self.page_size
464
+
465
+ database_table_page_size: Optional[int] = Field(
466
+ default=None,
467
+ description="[advanced] Number of database tables to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
468
+ )
469
+
470
+ @property
471
+ def effective_database_table_page_size(self) -> int:
472
+ return self.database_table_page_size or self.page_size
473
+
474
+
343
475
  class TableauConfig(
344
476
  DatasetLineageProviderConfigBase,
345
477
  StatefulIngestionConfigBase,
346
478
  DatasetSourceConfigMixin,
347
479
  TableauConnectionConfig,
480
+ TableauPageSizeConfig,
348
481
  ):
349
482
  projects: Optional[List[str]] = Field(
350
483
  default=["default"],
@@ -394,29 +527,6 @@ class TableauConfig(
394
527
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
395
528
  )
396
529
 
397
- page_size: int = Field(
398
- default=10,
399
- description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
400
- )
401
-
402
- # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
403
- # returns warnings like this:
404
- # {
405
- # 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
406
- # 'extensions': {
407
- # 'severity': 'WARNING',
408
- # 'code': 'NODE_LIMIT_EXCEEDED',
409
- # 'properties': {
410
- # 'nodeLimit': 20000
411
- # }
412
- # }
413
- # }
414
- # Reducing the page size for the workbook queries helps to avoid this.
415
- workbook_page_size: int = Field(
416
- default=1,
417
- description="[advanced] Number of workbooks to query at a time using the Tableau API.",
418
- )
419
-
420
530
  env: str = Field(
421
531
  default=builder.DEFAULT_ENV,
422
532
  description="Environment to use in namespace when constructing URNs.",
@@ -525,7 +635,7 @@ class TableauConfig(
525
635
  project_path_pattern = values.get("project_path_pattern")
526
636
  if project_pattern is None and project_path_pattern is None and projects:
527
637
  logger.warning(
528
- "projects is deprecated, please use " "project_path_pattern instead."
638
+ "projects is deprecated, please use project_path_pattern instead."
529
639
  )
530
640
  logger.info("Initializing project_pattern from projects")
531
641
  values["project_pattern"] = AllowDenyPattern(
@@ -598,18 +708,18 @@ class DatabaseTable:
598
708
  """
599
709
 
600
710
  urn: str
601
- id: Optional[
602
- str
603
- ] = None # is not None only for tables that came from Tableau metadata
711
+ id: Optional[str] = (
712
+ None # is not None only for tables that came from Tableau metadata
713
+ )
604
714
  num_cols: Optional[int] = None
605
715
 
606
- paths: Optional[
607
- Set[str]
608
- ] = None # maintains all browse paths encountered for this table
716
+ paths: Optional[Set[str]] = (
717
+ None # maintains all browse paths encountered for this table
718
+ )
609
719
 
610
- parsed_columns: Optional[
611
- Set[str]
612
- ] = None # maintains all columns encountered for this table during parsing SQL queries
720
+ parsed_columns: Optional[Set[str]] = (
721
+ None # maintains all columns encountered for this table during parsing SQL queries
722
+ )
613
723
 
614
724
  def update_table(
615
725
  self,
@@ -643,12 +753,41 @@ class SiteIdContentUrl:
643
753
 
644
754
 
645
755
  @dataclass
646
- class TableauSourceReport(StaleEntityRemovalSourceReport):
756
+ class TableauSourceReport(
757
+ StaleEntityRemovalSourceReport,
758
+ IngestionStageReport,
759
+ ):
647
760
  get_all_datasources_query_failed: bool = False
648
761
  num_get_datasource_query_failures: int = 0
649
762
  num_datasource_field_skipped_no_name: int = 0
650
763
  num_csql_field_skipped_no_name: int = 0
651
764
  num_table_field_skipped_no_name: int = 0
765
+ # timers
766
+ extract_usage_stats_timer: Dict[str, float] = dataclass_field(
767
+ default_factory=TopKDict
768
+ )
769
+ fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
770
+ populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
771
+ default_factory=TopKDict
772
+ )
773
+ populate_projects_registry_timer: Dict[str, float] = dataclass_field(
774
+ default_factory=TopKDict
775
+ )
776
+ emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
777
+ emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
778
+ emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
779
+ emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
780
+ default_factory=TopKDict
781
+ )
782
+ emit_published_datasources_timer: Dict[str, float] = dataclass_field(
783
+ default_factory=TopKDict
784
+ )
785
+ emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
786
+ default_factory=TopKDict
787
+ )
788
+ emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
789
+ default_factory=TopKDict
790
+ )
652
791
  # lineage
653
792
  num_tables_with_upstream_lineage: int = 0
654
793
  num_upstream_table_lineage: int = 0
@@ -660,6 +799,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
660
799
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
661
800
  num_hidden_assets_skipped: int = 0
662
801
  logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
802
+
663
803
  last_authenticated_at: Optional[datetime] = None
664
804
 
665
805
  num_expected_tableau_metadata_queries: int = 0
@@ -668,6 +808,23 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
668
808
  default_factory=(lambda: defaultdict(int))
669
809
  )
670
810
 
811
+ # Counters for tracking the number of queries made to get_connection_objects method
812
+ # by connection type (static and short set of keys):
813
+ # - num_queries_by_connection_type: total number of queries
814
+ # - num_filter_queries_by_connection_type: number of paginated queries due to splitting query filters
815
+ # - num_paginated_queries_by_connection_type: total number of queries due to Tableau pagination
816
+ # These counters are useful to understand the impact of changing the page size.
817
+
818
+ num_queries_by_connection_type: Dict[str, int] = dataclass_field(
819
+ default_factory=(lambda: defaultdict(int))
820
+ )
821
+ num_filter_queries_by_connection_type: Dict[str, int] = dataclass_field(
822
+ default_factory=(lambda: defaultdict(int))
823
+ )
824
+ num_paginated_queries_by_connection_type: Dict[str, int] = dataclass_field(
825
+ default_factory=(lambda: defaultdict(int))
826
+ )
827
+
671
828
 
672
829
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
673
830
  title: str = "Insufficient Permissions"
@@ -771,11 +928,6 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
771
928
  def get_report(self) -> TableauSourceReport:
772
929
  return self.report
773
930
 
774
- @classmethod
775
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
776
- config = TableauConfig.parse_obj(config_dict)
777
- return cls(config, ctx)
778
-
779
931
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
780
932
  return [
781
933
  *super().get_workunit_processors(),
@@ -834,6 +986,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
834
986
  platform=self.platform,
835
987
  )
836
988
  yield from site_source.ingest_tableau_site()
989
+
837
990
  except MetadataQueryException as md_exception:
838
991
  self.report.failure(
839
992
  title="Failed to Retrieve Tableau Metadata",
@@ -966,7 +1119,9 @@ class TableauSiteSource:
966
1119
  return server_connection
967
1120
 
968
1121
  for database_server in self.get_connection_objects(
969
- database_servers_graphql_query, c.DATABASE_SERVERS_CONNECTION
1122
+ query=database_servers_graphql_query,
1123
+ connection_type=c.DATABASE_SERVERS_CONNECTION,
1124
+ page_size=self.config.effective_database_server_page_size,
970
1125
  ):
971
1126
  database_server_id = database_server.get(c.ID)
972
1127
  server_connection = database_server.get(c.HOST_NAME)
@@ -1392,22 +1547,30 @@ class TableauSiteSource:
1392
1547
  self,
1393
1548
  query: str,
1394
1549
  connection_type: str,
1550
+ page_size: int,
1395
1551
  query_filter: dict = {},
1396
- page_size_override: Optional[int] = None,
1397
1552
  ) -> Iterable[dict]:
1398
1553
  query_filter = optimize_query_filter(query_filter)
1399
1554
 
1400
1555
  # Calls the get_connection_object_page function to get the objects,
1401
1556
  # and automatically handles pagination.
1402
- page_size = page_size_override or self.config.page_size
1403
1557
 
1404
1558
  filter_pages = get_filter_pages(query_filter, page_size)
1559
+ self.report.num_queries_by_connection_type[connection_type] += 1
1560
+ self.report.num_filter_queries_by_connection_type[connection_type] += len(
1561
+ filter_pages
1562
+ )
1563
+
1405
1564
  for filter_page in filter_pages:
1406
1565
  has_next_page = 1
1407
1566
  current_cursor: Optional[str] = None
1408
1567
  while has_next_page:
1409
1568
  filter_: str = make_filter(filter_page)
1410
1569
 
1570
+ self.report.num_paginated_queries_by_connection_type[
1571
+ connection_type
1572
+ ] += 1
1573
+
1411
1574
  self.report.num_expected_tableau_metadata_queries += 1
1412
1575
  (
1413
1576
  connection_objects,
@@ -1435,10 +1598,10 @@ class TableauSiteSource:
1435
1598
  projects = {c.PROJECT_NAME_WITH_IN: project_names}
1436
1599
 
1437
1600
  for workbook in self.get_connection_objects(
1438
- workbook_graphql_query,
1439
- c.WORKBOOKS_CONNECTION,
1440
- projects,
1441
- page_size_override=self.config.workbook_page_size,
1601
+ query=workbook_graphql_query,
1602
+ connection_type=c.WORKBOOKS_CONNECTION,
1603
+ query_filter=projects,
1604
+ page_size=self.config.effective_workbook_page_size,
1442
1605
  ):
1443
1606
  # This check is needed as we are using projectNameWithin which return project as per project name so if
1444
1607
  # user want to ingest only nested project C from A->B->C then tableau might return more than one Project
@@ -1893,9 +2056,10 @@ class TableauSiteSource:
1893
2056
 
1894
2057
  custom_sql_connection = list(
1895
2058
  self.get_connection_objects(
1896
- custom_sql_graphql_query,
1897
- c.CUSTOM_SQL_TABLE_CONNECTION,
1898
- custom_sql_filter,
2059
+ query=custom_sql_graphql_query,
2060
+ connection_type=c.CUSTOM_SQL_TABLE_CONNECTION,
2061
+ query_filter=custom_sql_filter,
2062
+ page_size=self.config.effective_custom_sql_table_page_size,
1899
2063
  )
1900
2064
  )
1901
2065
 
@@ -2146,8 +2310,7 @@ class TableauSiteSource:
2146
2310
  c.EMBEDDED_DATA_SOURCE,
2147
2311
  ):
2148
2312
  logger.debug(
2149
- f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
2150
- f"unsupported"
2313
+ f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
2151
2314
  )
2152
2315
  return None
2153
2316
 
@@ -2329,9 +2492,9 @@ class TableauSiteSource:
2329
2492
  def _enrich_database_tables_with_parsed_schemas(
2330
2493
  self, parsing_result: SqlParsingResult
2331
2494
  ) -> None:
2332
- in_tables_schemas: Dict[
2333
- str, Set[str]
2334
- ] = transform_parsing_result_to_in_tables_schemas(parsing_result)
2495
+ in_tables_schemas: Dict[str, Set[str]] = (
2496
+ transform_parsing_result_to_in_tables_schemas(parsing_result)
2497
+ )
2335
2498
 
2336
2499
  if not in_tables_schemas:
2337
2500
  logger.info("Unable to extract table schema from parsing result")
@@ -2604,6 +2767,7 @@ class TableauSiteSource:
2604
2767
  self,
2605
2768
  datasource: dict,
2606
2769
  field_upstream_query: str,
2770
+ page_size: int,
2607
2771
  ) -> dict:
2608
2772
  # Collect field ids to fetch field upstreams
2609
2773
  field_ids: List[str] = []
@@ -2614,9 +2778,10 @@ class TableauSiteSource:
2614
2778
  # Fetch field upstreams and arrange them in map
2615
2779
  field_vs_upstream: Dict[str, dict] = {}
2616
2780
  for field_upstream in self.get_connection_objects(
2617
- field_upstream_query,
2618
- c.FIELDS_CONNECTION,
2619
- {c.ID_WITH_IN: field_ids},
2781
+ query=field_upstream_query,
2782
+ connection_type=c.FIELDS_CONNECTION,
2783
+ query_filter={c.ID_WITH_IN: field_ids},
2784
+ page_size=page_size,
2620
2785
  ):
2621
2786
  if field_upstream.get(c.ID):
2622
2787
  field_id = field_upstream[c.ID]
@@ -2639,13 +2804,15 @@ class TableauSiteSource:
2639
2804
  datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2640
2805
 
2641
2806
  for datasource in self.get_connection_objects(
2642
- published_datasource_graphql_query,
2643
- c.PUBLISHED_DATA_SOURCES_CONNECTION,
2644
- datasource_filter,
2807
+ query=published_datasource_graphql_query,
2808
+ connection_type=c.PUBLISHED_DATA_SOURCES_CONNECTION,
2809
+ query_filter=datasource_filter,
2810
+ page_size=self.config.effective_published_datasource_page_size,
2645
2811
  ):
2646
2812
  datasource = self.update_datasource_for_field_upstream(
2647
2813
  datasource=datasource,
2648
2814
  field_upstream_query=datasource_upstream_fields_graphql_query,
2815
+ page_size=self.config.effective_published_datasource_field_upstream_page_size,
2649
2816
  )
2650
2817
 
2651
2818
  yield from self.emit_datasource(datasource)
@@ -2661,11 +2828,12 @@ class TableauSiteSource:
2661
2828
  c.ID_WITH_IN: list(tableau_database_table_id_to_urn_map.keys())
2662
2829
  }
2663
2830
 
2664
- # Emmitting tables that came from Tableau metadata
2831
+ # Emitting tables that came from Tableau metadata
2665
2832
  for tableau_table in self.get_connection_objects(
2666
- database_tables_graphql_query,
2667
- c.DATABASE_TABLES_CONNECTION,
2668
- tables_filter,
2833
+ query=database_tables_graphql_query,
2834
+ connection_type=c.DATABASE_TABLES_CONNECTION,
2835
+ query_filter=tables_filter,
2836
+ page_size=self.config.effective_database_table_page_size,
2669
2837
  ):
2670
2838
  database_table = self.database_tables[
2671
2839
  tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
@@ -2854,9 +3022,10 @@ class TableauSiteSource:
2854
3022
  sheets_filter = {c.ID_WITH_IN: self.sheet_ids}
2855
3023
 
2856
3024
  for sheet in self.get_connection_objects(
2857
- sheet_graphql_query,
2858
- c.SHEETS_CONNECTION,
2859
- sheets_filter,
3025
+ query=sheet_graphql_query,
3026
+ connection_type=c.SHEETS_CONNECTION,
3027
+ query_filter=sheets_filter,
3028
+ page_size=self.config.effective_sheet_page_size,
2860
3029
  ):
2861
3030
  if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
2862
3031
  yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
@@ -3174,9 +3343,10 @@ class TableauSiteSource:
3174
3343
  dashboards_filter = {c.ID_WITH_IN: self.dashboard_ids}
3175
3344
 
3176
3345
  for dashboard in self.get_connection_objects(
3177
- dashboard_graphql_query,
3178
- c.DASHBOARDS_CONNECTION,
3179
- dashboards_filter,
3346
+ query=dashboard_graphql_query,
3347
+ connection_type=c.DASHBOARDS_CONNECTION,
3348
+ query_filter=dashboards_filter,
3349
+ page_size=self.config.effective_dashboard_page_size,
3180
3350
  ):
3181
3351
  if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3182
3352
  yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
@@ -3321,13 +3491,15 @@ class TableauSiteSource:
3321
3491
  datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3322
3492
 
3323
3493
  for datasource in self.get_connection_objects(
3324
- embedded_datasource_graphql_query,
3325
- c.EMBEDDED_DATA_SOURCES_CONNECTION,
3326
- datasource_filter,
3494
+ query=embedded_datasource_graphql_query,
3495
+ connection_type=c.EMBEDDED_DATA_SOURCES_CONNECTION,
3496
+ query_filter=datasource_filter,
3497
+ page_size=self.config.effective_embedded_datasource_page_size,
3327
3498
  ):
3328
3499
  datasource = self.update_datasource_for_field_upstream(
3329
3500
  datasource=datasource,
3330
3501
  field_upstream_query=datasource_upstream_fields_graphql_query,
3502
+ page_size=self.config.effective_embedded_datasource_field_upstream_page_size,
3331
3503
  )
3332
3504
  yield from self.emit_datasource(
3333
3505
  datasource,
@@ -3386,25 +3558,25 @@ class TableauSiteSource:
3386
3558
 
3387
3559
  generated_project_keys.add(project_key.guid())
3388
3560
 
3389
- parent_project_key: Optional[
3390
- Union[ProjectKey, SiteKey]
3391
- ] = None # It is going
3561
+ parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
3562
+ None # It is going
3563
+ )
3392
3564
  # to be used as a parent container key for the current tableau project
3393
3565
 
3394
3566
  if project_.parent_id is not None:
3395
3567
  # Go to the parent project as we need to generate container first for parent
3396
3568
  parent_project_key = self.gen_project_key(project_.parent_id)
3397
3569
 
3398
- parent_tableau_project: Optional[
3399
- TableauProject
3400
- ] = self.tableau_project_registry.get(project_.parent_id)
3570
+ parent_tableau_project: Optional[TableauProject] = (
3571
+ self.tableau_project_registry.get(project_.parent_id)
3572
+ )
3401
3573
 
3402
3574
  if (
3403
3575
  parent_tableau_project is None
3404
3576
  ): # It is not in project registry because of project_pattern
3405
- assert (
3406
- project_.parent_name
3407
- ), f"project {project_.name} should not be null"
3577
+ assert project_.parent_name, (
3578
+ f"project {project_.name} should not be null"
3579
+ )
3408
3580
  parent_tableau_project = TableauProject(
3409
3581
  id=project_.parent_id,
3410
3582
  name=project_.parent_name,
@@ -3432,7 +3604,7 @@ class TableauSiteSource:
3432
3604
  parent_container_key=parent_project_key,
3433
3605
  )
3434
3606
 
3435
- for id_, project in self.tableau_project_registry.items():
3607
+ for project in self.tableau_project_registry.values():
3436
3608
  logger.debug(
3437
3609
  f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
3438
3610
  )
@@ -3489,33 +3661,87 @@ class TableauSiteSource:
3489
3661
  return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
3490
3662
 
3491
3663
  def ingest_tableau_site(self):
3492
- # Initialise the dictionary to later look-up for chart and dashboard stat
3493
- if self.config.extract_usage_stats:
3494
- self._populate_usage_stat_registry()
3495
-
3496
- if self.config.permission_ingestion:
3497
- self._fetch_groups()
3498
-
3499
- # Populate the map of database names and database hostnames to be used later to map
3500
- # databases to platform instances.
3501
- if self.config.database_hostname_to_platform_instance_map:
3502
- self._populate_database_server_hostname_map()
3503
-
3504
- self._populate_projects_registry()
3505
-
3506
- if self.config.add_site_container:
3507
- yield from self.emit_site_container()
3508
- yield from self.emit_project_containers()
3509
- yield from self.emit_workbooks()
3510
- if self.sheet_ids:
3511
- yield from self.emit_sheets()
3512
- if self.dashboard_ids:
3513
- yield from self.emit_dashboards()
3514
- if self.embedded_datasource_ids_being_used:
3515
- yield from self.emit_embedded_datasources()
3516
- if self.datasource_ids_being_used:
3517
- yield from self.emit_published_datasources()
3518
- if self.custom_sql_ids_being_used:
3519
- yield from self.emit_custom_sql_datasources()
3520
- if self.database_tables:
3521
- yield from self.emit_upstream_tables()
3664
+ with self.report.new_stage(
3665
+ f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
3666
+ ):
3667
+ # Initialise the dictionary to later look-up for chart and dashboard stat
3668
+ if self.config.extract_usage_stats:
3669
+ with PerfTimer() as timer:
3670
+ self._populate_usage_stat_registry()
3671
+ self.report.extract_usage_stats_timer[self.site_content_url] = (
3672
+ timer.elapsed_seconds(digits=2)
3673
+ )
3674
+
3675
+ if self.config.permission_ingestion:
3676
+ with PerfTimer() as timer:
3677
+ self._fetch_groups()
3678
+ self.report.fetch_groups_timer[self.site_content_url] = (
3679
+ timer.elapsed_seconds(digits=2)
3680
+ )
3681
+
3682
+ # Populate the map of database names and database hostnames to be used later to map
3683
+ # databases to platform instances.
3684
+ if self.config.database_hostname_to_platform_instance_map:
3685
+ with PerfTimer() as timer:
3686
+ self._populate_database_server_hostname_map()
3687
+ self.report.populate_database_server_hostname_map_timer[
3688
+ self.site_content_url
3689
+ ] = timer.elapsed_seconds(digits=2)
3690
+
3691
+ with PerfTimer() as timer:
3692
+ self._populate_projects_registry()
3693
+ self.report.populate_projects_registry_timer[self.site_content_url] = (
3694
+ timer.elapsed_seconds(digits=2)
3695
+ )
3696
+
3697
+ if self.config.add_site_container:
3698
+ yield from self.emit_site_container()
3699
+ yield from self.emit_project_containers()
3700
+
3701
+ with PerfTimer() as timer:
3702
+ yield from self.emit_workbooks()
3703
+ self.report.emit_workbooks_timer[self.site_content_url] = (
3704
+ timer.elapsed_seconds(digits=2)
3705
+ )
3706
+
3707
+ if self.sheet_ids:
3708
+ with PerfTimer() as timer:
3709
+ yield from self.emit_sheets()
3710
+ self.report.emit_sheets_timer[self.site_content_url] = (
3711
+ timer.elapsed_seconds(digits=2)
3712
+ )
3713
+
3714
+ if self.dashboard_ids:
3715
+ with PerfTimer() as timer:
3716
+ yield from self.emit_dashboards()
3717
+ self.report.emit_dashboards_timer[self.site_content_url] = (
3718
+ timer.elapsed_seconds(digits=2)
3719
+ )
3720
+
3721
+ if self.embedded_datasource_ids_being_used:
3722
+ with PerfTimer() as timer:
3723
+ yield from self.emit_embedded_datasources()
3724
+ self.report.emit_embedded_datasources_timer[
3725
+ self.site_content_url
3726
+ ] = timer.elapsed_seconds(digits=2)
3727
+
3728
+ if self.datasource_ids_being_used:
3729
+ with PerfTimer() as timer:
3730
+ yield from self.emit_published_datasources()
3731
+ self.report.emit_published_datasources_timer[
3732
+ self.site_content_url
3733
+ ] = timer.elapsed_seconds(digits=2)
3734
+
3735
+ if self.custom_sql_ids_being_used:
3736
+ with PerfTimer() as timer:
3737
+ yield from self.emit_custom_sql_datasources()
3738
+ self.report.emit_custom_sql_datasources_timer[
3739
+ self.site_content_url
3740
+ ] = timer.elapsed_seconds(digits=2)
3741
+
3742
+ if self.database_tables:
3743
+ with PerfTimer() as timer:
3744
+ yield from self.emit_upstream_tables()
3745
+ self.report.emit_upstream_tables_timer[self.site_content_url] = (
3746
+ timer.elapsed_seconds(digits=2)
3747
+ )