acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/datahub_gc.py +1 -0
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +0 -1
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +245 -101
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -174,6 +174,8 @@ from datahub.utilities.perf_timer import PerfTimer
|
|
|
174
174
|
from datahub.utilities.stats_collections import TopKDict
|
|
175
175
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
176
176
|
|
|
177
|
+
DEFAULT_PAGE_SIZE = 10
|
|
178
|
+
|
|
177
179
|
try:
|
|
178
180
|
# On earlier versions of the tableauserverclient, the NonXMLResponseError
|
|
179
181
|
# was thrown when reauthentication was necessary. We'll keep both exceptions
|
|
@@ -279,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
279
281
|
return authentication
|
|
280
282
|
|
|
281
283
|
def make_tableau_client(self, site: str) -> Server:
|
|
282
|
-
authentication: Union[
|
|
283
|
-
|
|
284
|
-
|
|
284
|
+
authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
|
|
285
|
+
self.get_tableau_auth(site)
|
|
286
|
+
)
|
|
285
287
|
try:
|
|
286
288
|
server = Server(
|
|
287
289
|
self.connect_uri,
|
|
@@ -342,11 +344,140 @@ class PermissionIngestionConfig(ConfigModel):
|
|
|
342
344
|
)
|
|
343
345
|
|
|
344
346
|
|
|
347
|
+
class TableauPageSizeConfig(ConfigModel):
|
|
348
|
+
"""
|
|
349
|
+
Configuration for setting page sizes for different Tableau metadata objects.
|
|
350
|
+
|
|
351
|
+
Some considerations:
|
|
352
|
+
- All have default values, so no setting is mandatory.
|
|
353
|
+
- In general, with the `effective_` methods, if not specifically set fine-grained metrics fallback to `page_size`
|
|
354
|
+
or correlate with `page_size`.
|
|
355
|
+
|
|
356
|
+
Measuring the impact of changing these values can be done by looking at the
|
|
357
|
+
`num_(filter_|paginated_)?queries_by_connection_type` metrics in the report.
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
page_size: int = Field(
|
|
361
|
+
default=DEFAULT_PAGE_SIZE,
|
|
362
|
+
description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
database_server_page_size: Optional[int] = Field(
|
|
366
|
+
default=None,
|
|
367
|
+
description="[advanced] Number of database servers to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def effective_database_server_page_size(self) -> int:
|
|
372
|
+
return self.database_server_page_size or self.page_size
|
|
373
|
+
|
|
374
|
+
# We've found that even with a small workbook page size (e.g. 10), the Tableau API often
|
|
375
|
+
# returns warnings like this:
|
|
376
|
+
# {
|
|
377
|
+
# 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
|
|
378
|
+
# 'extensions': {
|
|
379
|
+
# 'severity': 'WARNING',
|
|
380
|
+
# 'code': 'NODE_LIMIT_EXCEEDED',
|
|
381
|
+
# 'properties': {
|
|
382
|
+
# 'nodeLimit': 20000
|
|
383
|
+
# }
|
|
384
|
+
# }
|
|
385
|
+
# }
|
|
386
|
+
# Reducing the page size for the workbook queries helps to avoid this.
|
|
387
|
+
workbook_page_size: Optional[int] = Field(
|
|
388
|
+
default=1,
|
|
389
|
+
description="[advanced] Number of workbooks to query at a time using the Tableau API; defaults to `1` and fallbacks to `page_size` if not set.",
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
@property
|
|
393
|
+
def effective_workbook_page_size(self) -> int:
|
|
394
|
+
return self.workbook_page_size or self.page_size
|
|
395
|
+
|
|
396
|
+
sheet_page_size: Optional[int] = Field(
|
|
397
|
+
default=None,
|
|
398
|
+
description="[advanced] Number of sheets to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
@property
|
|
402
|
+
def effective_sheet_page_size(self) -> int:
|
|
403
|
+
return self.sheet_page_size or self.page_size
|
|
404
|
+
|
|
405
|
+
dashboard_page_size: Optional[int] = Field(
|
|
406
|
+
default=None,
|
|
407
|
+
description="[advanced] Number of dashboards to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
@property
|
|
411
|
+
def effective_dashboard_page_size(self) -> int:
|
|
412
|
+
return self.dashboard_page_size or self.page_size
|
|
413
|
+
|
|
414
|
+
embedded_datasource_page_size: Optional[int] = Field(
|
|
415
|
+
default=None,
|
|
416
|
+
description="[advanced] Number of embedded datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
@property
|
|
420
|
+
def effective_embedded_datasource_page_size(self) -> int:
|
|
421
|
+
return self.embedded_datasource_page_size or self.page_size
|
|
422
|
+
|
|
423
|
+
# Since the field upstream query was separated from the embedded datasource queries into an independent query,
|
|
424
|
+
# the number of queries increased significantly and so the execution time.
|
|
425
|
+
# To increase the batching and so reduce the number of queries, we can increase the page size for that
|
|
426
|
+
# particular case.
|
|
427
|
+
#
|
|
428
|
+
# That's why unless specifically set, we will effectively use 10 times the page size as the default page size.
|
|
429
|
+
embedded_datasource_field_upstream_page_size: Optional[int] = Field(
|
|
430
|
+
default=None,
|
|
431
|
+
description="[advanced] Number of upstream fields to query at a time for embedded datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
@property
|
|
435
|
+
def effective_embedded_datasource_field_upstream_page_size(self) -> int:
|
|
436
|
+
return self.embedded_datasource_field_upstream_page_size or self.page_size * 10
|
|
437
|
+
|
|
438
|
+
published_datasource_page_size: Optional[int] = Field(
|
|
439
|
+
default=None,
|
|
440
|
+
description="[advanced] Number of published datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
@property
|
|
444
|
+
def effective_published_datasource_page_size(self) -> int:
|
|
445
|
+
return self.published_datasource_page_size or self.page_size
|
|
446
|
+
|
|
447
|
+
published_datasource_field_upstream_page_size: Optional[int] = Field(
|
|
448
|
+
default=None,
|
|
449
|
+
description="[advanced] Number of upstream fields to query at a time for published datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
@property
|
|
453
|
+
def effective_published_datasource_field_upstream_page_size(self) -> int:
|
|
454
|
+
return self.published_datasource_field_upstream_page_size or self.page_size * 10
|
|
455
|
+
|
|
456
|
+
custom_sql_table_page_size: Optional[int] = Field(
|
|
457
|
+
default=None,
|
|
458
|
+
description="[advanced] Number of custom sql datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
@property
|
|
462
|
+
def effective_custom_sql_table_page_size(self) -> int:
|
|
463
|
+
return self.custom_sql_table_page_size or self.page_size
|
|
464
|
+
|
|
465
|
+
database_table_page_size: Optional[int] = Field(
|
|
466
|
+
default=None,
|
|
467
|
+
description="[advanced] Number of database tables to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
@property
|
|
471
|
+
def effective_database_table_page_size(self) -> int:
|
|
472
|
+
return self.database_table_page_size or self.page_size
|
|
473
|
+
|
|
474
|
+
|
|
345
475
|
class TableauConfig(
|
|
346
476
|
DatasetLineageProviderConfigBase,
|
|
347
477
|
StatefulIngestionConfigBase,
|
|
348
478
|
DatasetSourceConfigMixin,
|
|
349
479
|
TableauConnectionConfig,
|
|
480
|
+
TableauPageSizeConfig,
|
|
350
481
|
):
|
|
351
482
|
projects: Optional[List[str]] = Field(
|
|
352
483
|
default=["default"],
|
|
@@ -396,29 +527,6 @@ class TableauConfig(
|
|
|
396
527
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
397
528
|
)
|
|
398
529
|
|
|
399
|
-
page_size: int = Field(
|
|
400
|
-
default=10,
|
|
401
|
-
description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
# We've found that even with a small workbook page size (e.g. 10), the Tableau API often
|
|
405
|
-
# returns warnings like this:
|
|
406
|
-
# {
|
|
407
|
-
# 'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
|
|
408
|
-
# 'extensions': {
|
|
409
|
-
# 'severity': 'WARNING',
|
|
410
|
-
# 'code': 'NODE_LIMIT_EXCEEDED',
|
|
411
|
-
# 'properties': {
|
|
412
|
-
# 'nodeLimit': 20000
|
|
413
|
-
# }
|
|
414
|
-
# }
|
|
415
|
-
# }
|
|
416
|
-
# Reducing the page size for the workbook queries helps to avoid this.
|
|
417
|
-
workbook_page_size: int = Field(
|
|
418
|
-
default=1,
|
|
419
|
-
description="[advanced] Number of workbooks to query at a time using the Tableau API.",
|
|
420
|
-
)
|
|
421
|
-
|
|
422
530
|
env: str = Field(
|
|
423
531
|
default=builder.DEFAULT_ENV,
|
|
424
532
|
description="Environment to use in namespace when constructing URNs.",
|
|
@@ -527,7 +635,7 @@ class TableauConfig(
|
|
|
527
635
|
project_path_pattern = values.get("project_path_pattern")
|
|
528
636
|
if project_pattern is None and project_path_pattern is None and projects:
|
|
529
637
|
logger.warning(
|
|
530
|
-
"projects is deprecated, please use
|
|
638
|
+
"projects is deprecated, please use project_path_pattern instead."
|
|
531
639
|
)
|
|
532
640
|
logger.info("Initializing project_pattern from projects")
|
|
533
641
|
values["project_pattern"] = AllowDenyPattern(
|
|
@@ -600,18 +708,18 @@ class DatabaseTable:
|
|
|
600
708
|
"""
|
|
601
709
|
|
|
602
710
|
urn: str
|
|
603
|
-
id: Optional[
|
|
604
|
-
|
|
605
|
-
|
|
711
|
+
id: Optional[str] = (
|
|
712
|
+
None # is not None only for tables that came from Tableau metadata
|
|
713
|
+
)
|
|
606
714
|
num_cols: Optional[int] = None
|
|
607
715
|
|
|
608
|
-
paths: Optional[
|
|
609
|
-
|
|
610
|
-
|
|
716
|
+
paths: Optional[Set[str]] = (
|
|
717
|
+
None # maintains all browse paths encountered for this table
|
|
718
|
+
)
|
|
611
719
|
|
|
612
|
-
parsed_columns: Optional[
|
|
613
|
-
|
|
614
|
-
|
|
720
|
+
parsed_columns: Optional[Set[str]] = (
|
|
721
|
+
None # maintains all columns encountered for this table during parsing SQL queries
|
|
722
|
+
)
|
|
615
723
|
|
|
616
724
|
def update_table(
|
|
617
725
|
self,
|
|
@@ -700,6 +808,23 @@ class TableauSourceReport(
|
|
|
700
808
|
default_factory=(lambda: defaultdict(int))
|
|
701
809
|
)
|
|
702
810
|
|
|
811
|
+
# Counters for tracking the number of queries made to get_connection_objects method
|
|
812
|
+
# by connection type (static and short set of keys):
|
|
813
|
+
# - num_queries_by_connection_type: total number of queries
|
|
814
|
+
# - num_filter_queries_by_connection_type: number of paginated queries due to splitting query filters
|
|
815
|
+
# - num_paginated_queries_by_connection_type: total number of queries due to Tableau pagination
|
|
816
|
+
# These counters are useful to understand the impact of changing the page size.
|
|
817
|
+
|
|
818
|
+
num_queries_by_connection_type: Dict[str, int] = dataclass_field(
|
|
819
|
+
default_factory=(lambda: defaultdict(int))
|
|
820
|
+
)
|
|
821
|
+
num_filter_queries_by_connection_type: Dict[str, int] = dataclass_field(
|
|
822
|
+
default_factory=(lambda: defaultdict(int))
|
|
823
|
+
)
|
|
824
|
+
num_paginated_queries_by_connection_type: Dict[str, int] = dataclass_field(
|
|
825
|
+
default_factory=(lambda: defaultdict(int))
|
|
826
|
+
)
|
|
827
|
+
|
|
703
828
|
|
|
704
829
|
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
705
830
|
title: str = "Insufficient Permissions"
|
|
@@ -994,7 +1119,9 @@ class TableauSiteSource:
|
|
|
994
1119
|
return server_connection
|
|
995
1120
|
|
|
996
1121
|
for database_server in self.get_connection_objects(
|
|
997
|
-
database_servers_graphql_query,
|
|
1122
|
+
query=database_servers_graphql_query,
|
|
1123
|
+
connection_type=c.DATABASE_SERVERS_CONNECTION,
|
|
1124
|
+
page_size=self.config.effective_database_server_page_size,
|
|
998
1125
|
):
|
|
999
1126
|
database_server_id = database_server.get(c.ID)
|
|
1000
1127
|
server_connection = database_server.get(c.HOST_NAME)
|
|
@@ -1420,22 +1547,30 @@ class TableauSiteSource:
|
|
|
1420
1547
|
self,
|
|
1421
1548
|
query: str,
|
|
1422
1549
|
connection_type: str,
|
|
1550
|
+
page_size: int,
|
|
1423
1551
|
query_filter: dict = {},
|
|
1424
|
-
page_size_override: Optional[int] = None,
|
|
1425
1552
|
) -> Iterable[dict]:
|
|
1426
1553
|
query_filter = optimize_query_filter(query_filter)
|
|
1427
1554
|
|
|
1428
1555
|
# Calls the get_connection_object_page function to get the objects,
|
|
1429
1556
|
# and automatically handles pagination.
|
|
1430
|
-
page_size = page_size_override or self.config.page_size
|
|
1431
1557
|
|
|
1432
1558
|
filter_pages = get_filter_pages(query_filter, page_size)
|
|
1559
|
+
self.report.num_queries_by_connection_type[connection_type] += 1
|
|
1560
|
+
self.report.num_filter_queries_by_connection_type[connection_type] += len(
|
|
1561
|
+
filter_pages
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1433
1564
|
for filter_page in filter_pages:
|
|
1434
1565
|
has_next_page = 1
|
|
1435
1566
|
current_cursor: Optional[str] = None
|
|
1436
1567
|
while has_next_page:
|
|
1437
1568
|
filter_: str = make_filter(filter_page)
|
|
1438
1569
|
|
|
1570
|
+
self.report.num_paginated_queries_by_connection_type[
|
|
1571
|
+
connection_type
|
|
1572
|
+
] += 1
|
|
1573
|
+
|
|
1439
1574
|
self.report.num_expected_tableau_metadata_queries += 1
|
|
1440
1575
|
(
|
|
1441
1576
|
connection_objects,
|
|
@@ -1463,10 +1598,10 @@ class TableauSiteSource:
|
|
|
1463
1598
|
projects = {c.PROJECT_NAME_WITH_IN: project_names}
|
|
1464
1599
|
|
|
1465
1600
|
for workbook in self.get_connection_objects(
|
|
1466
|
-
workbook_graphql_query,
|
|
1467
|
-
c.WORKBOOKS_CONNECTION,
|
|
1468
|
-
projects,
|
|
1469
|
-
|
|
1601
|
+
query=workbook_graphql_query,
|
|
1602
|
+
connection_type=c.WORKBOOKS_CONNECTION,
|
|
1603
|
+
query_filter=projects,
|
|
1604
|
+
page_size=self.config.effective_workbook_page_size,
|
|
1470
1605
|
):
|
|
1471
1606
|
# This check is needed as we are using projectNameWithin which return project as per project name so if
|
|
1472
1607
|
# user want to ingest only nested project C from A->B->C then tableau might return more than one Project
|
|
@@ -1921,9 +2056,10 @@ class TableauSiteSource:
|
|
|
1921
2056
|
|
|
1922
2057
|
custom_sql_connection = list(
|
|
1923
2058
|
self.get_connection_objects(
|
|
1924
|
-
custom_sql_graphql_query,
|
|
1925
|
-
c.CUSTOM_SQL_TABLE_CONNECTION,
|
|
1926
|
-
custom_sql_filter,
|
|
2059
|
+
query=custom_sql_graphql_query,
|
|
2060
|
+
connection_type=c.CUSTOM_SQL_TABLE_CONNECTION,
|
|
2061
|
+
query_filter=custom_sql_filter,
|
|
2062
|
+
page_size=self.config.effective_custom_sql_table_page_size,
|
|
1927
2063
|
)
|
|
1928
2064
|
)
|
|
1929
2065
|
|
|
@@ -2174,8 +2310,7 @@ class TableauSiteSource:
|
|
|
2174
2310
|
c.EMBEDDED_DATA_SOURCE,
|
|
2175
2311
|
):
|
|
2176
2312
|
logger.debug(
|
|
2177
|
-
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
|
|
2178
|
-
f"unsupported"
|
|
2313
|
+
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
|
|
2179
2314
|
)
|
|
2180
2315
|
return None
|
|
2181
2316
|
|
|
@@ -2357,9 +2492,9 @@ class TableauSiteSource:
|
|
|
2357
2492
|
def _enrich_database_tables_with_parsed_schemas(
|
|
2358
2493
|
self, parsing_result: SqlParsingResult
|
|
2359
2494
|
) -> None:
|
|
2360
|
-
in_tables_schemas: Dict[
|
|
2361
|
-
|
|
2362
|
-
|
|
2495
|
+
in_tables_schemas: Dict[str, Set[str]] = (
|
|
2496
|
+
transform_parsing_result_to_in_tables_schemas(parsing_result)
|
|
2497
|
+
)
|
|
2363
2498
|
|
|
2364
2499
|
if not in_tables_schemas:
|
|
2365
2500
|
logger.info("Unable to extract table schema from parsing result")
|
|
@@ -2632,6 +2767,7 @@ class TableauSiteSource:
|
|
|
2632
2767
|
self,
|
|
2633
2768
|
datasource: dict,
|
|
2634
2769
|
field_upstream_query: str,
|
|
2770
|
+
page_size: int,
|
|
2635
2771
|
) -> dict:
|
|
2636
2772
|
# Collect field ids to fetch field upstreams
|
|
2637
2773
|
field_ids: List[str] = []
|
|
@@ -2642,9 +2778,10 @@ class TableauSiteSource:
|
|
|
2642
2778
|
# Fetch field upstreams and arrange them in map
|
|
2643
2779
|
field_vs_upstream: Dict[str, dict] = {}
|
|
2644
2780
|
for field_upstream in self.get_connection_objects(
|
|
2645
|
-
field_upstream_query,
|
|
2646
|
-
c.FIELDS_CONNECTION,
|
|
2647
|
-
{c.ID_WITH_IN: field_ids},
|
|
2781
|
+
query=field_upstream_query,
|
|
2782
|
+
connection_type=c.FIELDS_CONNECTION,
|
|
2783
|
+
query_filter={c.ID_WITH_IN: field_ids},
|
|
2784
|
+
page_size=page_size,
|
|
2648
2785
|
):
|
|
2649
2786
|
if field_upstream.get(c.ID):
|
|
2650
2787
|
field_id = field_upstream[c.ID]
|
|
@@ -2667,13 +2804,15 @@ class TableauSiteSource:
|
|
|
2667
2804
|
datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
|
|
2668
2805
|
|
|
2669
2806
|
for datasource in self.get_connection_objects(
|
|
2670
|
-
published_datasource_graphql_query,
|
|
2671
|
-
c.PUBLISHED_DATA_SOURCES_CONNECTION,
|
|
2672
|
-
datasource_filter,
|
|
2807
|
+
query=published_datasource_graphql_query,
|
|
2808
|
+
connection_type=c.PUBLISHED_DATA_SOURCES_CONNECTION,
|
|
2809
|
+
query_filter=datasource_filter,
|
|
2810
|
+
page_size=self.config.effective_published_datasource_page_size,
|
|
2673
2811
|
):
|
|
2674
2812
|
datasource = self.update_datasource_for_field_upstream(
|
|
2675
2813
|
datasource=datasource,
|
|
2676
2814
|
field_upstream_query=datasource_upstream_fields_graphql_query,
|
|
2815
|
+
page_size=self.config.effective_published_datasource_field_upstream_page_size,
|
|
2677
2816
|
)
|
|
2678
2817
|
|
|
2679
2818
|
yield from self.emit_datasource(datasource)
|
|
@@ -2689,11 +2828,12 @@ class TableauSiteSource:
|
|
|
2689
2828
|
c.ID_WITH_IN: list(tableau_database_table_id_to_urn_map.keys())
|
|
2690
2829
|
}
|
|
2691
2830
|
|
|
2692
|
-
#
|
|
2831
|
+
# Emitting tables that came from Tableau metadata
|
|
2693
2832
|
for tableau_table in self.get_connection_objects(
|
|
2694
|
-
database_tables_graphql_query,
|
|
2695
|
-
c.DATABASE_TABLES_CONNECTION,
|
|
2696
|
-
tables_filter,
|
|
2833
|
+
query=database_tables_graphql_query,
|
|
2834
|
+
connection_type=c.DATABASE_TABLES_CONNECTION,
|
|
2835
|
+
query_filter=tables_filter,
|
|
2836
|
+
page_size=self.config.effective_database_table_page_size,
|
|
2697
2837
|
):
|
|
2698
2838
|
database_table = self.database_tables[
|
|
2699
2839
|
tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
|
|
@@ -2882,9 +3022,10 @@ class TableauSiteSource:
|
|
|
2882
3022
|
sheets_filter = {c.ID_WITH_IN: self.sheet_ids}
|
|
2883
3023
|
|
|
2884
3024
|
for sheet in self.get_connection_objects(
|
|
2885
|
-
sheet_graphql_query,
|
|
2886
|
-
c.SHEETS_CONNECTION,
|
|
2887
|
-
sheets_filter,
|
|
3025
|
+
query=sheet_graphql_query,
|
|
3026
|
+
connection_type=c.SHEETS_CONNECTION,
|
|
3027
|
+
query_filter=sheets_filter,
|
|
3028
|
+
page_size=self.config.effective_sheet_page_size,
|
|
2888
3029
|
):
|
|
2889
3030
|
if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
|
|
2890
3031
|
yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
|
|
@@ -3202,9 +3343,10 @@ class TableauSiteSource:
|
|
|
3202
3343
|
dashboards_filter = {c.ID_WITH_IN: self.dashboard_ids}
|
|
3203
3344
|
|
|
3204
3345
|
for dashboard in self.get_connection_objects(
|
|
3205
|
-
dashboard_graphql_query,
|
|
3206
|
-
c.DASHBOARDS_CONNECTION,
|
|
3207
|
-
dashboards_filter,
|
|
3346
|
+
query=dashboard_graphql_query,
|
|
3347
|
+
connection_type=c.DASHBOARDS_CONNECTION,
|
|
3348
|
+
query_filter=dashboards_filter,
|
|
3349
|
+
page_size=self.config.effective_dashboard_page_size,
|
|
3208
3350
|
):
|
|
3209
3351
|
if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
|
|
3210
3352
|
yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
|
|
@@ -3349,13 +3491,15 @@ class TableauSiteSource:
|
|
|
3349
3491
|
datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
|
|
3350
3492
|
|
|
3351
3493
|
for datasource in self.get_connection_objects(
|
|
3352
|
-
embedded_datasource_graphql_query,
|
|
3353
|
-
c.EMBEDDED_DATA_SOURCES_CONNECTION,
|
|
3354
|
-
datasource_filter,
|
|
3494
|
+
query=embedded_datasource_graphql_query,
|
|
3495
|
+
connection_type=c.EMBEDDED_DATA_SOURCES_CONNECTION,
|
|
3496
|
+
query_filter=datasource_filter,
|
|
3497
|
+
page_size=self.config.effective_embedded_datasource_page_size,
|
|
3355
3498
|
):
|
|
3356
3499
|
datasource = self.update_datasource_for_field_upstream(
|
|
3357
3500
|
datasource=datasource,
|
|
3358
3501
|
field_upstream_query=datasource_upstream_fields_graphql_query,
|
|
3502
|
+
page_size=self.config.effective_embedded_datasource_field_upstream_page_size,
|
|
3359
3503
|
)
|
|
3360
3504
|
yield from self.emit_datasource(
|
|
3361
3505
|
datasource,
|
|
@@ -3414,25 +3558,25 @@ class TableauSiteSource:
|
|
|
3414
3558
|
|
|
3415
3559
|
generated_project_keys.add(project_key.guid())
|
|
3416
3560
|
|
|
3417
|
-
parent_project_key: Optional[
|
|
3418
|
-
|
|
3419
|
-
|
|
3561
|
+
parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
|
|
3562
|
+
None # It is going
|
|
3563
|
+
)
|
|
3420
3564
|
# to be used as a parent container key for the current tableau project
|
|
3421
3565
|
|
|
3422
3566
|
if project_.parent_id is not None:
|
|
3423
3567
|
# Go to the parent project as we need to generate container first for parent
|
|
3424
3568
|
parent_project_key = self.gen_project_key(project_.parent_id)
|
|
3425
3569
|
|
|
3426
|
-
parent_tableau_project: Optional[
|
|
3427
|
-
|
|
3428
|
-
|
|
3570
|
+
parent_tableau_project: Optional[TableauProject] = (
|
|
3571
|
+
self.tableau_project_registry.get(project_.parent_id)
|
|
3572
|
+
)
|
|
3429
3573
|
|
|
3430
3574
|
if (
|
|
3431
3575
|
parent_tableau_project is None
|
|
3432
3576
|
): # It is not in project registry because of project_pattern
|
|
3433
|
-
assert (
|
|
3434
|
-
project_.
|
|
3435
|
-
)
|
|
3577
|
+
assert project_.parent_name, (
|
|
3578
|
+
f"project {project_.name} should not be null"
|
|
3579
|
+
)
|
|
3436
3580
|
parent_tableau_project = TableauProject(
|
|
3437
3581
|
id=project_.parent_id,
|
|
3438
3582
|
name=project_.parent_name,
|
|
@@ -3460,7 +3604,7 @@ class TableauSiteSource:
|
|
|
3460
3604
|
parent_container_key=parent_project_key,
|
|
3461
3605
|
)
|
|
3462
3606
|
|
|
3463
|
-
for
|
|
3607
|
+
for project in self.tableau_project_registry.values():
|
|
3464
3608
|
logger.debug(
|
|
3465
3609
|
f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
|
|
3466
3610
|
)
|
|
@@ -3524,16 +3668,16 @@ class TableauSiteSource:
|
|
|
3524
3668
|
if self.config.extract_usage_stats:
|
|
3525
3669
|
with PerfTimer() as timer:
|
|
3526
3670
|
self._populate_usage_stat_registry()
|
|
3527
|
-
self.report.extract_usage_stats_timer[
|
|
3528
|
-
|
|
3529
|
-
|
|
3671
|
+
self.report.extract_usage_stats_timer[self.site_content_url] = (
|
|
3672
|
+
timer.elapsed_seconds(digits=2)
|
|
3673
|
+
)
|
|
3530
3674
|
|
|
3531
3675
|
if self.config.permission_ingestion:
|
|
3532
3676
|
with PerfTimer() as timer:
|
|
3533
3677
|
self._fetch_groups()
|
|
3534
|
-
self.report.fetch_groups_timer[
|
|
3535
|
-
|
|
3536
|
-
|
|
3678
|
+
self.report.fetch_groups_timer[self.site_content_url] = (
|
|
3679
|
+
timer.elapsed_seconds(digits=2)
|
|
3680
|
+
)
|
|
3537
3681
|
|
|
3538
3682
|
# Populate the map of database names and database hostnames to be used later to map
|
|
3539
3683
|
# databases to platform instances.
|
|
@@ -3546,9 +3690,9 @@ class TableauSiteSource:
|
|
|
3546
3690
|
|
|
3547
3691
|
with PerfTimer() as timer:
|
|
3548
3692
|
self._populate_projects_registry()
|
|
3549
|
-
self.report.populate_projects_registry_timer[
|
|
3550
|
-
|
|
3551
|
-
|
|
3693
|
+
self.report.populate_projects_registry_timer[self.site_content_url] = (
|
|
3694
|
+
timer.elapsed_seconds(digits=2)
|
|
3695
|
+
)
|
|
3552
3696
|
|
|
3553
3697
|
if self.config.add_site_container:
|
|
3554
3698
|
yield from self.emit_site_container()
|
|
@@ -3556,23 +3700,23 @@ class TableauSiteSource:
|
|
|
3556
3700
|
|
|
3557
3701
|
with PerfTimer() as timer:
|
|
3558
3702
|
yield from self.emit_workbooks()
|
|
3559
|
-
self.report.emit_workbooks_timer[
|
|
3560
|
-
|
|
3561
|
-
|
|
3703
|
+
self.report.emit_workbooks_timer[self.site_content_url] = (
|
|
3704
|
+
timer.elapsed_seconds(digits=2)
|
|
3705
|
+
)
|
|
3562
3706
|
|
|
3563
3707
|
if self.sheet_ids:
|
|
3564
3708
|
with PerfTimer() as timer:
|
|
3565
3709
|
yield from self.emit_sheets()
|
|
3566
|
-
self.report.emit_sheets_timer[
|
|
3567
|
-
|
|
3568
|
-
|
|
3710
|
+
self.report.emit_sheets_timer[self.site_content_url] = (
|
|
3711
|
+
timer.elapsed_seconds(digits=2)
|
|
3712
|
+
)
|
|
3569
3713
|
|
|
3570
3714
|
if self.dashboard_ids:
|
|
3571
3715
|
with PerfTimer() as timer:
|
|
3572
3716
|
yield from self.emit_dashboards()
|
|
3573
|
-
self.report.emit_dashboards_timer[
|
|
3574
|
-
|
|
3575
|
-
|
|
3717
|
+
self.report.emit_dashboards_timer[self.site_content_url] = (
|
|
3718
|
+
timer.elapsed_seconds(digits=2)
|
|
3719
|
+
)
|
|
3576
3720
|
|
|
3577
3721
|
if self.embedded_datasource_ids_being_used:
|
|
3578
3722
|
with PerfTimer() as timer:
|
|
@@ -3598,6 +3742,6 @@ class TableauSiteSource:
|
|
|
3598
3742
|
if self.database_tables:
|
|
3599
3743
|
with PerfTimer() as timer:
|
|
3600
3744
|
yield from self.emit_upstream_tables()
|
|
3601
|
-
self.report.emit_upstream_tables_timer[
|
|
3602
|
-
|
|
3603
|
-
|
|
3745
|
+
self.report.emit_upstream_tables_timer[self.site_content_url] = (
|
|
3746
|
+
timer.elapsed_seconds(digits=2)
|
|
3747
|
+
)
|
|
@@ -642,8 +642,11 @@ class TableauUpstreamReference:
|
|
|
642
642
|
|
|
643
643
|
@classmethod
|
|
644
644
|
def create(
|
|
645
|
-
cls, d:
|
|
645
|
+
cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
|
|
646
646
|
) -> "TableauUpstreamReference":
|
|
647
|
+
if d is None:
|
|
648
|
+
raise ValueError("TableauUpstreamReference.create: d is None")
|
|
649
|
+
|
|
647
650
|
# Values directly from `table` object from Tableau
|
|
648
651
|
database_dict = (
|
|
649
652
|
d.get(c.DATABASE) or {}
|
|
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
|
|
|
717
720
|
# schema
|
|
718
721
|
|
|
719
722
|
# TODO: Validate the startswith check. Currently required for our integration tests
|
|
720
|
-
if full_name is None
|
|
723
|
+
if full_name is None:
|
|
721
724
|
return None
|
|
722
725
|
|
|
723
726
|
return full_name.replace("[", "").replace("]", "").split(".")
|
|
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
|
|
|
254
254
|
)
|
|
255
255
|
|
|
256
256
|
# TODO: Remove `type:ignore` by refactoring config
|
|
257
|
-
profiling: Union[
|
|
257
|
+
profiling: Union[
|
|
258
|
+
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
259
|
+
] = Field( # type: ignore
|
|
258
260
|
default=UnityCatalogGEProfilerConfig(),
|
|
259
261
|
description="Data profiling configuration",
|
|
260
262
|
discriminator="method",
|
|
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
363
363
|
|
|
364
364
|
@staticmethod
|
|
365
365
|
def _create_metastore(
|
|
366
|
-
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
|
|
366
|
+
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
|
|
367
367
|
) -> Optional[Metastore]:
|
|
368
368
|
if not obj.name:
|
|
369
369
|
return None
|
|
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
205
205
|
self.table_refs: Set[TableReference] = set()
|
|
206
206
|
self.view_refs: Set[TableReference] = set()
|
|
207
207
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
208
|
-
self.view_definitions: FileBackedDict[
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
209
|
+
FileBackedDict()
|
|
210
|
+
)
|
|
211
211
|
|
|
212
212
|
# Global map of tables, for profiling
|
|
213
213
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
|
|
|
103
103
|
query, table_info
|
|
104
104
|
)
|
|
105
105
|
for source_table in table_info.source_tables:
|
|
106
|
-
with
|
|
106
|
+
with (
|
|
107
|
+
self.report.usage_perf_report.aggregator_add_event_timer
|
|
108
|
+
):
|
|
107
109
|
self.usage_aggregator.aggregate_event(
|
|
108
110
|
resource=source_table,
|
|
109
111
|
start_time=query.start_time,
|
|
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
|
|
|
213
213
|
def _aggregate_access_events(
|
|
214
214
|
self, events: List[ClickHouseJoinedAccessEvent]
|
|
215
215
|
) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
|
|
216
|
-
datasets: Dict[
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
|
|
217
|
+
collections.defaultdict(dict)
|
|
218
|
+
)
|
|
219
219
|
|
|
220
220
|
for event in events:
|
|
221
221
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
222
222
|
|
|
223
223
|
resource = (
|
|
224
|
-
f
|
|
224
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
225
225
|
f"{event.database}.{event.table}"
|
|
226
226
|
)
|
|
227
227
|
|
|
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
|
|
|
235
235
|
def _aggregate_access_events(
|
|
236
236
|
self, events: List[TrinoJoinedAccessEvent]
|
|
237
237
|
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
|
|
238
|
-
datasets: Dict[
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
|
|
239
|
+
collections.defaultdict(dict)
|
|
240
|
+
)
|
|
241
241
|
|
|
242
242
|
for event in events:
|
|
243
243
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|