acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/METADATA +2440 -2440
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/RECORD +33 -33
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
- datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
- datahub/ingestion/source/bigquery_v2/usage.py +57 -57
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
- datahub/ingestion/source/datahub/config.py +6 -0
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +10 -14
- datahub/ingestion/source/looker/looker_config.py +8 -3
- datahub/ingestion/source/redshift/redshift.py +32 -34
- datahub/ingestion/source/redshift/usage.py +29 -29
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +119 -31
- datahub/ingestion/source/unity/source.py +71 -71
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/utilities/perf_timer.py +11 -6
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/top_level.txt +0 -0
|
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
146
146
|
if not self._should_ingest_usage():
|
|
147
147
|
return
|
|
148
148
|
|
|
149
|
-
self.report.
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
149
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
150
|
+
if self.report.edition == SnowflakeEdition.STANDARD.value:
|
|
151
|
+
logger.info(
|
|
152
|
+
"Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
|
|
153
|
+
)
|
|
154
|
+
return
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
logger.info("Checking usage date ranges")
|
|
157
157
|
|
|
158
|
-
|
|
158
|
+
self._check_usage_date_ranges()
|
|
159
159
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
160
|
+
# If permission error, execution returns from here
|
|
161
|
+
if (
|
|
162
|
+
self.report.min_access_history_time is None
|
|
163
|
+
or self.report.max_access_history_time is None
|
|
164
|
+
):
|
|
165
|
+
return
|
|
166
166
|
|
|
167
|
-
|
|
168
|
-
|
|
167
|
+
# NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
|
|
168
|
+
# Now, we report the usage as well as operation metadata even if user email is absent
|
|
169
169
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
170
|
+
if self.config.include_usage_stats:
|
|
171
|
+
yield from auto_empty_dataset_usage_statistics(
|
|
172
|
+
self._get_workunits_internal(discovered_datasets),
|
|
173
|
+
config=BaseTimeWindowConfig(
|
|
174
|
+
start_time=self.start_time,
|
|
175
|
+
end_time=self.end_time,
|
|
176
|
+
bucket_duration=self.config.bucket_duration,
|
|
177
|
+
),
|
|
178
|
+
dataset_urns={
|
|
179
|
+
self.identifiers.gen_dataset_urn(dataset_identifier)
|
|
180
|
+
for dataset_identifier in discovered_datasets
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
183
|
|
|
184
|
-
self.report.
|
|
184
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
185
|
+
if self.config.include_operational_stats:
|
|
186
|
+
# Generate the operation workunits.
|
|
187
|
+
access_events = self._get_snowflake_history()
|
|
188
|
+
for event in access_events:
|
|
189
|
+
yield from self._get_operation_aspect_work_unit(
|
|
190
|
+
event, discovered_datasets
|
|
191
|
+
)
|
|
185
192
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
193
|
+
if self.redundant_run_skip_handler:
|
|
194
|
+
# Update the checkpoint state for this run.
|
|
195
|
+
self.redundant_run_skip_handler.update_state(
|
|
196
|
+
self.config.start_time,
|
|
197
|
+
self.config.end_time,
|
|
198
|
+
self.config.bucket_duration,
|
|
192
199
|
)
|
|
193
200
|
|
|
194
|
-
if self.redundant_run_skip_handler:
|
|
195
|
-
# Update the checkpoint state for this run.
|
|
196
|
-
self.redundant_run_skip_handler.update_state(
|
|
197
|
-
self.config.start_time,
|
|
198
|
-
self.config.end_time,
|
|
199
|
-
self.config.bucket_duration,
|
|
200
|
-
)
|
|
201
|
-
|
|
202
201
|
def _get_workunits_internal(
|
|
203
202
|
self, discovered_datasets: List[str]
|
|
204
203
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
386
385
|
)
|
|
387
386
|
self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
|
|
388
387
|
return
|
|
389
|
-
self.report.access_history_query_secs =
|
|
388
|
+
self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
|
|
390
389
|
|
|
391
390
|
for row in results:
|
|
392
391
|
yield from self._process_snowflake_history_row(row)
|
|
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
434
433
|
self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
|
|
435
434
|
tz=timezone.utc
|
|
436
435
|
)
|
|
437
|
-
self.report.access_history_range_query_secs =
|
|
438
|
-
|
|
436
|
+
self.report.access_history_range_query_secs = timer.elapsed_seconds(
|
|
437
|
+
digits=2
|
|
439
438
|
)
|
|
440
439
|
|
|
441
440
|
def _get_operation_aspect_work_unit(
|
|
@@ -480,8 +480,8 @@ class SnowflakeV2Source(
|
|
|
480
480
|
identifiers=self.identifiers,
|
|
481
481
|
)
|
|
482
482
|
|
|
483
|
-
self.report.
|
|
484
|
-
|
|
483
|
+
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
484
|
+
yield from schema_extractor.get_workunits_internal()
|
|
485
485
|
|
|
486
486
|
databases = schema_extractor.databases
|
|
487
487
|
|
|
@@ -513,47 +513,46 @@ class SnowflakeV2Source(
|
|
|
513
513
|
discovered_datasets = discovered_tables + discovered_views
|
|
514
514
|
|
|
515
515
|
if self.config.use_queries_v2:
|
|
516
|
-
self.report.
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
|
|
520
|
-
|
|
521
|
-
schema_resolver = self.aggregator._schema_resolver
|
|
522
|
-
|
|
523
|
-
queries_extractor = SnowflakeQueriesExtractor(
|
|
524
|
-
connection=self.connection,
|
|
525
|
-
config=SnowflakeQueriesExtractorConfig(
|
|
526
|
-
window=self.config,
|
|
527
|
-
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
528
|
-
include_lineage=self.config.include_table_lineage,
|
|
529
|
-
include_usage_statistics=self.config.include_usage_stats,
|
|
530
|
-
include_operations=self.config.include_operational_stats,
|
|
531
|
-
include_queries=self.config.include_queries,
|
|
532
|
-
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
533
|
-
user_email_pattern=self.config.user_email_pattern,
|
|
534
|
-
),
|
|
535
|
-
structured_report=self.report,
|
|
536
|
-
filters=self.filters,
|
|
537
|
-
identifiers=self.identifiers,
|
|
538
|
-
schema_resolver=schema_resolver,
|
|
539
|
-
discovered_tables=discovered_datasets,
|
|
540
|
-
graph=self.ctx.graph,
|
|
541
|
-
)
|
|
516
|
+
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
517
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
542
518
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
519
|
+
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
520
|
+
schema_resolver = self.aggregator._schema_resolver
|
|
521
|
+
|
|
522
|
+
queries_extractor = SnowflakeQueriesExtractor(
|
|
523
|
+
connection=self.connection,
|
|
524
|
+
config=SnowflakeQueriesExtractorConfig(
|
|
525
|
+
window=self.config,
|
|
526
|
+
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
527
|
+
include_lineage=self.config.include_table_lineage,
|
|
528
|
+
include_usage_statistics=self.config.include_usage_stats,
|
|
529
|
+
include_operations=self.config.include_operational_stats,
|
|
530
|
+
include_queries=self.config.include_queries,
|
|
531
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
532
|
+
user_email_pattern=self.config.user_email_pattern,
|
|
533
|
+
),
|
|
534
|
+
structured_report=self.report,
|
|
535
|
+
filters=self.filters,
|
|
536
|
+
identifiers=self.identifiers,
|
|
537
|
+
schema_resolver=schema_resolver,
|
|
538
|
+
discovered_tables=discovered_datasets,
|
|
539
|
+
graph=self.ctx.graph,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
543
|
+
# but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
|
|
544
|
+
# it should be pretty straightforward to refactor this and only initialize the aggregator once.
|
|
545
|
+
self.report.queries_extractor = queries_extractor.report
|
|
546
|
+
yield from queries_extractor.get_workunits_internal()
|
|
547
|
+
queries_extractor.close()
|
|
549
548
|
|
|
550
549
|
else:
|
|
551
550
|
if self.lineage_extractor:
|
|
552
|
-
self.report.
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
551
|
+
with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
|
|
552
|
+
self.lineage_extractor.add_time_based_lineage_to_aggregator(
|
|
553
|
+
discovered_tables=discovered_tables,
|
|
554
|
+
discovered_views=discovered_views,
|
|
555
|
+
)
|
|
557
556
|
|
|
558
557
|
# This would emit view and external table ddl lineage
|
|
559
558
|
# as well as query lineage via lineage_extractor
|
|
@@ -878,7 +878,7 @@ ORDER by DataBaseName, TableName;
|
|
|
878
878
|
|
|
879
879
|
urns = self.schema_resolver.get_urns()
|
|
880
880
|
if self.config.include_table_lineage or self.config.include_usage_statistics:
|
|
881
|
-
self.report.
|
|
882
|
-
|
|
881
|
+
with self.report.new_stage("Audit log extraction"):
|
|
882
|
+
yield from self.get_audit_log_mcps(urns=urns)
|
|
883
883
|
|
|
884
884
|
yield from self.builder.gen_workunits()
|
|
@@ -118,6 +118,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
118
118
|
)
|
|
119
119
|
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
|
|
120
120
|
from datahub.ingestion.source.tableau.tableau_validation import check_user_role
|
|
121
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
121
122
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
122
123
|
AuditStamp,
|
|
123
124
|
ChangeAuditStamps,
|
|
@@ -170,6 +171,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
171
|
create_lineage_sql_parsed_result,
|
|
171
172
|
)
|
|
172
173
|
from datahub.utilities import config_clean
|
|
174
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
175
|
+
from datahub.utilities.stats_collections import TopKDict
|
|
173
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
174
177
|
|
|
175
178
|
try:
|
|
@@ -643,12 +646,41 @@ class SiteIdContentUrl:
|
|
|
643
646
|
|
|
644
647
|
|
|
645
648
|
@dataclass
|
|
646
|
-
class TableauSourceReport(
|
|
649
|
+
class TableauSourceReport(
|
|
650
|
+
StaleEntityRemovalSourceReport,
|
|
651
|
+
IngestionStageReport,
|
|
652
|
+
):
|
|
647
653
|
get_all_datasources_query_failed: bool = False
|
|
648
654
|
num_get_datasource_query_failures: int = 0
|
|
649
655
|
num_datasource_field_skipped_no_name: int = 0
|
|
650
656
|
num_csql_field_skipped_no_name: int = 0
|
|
651
657
|
num_table_field_skipped_no_name: int = 0
|
|
658
|
+
# timers
|
|
659
|
+
extract_usage_stats_timer: Dict[str, float] = dataclass_field(
|
|
660
|
+
default_factory=TopKDict
|
|
661
|
+
)
|
|
662
|
+
fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
663
|
+
populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
|
|
664
|
+
default_factory=TopKDict
|
|
665
|
+
)
|
|
666
|
+
populate_projects_registry_timer: Dict[str, float] = dataclass_field(
|
|
667
|
+
default_factory=TopKDict
|
|
668
|
+
)
|
|
669
|
+
emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
670
|
+
emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
671
|
+
emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
672
|
+
emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
|
|
673
|
+
default_factory=TopKDict
|
|
674
|
+
)
|
|
675
|
+
emit_published_datasources_timer: Dict[str, float] = dataclass_field(
|
|
676
|
+
default_factory=TopKDict
|
|
677
|
+
)
|
|
678
|
+
emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
|
|
679
|
+
default_factory=TopKDict
|
|
680
|
+
)
|
|
681
|
+
emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
|
|
682
|
+
default_factory=TopKDict
|
|
683
|
+
)
|
|
652
684
|
# lineage
|
|
653
685
|
num_tables_with_upstream_lineage: int = 0
|
|
654
686
|
num_upstream_table_lineage: int = 0
|
|
@@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
660
692
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
661
693
|
num_hidden_assets_skipped: int = 0
|
|
662
694
|
logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
|
|
695
|
+
|
|
663
696
|
last_authenticated_at: Optional[datetime] = None
|
|
664
697
|
|
|
665
698
|
num_expected_tableau_metadata_queries: int = 0
|
|
@@ -834,6 +867,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
834
867
|
platform=self.platform,
|
|
835
868
|
)
|
|
836
869
|
yield from site_source.ingest_tableau_site()
|
|
870
|
+
|
|
837
871
|
except MetadataQueryException as md_exception:
|
|
838
872
|
self.report.failure(
|
|
839
873
|
title="Failed to Retrieve Tableau Metadata",
|
|
@@ -3489,33 +3523,87 @@ class TableauSiteSource:
|
|
|
3489
3523
|
return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
|
|
3490
3524
|
|
|
3491
3525
|
def ingest_tableau_site(self):
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3517
|
-
|
|
3518
|
-
|
|
3519
|
-
|
|
3520
|
-
|
|
3521
|
-
|
|
3526
|
+
with self.report.new_stage(
|
|
3527
|
+
f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
|
|
3528
|
+
):
|
|
3529
|
+
# Initialise the dictionary to later look-up for chart and dashboard stat
|
|
3530
|
+
if self.config.extract_usage_stats:
|
|
3531
|
+
with PerfTimer() as timer:
|
|
3532
|
+
self._populate_usage_stat_registry()
|
|
3533
|
+
self.report.extract_usage_stats_timer[
|
|
3534
|
+
self.site_content_url
|
|
3535
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3536
|
+
|
|
3537
|
+
if self.config.permission_ingestion:
|
|
3538
|
+
with PerfTimer() as timer:
|
|
3539
|
+
self._fetch_groups()
|
|
3540
|
+
self.report.fetch_groups_timer[
|
|
3541
|
+
self.site_content_url
|
|
3542
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3543
|
+
|
|
3544
|
+
# Populate the map of database names and database hostnames to be used later to map
|
|
3545
|
+
# databases to platform instances.
|
|
3546
|
+
if self.config.database_hostname_to_platform_instance_map:
|
|
3547
|
+
with PerfTimer() as timer:
|
|
3548
|
+
self._populate_database_server_hostname_map()
|
|
3549
|
+
self.report.populate_database_server_hostname_map_timer[
|
|
3550
|
+
self.site_content_url
|
|
3551
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3552
|
+
|
|
3553
|
+
with PerfTimer() as timer:
|
|
3554
|
+
self._populate_projects_registry()
|
|
3555
|
+
self.report.populate_projects_registry_timer[
|
|
3556
|
+
self.site_content_url
|
|
3557
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3558
|
+
|
|
3559
|
+
if self.config.add_site_container:
|
|
3560
|
+
yield from self.emit_site_container()
|
|
3561
|
+
yield from self.emit_project_containers()
|
|
3562
|
+
|
|
3563
|
+
with PerfTimer() as timer:
|
|
3564
|
+
yield from self.emit_workbooks()
|
|
3565
|
+
self.report.emit_workbooks_timer[
|
|
3566
|
+
self.site_content_url
|
|
3567
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3568
|
+
|
|
3569
|
+
if self.sheet_ids:
|
|
3570
|
+
with PerfTimer() as timer:
|
|
3571
|
+
yield from self.emit_sheets()
|
|
3572
|
+
self.report.emit_sheets_timer[
|
|
3573
|
+
self.site_content_url
|
|
3574
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3575
|
+
|
|
3576
|
+
if self.dashboard_ids:
|
|
3577
|
+
with PerfTimer() as timer:
|
|
3578
|
+
yield from self.emit_dashboards()
|
|
3579
|
+
self.report.emit_dashboards_timer[
|
|
3580
|
+
self.site_content_url
|
|
3581
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3582
|
+
|
|
3583
|
+
if self.embedded_datasource_ids_being_used:
|
|
3584
|
+
with PerfTimer() as timer:
|
|
3585
|
+
yield from self.emit_embedded_datasources()
|
|
3586
|
+
self.report.emit_embedded_datasources_timer[
|
|
3587
|
+
self.site_content_url
|
|
3588
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3589
|
+
|
|
3590
|
+
if self.datasource_ids_being_used:
|
|
3591
|
+
with PerfTimer() as timer:
|
|
3592
|
+
yield from self.emit_published_datasources()
|
|
3593
|
+
self.report.emit_published_datasources_timer[
|
|
3594
|
+
self.site_content_url
|
|
3595
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3596
|
+
|
|
3597
|
+
if self.custom_sql_ids_being_used:
|
|
3598
|
+
with PerfTimer() as timer:
|
|
3599
|
+
yield from self.emit_custom_sql_datasources()
|
|
3600
|
+
self.report.emit_custom_sql_datasources_timer[
|
|
3601
|
+
self.site_content_url
|
|
3602
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3603
|
+
|
|
3604
|
+
if self.database_tables:
|
|
3605
|
+
with PerfTimer() as timer:
|
|
3606
|
+
yield from self.emit_upstream_tables()
|
|
3607
|
+
self.report.emit_upstream_tables_timer[
|
|
3608
|
+
self.site_content_url
|
|
3609
|
+
] = timer.elapsed_seconds(digits=2)
|
|
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
263
263
|
]
|
|
264
264
|
|
|
265
265
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
266
|
-
self.report.
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
266
|
+
with self.report.new_stage("Ingestion Setup"):
|
|
267
|
+
wait_on_warehouse = None
|
|
268
|
+
if self.config.include_hive_metastore:
|
|
269
|
+
with self.report.new_stage("Start warehouse"):
|
|
270
|
+
# Can take several minutes, so start now and wait later
|
|
271
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
272
|
+
if wait_on_warehouse is None:
|
|
273
|
+
self.report.report_failure(
|
|
274
|
+
"initialization",
|
|
275
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
276
|
+
)
|
|
277
|
+
return
|
|
278
|
+
else:
|
|
279
|
+
# wait until warehouse is started
|
|
280
|
+
wait_on_warehouse.result()
|
|
281
281
|
|
|
282
282
|
if self.config.include_ownership:
|
|
283
|
-
self.report.
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
with self.report.new_stage("Ingest service principals"):
|
|
284
|
+
self.build_service_principal_map()
|
|
285
|
+
self.build_groups_map()
|
|
286
286
|
if self.config.include_notebooks:
|
|
287
|
-
self.report.
|
|
288
|
-
|
|
287
|
+
with self.report.new_stage("Ingest notebooks"):
|
|
288
|
+
yield from self.process_notebooks()
|
|
289
289
|
|
|
290
290
|
yield from self.process_metastores()
|
|
291
291
|
|
|
292
292
|
yield from self.get_view_lineage()
|
|
293
293
|
|
|
294
294
|
if self.config.include_notebooks:
|
|
295
|
-
self.report.
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
295
|
+
with self.report.new_stage("Notebook lineage"):
|
|
296
|
+
for notebook in self.notebooks.values():
|
|
297
|
+
wu = self._gen_notebook_lineage(notebook)
|
|
298
|
+
if wu:
|
|
299
|
+
yield wu
|
|
300
300
|
|
|
301
301
|
if self.config.include_usage_statistics:
|
|
302
|
-
self.report.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
if self.config.is_profiling_enabled():
|
|
315
|
-
self.report.report_ingestion_stage_start("Start warehouse")
|
|
316
|
-
# Need to start the warehouse again for profiling,
|
|
317
|
-
# as it may have been stopped after ingestion might take
|
|
318
|
-
# longer time to complete
|
|
319
|
-
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
-
if wait_on_warehouse is None:
|
|
321
|
-
self.report.report_failure(
|
|
322
|
-
"initialization",
|
|
323
|
-
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
302
|
+
with self.report.new_stage("Ingest usage"):
|
|
303
|
+
usage_extractor = UnityCatalogUsageExtractor(
|
|
304
|
+
config=self.config,
|
|
305
|
+
report=self.report,
|
|
306
|
+
proxy=self.unity_catalog_api_proxy,
|
|
307
|
+
table_urn_builder=self.gen_dataset_urn,
|
|
308
|
+
user_urn_builder=self.gen_user_urn,
|
|
309
|
+
)
|
|
310
|
+
yield from usage_extractor.get_usage_workunits(
|
|
311
|
+
self.table_refs | self.view_refs
|
|
324
312
|
)
|
|
325
|
-
return
|
|
326
|
-
else:
|
|
327
|
-
# wait until warehouse is started
|
|
328
|
-
wait_on_warehouse.result()
|
|
329
313
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
314
|
+
if self.config.is_profiling_enabled():
|
|
315
|
+
with self.report.new_stage("Start warehouse"):
|
|
316
|
+
# Need to start the warehouse again for profiling,
|
|
317
|
+
# as it may have been stopped after ingestion might take
|
|
318
|
+
# longer time to complete
|
|
319
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
+
if wait_on_warehouse is None:
|
|
321
|
+
self.report.report_failure(
|
|
322
|
+
"initialization",
|
|
323
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
324
|
+
)
|
|
325
|
+
return
|
|
326
|
+
else:
|
|
327
|
+
# wait until warehouse is started
|
|
328
|
+
wait_on_warehouse.result()
|
|
329
|
+
|
|
330
|
+
with self.report.new_stage("Profiling"):
|
|
331
|
+
if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
|
|
332
|
+
yield from UnityCatalogAnalyzeProfiler(
|
|
333
|
+
self.config.profiling,
|
|
334
|
+
self.report,
|
|
335
|
+
self.unity_catalog_api_proxy,
|
|
336
|
+
self.gen_dataset_urn,
|
|
337
|
+
).get_workunits(self.table_refs)
|
|
338
|
+
elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
|
|
339
|
+
yield from UnityCatalogGEProfiler(
|
|
340
|
+
sql_common_config=self.config,
|
|
341
|
+
profiling_config=self.config.profiling,
|
|
342
|
+
report=self.report,
|
|
343
|
+
).get_workunits(list(self.tables.values()))
|
|
344
|
+
else:
|
|
345
|
+
raise ValueError("Unknown profiling config method")
|
|
346
346
|
|
|
347
347
|
def build_service_principal_map(self) -> None:
|
|
348
348
|
try:
|
|
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
462
462
|
self.report.schemas.dropped(schema.id)
|
|
463
463
|
continue
|
|
464
464
|
|
|
465
|
-
self.report.
|
|
466
|
-
|
|
467
|
-
|
|
465
|
+
with self.report.new_stage(f"Ingest schema {schema.id}"):
|
|
466
|
+
yield from self.gen_schema_containers(schema)
|
|
467
|
+
yield from self.process_tables(schema)
|
|
468
468
|
|
|
469
|
-
|
|
469
|
+
self.report.schemas.processed(schema.id)
|
|
470
470
|
|
|
471
471
|
def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
472
472
|
for table in self.unity_catalog_api_proxy.tables(schema=schema):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import AbstractContextManager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
|
-
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
7
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
|
|
|
22
22
|
|
|
23
23
|
@dataclass
|
|
24
24
|
class IngestionStageReport:
|
|
25
|
-
ingestion_stage: Optional[str] = None
|
|
26
25
|
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
else:
|
|
42
|
-
self._timer = PerfTimer()
|
|
43
|
-
|
|
44
|
-
self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
45
|
-
logger.info(f"Stage started: {self.ingestion_stage}")
|
|
27
|
+
def new_stage(self, stage: str) -> "IngestionStageContext":
|
|
28
|
+
return IngestionStageContext(stage, self)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class IngestionStageContext(AbstractContextManager):
|
|
33
|
+
def __init__(self, stage: str, report: IngestionStageReport):
|
|
34
|
+
self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
35
|
+
self._timer: PerfTimer = PerfTimer()
|
|
36
|
+
self._report = report
|
|
37
|
+
|
|
38
|
+
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
46
40
|
self._timer.start()
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
|
+
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
+
logger.info(
|
|
46
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
47
|
+
stacklevel=2,
|
|
48
|
+
)
|
|
49
|
+
self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
|
|
50
|
+
return None
|