acryl-datahub 0.15.0.2rc1__py3-none-any.whl → 0.15.0.2rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/METADATA +2469 -2459
- {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/RECORD +34 -34
- {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
- datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
- datahub/ingestion/source/bigquery_v2/usage.py +57 -57
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
- datahub/ingestion/source/datahub/config.py +6 -0
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +10 -14
- datahub/ingestion/source/looker/looker_config.py +8 -3
- datahub/ingestion/source/redshift/redshift.py +32 -34
- datahub/ingestion/source/redshift/usage.py +29 -29
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +119 -31
- datahub/ingestion/source/unity/source.py +71 -71
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/utilities/file_backed_collections.py +1 -1
- datahub/utilities/perf_timer.py +11 -6
- {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc1.dist-info → acryl_datahub-0.15.0.2rc3.dist-info}/top_level.txt +0 -0
|
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
|
|
|
495
495
|
def _generate_operational_workunits(
|
|
496
496
|
self, usage_state: BigQueryUsageState, table_refs: Collection[str]
|
|
497
497
|
) -> Iterable[MetadataWorkUnit]:
|
|
498
|
-
self.report.
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
498
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
499
|
+
for audit_event in usage_state.standalone_events():
|
|
500
|
+
try:
|
|
501
|
+
operational_wu = self._create_operation_workunit(
|
|
502
|
+
audit_event, table_refs
|
|
503
|
+
)
|
|
504
|
+
if operational_wu:
|
|
505
|
+
yield operational_wu
|
|
506
|
+
self.report.num_operational_stats_workunits_emitted += 1
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self.report.warning(
|
|
509
|
+
message="Unable to generate operation workunit",
|
|
510
|
+
context=f"{audit_event}",
|
|
511
|
+
exc=e,
|
|
512
|
+
)
|
|
513
513
|
|
|
514
514
|
def _generate_usage_workunits(
|
|
515
515
|
self, usage_state: BigQueryUsageState
|
|
516
516
|
) -> Iterable[MetadataWorkUnit]:
|
|
517
|
-
self.report.
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
517
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
518
|
+
top_n = (
|
|
519
|
+
self.config.usage.top_n_queries
|
|
520
|
+
if self.config.usage.include_top_n_queries
|
|
521
|
+
else 0
|
|
522
|
+
)
|
|
523
|
+
for entry in usage_state.usage_statistics(top_n=top_n):
|
|
524
|
+
try:
|
|
525
|
+
query_freq = [
|
|
526
|
+
(
|
|
527
|
+
self.uuid_to_query.get(
|
|
528
|
+
query_hash, usage_state.queries[query_hash]
|
|
529
|
+
),
|
|
530
|
+
count,
|
|
531
|
+
)
|
|
532
|
+
for query_hash, count in entry.query_freq
|
|
533
|
+
]
|
|
534
|
+
yield make_usage_workunit(
|
|
535
|
+
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
+
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
+
query_count=entry.query_count,
|
|
538
|
+
query_freq=query_freq,
|
|
539
|
+
user_freq=entry.user_freq,
|
|
540
|
+
column_freq=entry.column_freq,
|
|
541
|
+
bucket_duration=self.config.bucket_duration,
|
|
542
|
+
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
+
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
+
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
+
)
|
|
547
|
+
self.report.num_usage_workunits_emitted += 1
|
|
548
|
+
except Exception as e:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
message="Unable to generate usage statistics workunit",
|
|
551
|
+
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
+
exc=e,
|
|
531
553
|
)
|
|
532
|
-
for query_hash, count in entry.query_freq
|
|
533
|
-
]
|
|
534
|
-
yield make_usage_workunit(
|
|
535
|
-
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
-
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
-
query_count=entry.query_count,
|
|
538
|
-
query_freq=query_freq,
|
|
539
|
-
user_freq=entry.user_freq,
|
|
540
|
-
column_freq=entry.column_freq,
|
|
541
|
-
bucket_duration=self.config.bucket_duration,
|
|
542
|
-
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
-
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
-
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
-
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
-
)
|
|
547
|
-
self.report.num_usage_workunits_emitted += 1
|
|
548
|
-
except Exception as e:
|
|
549
|
-
self.report.warning(
|
|
550
|
-
message="Unable to generate usage statistics workunit",
|
|
551
|
-
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
-
exc=e,
|
|
553
|
-
)
|
|
554
554
|
|
|
555
555
|
def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
|
|
556
556
|
if self.config.use_exported_bigquery_audit_metadata:
|
|
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
|
|
|
559
559
|
for project_id in projects:
|
|
560
560
|
with PerfTimer() as timer:
|
|
561
561
|
try:
|
|
562
|
-
self.report.
|
|
563
|
-
project_id
|
|
564
|
-
)
|
|
565
|
-
|
|
562
|
+
with self.report.new_stage(
|
|
563
|
+
f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
|
|
564
|
+
):
|
|
565
|
+
yield from self._get_parsed_bigquery_log_events(project_id)
|
|
566
566
|
except Exception as e:
|
|
567
567
|
self.report.usage_failed_extraction.append(project_id)
|
|
568
568
|
self.report.warning(
|
|
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
|
|
|
572
572
|
)
|
|
573
573
|
self.report_status(f"usage-extraction-{project_id}", False)
|
|
574
574
|
|
|
575
|
-
self.report.usage_extraction_sec[project_id] =
|
|
576
|
-
|
|
575
|
+
self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
576
|
+
digits=2
|
|
577
577
|
)
|
|
578
578
|
|
|
579
579
|
def _store_usage_event(
|
|
@@ -70,30 +70,30 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
self.report.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
|
|
74
|
+
with ThreadPoolExecutor(
|
|
75
|
+
max_workers=self.config.profiling.max_workers
|
|
76
|
+
) as executor:
|
|
77
|
+
future_to_dataset = {
|
|
78
|
+
executor.submit(
|
|
79
|
+
self.generate_profile,
|
|
80
|
+
keyspace_name,
|
|
81
|
+
table_name,
|
|
82
|
+
cassandra_data.columns.get(table_name, []),
|
|
83
|
+
): table_name
|
|
84
|
+
for table_name in tables
|
|
85
|
+
}
|
|
86
|
+
for future in as_completed(future_to_dataset):
|
|
87
|
+
table_name = future_to_dataset[future]
|
|
88
|
+
try:
|
|
89
|
+
yield from future.result()
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
92
|
+
self.report.failure(
|
|
93
|
+
message="Failed to profile for table",
|
|
94
|
+
context=f"{keyspace_name}.{table_name}",
|
|
95
|
+
exc=exc,
|
|
96
|
+
)
|
|
97
97
|
|
|
98
98
|
def generate_profile(
|
|
99
99
|
self,
|
|
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
|
|
|
54
54
|
else:
|
|
55
55
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
56
56
|
|
|
57
|
-
def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
|
|
58
|
-
self.report_ingestion_stage_start(f"{keyspace}: {stage}")
|
|
59
|
-
|
|
60
57
|
# TODO Need to create seperate common config for profiling report
|
|
61
58
|
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
62
59
|
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
108
108
|
|
|
109
109
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
110
110
|
|
|
111
|
+
drop_duplicate_schema_fields: bool = Field(
|
|
112
|
+
default=False,
|
|
113
|
+
description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
|
|
114
|
+
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
115
|
+
)
|
|
116
|
+
|
|
111
117
|
@root_validator(skip_on_failure=True)
|
|
112
118
|
def check_ingesting_data(cls, values):
|
|
113
119
|
if (
|
|
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
12
12
|
support_status,
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
15
|
-
from datahub.ingestion.api.source_helpers import
|
|
15
|
+
from datahub.ingestion.api.source_helpers import (
|
|
16
|
+
auto_fix_duplicate_schema_field_paths,
|
|
17
|
+
auto_workunit_reporter,
|
|
18
|
+
)
|
|
16
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
20
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
18
21
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
57
60
|
|
|
58
61
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
59
62
|
# Exactly replicate data from DataHub source
|
|
60
|
-
return [
|
|
63
|
+
return [
|
|
64
|
+
(
|
|
65
|
+
auto_fix_duplicate_schema_field_paths
|
|
66
|
+
if self.config.drop_duplicate_schema_fields
|
|
67
|
+
else None
|
|
68
|
+
),
|
|
69
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
70
|
+
]
|
|
61
71
|
|
|
62
72
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
63
73
|
self.report.stop_time = datetime.now(tz=timezone.utc)
|
|
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
472
472
|
env=self.config.env,
|
|
473
473
|
platform_instance=self.config.platform_instance,
|
|
474
474
|
)
|
|
475
|
-
self.report.
|
|
476
|
-
|
|
475
|
+
with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
|
|
476
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
477
477
|
|
|
478
478
|
def generate_view_lineage(
|
|
479
479
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
|
|
|
141
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
142
142
|
if self.config.cleanup_expired_tokens:
|
|
143
143
|
try:
|
|
144
|
-
self.report.
|
|
145
|
-
|
|
144
|
+
with self.report.new_stage("Expired Token Cleanup"):
|
|
145
|
+
self.revoke_expired_tokens()
|
|
146
146
|
except Exception as e:
|
|
147
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
148
148
|
if self.config.truncate_indices:
|
|
149
149
|
try:
|
|
150
|
-
self.report.
|
|
151
|
-
|
|
150
|
+
with self.report.new_stage("Truncate Indices"):
|
|
151
|
+
self.truncate_indices()
|
|
152
152
|
except Exception as e:
|
|
153
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
154
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
155
155
|
try:
|
|
156
|
-
self.report.
|
|
157
|
-
|
|
158
|
-
)
|
|
159
|
-
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
156
|
+
with self.report.new_stage("Soft Deleted Entities Cleanup"):
|
|
157
|
+
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
160
158
|
except Exception as e:
|
|
161
159
|
self.report.failure(
|
|
162
160
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
163
161
|
)
|
|
164
162
|
if self.config.dataprocess_cleanup.enabled:
|
|
165
163
|
try:
|
|
166
|
-
self.report.
|
|
167
|
-
|
|
164
|
+
with self.report.new_stage("Data Process Cleanup"):
|
|
165
|
+
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
168
166
|
except Exception as e:
|
|
169
167
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
168
|
if self.config.execution_request_cleanup.enabled:
|
|
171
169
|
try:
|
|
172
|
-
self.report.
|
|
173
|
-
|
|
170
|
+
with self.report.new_stage("Execution request Cleanup"):
|
|
171
|
+
self.execution_request_cleanup.run()
|
|
174
172
|
except Exception as e:
|
|
175
173
|
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
-
# Otherwise last stage's duration does not get calculated.
|
|
177
|
-
self.report.report_ingestion_stage_start("End")
|
|
178
174
|
yield from []
|
|
179
175
|
|
|
180
176
|
def truncate_indices(self) -> None:
|
|
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
|
|
|
300
300
|
|
|
301
301
|
folder_path_pattern: AllowDenyPattern = Field(
|
|
302
302
|
default=AllowDenyPattern.allow_all(),
|
|
303
|
-
description="Allow or deny dashboards from specific folders. "
|
|
303
|
+
description="Allow or deny dashboards from specific folders using their fully qualified paths. "
|
|
304
304
|
"For example: \n"
|
|
305
305
|
"deny: \n"
|
|
306
|
-
" -
|
|
307
|
-
"This pattern will deny the ingestion of all dashboards and looks within the
|
|
306
|
+
" - Shared/deprecated \n"
|
|
307
|
+
"This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
|
|
308
|
+
"allow: \n"
|
|
309
|
+
" - Shared/sales \n"
|
|
310
|
+
"This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
|
|
311
|
+
"To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
|
|
312
|
+
"For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
|
|
308
313
|
"Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
|
|
309
314
|
)
|
|
310
315
|
|
|
@@ -423,10 +423,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
423
423
|
|
|
424
424
|
database = self.config.database
|
|
425
425
|
logger.info(f"Processing db {database}")
|
|
426
|
-
self.report.
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
426
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
427
|
+
self.db_tables[database] = defaultdict()
|
|
428
|
+
self.db_views[database] = defaultdict()
|
|
429
|
+
self.db_schemas.setdefault(database, {})
|
|
430
430
|
|
|
431
431
|
# TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
|
|
432
432
|
# this fallback. For now, this gets us broad coverage quickly.
|
|
@@ -462,12 +462,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
462
462
|
self.process_schemas(connection, database)
|
|
463
463
|
)
|
|
464
464
|
|
|
465
|
-
self.report.
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
465
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
466
|
+
yield from self.extract_lineage_v2(
|
|
467
|
+
connection=connection,
|
|
468
|
+
database=database,
|
|
469
|
+
lineage_extractor=lineage_extractor,
|
|
470
|
+
)
|
|
471
471
|
|
|
472
472
|
all_tables = self.get_all_tables()
|
|
473
473
|
else:
|
|
@@ -480,25 +480,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
480
480
|
or self.config.include_view_lineage
|
|
481
481
|
or self.config.include_copy_lineage
|
|
482
482
|
):
|
|
483
|
-
self.report.
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
483
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
484
|
+
yield from self.extract_lineage(
|
|
485
|
+
connection=connection, all_tables=all_tables, database=database
|
|
486
|
+
)
|
|
487
487
|
|
|
488
|
-
self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
|
|
489
488
|
if self.config.include_usage_statistics:
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
489
|
+
with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
|
|
490
|
+
yield from self.extract_usage(
|
|
491
|
+
connection=connection, all_tables=all_tables, database=database
|
|
492
|
+
)
|
|
493
493
|
|
|
494
494
|
if self.config.is_profiling_enabled():
|
|
495
|
-
self.report.
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
495
|
+
with self.report.new_stage(PROFILING):
|
|
496
|
+
profiler = RedshiftProfiler(
|
|
497
|
+
config=self.config,
|
|
498
|
+
report=self.report,
|
|
499
|
+
state_handler=self.profiling_state_handler,
|
|
500
|
+
)
|
|
501
|
+
yield from profiler.get_workunits(self.db_tables)
|
|
502
502
|
|
|
503
503
|
def process_schemas(self, connection, database):
|
|
504
504
|
for schema in self.data_dictionary.get_schemas(
|
|
@@ -633,8 +633,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
633
633
|
else:
|
|
634
634
|
logger.info("View processing disabled, skipping")
|
|
635
635
|
|
|
636
|
-
self.report.metadata_extraction_sec[report_key] =
|
|
637
|
-
|
|
636
|
+
self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
|
|
637
|
+
digits=2
|
|
638
638
|
)
|
|
639
639
|
|
|
640
640
|
def _process_table(
|
|
@@ -986,9 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
986
986
|
|
|
987
987
|
yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
|
|
988
988
|
|
|
989
|
-
self.report.usage_extraction_sec[database] =
|
|
990
|
-
timer.elapsed_seconds(), 2
|
|
991
|
-
)
|
|
989
|
+
self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
|
|
992
990
|
|
|
993
991
|
def extract_lineage(
|
|
994
992
|
self,
|
|
@@ -1011,8 +1009,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1011
1009
|
database=database, connection=connection, all_tables=all_tables
|
|
1012
1010
|
)
|
|
1013
1011
|
|
|
1014
|
-
self.report.lineage_extraction_sec[f"{database}"] =
|
|
1015
|
-
|
|
1012
|
+
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
1013
|
+
digits=2
|
|
1016
1014
|
)
|
|
1017
1015
|
yield from self.generate_lineage(
|
|
1018
1016
|
database, lineage_extractor=lineage_extractor
|
|
@@ -1042,8 +1040,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1042
1040
|
|
|
1043
1041
|
yield from lineage_extractor.generate()
|
|
1044
1042
|
|
|
1045
|
-
self.report.lineage_extraction_sec[f"{database}"] =
|
|
1046
|
-
|
|
1043
|
+
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
1044
|
+
digits=2
|
|
1047
1045
|
)
|
|
1048
1046
|
|
|
1049
1047
|
if self.redundant_lineage_run_skip_handler:
|
|
@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
self.report.
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
185
|
+
with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
|
|
186
|
+
with PerfTimer() as timer:
|
|
187
|
+
# Generate operation aspect workunits
|
|
188
|
+
yield from self._gen_operation_aspect_workunits(
|
|
189
|
+
self.connection, all_tables
|
|
190
|
+
)
|
|
191
|
+
self.report.operational_metadata_extraction_sec[
|
|
192
|
+
self.config.database
|
|
193
|
+
] = timer.elapsed_seconds(digits=2)
|
|
194
194
|
|
|
195
195
|
# Generate aggregate events
|
|
196
|
-
self.report.
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
196
|
+
with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
|
|
197
|
+
query: str = self.queries.usage_query(
|
|
198
|
+
start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
199
|
+
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
200
|
+
database=self.config.database,
|
|
201
|
+
)
|
|
202
|
+
access_events_iterable: Iterable[
|
|
203
|
+
RedshiftAccessEvent
|
|
204
|
+
] = self._gen_access_events_from_history_query(
|
|
205
|
+
query, connection=self.connection, all_tables=all_tables
|
|
206
|
+
)
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
208
|
+
aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
|
|
209
|
+
access_events_iterable
|
|
210
|
+
)
|
|
211
|
+
# Generate usage workunits from aggregated events.
|
|
212
|
+
for time_bucket in aggregated_events.values():
|
|
213
|
+
for aggregate in time_bucket.values():
|
|
214
|
+
wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
|
|
215
|
+
self.report.num_usage_workunits_emitted += 1
|
|
216
|
+
yield wu
|
|
217
217
|
|
|
218
218
|
def _gen_operation_aspect_workunits(
|
|
219
219
|
self,
|
|
@@ -166,6 +166,3 @@ class SnowflakeV2Report(
|
|
|
166
166
|
|
|
167
167
|
def report_tag_processed(self, tag_name: str) -> None:
|
|
168
168
|
self._processed_tags.add(tag_name)
|
|
169
|
-
|
|
170
|
-
def set_ingestion_stage(self, database: str, stage: str) -> None:
|
|
171
|
-
self.report_ingestion_stage_start(f"{database}: {stage}")
|
|
@@ -216,21 +216,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
216
216
|
|
|
217
217
|
try:
|
|
218
218
|
for snowflake_db in self.databases:
|
|
219
|
-
self.report.
|
|
220
|
-
|
|
219
|
+
with self.report.new_stage(
|
|
220
|
+
f"{snowflake_db.name}: {METADATA_EXTRACTION}"
|
|
221
|
+
):
|
|
222
|
+
yield from self._process_database(snowflake_db)
|
|
221
223
|
|
|
222
|
-
self.report.
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
224
|
+
with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
|
|
225
|
+
discovered_tables: List[str] = [
|
|
226
|
+
self.identifiers.get_dataset_identifier(
|
|
227
|
+
table_name, schema.name, db.name
|
|
228
|
+
)
|
|
229
|
+
for db in self.databases
|
|
230
|
+
for schema in db.schemas
|
|
231
|
+
for table_name in schema.tables
|
|
232
|
+
]
|
|
233
|
+
if self.aggregator:
|
|
234
|
+
for entry in self._external_tables_ddl_lineage(discovered_tables):
|
|
235
|
+
self.aggregator.add(entry)
|
|
234
236
|
|
|
235
237
|
except SnowflakePermissionError as e:
|
|
236
238
|
self.structured_reporter.failure(
|
|
@@ -332,8 +334,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
332
334
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
333
335
|
|
|
334
336
|
if self.profiler and db_tables:
|
|
335
|
-
self.report.
|
|
336
|
-
|
|
337
|
+
with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
|
|
338
|
+
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
337
339
|
|
|
338
340
|
def _process_db_schemas(
|
|
339
341
|
self,
|