acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/graph/client.py +4 -2
- datahub/ingestion/source/aws/glue.py +14 -1
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
- datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
- datahub/ingestion/source/bigquery_v2/usage.py +57 -57
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
- datahub/ingestion/source/datahub/config.py +6 -0
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +10 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
- datahub/ingestion/source/looker/looker_config.py +8 -3
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +32 -34
- datahub/ingestion/source/redshift/usage.py +29 -29
- datahub/ingestion/source/s3/source.py +10 -14
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +119 -31
- datahub/ingestion/source/unity/source.py +71 -71
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/metadata/_schema_classes.py +2 -2
- datahub/metadata/_urns/urn_defs.py +15 -15
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/utilities/perf_timer.py +11 -6
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
|
@@ -423,10 +423,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
423
423
|
|
|
424
424
|
database = self.config.database
|
|
425
425
|
logger.info(f"Processing db {database}")
|
|
426
|
-
self.report.
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
426
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
427
|
+
self.db_tables[database] = defaultdict()
|
|
428
|
+
self.db_views[database] = defaultdict()
|
|
429
|
+
self.db_schemas.setdefault(database, {})
|
|
430
430
|
|
|
431
431
|
# TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
|
|
432
432
|
# this fallback. For now, this gets us broad coverage quickly.
|
|
@@ -462,12 +462,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
462
462
|
self.process_schemas(connection, database)
|
|
463
463
|
)
|
|
464
464
|
|
|
465
|
-
self.report.
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
465
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
466
|
+
yield from self.extract_lineage_v2(
|
|
467
|
+
connection=connection,
|
|
468
|
+
database=database,
|
|
469
|
+
lineage_extractor=lineage_extractor,
|
|
470
|
+
)
|
|
471
471
|
|
|
472
472
|
all_tables = self.get_all_tables()
|
|
473
473
|
else:
|
|
@@ -480,25 +480,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
480
480
|
or self.config.include_view_lineage
|
|
481
481
|
or self.config.include_copy_lineage
|
|
482
482
|
):
|
|
483
|
-
self.report.
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
483
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
484
|
+
yield from self.extract_lineage(
|
|
485
|
+
connection=connection, all_tables=all_tables, database=database
|
|
486
|
+
)
|
|
487
487
|
|
|
488
|
-
self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
|
|
489
488
|
if self.config.include_usage_statistics:
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
489
|
+
with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
|
|
490
|
+
yield from self.extract_usage(
|
|
491
|
+
connection=connection, all_tables=all_tables, database=database
|
|
492
|
+
)
|
|
493
493
|
|
|
494
494
|
if self.config.is_profiling_enabled():
|
|
495
|
-
self.report.
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
495
|
+
with self.report.new_stage(PROFILING):
|
|
496
|
+
profiler = RedshiftProfiler(
|
|
497
|
+
config=self.config,
|
|
498
|
+
report=self.report,
|
|
499
|
+
state_handler=self.profiling_state_handler,
|
|
500
|
+
)
|
|
501
|
+
yield from profiler.get_workunits(self.db_tables)
|
|
502
502
|
|
|
503
503
|
def process_schemas(self, connection, database):
|
|
504
504
|
for schema in self.data_dictionary.get_schemas(
|
|
@@ -633,8 +633,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
633
633
|
else:
|
|
634
634
|
logger.info("View processing disabled, skipping")
|
|
635
635
|
|
|
636
|
-
self.report.metadata_extraction_sec[report_key] =
|
|
637
|
-
|
|
636
|
+
self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
|
|
637
|
+
digits=2
|
|
638
638
|
)
|
|
639
639
|
|
|
640
640
|
def _process_table(
|
|
@@ -986,9 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
986
986
|
|
|
987
987
|
yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
|
|
988
988
|
|
|
989
|
-
self.report.usage_extraction_sec[database] =
|
|
990
|
-
timer.elapsed_seconds(), 2
|
|
991
|
-
)
|
|
989
|
+
self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
|
|
992
990
|
|
|
993
991
|
def extract_lineage(
|
|
994
992
|
self,
|
|
@@ -1011,8 +1009,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1011
1009
|
database=database, connection=connection, all_tables=all_tables
|
|
1012
1010
|
)
|
|
1013
1011
|
|
|
1014
|
-
self.report.lineage_extraction_sec[f"{database}"] =
|
|
1015
|
-
|
|
1012
|
+
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
1013
|
+
digits=2
|
|
1016
1014
|
)
|
|
1017
1015
|
yield from self.generate_lineage(
|
|
1018
1016
|
database, lineage_extractor=lineage_extractor
|
|
@@ -1042,8 +1040,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1042
1040
|
|
|
1043
1041
|
yield from lineage_extractor.generate()
|
|
1044
1042
|
|
|
1045
|
-
self.report.lineage_extraction_sec[f"{database}"] =
|
|
1046
|
-
|
|
1043
|
+
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
1044
|
+
digits=2
|
|
1047
1045
|
)
|
|
1048
1046
|
|
|
1049
1047
|
if self.redundant_lineage_run_skip_handler:
|
|
@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
self.report.
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
185
|
+
with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
|
|
186
|
+
with PerfTimer() as timer:
|
|
187
|
+
# Generate operation aspect workunits
|
|
188
|
+
yield from self._gen_operation_aspect_workunits(
|
|
189
|
+
self.connection, all_tables
|
|
190
|
+
)
|
|
191
|
+
self.report.operational_metadata_extraction_sec[
|
|
192
|
+
self.config.database
|
|
193
|
+
] = timer.elapsed_seconds(digits=2)
|
|
194
194
|
|
|
195
195
|
# Generate aggregate events
|
|
196
|
-
self.report.
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
196
|
+
with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
|
|
197
|
+
query: str = self.queries.usage_query(
|
|
198
|
+
start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
199
|
+
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
200
|
+
database=self.config.database,
|
|
201
|
+
)
|
|
202
|
+
access_events_iterable: Iterable[
|
|
203
|
+
RedshiftAccessEvent
|
|
204
|
+
] = self._gen_access_events_from_history_query(
|
|
205
|
+
query, connection=self.connection, all_tables=all_tables
|
|
206
|
+
)
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
208
|
+
aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
|
|
209
|
+
access_events_iterable
|
|
210
|
+
)
|
|
211
|
+
# Generate usage workunits from aggregated events.
|
|
212
|
+
for time_bucket in aggregated_events.values():
|
|
213
|
+
for aggregate in time_bucket.values():
|
|
214
|
+
wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
|
|
215
|
+
self.report.num_usage_workunits_emitted += 1
|
|
216
|
+
yield wu
|
|
217
217
|
|
|
218
218
|
def _gen_operation_aspect_workunits(
|
|
219
219
|
self,
|
|
@@ -6,9 +6,8 @@ import pathlib
|
|
|
6
6
|
import re
|
|
7
7
|
import time
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from itertools import groupby
|
|
10
9
|
from pathlib import PurePath
|
|
11
|
-
from typing import
|
|
10
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
|
|
12
11
|
from urllib.parse import urlparse
|
|
13
12
|
|
|
14
13
|
import smart_open.compression as so_compression
|
|
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
41
40
|
get_bucket_name,
|
|
42
41
|
get_bucket_relative_path,
|
|
43
42
|
get_key_prefix,
|
|
43
|
+
group_s3_objects_by_dirname,
|
|
44
44
|
strip_s3_prefix,
|
|
45
45
|
)
|
|
46
46
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
|
|
|
75
75
|
from datahub.telemetry import stats, telemetry
|
|
76
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
77
|
|
|
78
|
+
if TYPE_CHECKING:
|
|
79
|
+
from mypy_boto3_s3.service_resource import Bucket
|
|
80
|
+
|
|
78
81
|
# hide annoying debug errors from py4j
|
|
79
82
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
80
83
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
842
845
|
def get_folder_info(
|
|
843
846
|
self,
|
|
844
847
|
path_spec: PathSpec,
|
|
845
|
-
bucket:
|
|
848
|
+
bucket: "Bucket",
|
|
846
849
|
prefix: str,
|
|
847
850
|
) -> List[Folder]:
|
|
848
851
|
"""
|
|
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
857
860
|
|
|
858
861
|
Parameters:
|
|
859
862
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
860
|
-
bucket (
|
|
863
|
+
bucket (Bucket): The S3 bucket object.
|
|
861
864
|
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
862
865
|
|
|
863
866
|
Returns:
|
|
864
867
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
865
868
|
"""
|
|
866
|
-
|
|
867
|
-
prefix_to_list = prefix
|
|
868
|
-
files = list(
|
|
869
|
-
bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
|
|
870
|
-
)
|
|
871
|
-
files = sorted(files, key=lambda a: a.last_modified)
|
|
872
|
-
grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
|
|
873
|
-
|
|
874
869
|
partitions: List[Folder] = []
|
|
875
|
-
|
|
870
|
+
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
871
|
+
for key, group in group_s3_objects_by_dirname(s3_objects).items():
|
|
876
872
|
file_size = 0
|
|
877
873
|
creation_time = None
|
|
878
874
|
modification_time = None
|
|
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
904
900
|
Folder(
|
|
905
901
|
partition_id=id,
|
|
906
902
|
is_partition=bool(id),
|
|
907
|
-
creation_time=creation_time if creation_time else None,
|
|
903
|
+
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
908
904
|
modification_time=modification_time,
|
|
909
905
|
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
910
906
|
size=file_size,
|
|
@@ -166,6 +166,3 @@ class SnowflakeV2Report(
|
|
|
166
166
|
|
|
167
167
|
def report_tag_processed(self, tag_name: str) -> None:
|
|
168
168
|
self._processed_tags.add(tag_name)
|
|
169
|
-
|
|
170
|
-
def set_ingestion_stage(self, database: str, stage: str) -> None:
|
|
171
|
-
self.report_ingestion_stage_start(f"{database}: {stage}")
|
|
@@ -216,21 +216,23 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
216
216
|
|
|
217
217
|
try:
|
|
218
218
|
for snowflake_db in self.databases:
|
|
219
|
-
self.report.
|
|
220
|
-
|
|
219
|
+
with self.report.new_stage(
|
|
220
|
+
f"{snowflake_db.name}: {METADATA_EXTRACTION}"
|
|
221
|
+
):
|
|
222
|
+
yield from self._process_database(snowflake_db)
|
|
221
223
|
|
|
222
|
-
self.report.
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
224
|
+
with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
|
|
225
|
+
discovered_tables: List[str] = [
|
|
226
|
+
self.identifiers.get_dataset_identifier(
|
|
227
|
+
table_name, schema.name, db.name
|
|
228
|
+
)
|
|
229
|
+
for db in self.databases
|
|
230
|
+
for schema in db.schemas
|
|
231
|
+
for table_name in schema.tables
|
|
232
|
+
]
|
|
233
|
+
if self.aggregator:
|
|
234
|
+
for entry in self._external_tables_ddl_lineage(discovered_tables):
|
|
235
|
+
self.aggregator.add(entry)
|
|
234
236
|
|
|
235
237
|
except SnowflakePermissionError as e:
|
|
236
238
|
self.structured_reporter.failure(
|
|
@@ -332,8 +334,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
332
334
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
333
335
|
|
|
334
336
|
if self.profiler and db_tables:
|
|
335
|
-
self.report.
|
|
336
|
-
|
|
337
|
+
with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
|
|
338
|
+
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
337
339
|
|
|
338
340
|
def _process_db_schemas(
|
|
339
341
|
self,
|
|
@@ -146,59 +146,58 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
146
146
|
if not self._should_ingest_usage():
|
|
147
147
|
return
|
|
148
148
|
|
|
149
|
-
self.report.
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
149
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
150
|
+
if self.report.edition == SnowflakeEdition.STANDARD.value:
|
|
151
|
+
logger.info(
|
|
152
|
+
"Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
|
|
153
|
+
)
|
|
154
|
+
return
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
logger.info("Checking usage date ranges")
|
|
157
157
|
|
|
158
|
-
|
|
158
|
+
self._check_usage_date_ranges()
|
|
159
159
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
160
|
+
# If permission error, execution returns from here
|
|
161
|
+
if (
|
|
162
|
+
self.report.min_access_history_time is None
|
|
163
|
+
or self.report.max_access_history_time is None
|
|
164
|
+
):
|
|
165
|
+
return
|
|
166
166
|
|
|
167
|
-
|
|
168
|
-
|
|
167
|
+
# NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
|
|
168
|
+
# Now, we report the usage as well as operation metadata even if user email is absent
|
|
169
169
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
170
|
+
if self.config.include_usage_stats:
|
|
171
|
+
yield from auto_empty_dataset_usage_statistics(
|
|
172
|
+
self._get_workunits_internal(discovered_datasets),
|
|
173
|
+
config=BaseTimeWindowConfig(
|
|
174
|
+
start_time=self.start_time,
|
|
175
|
+
end_time=self.end_time,
|
|
176
|
+
bucket_duration=self.config.bucket_duration,
|
|
177
|
+
),
|
|
178
|
+
dataset_urns={
|
|
179
|
+
self.identifiers.gen_dataset_urn(dataset_identifier)
|
|
180
|
+
for dataset_identifier in discovered_datasets
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
183
|
|
|
184
|
-
self.report.
|
|
184
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
185
|
+
if self.config.include_operational_stats:
|
|
186
|
+
# Generate the operation workunits.
|
|
187
|
+
access_events = self._get_snowflake_history()
|
|
188
|
+
for event in access_events:
|
|
189
|
+
yield from self._get_operation_aspect_work_unit(
|
|
190
|
+
event, discovered_datasets
|
|
191
|
+
)
|
|
185
192
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
193
|
+
if self.redundant_run_skip_handler:
|
|
194
|
+
# Update the checkpoint state for this run.
|
|
195
|
+
self.redundant_run_skip_handler.update_state(
|
|
196
|
+
self.config.start_time,
|
|
197
|
+
self.config.end_time,
|
|
198
|
+
self.config.bucket_duration,
|
|
192
199
|
)
|
|
193
200
|
|
|
194
|
-
if self.redundant_run_skip_handler:
|
|
195
|
-
# Update the checkpoint state for this run.
|
|
196
|
-
self.redundant_run_skip_handler.update_state(
|
|
197
|
-
self.config.start_time,
|
|
198
|
-
self.config.end_time,
|
|
199
|
-
self.config.bucket_duration,
|
|
200
|
-
)
|
|
201
|
-
|
|
202
201
|
def _get_workunits_internal(
|
|
203
202
|
self, discovered_datasets: List[str]
|
|
204
203
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -386,7 +385,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
386
385
|
)
|
|
387
386
|
self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
|
|
388
387
|
return
|
|
389
|
-
self.report.access_history_query_secs =
|
|
388
|
+
self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
|
|
390
389
|
|
|
391
390
|
for row in results:
|
|
392
391
|
yield from self._process_snowflake_history_row(row)
|
|
@@ -434,8 +433,8 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
434
433
|
self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
|
|
435
434
|
tz=timezone.utc
|
|
436
435
|
)
|
|
437
|
-
self.report.access_history_range_query_secs =
|
|
438
|
-
|
|
436
|
+
self.report.access_history_range_query_secs = timer.elapsed_seconds(
|
|
437
|
+
digits=2
|
|
439
438
|
)
|
|
440
439
|
|
|
441
440
|
def _get_operation_aspect_work_unit(
|
|
@@ -480,8 +480,8 @@ class SnowflakeV2Source(
|
|
|
480
480
|
identifiers=self.identifiers,
|
|
481
481
|
)
|
|
482
482
|
|
|
483
|
-
self.report.
|
|
484
|
-
|
|
483
|
+
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
484
|
+
yield from schema_extractor.get_workunits_internal()
|
|
485
485
|
|
|
486
486
|
databases = schema_extractor.databases
|
|
487
487
|
|
|
@@ -513,47 +513,46 @@ class SnowflakeV2Source(
|
|
|
513
513
|
discovered_datasets = discovered_tables + discovered_views
|
|
514
514
|
|
|
515
515
|
if self.config.use_queries_v2:
|
|
516
|
-
self.report.
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
|
|
520
|
-
|
|
521
|
-
schema_resolver = self.aggregator._schema_resolver
|
|
522
|
-
|
|
523
|
-
queries_extractor = SnowflakeQueriesExtractor(
|
|
524
|
-
connection=self.connection,
|
|
525
|
-
config=SnowflakeQueriesExtractorConfig(
|
|
526
|
-
window=self.config,
|
|
527
|
-
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
528
|
-
include_lineage=self.config.include_table_lineage,
|
|
529
|
-
include_usage_statistics=self.config.include_usage_stats,
|
|
530
|
-
include_operations=self.config.include_operational_stats,
|
|
531
|
-
include_queries=self.config.include_queries,
|
|
532
|
-
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
533
|
-
user_email_pattern=self.config.user_email_pattern,
|
|
534
|
-
),
|
|
535
|
-
structured_report=self.report,
|
|
536
|
-
filters=self.filters,
|
|
537
|
-
identifiers=self.identifiers,
|
|
538
|
-
schema_resolver=schema_resolver,
|
|
539
|
-
discovered_tables=discovered_datasets,
|
|
540
|
-
graph=self.ctx.graph,
|
|
541
|
-
)
|
|
516
|
+
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
517
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
542
518
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
519
|
+
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
520
|
+
schema_resolver = self.aggregator._schema_resolver
|
|
521
|
+
|
|
522
|
+
queries_extractor = SnowflakeQueriesExtractor(
|
|
523
|
+
connection=self.connection,
|
|
524
|
+
config=SnowflakeQueriesExtractorConfig(
|
|
525
|
+
window=self.config,
|
|
526
|
+
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
527
|
+
include_lineage=self.config.include_table_lineage,
|
|
528
|
+
include_usage_statistics=self.config.include_usage_stats,
|
|
529
|
+
include_operations=self.config.include_operational_stats,
|
|
530
|
+
include_queries=self.config.include_queries,
|
|
531
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
532
|
+
user_email_pattern=self.config.user_email_pattern,
|
|
533
|
+
),
|
|
534
|
+
structured_report=self.report,
|
|
535
|
+
filters=self.filters,
|
|
536
|
+
identifiers=self.identifiers,
|
|
537
|
+
schema_resolver=schema_resolver,
|
|
538
|
+
discovered_tables=discovered_datasets,
|
|
539
|
+
graph=self.ctx.graph,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
543
|
+
# but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
|
|
544
|
+
# it should be pretty straightforward to refactor this and only initialize the aggregator once.
|
|
545
|
+
self.report.queries_extractor = queries_extractor.report
|
|
546
|
+
yield from queries_extractor.get_workunits_internal()
|
|
547
|
+
queries_extractor.close()
|
|
549
548
|
|
|
550
549
|
else:
|
|
551
550
|
if self.lineage_extractor:
|
|
552
|
-
self.report.
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
551
|
+
with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
|
|
552
|
+
self.lineage_extractor.add_time_based_lineage_to_aggregator(
|
|
553
|
+
discovered_tables=discovered_tables,
|
|
554
|
+
discovered_views=discovered_views,
|
|
555
|
+
)
|
|
557
556
|
|
|
558
557
|
# This would emit view and external table ddl lineage
|
|
559
558
|
# as well as query lineage via lineage_extractor
|
|
@@ -878,7 +878,7 @@ ORDER by DataBaseName, TableName;
|
|
|
878
878
|
|
|
879
879
|
urns = self.schema_resolver.get_urns()
|
|
880
880
|
if self.config.include_table_lineage or self.config.include_usage_statistics:
|
|
881
|
-
self.report.
|
|
882
|
-
|
|
881
|
+
with self.report.new_stage("Audit log extraction"):
|
|
882
|
+
yield from self.get_audit_log_mcps(urns=urns)
|
|
883
883
|
|
|
884
884
|
yield from self.builder.gen_workunits()
|