acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/graph/client.py +4 -2
- datahub/ingestion/source/aws/glue.py +14 -1
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
- datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
- datahub/ingestion/source/bigquery_v2/usage.py +57 -57
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
- datahub/ingestion/source/datahub/config.py +6 -0
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +10 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
- datahub/ingestion/source/looker/looker_config.py +8 -3
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +32 -34
- datahub/ingestion/source/redshift/usage.py +29 -29
- datahub/ingestion/source/s3/source.py +10 -14
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +119 -31
- datahub/ingestion/source/unity/source.py +71 -71
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/metadata/_schema_classes.py +2 -2
- datahub/metadata/_urns/urn_defs.py +15 -15
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/utilities/perf_timer.py +11 -6
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
|
@@ -118,6 +118,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
118
118
|
)
|
|
119
119
|
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
|
|
120
120
|
from datahub.ingestion.source.tableau.tableau_validation import check_user_role
|
|
121
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
121
122
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
122
123
|
AuditStamp,
|
|
123
124
|
ChangeAuditStamps,
|
|
@@ -170,6 +171,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
171
|
create_lineage_sql_parsed_result,
|
|
171
172
|
)
|
|
172
173
|
from datahub.utilities import config_clean
|
|
174
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
175
|
+
from datahub.utilities.stats_collections import TopKDict
|
|
173
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
174
177
|
|
|
175
178
|
try:
|
|
@@ -643,12 +646,41 @@ class SiteIdContentUrl:
|
|
|
643
646
|
|
|
644
647
|
|
|
645
648
|
@dataclass
|
|
646
|
-
class TableauSourceReport(
|
|
649
|
+
class TableauSourceReport(
|
|
650
|
+
StaleEntityRemovalSourceReport,
|
|
651
|
+
IngestionStageReport,
|
|
652
|
+
):
|
|
647
653
|
get_all_datasources_query_failed: bool = False
|
|
648
654
|
num_get_datasource_query_failures: int = 0
|
|
649
655
|
num_datasource_field_skipped_no_name: int = 0
|
|
650
656
|
num_csql_field_skipped_no_name: int = 0
|
|
651
657
|
num_table_field_skipped_no_name: int = 0
|
|
658
|
+
# timers
|
|
659
|
+
extract_usage_stats_timer: Dict[str, float] = dataclass_field(
|
|
660
|
+
default_factory=TopKDict
|
|
661
|
+
)
|
|
662
|
+
fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
663
|
+
populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
|
|
664
|
+
default_factory=TopKDict
|
|
665
|
+
)
|
|
666
|
+
populate_projects_registry_timer: Dict[str, float] = dataclass_field(
|
|
667
|
+
default_factory=TopKDict
|
|
668
|
+
)
|
|
669
|
+
emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
670
|
+
emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
671
|
+
emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
|
|
672
|
+
emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
|
|
673
|
+
default_factory=TopKDict
|
|
674
|
+
)
|
|
675
|
+
emit_published_datasources_timer: Dict[str, float] = dataclass_field(
|
|
676
|
+
default_factory=TopKDict
|
|
677
|
+
)
|
|
678
|
+
emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
|
|
679
|
+
default_factory=TopKDict
|
|
680
|
+
)
|
|
681
|
+
emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
|
|
682
|
+
default_factory=TopKDict
|
|
683
|
+
)
|
|
652
684
|
# lineage
|
|
653
685
|
num_tables_with_upstream_lineage: int = 0
|
|
654
686
|
num_upstream_table_lineage: int = 0
|
|
@@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
660
692
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
661
693
|
num_hidden_assets_skipped: int = 0
|
|
662
694
|
logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
|
|
695
|
+
|
|
663
696
|
last_authenticated_at: Optional[datetime] = None
|
|
664
697
|
|
|
665
698
|
num_expected_tableau_metadata_queries: int = 0
|
|
@@ -834,6 +867,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
834
867
|
platform=self.platform,
|
|
835
868
|
)
|
|
836
869
|
yield from site_source.ingest_tableau_site()
|
|
870
|
+
|
|
837
871
|
except MetadataQueryException as md_exception:
|
|
838
872
|
self.report.failure(
|
|
839
873
|
title="Failed to Retrieve Tableau Metadata",
|
|
@@ -3489,33 +3523,87 @@ class TableauSiteSource:
|
|
|
3489
3523
|
return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
|
|
3490
3524
|
|
|
3491
3525
|
def ingest_tableau_site(self):
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3517
|
-
|
|
3518
|
-
|
|
3519
|
-
|
|
3520
|
-
|
|
3521
|
-
|
|
3526
|
+
with self.report.new_stage(
|
|
3527
|
+
f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
|
|
3528
|
+
):
|
|
3529
|
+
# Initialise the dictionary to later look-up for chart and dashboard stat
|
|
3530
|
+
if self.config.extract_usage_stats:
|
|
3531
|
+
with PerfTimer() as timer:
|
|
3532
|
+
self._populate_usage_stat_registry()
|
|
3533
|
+
self.report.extract_usage_stats_timer[
|
|
3534
|
+
self.site_content_url
|
|
3535
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3536
|
+
|
|
3537
|
+
if self.config.permission_ingestion:
|
|
3538
|
+
with PerfTimer() as timer:
|
|
3539
|
+
self._fetch_groups()
|
|
3540
|
+
self.report.fetch_groups_timer[
|
|
3541
|
+
self.site_content_url
|
|
3542
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3543
|
+
|
|
3544
|
+
# Populate the map of database names and database hostnames to be used later to map
|
|
3545
|
+
# databases to platform instances.
|
|
3546
|
+
if self.config.database_hostname_to_platform_instance_map:
|
|
3547
|
+
with PerfTimer() as timer:
|
|
3548
|
+
self._populate_database_server_hostname_map()
|
|
3549
|
+
self.report.populate_database_server_hostname_map_timer[
|
|
3550
|
+
self.site_content_url
|
|
3551
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3552
|
+
|
|
3553
|
+
with PerfTimer() as timer:
|
|
3554
|
+
self._populate_projects_registry()
|
|
3555
|
+
self.report.populate_projects_registry_timer[
|
|
3556
|
+
self.site_content_url
|
|
3557
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3558
|
+
|
|
3559
|
+
if self.config.add_site_container:
|
|
3560
|
+
yield from self.emit_site_container()
|
|
3561
|
+
yield from self.emit_project_containers()
|
|
3562
|
+
|
|
3563
|
+
with PerfTimer() as timer:
|
|
3564
|
+
yield from self.emit_workbooks()
|
|
3565
|
+
self.report.emit_workbooks_timer[
|
|
3566
|
+
self.site_content_url
|
|
3567
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3568
|
+
|
|
3569
|
+
if self.sheet_ids:
|
|
3570
|
+
with PerfTimer() as timer:
|
|
3571
|
+
yield from self.emit_sheets()
|
|
3572
|
+
self.report.emit_sheets_timer[
|
|
3573
|
+
self.site_content_url
|
|
3574
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3575
|
+
|
|
3576
|
+
if self.dashboard_ids:
|
|
3577
|
+
with PerfTimer() as timer:
|
|
3578
|
+
yield from self.emit_dashboards()
|
|
3579
|
+
self.report.emit_dashboards_timer[
|
|
3580
|
+
self.site_content_url
|
|
3581
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3582
|
+
|
|
3583
|
+
if self.embedded_datasource_ids_being_used:
|
|
3584
|
+
with PerfTimer() as timer:
|
|
3585
|
+
yield from self.emit_embedded_datasources()
|
|
3586
|
+
self.report.emit_embedded_datasources_timer[
|
|
3587
|
+
self.site_content_url
|
|
3588
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3589
|
+
|
|
3590
|
+
if self.datasource_ids_being_used:
|
|
3591
|
+
with PerfTimer() as timer:
|
|
3592
|
+
yield from self.emit_published_datasources()
|
|
3593
|
+
self.report.emit_published_datasources_timer[
|
|
3594
|
+
self.site_content_url
|
|
3595
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3596
|
+
|
|
3597
|
+
if self.custom_sql_ids_being_used:
|
|
3598
|
+
with PerfTimer() as timer:
|
|
3599
|
+
yield from self.emit_custom_sql_datasources()
|
|
3600
|
+
self.report.emit_custom_sql_datasources_timer[
|
|
3601
|
+
self.site_content_url
|
|
3602
|
+
] = timer.elapsed_seconds(digits=2)
|
|
3603
|
+
|
|
3604
|
+
if self.database_tables:
|
|
3605
|
+
with PerfTimer() as timer:
|
|
3606
|
+
yield from self.emit_upstream_tables()
|
|
3607
|
+
self.report.emit_upstream_tables_timer[
|
|
3608
|
+
self.site_content_url
|
|
3609
|
+
] = timer.elapsed_seconds(digits=2)
|
|
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
263
263
|
]
|
|
264
264
|
|
|
265
265
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
266
|
-
self.report.
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
266
|
+
with self.report.new_stage("Ingestion Setup"):
|
|
267
|
+
wait_on_warehouse = None
|
|
268
|
+
if self.config.include_hive_metastore:
|
|
269
|
+
with self.report.new_stage("Start warehouse"):
|
|
270
|
+
# Can take several minutes, so start now and wait later
|
|
271
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
272
|
+
if wait_on_warehouse is None:
|
|
273
|
+
self.report.report_failure(
|
|
274
|
+
"initialization",
|
|
275
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
276
|
+
)
|
|
277
|
+
return
|
|
278
|
+
else:
|
|
279
|
+
# wait until warehouse is started
|
|
280
|
+
wait_on_warehouse.result()
|
|
281
281
|
|
|
282
282
|
if self.config.include_ownership:
|
|
283
|
-
self.report.
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
with self.report.new_stage("Ingest service principals"):
|
|
284
|
+
self.build_service_principal_map()
|
|
285
|
+
self.build_groups_map()
|
|
286
286
|
if self.config.include_notebooks:
|
|
287
|
-
self.report.
|
|
288
|
-
|
|
287
|
+
with self.report.new_stage("Ingest notebooks"):
|
|
288
|
+
yield from self.process_notebooks()
|
|
289
289
|
|
|
290
290
|
yield from self.process_metastores()
|
|
291
291
|
|
|
292
292
|
yield from self.get_view_lineage()
|
|
293
293
|
|
|
294
294
|
if self.config.include_notebooks:
|
|
295
|
-
self.report.
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
295
|
+
with self.report.new_stage("Notebook lineage"):
|
|
296
|
+
for notebook in self.notebooks.values():
|
|
297
|
+
wu = self._gen_notebook_lineage(notebook)
|
|
298
|
+
if wu:
|
|
299
|
+
yield wu
|
|
300
300
|
|
|
301
301
|
if self.config.include_usage_statistics:
|
|
302
|
-
self.report.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
if self.config.is_profiling_enabled():
|
|
315
|
-
self.report.report_ingestion_stage_start("Start warehouse")
|
|
316
|
-
# Need to start the warehouse again for profiling,
|
|
317
|
-
# as it may have been stopped after ingestion might take
|
|
318
|
-
# longer time to complete
|
|
319
|
-
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
-
if wait_on_warehouse is None:
|
|
321
|
-
self.report.report_failure(
|
|
322
|
-
"initialization",
|
|
323
|
-
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
302
|
+
with self.report.new_stage("Ingest usage"):
|
|
303
|
+
usage_extractor = UnityCatalogUsageExtractor(
|
|
304
|
+
config=self.config,
|
|
305
|
+
report=self.report,
|
|
306
|
+
proxy=self.unity_catalog_api_proxy,
|
|
307
|
+
table_urn_builder=self.gen_dataset_urn,
|
|
308
|
+
user_urn_builder=self.gen_user_urn,
|
|
309
|
+
)
|
|
310
|
+
yield from usage_extractor.get_usage_workunits(
|
|
311
|
+
self.table_refs | self.view_refs
|
|
324
312
|
)
|
|
325
|
-
return
|
|
326
|
-
else:
|
|
327
|
-
# wait until warehouse is started
|
|
328
|
-
wait_on_warehouse.result()
|
|
329
313
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
314
|
+
if self.config.is_profiling_enabled():
|
|
315
|
+
with self.report.new_stage("Start warehouse"):
|
|
316
|
+
# Need to start the warehouse again for profiling,
|
|
317
|
+
# as it may have been stopped after ingestion might take
|
|
318
|
+
# longer time to complete
|
|
319
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
+
if wait_on_warehouse is None:
|
|
321
|
+
self.report.report_failure(
|
|
322
|
+
"initialization",
|
|
323
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
324
|
+
)
|
|
325
|
+
return
|
|
326
|
+
else:
|
|
327
|
+
# wait until warehouse is started
|
|
328
|
+
wait_on_warehouse.result()
|
|
329
|
+
|
|
330
|
+
with self.report.new_stage("Profiling"):
|
|
331
|
+
if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
|
|
332
|
+
yield from UnityCatalogAnalyzeProfiler(
|
|
333
|
+
self.config.profiling,
|
|
334
|
+
self.report,
|
|
335
|
+
self.unity_catalog_api_proxy,
|
|
336
|
+
self.gen_dataset_urn,
|
|
337
|
+
).get_workunits(self.table_refs)
|
|
338
|
+
elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
|
|
339
|
+
yield from UnityCatalogGEProfiler(
|
|
340
|
+
sql_common_config=self.config,
|
|
341
|
+
profiling_config=self.config.profiling,
|
|
342
|
+
report=self.report,
|
|
343
|
+
).get_workunits(list(self.tables.values()))
|
|
344
|
+
else:
|
|
345
|
+
raise ValueError("Unknown profiling config method")
|
|
346
346
|
|
|
347
347
|
def build_service_principal_map(self) -> None:
|
|
348
348
|
try:
|
|
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
462
462
|
self.report.schemas.dropped(schema.id)
|
|
463
463
|
continue
|
|
464
464
|
|
|
465
|
-
self.report.
|
|
466
|
-
|
|
467
|
-
|
|
465
|
+
with self.report.new_stage(f"Ingest schema {schema.id}"):
|
|
466
|
+
yield from self.gen_schema_containers(schema)
|
|
467
|
+
yield from self.process_tables(schema)
|
|
468
468
|
|
|
469
|
-
|
|
469
|
+
self.report.schemas.processed(schema.id)
|
|
470
470
|
|
|
471
471
|
def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
472
472
|
for table in self.unity_catalog_api_proxy.tables(schema=schema):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import AbstractContextManager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
|
-
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
7
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
|
|
|
22
22
|
|
|
23
23
|
@dataclass
|
|
24
24
|
class IngestionStageReport:
|
|
25
|
-
ingestion_stage: Optional[str] = None
|
|
26
25
|
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
else:
|
|
42
|
-
self._timer = PerfTimer()
|
|
43
|
-
|
|
44
|
-
self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
45
|
-
logger.info(f"Stage started: {self.ingestion_stage}")
|
|
27
|
+
def new_stage(self, stage: str) -> "IngestionStageContext":
|
|
28
|
+
return IngestionStageContext(stage, self)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class IngestionStageContext(AbstractContextManager):
|
|
33
|
+
def __init__(self, stage: str, report: IngestionStageReport):
|
|
34
|
+
self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
35
|
+
self._timer: PerfTimer = PerfTimer()
|
|
36
|
+
self._report = report
|
|
37
|
+
|
|
38
|
+
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
46
40
|
self._timer.start()
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
|
+
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
+
logger.info(
|
|
46
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
47
|
+
stacklevel=2,
|
|
48
|
+
)
|
|
49
|
+
self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
|
|
50
|
+
return None
|
|
@@ -14262,7 +14262,7 @@ class DataFlowKeyClass(_Aspect):
|
|
|
14262
14262
|
|
|
14263
14263
|
|
|
14264
14264
|
ASPECT_NAME = 'dataFlowKey'
|
|
14265
|
-
ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
|
|
14265
|
+
ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
|
|
14266
14266
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataFlowKey")
|
|
14267
14267
|
|
|
14268
14268
|
def __init__(self,
|
|
@@ -14678,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
|
|
|
14678
14678
|
|
|
14679
14679
|
|
|
14680
14680
|
ASPECT_NAME = 'dataJobKey'
|
|
14681
|
-
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14681
|
+
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14682
14682
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
|
|
14683
14683
|
|
|
14684
14684
|
def __init__(self,
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
# This file contains classes corresponding to entity URNs.
|
|
12
12
|
|
|
13
|
-
from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
|
|
13
|
+
from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
|
|
14
14
|
|
|
15
15
|
import functools
|
|
16
16
|
from deprecated.sphinx import deprecated as _sphinx_deprecated
|
|
@@ -213,7 +213,7 @@ class SchemaFieldUrn(_SpecificUrn):
|
|
|
213
213
|
ENTITY_TYPE: ClassVar[str] = "schemaField"
|
|
214
214
|
_URN_PARTS: ClassVar[int] = 2
|
|
215
215
|
|
|
216
|
-
def __init__(self, parent: str, field_path: str, *, _allow_coercion: bool = True) -> None:
|
|
216
|
+
def __init__(self, parent: Union["Urn", str], field_path: str, *, _allow_coercion: bool = True) -> None:
|
|
217
217
|
if _allow_coercion:
|
|
218
218
|
# Field coercion logic (if any is required).
|
|
219
219
|
field_path = UrnEncoder.encode_string(field_path)
|
|
@@ -604,7 +604,7 @@ class DataJobUrn(_SpecificUrn):
|
|
|
604
604
|
ENTITY_TYPE: ClassVar[str] = "dataJob"
|
|
605
605
|
_URN_PARTS: ClassVar[int] = 2
|
|
606
606
|
|
|
607
|
-
def __init__(self, flow: str, job_id: str, *, _allow_coercion: bool = True) -> None:
|
|
607
|
+
def __init__(self, flow: Union["DataFlowUrn", str], job_id: str, *, _allow_coercion: bool = True) -> None:
|
|
608
608
|
if _allow_coercion:
|
|
609
609
|
# Field coercion logic (if any is required).
|
|
610
610
|
job_id = UrnEncoder.encode_string(job_id)
|
|
@@ -1435,10 +1435,10 @@ class DataPlatformInstanceUrn(_SpecificUrn):
|
|
|
1435
1435
|
ENTITY_TYPE: ClassVar[str] = "dataPlatformInstance"
|
|
1436
1436
|
_URN_PARTS: ClassVar[int] = 2
|
|
1437
1437
|
|
|
1438
|
-
def __init__(self, platform: str, instance: str, *, _allow_coercion: bool = True) -> None:
|
|
1438
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], instance: str, *, _allow_coercion: bool = True) -> None:
|
|
1439
1439
|
if _allow_coercion:
|
|
1440
1440
|
# Field coercion logic (if any is required).
|
|
1441
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1441
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1442
1442
|
instance = UrnEncoder.encode_string(instance)
|
|
1443
1443
|
|
|
1444
1444
|
# Validation logic.
|
|
@@ -1678,10 +1678,10 @@ class DatasetUrn(_SpecificUrn):
|
|
|
1678
1678
|
ENTITY_TYPE: ClassVar[str] = "dataset"
|
|
1679
1679
|
_URN_PARTS: ClassVar[int] = 3
|
|
1680
1680
|
|
|
1681
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1681
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1682
1682
|
if _allow_coercion:
|
|
1683
1683
|
# Field coercion logic (if any is required).
|
|
1684
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1684
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1685
1685
|
name = UrnEncoder.encode_string(name)
|
|
1686
1686
|
env = env.upper()
|
|
1687
1687
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1771,10 +1771,10 @@ class MlModelUrn(_SpecificUrn):
|
|
|
1771
1771
|
ENTITY_TYPE: ClassVar[str] = "mlModel"
|
|
1772
1772
|
_URN_PARTS: ClassVar[int] = 3
|
|
1773
1773
|
|
|
1774
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1774
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1775
1775
|
if _allow_coercion:
|
|
1776
1776
|
# Field coercion logic (if any is required).
|
|
1777
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1777
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1778
1778
|
name = UrnEncoder.encode_string(name)
|
|
1779
1779
|
env = env.upper()
|
|
1780
1780
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1889,10 +1889,10 @@ class MlModelDeploymentUrn(_SpecificUrn):
|
|
|
1889
1889
|
ENTITY_TYPE: ClassVar[str] = "mlModelDeployment"
|
|
1890
1890
|
_URN_PARTS: ClassVar[int] = 3
|
|
1891
1891
|
|
|
1892
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1892
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1893
1893
|
if _allow_coercion:
|
|
1894
1894
|
# Field coercion logic (if any is required).
|
|
1895
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1895
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1896
1896
|
name = UrnEncoder.encode_string(name)
|
|
1897
1897
|
env = env.upper()
|
|
1898
1898
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1953,10 +1953,10 @@ class MlFeatureTableUrn(_SpecificUrn):
|
|
|
1953
1953
|
ENTITY_TYPE: ClassVar[str] = "mlFeatureTable"
|
|
1954
1954
|
_URN_PARTS: ClassVar[int] = 2
|
|
1955
1955
|
|
|
1956
|
-
def __init__(self, platform: str, name: str, *, _allow_coercion: bool = True) -> None:
|
|
1956
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, *, _allow_coercion: bool = True) -> None:
|
|
1957
1957
|
if _allow_coercion:
|
|
1958
1958
|
# Field coercion logic (if any is required).
|
|
1959
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1959
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1960
1960
|
name = UrnEncoder.encode_string(name)
|
|
1961
1961
|
|
|
1962
1962
|
# Validation logic.
|
|
@@ -2385,10 +2385,10 @@ class MlModelGroupUrn(_SpecificUrn):
|
|
|
2385
2385
|
ENTITY_TYPE: ClassVar[str] = "mlModelGroup"
|
|
2386
2386
|
_URN_PARTS: ClassVar[int] = 3
|
|
2387
2387
|
|
|
2388
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
2388
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
2389
2389
|
if _allow_coercion:
|
|
2390
2390
|
# Field coercion logic (if any is required).
|
|
2391
|
-
platform = DataPlatformUrn(platform).urn()
|
|
2391
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
2392
2392
|
name = UrnEncoder.encode_string(name)
|
|
2393
2393
|
env = env.upper()
|
|
2394
2394
|
env = UrnEncoder.encode_string(env)
|
datahub/utilities/perf_timer.py
CHANGED
|
@@ -57,7 +57,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
57
57
|
self.finish()
|
|
58
58
|
return None
|
|
59
59
|
|
|
60
|
-
def elapsed_seconds(self) -> float:
|
|
60
|
+
def elapsed_seconds(self, digits: int = 4) -> float:
|
|
61
61
|
"""
|
|
62
62
|
Returns the elapsed time in seconds.
|
|
63
63
|
"""
|
|
@@ -65,11 +65,18 @@ class PerfTimer(AbstractContextManager):
|
|
|
65
65
|
return self._past_active_time
|
|
66
66
|
|
|
67
67
|
if self.end_time is None:
|
|
68
|
-
|
|
68
|
+
elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
|
|
69
69
|
else:
|
|
70
|
-
|
|
70
|
+
elapsed = (self.end_time - self.start_time) + self._past_active_time
|
|
71
|
+
|
|
72
|
+
return round(elapsed, digits)
|
|
71
73
|
|
|
72
74
|
def assert_timer_is_running(self) -> None:
|
|
75
|
+
if not self.is_running():
|
|
76
|
+
self._error_state = True
|
|
77
|
+
logger.warning("Did you forget to start the timer ?")
|
|
78
|
+
|
|
79
|
+
def is_running(self) -> bool:
|
|
73
80
|
"""
|
|
74
81
|
Returns true if timer is in running state.
|
|
75
82
|
Timer is in NOT in running state if
|
|
@@ -77,9 +84,7 @@ class PerfTimer(AbstractContextManager):
|
|
|
77
84
|
2. it is in paused state.
|
|
78
85
|
3. it had been started and finished in the past but not started again.
|
|
79
86
|
"""
|
|
80
|
-
|
|
81
|
-
self._error_state = True
|
|
82
|
-
logger.warning("Did you forget to start the timer ?")
|
|
87
|
+
return self.start_time is not None and not self.paused and self.end_time is None
|
|
83
88
|
|
|
84
89
|
def __repr__(self) -> str:
|
|
85
90
|
return repr(self.as_obj())
|
|
File without changes
|
{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|