acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (44) hide show
  1. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
  2. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
  5. datahub/cli/specific/structuredproperties_cli.py +84 -0
  6. datahub/ingestion/api/source.py +2 -0
  7. datahub/ingestion/graph/client.py +4 -2
  8. datahub/ingestion/source/aws/glue.py +14 -1
  9. datahub/ingestion/source/aws/s3_util.py +24 -1
  10. datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
  11. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  12. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
  13. datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
  14. datahub/ingestion/source/bigquery_v2/usage.py +57 -57
  15. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  16. datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
  17. datahub/ingestion/source/datahub/config.py +6 -0
  18. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  19. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  20. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  21. datahub/ingestion/source/gc/datahub_gc.py +10 -14
  22. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  23. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
  24. datahub/ingestion/source/looker/looker_config.py +8 -3
  25. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  26. datahub/ingestion/source/redshift/redshift.py +32 -34
  27. datahub/ingestion/source/redshift/usage.py +29 -29
  28. datahub/ingestion/source/s3/source.py +10 -14
  29. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  30. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
  31. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
  32. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
  33. datahub/ingestion/source/sql/teradata.py +2 -2
  34. datahub/ingestion/source/tableau/tableau.py +119 -31
  35. datahub/ingestion/source/unity/source.py +71 -71
  36. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  37. datahub/metadata/_schema_classes.py +2 -2
  38. datahub/metadata/_urns/urn_defs.py +15 -15
  39. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  40. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  41. datahub/utilities/perf_timer.py +11 -6
  42. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
  43. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
  44. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
@@ -118,6 +118,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
118
118
  )
119
119
  from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
120
120
  from datahub.ingestion.source.tableau.tableau_validation import check_user_role
121
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
121
122
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
122
123
  AuditStamp,
123
124
  ChangeAuditStamps,
@@ -170,6 +171,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
171
  create_lineage_sql_parsed_result,
171
172
  )
172
173
  from datahub.utilities import config_clean
174
+ from datahub.utilities.perf_timer import PerfTimer
175
+ from datahub.utilities.stats_collections import TopKDict
173
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
174
177
 
175
178
  try:
@@ -643,12 +646,41 @@ class SiteIdContentUrl:
643
646
 
644
647
 
645
648
  @dataclass
646
- class TableauSourceReport(StaleEntityRemovalSourceReport):
649
+ class TableauSourceReport(
650
+ StaleEntityRemovalSourceReport,
651
+ IngestionStageReport,
652
+ ):
647
653
  get_all_datasources_query_failed: bool = False
648
654
  num_get_datasource_query_failures: int = 0
649
655
  num_datasource_field_skipped_no_name: int = 0
650
656
  num_csql_field_skipped_no_name: int = 0
651
657
  num_table_field_skipped_no_name: int = 0
658
+ # timers
659
+ extract_usage_stats_timer: Dict[str, float] = dataclass_field(
660
+ default_factory=TopKDict
661
+ )
662
+ fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
663
+ populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
664
+ default_factory=TopKDict
665
+ )
666
+ populate_projects_registry_timer: Dict[str, float] = dataclass_field(
667
+ default_factory=TopKDict
668
+ )
669
+ emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
670
+ emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
671
+ emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
672
+ emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
673
+ default_factory=TopKDict
674
+ )
675
+ emit_published_datasources_timer: Dict[str, float] = dataclass_field(
676
+ default_factory=TopKDict
677
+ )
678
+ emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
679
+ default_factory=TopKDict
680
+ )
681
+ emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
682
+ default_factory=TopKDict
683
+ )
652
684
  # lineage
653
685
  num_tables_with_upstream_lineage: int = 0
654
686
  num_upstream_table_lineage: int = 0
@@ -660,6 +692,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
660
692
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
661
693
  num_hidden_assets_skipped: int = 0
662
694
  logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
695
+
663
696
  last_authenticated_at: Optional[datetime] = None
664
697
 
665
698
  num_expected_tableau_metadata_queries: int = 0
@@ -834,6 +867,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
834
867
  platform=self.platform,
835
868
  )
836
869
  yield from site_source.ingest_tableau_site()
870
+
837
871
  except MetadataQueryException as md_exception:
838
872
  self.report.failure(
839
873
  title="Failed to Retrieve Tableau Metadata",
@@ -3489,33 +3523,87 @@ class TableauSiteSource:
3489
3523
  return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
3490
3524
 
3491
3525
  def ingest_tableau_site(self):
3492
- # Initialise the dictionary to later look-up for chart and dashboard stat
3493
- if self.config.extract_usage_stats:
3494
- self._populate_usage_stat_registry()
3495
-
3496
- if self.config.permission_ingestion:
3497
- self._fetch_groups()
3498
-
3499
- # Populate the map of database names and database hostnames to be used later to map
3500
- # databases to platform instances.
3501
- if self.config.database_hostname_to_platform_instance_map:
3502
- self._populate_database_server_hostname_map()
3503
-
3504
- self._populate_projects_registry()
3505
-
3506
- if self.config.add_site_container:
3507
- yield from self.emit_site_container()
3508
- yield from self.emit_project_containers()
3509
- yield from self.emit_workbooks()
3510
- if self.sheet_ids:
3511
- yield from self.emit_sheets()
3512
- if self.dashboard_ids:
3513
- yield from self.emit_dashboards()
3514
- if self.embedded_datasource_ids_being_used:
3515
- yield from self.emit_embedded_datasources()
3516
- if self.datasource_ids_being_used:
3517
- yield from self.emit_published_datasources()
3518
- if self.custom_sql_ids_being_used:
3519
- yield from self.emit_custom_sql_datasources()
3520
- if self.database_tables:
3521
- yield from self.emit_upstream_tables()
3526
+ with self.report.new_stage(
3527
+ f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
3528
+ ):
3529
+ # Initialise the dictionary to later look-up for chart and dashboard stat
3530
+ if self.config.extract_usage_stats:
3531
+ with PerfTimer() as timer:
3532
+ self._populate_usage_stat_registry()
3533
+ self.report.extract_usage_stats_timer[
3534
+ self.site_content_url
3535
+ ] = timer.elapsed_seconds(digits=2)
3536
+
3537
+ if self.config.permission_ingestion:
3538
+ with PerfTimer() as timer:
3539
+ self._fetch_groups()
3540
+ self.report.fetch_groups_timer[
3541
+ self.site_content_url
3542
+ ] = timer.elapsed_seconds(digits=2)
3543
+
3544
+ # Populate the map of database names and database hostnames to be used later to map
3545
+ # databases to platform instances.
3546
+ if self.config.database_hostname_to_platform_instance_map:
3547
+ with PerfTimer() as timer:
3548
+ self._populate_database_server_hostname_map()
3549
+ self.report.populate_database_server_hostname_map_timer[
3550
+ self.site_content_url
3551
+ ] = timer.elapsed_seconds(digits=2)
3552
+
3553
+ with PerfTimer() as timer:
3554
+ self._populate_projects_registry()
3555
+ self.report.populate_projects_registry_timer[
3556
+ self.site_content_url
3557
+ ] = timer.elapsed_seconds(digits=2)
3558
+
3559
+ if self.config.add_site_container:
3560
+ yield from self.emit_site_container()
3561
+ yield from self.emit_project_containers()
3562
+
3563
+ with PerfTimer() as timer:
3564
+ yield from self.emit_workbooks()
3565
+ self.report.emit_workbooks_timer[
3566
+ self.site_content_url
3567
+ ] = timer.elapsed_seconds(digits=2)
3568
+
3569
+ if self.sheet_ids:
3570
+ with PerfTimer() as timer:
3571
+ yield from self.emit_sheets()
3572
+ self.report.emit_sheets_timer[
3573
+ self.site_content_url
3574
+ ] = timer.elapsed_seconds(digits=2)
3575
+
3576
+ if self.dashboard_ids:
3577
+ with PerfTimer() as timer:
3578
+ yield from self.emit_dashboards()
3579
+ self.report.emit_dashboards_timer[
3580
+ self.site_content_url
3581
+ ] = timer.elapsed_seconds(digits=2)
3582
+
3583
+ if self.embedded_datasource_ids_being_used:
3584
+ with PerfTimer() as timer:
3585
+ yield from self.emit_embedded_datasources()
3586
+ self.report.emit_embedded_datasources_timer[
3587
+ self.site_content_url
3588
+ ] = timer.elapsed_seconds(digits=2)
3589
+
3590
+ if self.datasource_ids_being_used:
3591
+ with PerfTimer() as timer:
3592
+ yield from self.emit_published_datasources()
3593
+ self.report.emit_published_datasources_timer[
3594
+ self.site_content_url
3595
+ ] = timer.elapsed_seconds(digits=2)
3596
+
3597
+ if self.custom_sql_ids_being_used:
3598
+ with PerfTimer() as timer:
3599
+ yield from self.emit_custom_sql_datasources()
3600
+ self.report.emit_custom_sql_datasources_timer[
3601
+ self.site_content_url
3602
+ ] = timer.elapsed_seconds(digits=2)
3603
+
3604
+ if self.database_tables:
3605
+ with PerfTimer() as timer:
3606
+ yield from self.emit_upstream_tables()
3607
+ self.report.emit_upstream_tables_timer[
3608
+ self.site_content_url
3609
+ ] = timer.elapsed_seconds(digits=2)
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
263
263
  ]
264
264
 
265
265
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
266
- self.report.report_ingestion_stage_start("Ingestion Setup")
267
- wait_on_warehouse = None
268
- if self.config.include_hive_metastore:
269
- self.report.report_ingestion_stage_start("Start warehouse")
270
- # Can take several minutes, so start now and wait later
271
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
- if wait_on_warehouse is None:
273
- self.report.report_failure(
274
- "initialization",
275
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
- )
277
- return
278
- else:
279
- # wait until warehouse is started
280
- wait_on_warehouse.result()
266
+ with self.report.new_stage("Ingestion Setup"):
267
+ wait_on_warehouse = None
268
+ if self.config.include_hive_metastore:
269
+ with self.report.new_stage("Start warehouse"):
270
+ # Can take several minutes, so start now and wait later
271
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
+ if wait_on_warehouse is None:
273
+ self.report.report_failure(
274
+ "initialization",
275
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
+ )
277
+ return
278
+ else:
279
+ # wait until warehouse is started
280
+ wait_on_warehouse.result()
281
281
 
282
282
  if self.config.include_ownership:
283
- self.report.report_ingestion_stage_start("Ingest service principals")
284
- self.build_service_principal_map()
285
- self.build_groups_map()
283
+ with self.report.new_stage("Ingest service principals"):
284
+ self.build_service_principal_map()
285
+ self.build_groups_map()
286
286
  if self.config.include_notebooks:
287
- self.report.report_ingestion_stage_start("Ingest notebooks")
288
- yield from self.process_notebooks()
287
+ with self.report.new_stage("Ingest notebooks"):
288
+ yield from self.process_notebooks()
289
289
 
290
290
  yield from self.process_metastores()
291
291
 
292
292
  yield from self.get_view_lineage()
293
293
 
294
294
  if self.config.include_notebooks:
295
- self.report.report_ingestion_stage_start("Notebook lineage")
296
- for notebook in self.notebooks.values():
297
- wu = self._gen_notebook_lineage(notebook)
298
- if wu:
299
- yield wu
295
+ with self.report.new_stage("Notebook lineage"):
296
+ for notebook in self.notebooks.values():
297
+ wu = self._gen_notebook_lineage(notebook)
298
+ if wu:
299
+ yield wu
300
300
 
301
301
  if self.config.include_usage_statistics:
302
- self.report.report_ingestion_stage_start("Ingest usage")
303
- usage_extractor = UnityCatalogUsageExtractor(
304
- config=self.config,
305
- report=self.report,
306
- proxy=self.unity_catalog_api_proxy,
307
- table_urn_builder=self.gen_dataset_urn,
308
- user_urn_builder=self.gen_user_urn,
309
- )
310
- yield from usage_extractor.get_usage_workunits(
311
- self.table_refs | self.view_refs
312
- )
313
-
314
- if self.config.is_profiling_enabled():
315
- self.report.report_ingestion_stage_start("Start warehouse")
316
- # Need to start the warehouse again for profiling,
317
- # as it may have been stopped after ingestion might take
318
- # longer time to complete
319
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
- if wait_on_warehouse is None:
321
- self.report.report_failure(
322
- "initialization",
323
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
302
+ with self.report.new_stage("Ingest usage"):
303
+ usage_extractor = UnityCatalogUsageExtractor(
304
+ config=self.config,
305
+ report=self.report,
306
+ proxy=self.unity_catalog_api_proxy,
307
+ table_urn_builder=self.gen_dataset_urn,
308
+ user_urn_builder=self.gen_user_urn,
309
+ )
310
+ yield from usage_extractor.get_usage_workunits(
311
+ self.table_refs | self.view_refs
324
312
  )
325
- return
326
- else:
327
- # wait until warehouse is started
328
- wait_on_warehouse.result()
329
313
 
330
- self.report.report_ingestion_stage_start("Profiling")
331
- if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
- yield from UnityCatalogAnalyzeProfiler(
333
- self.config.profiling,
334
- self.report,
335
- self.unity_catalog_api_proxy,
336
- self.gen_dataset_urn,
337
- ).get_workunits(self.table_refs)
338
- elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
- yield from UnityCatalogGEProfiler(
340
- sql_common_config=self.config,
341
- profiling_config=self.config.profiling,
342
- report=self.report,
343
- ).get_workunits(list(self.tables.values()))
344
- else:
345
- raise ValueError("Unknown profiling config method")
314
+ if self.config.is_profiling_enabled():
315
+ with self.report.new_stage("Start warehouse"):
316
+ # Need to start the warehouse again for profiling,
317
+ # as it may have been stopped after ingestion might take
318
+ # longer time to complete
319
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
+ if wait_on_warehouse is None:
321
+ self.report.report_failure(
322
+ "initialization",
323
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
324
+ )
325
+ return
326
+ else:
327
+ # wait until warehouse is started
328
+ wait_on_warehouse.result()
329
+
330
+ with self.report.new_stage("Profiling"):
331
+ if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
+ yield from UnityCatalogAnalyzeProfiler(
333
+ self.config.profiling,
334
+ self.report,
335
+ self.unity_catalog_api_proxy,
336
+ self.gen_dataset_urn,
337
+ ).get_workunits(self.table_refs)
338
+ elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
+ yield from UnityCatalogGEProfiler(
340
+ sql_common_config=self.config,
341
+ profiling_config=self.config.profiling,
342
+ report=self.report,
343
+ ).get_workunits(list(self.tables.values()))
344
+ else:
345
+ raise ValueError("Unknown profiling config method")
346
346
 
347
347
  def build_service_principal_map(self) -> None:
348
348
  try:
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
462
462
  self.report.schemas.dropped(schema.id)
463
463
  continue
464
464
 
465
- self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
466
- yield from self.gen_schema_containers(schema)
467
- yield from self.process_tables(schema)
465
+ with self.report.new_stage(f"Ingest schema {schema.id}"):
466
+ yield from self.gen_schema_containers(schema)
467
+ yield from self.process_tables(schema)
468
468
 
469
- self.report.schemas.processed(schema.id)
469
+ self.report.schemas.processed(schema.id)
470
470
 
471
471
  def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
472
472
  for table in self.unity_catalog_api_proxy.tables(schema=schema):
@@ -1,7 +1,7 @@
1
1
  import logging
2
+ from contextlib import AbstractContextManager
2
3
  from dataclasses import dataclass, field
3
4
  from datetime import datetime, timezone
4
- from typing import Optional
5
5
 
6
6
  from datahub.utilities.perf_timer import PerfTimer
7
7
  from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
22
22
 
23
23
  @dataclass
24
24
  class IngestionStageReport:
25
- ingestion_stage: Optional[str] = None
26
25
  ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
27
26
 
28
- _timer: Optional[PerfTimer] = field(
29
- default=None, init=False, repr=False, compare=False
30
- )
31
-
32
- def report_ingestion_stage_start(self, stage: str) -> None:
33
- if self._timer:
34
- elapsed = round(self._timer.elapsed_seconds(), 2)
35
- logger.info(
36
- f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
37
- stacklevel=2,
38
- )
39
- if self.ingestion_stage:
40
- self.ingestion_stage_durations[self.ingestion_stage] = elapsed
41
- else:
42
- self._timer = PerfTimer()
43
-
44
- self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
- logger.info(f"Stage started: {self.ingestion_stage}")
27
+ def new_stage(self, stage: str) -> "IngestionStageContext":
28
+ return IngestionStageContext(stage, self)
29
+
30
+
31
+ @dataclass
32
+ class IngestionStageContext(AbstractContextManager):
33
+ def __init__(self, stage: str, report: IngestionStageReport):
34
+ self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
35
+ self._timer: PerfTimer = PerfTimer()
36
+ self._report = report
37
+
38
+ def __enter__(self) -> "IngestionStageContext":
39
+ logger.info(f"Stage started: {self._ingestion_stage}")
46
40
  self._timer.start()
41
+ return self
42
+
43
+ def __exit__(self, exc_type, exc_val, exc_tb):
44
+ elapsed = self._timer.elapsed_seconds(digits=2)
45
+ logger.info(
46
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
47
+ stacklevel=2,
48
+ )
49
+ self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
50
+ return None
@@ -14262,7 +14262,7 @@ class DataFlowKeyClass(_Aspect):
14262
14262
 
14263
14263
 
14264
14264
  ASPECT_NAME = 'dataFlowKey'
14265
- ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
14265
+ ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
14266
14266
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataFlowKey")
14267
14267
 
14268
14268
  def __init__(self,
@@ -14678,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
14678
14678
 
14679
14679
 
14680
14680
  ASPECT_NAME = 'dataJobKey'
14681
- ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14681
+ ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14682
14682
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
14683
14683
 
14684
14684
  def __init__(self,
@@ -10,7 +10,7 @@
10
10
 
11
11
  # This file contains classes corresponding to entity URNs.
12
12
 
13
- from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
13
+ from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
14
14
 
15
15
  import functools
16
16
  from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -213,7 +213,7 @@ class SchemaFieldUrn(_SpecificUrn):
213
213
  ENTITY_TYPE: ClassVar[str] = "schemaField"
214
214
  _URN_PARTS: ClassVar[int] = 2
215
215
 
216
- def __init__(self, parent: str, field_path: str, *, _allow_coercion: bool = True) -> None:
216
+ def __init__(self, parent: Union["Urn", str], field_path: str, *, _allow_coercion: bool = True) -> None:
217
217
  if _allow_coercion:
218
218
  # Field coercion logic (if any is required).
219
219
  field_path = UrnEncoder.encode_string(field_path)
@@ -604,7 +604,7 @@ class DataJobUrn(_SpecificUrn):
604
604
  ENTITY_TYPE: ClassVar[str] = "dataJob"
605
605
  _URN_PARTS: ClassVar[int] = 2
606
606
 
607
- def __init__(self, flow: str, job_id: str, *, _allow_coercion: bool = True) -> None:
607
+ def __init__(self, flow: Union["DataFlowUrn", str], job_id: str, *, _allow_coercion: bool = True) -> None:
608
608
  if _allow_coercion:
609
609
  # Field coercion logic (if any is required).
610
610
  job_id = UrnEncoder.encode_string(job_id)
@@ -1435,10 +1435,10 @@ class DataPlatformInstanceUrn(_SpecificUrn):
1435
1435
  ENTITY_TYPE: ClassVar[str] = "dataPlatformInstance"
1436
1436
  _URN_PARTS: ClassVar[int] = 2
1437
1437
 
1438
- def __init__(self, platform: str, instance: str, *, _allow_coercion: bool = True) -> None:
1438
+ def __init__(self, platform: Union["DataPlatformUrn", str], instance: str, *, _allow_coercion: bool = True) -> None:
1439
1439
  if _allow_coercion:
1440
1440
  # Field coercion logic (if any is required).
1441
- platform = DataPlatformUrn(platform).urn()
1441
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1442
1442
  instance = UrnEncoder.encode_string(instance)
1443
1443
 
1444
1444
  # Validation logic.
@@ -1678,10 +1678,10 @@ class DatasetUrn(_SpecificUrn):
1678
1678
  ENTITY_TYPE: ClassVar[str] = "dataset"
1679
1679
  _URN_PARTS: ClassVar[int] = 3
1680
1680
 
1681
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1681
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1682
1682
  if _allow_coercion:
1683
1683
  # Field coercion logic (if any is required).
1684
- platform = DataPlatformUrn(platform).urn()
1684
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1685
1685
  name = UrnEncoder.encode_string(name)
1686
1686
  env = env.upper()
1687
1687
  env = UrnEncoder.encode_string(env)
@@ -1771,10 +1771,10 @@ class MlModelUrn(_SpecificUrn):
1771
1771
  ENTITY_TYPE: ClassVar[str] = "mlModel"
1772
1772
  _URN_PARTS: ClassVar[int] = 3
1773
1773
 
1774
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1774
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1775
1775
  if _allow_coercion:
1776
1776
  # Field coercion logic (if any is required).
1777
- platform = DataPlatformUrn(platform).urn()
1777
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1778
1778
  name = UrnEncoder.encode_string(name)
1779
1779
  env = env.upper()
1780
1780
  env = UrnEncoder.encode_string(env)
@@ -1889,10 +1889,10 @@ class MlModelDeploymentUrn(_SpecificUrn):
1889
1889
  ENTITY_TYPE: ClassVar[str] = "mlModelDeployment"
1890
1890
  _URN_PARTS: ClassVar[int] = 3
1891
1891
 
1892
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1892
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1893
1893
  if _allow_coercion:
1894
1894
  # Field coercion logic (if any is required).
1895
- platform = DataPlatformUrn(platform).urn()
1895
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1896
1896
  name = UrnEncoder.encode_string(name)
1897
1897
  env = env.upper()
1898
1898
  env = UrnEncoder.encode_string(env)
@@ -1953,10 +1953,10 @@ class MlFeatureTableUrn(_SpecificUrn):
1953
1953
  ENTITY_TYPE: ClassVar[str] = "mlFeatureTable"
1954
1954
  _URN_PARTS: ClassVar[int] = 2
1955
1955
 
1956
- def __init__(self, platform: str, name: str, *, _allow_coercion: bool = True) -> None:
1956
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, *, _allow_coercion: bool = True) -> None:
1957
1957
  if _allow_coercion:
1958
1958
  # Field coercion logic (if any is required).
1959
- platform = DataPlatformUrn(platform).urn()
1959
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1960
1960
  name = UrnEncoder.encode_string(name)
1961
1961
 
1962
1962
  # Validation logic.
@@ -2385,10 +2385,10 @@ class MlModelGroupUrn(_SpecificUrn):
2385
2385
  ENTITY_TYPE: ClassVar[str] = "mlModelGroup"
2386
2386
  _URN_PARTS: ClassVar[int] = 3
2387
2387
 
2388
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
2388
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
2389
2389
  if _allow_coercion:
2390
2390
  # Field coercion logic (if any is required).
2391
- platform = DataPlatformUrn(platform).urn()
2391
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
2392
2392
  name = UrnEncoder.encode_string(name)
2393
2393
  env = env.upper()
2394
2394
  env = UrnEncoder.encode_string(env)
@@ -17,6 +17,7 @@
17
17
  "glossaryTerms",
18
18
  "institutionalMemory",
19
19
  "dataPlatformInstance",
20
+ "container",
20
21
  "browsePathsV2",
21
22
  "structuredProperties",
22
23
  "incidentsSummary",
@@ -20,6 +20,7 @@
20
20
  "glossaryTerms",
21
21
  "institutionalMemory",
22
22
  "dataPlatformInstance",
23
+ "container",
23
24
  "browsePathsV2",
24
25
  "structuredProperties",
25
26
  "forms",
@@ -57,7 +57,7 @@ class PerfTimer(AbstractContextManager):
57
57
  self.finish()
58
58
  return None
59
59
 
60
- def elapsed_seconds(self) -> float:
60
+ def elapsed_seconds(self, digits: int = 4) -> float:
61
61
  """
62
62
  Returns the elapsed time in seconds.
63
63
  """
@@ -65,11 +65,18 @@ class PerfTimer(AbstractContextManager):
65
65
  return self._past_active_time
66
66
 
67
67
  if self.end_time is None:
68
- return (time.perf_counter() - self.start_time) + (self._past_active_time)
68
+ elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
69
69
  else:
70
- return (self.end_time - self.start_time) + self._past_active_time
70
+ elapsed = (self.end_time - self.start_time) + self._past_active_time
71
+
72
+ return round(elapsed, digits)
71
73
 
72
74
  def assert_timer_is_running(self) -> None:
75
+ if not self.is_running():
76
+ self._error_state = True
77
+ logger.warning("Did you forget to start the timer ?")
78
+
79
+ def is_running(self) -> bool:
73
80
  """
74
81
  Returns true if timer is in running state.
75
82
  Timer is in NOT in running state if
@@ -77,9 +84,7 @@ class PerfTimer(AbstractContextManager):
77
84
  2. it is in paused state.
78
85
  3. it had been started and finished in the past but not started again.
79
86
  """
80
- if self.start_time is None or self.paused or self.end_time:
81
- self._error_state = True
82
- logger.warning("Did you forget to start the timer ?")
87
+ return self.start_time is not None and not self.paused and self.end_time is None
83
88
 
84
89
  def __repr__(self) -> str:
85
90
  return repr(self.as_obj())