acryl-datahub 0.15.0.1rc10__py3-none-any.whl → 0.15.0.1rc12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (33) hide show
  1. {acryl_datahub-0.15.0.1rc10.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/METADATA +2376 -2380
  2. {acryl_datahub-0.15.0.1rc10.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/RECORD +33 -33
  3. datahub/__init__.py +1 -1
  4. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  5. datahub/configuration/common.py +2 -5
  6. datahub/emitter/mce_builder.py +17 -1
  7. datahub/emitter/mcp_builder.py +2 -7
  8. datahub/emitter/mcp_patch_builder.py +2 -2
  9. datahub/emitter/rest_emitter.py +2 -2
  10. datahub/ingestion/api/closeable.py +3 -3
  11. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  12. datahub/ingestion/api/report.py +4 -1
  13. datahub/ingestion/api/sink.py +4 -3
  14. datahub/ingestion/api/source_helpers.py +2 -6
  15. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  16. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  17. datahub/ingestion/source/gc/dataprocess_cleanup.py +19 -6
  18. datahub/ingestion/source/s3/source.py +1 -1
  19. datahub/ingestion/source/sql/hive.py +15 -0
  20. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  21. datahub/ingestion/source/sql/mssql/source.py +1 -1
  22. datahub/ingestion/source/sql/sql_common.py +41 -102
  23. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  24. datahub/ingestion/source/sql/sql_report.py +2 -0
  25. datahub/ingestion/source/state/checkpoint.py +2 -1
  26. datahub/ingestion/source/tableau/tableau.py +14 -6
  27. datahub/ingestion/source/unity/proxy.py +8 -27
  28. datahub/metadata/_urns/urn_defs.py +168 -168
  29. datahub/utilities/time.py +8 -3
  30. datahub/utilities/urns/_urn_base.py +5 -7
  31. {acryl_datahub-0.15.0.1rc10.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/WHEEL +0 -0
  32. {acryl_datahub-0.15.0.1rc10.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/entry_points.txt +0 -0
  33. {acryl_datahub-0.15.0.1rc10.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/top_level.txt +0 -0
@@ -170,6 +170,8 @@ class DataProcessCleanupReport(SourceReport):
170
170
  sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
171
171
  default_factory=TopKDict
172
172
  )
173
+ num_data_flows_found: int = 0
174
+ num_data_jobs_found: int = 0
173
175
 
174
176
 
175
177
  class DataProcessCleanup:
@@ -265,13 +267,17 @@ class DataProcessCleanup:
265
267
  self.report.report_failure(
266
268
  f"Exception while deleting DPI: {e}", exc=e
267
269
  )
268
- if deleted_count_last_n % self.config.batch_size == 0:
270
+ if (
271
+ deleted_count_last_n % self.config.batch_size == 0
272
+ and deleted_count_last_n > 0
273
+ ):
269
274
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
270
275
  if self.config.delay:
271
276
  logger.info(f"Sleeping for {self.config.delay} seconds")
272
277
  time.sleep(self.config.delay)
273
278
 
274
- logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
279
+ if deleted_count_last_n > 0:
280
+ logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
275
281
 
276
282
  def delete_entity(self, urn: str, type: str) -> None:
277
283
  assert self.ctx.graph
@@ -351,7 +357,10 @@ class DataProcessCleanup:
351
357
  except Exception as e:
352
358
  self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
353
359
 
354
- if deleted_count_retention % self.config.batch_size == 0:
360
+ if (
361
+ deleted_count_retention % self.config.batch_size == 0
362
+ and deleted_count_retention > 0
363
+ ):
355
364
  logger.info(
356
365
  f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
357
366
  )
@@ -393,6 +402,7 @@ class DataProcessCleanup:
393
402
  scrollAcrossEntities = result.get("scrollAcrossEntities")
394
403
  if not scrollAcrossEntities:
395
404
  raise ValueError("Missing scrollAcrossEntities in response")
405
+ self.report.num_data_flows_found += scrollAcrossEntities.get("count")
396
406
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
397
407
 
398
408
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -415,8 +425,9 @@ class DataProcessCleanup:
415
425
  assert self.ctx.graph
416
426
 
417
427
  dataFlows: Dict[str, DataFlowEntity] = {}
418
- for flow in self.get_data_flows():
419
- dataFlows[flow.urn] = flow
428
+ if self.config.delete_empty_data_flows:
429
+ for flow in self.get_data_flows():
430
+ dataFlows[flow.urn] = flow
420
431
 
421
432
  scroll_id: Optional[str] = None
422
433
  previous_scroll_id: Optional[str] = None
@@ -443,6 +454,7 @@ class DataProcessCleanup:
443
454
  if not scrollAcrossEntities:
444
455
  raise ValueError("Missing scrollAcrossEntities in response")
445
456
 
457
+ self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
446
458
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
447
459
 
448
460
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -481,7 +493,8 @@ class DataProcessCleanup:
481
493
 
482
494
  previous_scroll_id = scroll_id
483
495
 
484
- logger.info(f"Deleted {deleted_jobs} DataJobs")
496
+ if deleted_jobs > 0:
497
+ logger.info(f"Deleted {deleted_jobs} DataJobs")
485
498
  # Delete empty dataflows if needed
486
499
  if self.config.delete_empty_data_flows:
487
500
  deleted_data_flows: int = 0
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
225
225
  self.init_spark()
226
226
 
227
227
  def init_spark(self):
228
- os.environ.setdefault("SPARK_VERSION", "3.3")
228
+ os.environ.setdefault("SPARK_VERSION", "3.5")
229
229
  spark_version = os.environ["SPARK_VERSION"]
230
230
 
231
231
  # Importing here to avoid Deequ dependency for non profiling use cases
@@ -838,3 +838,18 @@ class HiveSource(TwoTierSQLAlchemySource):
838
838
  entityUrn=dataset_urn,
839
839
  aspect=view_properties_aspect,
840
840
  ).as_workunit()
841
+
842
+ if view_definition and self.config.include_view_lineage:
843
+ default_db = None
844
+ default_schema = None
845
+ try:
846
+ default_db, default_schema = self.get_db_schema(dataset_name)
847
+ except ValueError:
848
+ logger.warning(f"Invalid view identifier: {dataset_name}")
849
+
850
+ self.aggregator.add_view_definition(
851
+ view_urn=dataset_urn,
852
+ view_definition=view_definition,
853
+ default_db=default_db,
854
+ default_schema=default_schema,
855
+ )
@@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig):
123
123
  description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
124
124
  )
125
125
 
126
+ include_view_lineage: bool = Field(
127
+ default=False, description="", hidden_from_docs=True
128
+ )
129
+
126
130
  include_catalog_name_in_ids: bool = Field(
127
131
  default=False,
128
132
  description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
@@ -160,6 +164,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
160
164
  @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
161
165
  @capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
162
166
  @capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
167
+ @capability(
168
+ SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False
169
+ )
163
170
  class HiveMetastoreSource(SQLAlchemySource):
164
171
  """
165
172
  This plugin extracts the following:
@@ -724,7 +724,7 @@ class SQLServerSource(SQLAlchemySource):
724
724
  ):
725
725
  yield from auto_workunit(
726
726
  generate_procedure_lineage(
727
- schema_resolver=self.schema_resolver,
727
+ schema_resolver=self.get_schema_resolver(),
728
728
  procedure=procedure,
729
729
  procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
730
730
  is_temp_table=self.is_temp_table,
@@ -11,7 +11,6 @@ from typing import (
11
11
  Dict,
12
12
  Iterable,
13
13
  List,
14
- MutableMapping,
15
14
  Optional,
16
15
  Set,
17
16
  Tuple,
@@ -36,7 +35,6 @@ from datahub.emitter.mce_builder import (
36
35
  make_tag_urn,
37
36
  )
38
37
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
39
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
40
38
  from datahub.ingestion.api.common import PipelineContext
41
39
  from datahub.ingestion.api.decorators import capability
42
40
  from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
@@ -79,7 +77,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
79
77
  StatefulIngestionSourceBase,
80
78
  )
81
79
  from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
82
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
83
80
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
84
81
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
85
82
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
@@ -106,17 +103,11 @@ from datahub.metadata.schema_classes import (
106
103
  GlobalTagsClass,
107
104
  SubTypesClass,
108
105
  TagAssociationClass,
109
- UpstreamClass,
110
106
  ViewPropertiesClass,
111
107
  )
112
108
  from datahub.sql_parsing.schema_resolver import SchemaResolver
113
- from datahub.sql_parsing.sqlglot_lineage import (
114
- SqlParsingResult,
115
- sqlglot_lineage,
116
- view_definition_lineage_helper,
117
- )
109
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
118
110
  from datahub.telemetry import telemetry
119
- from datahub.utilities.file_backed_collections import FileBackedDict
120
111
  from datahub.utilities.registries.domain_registry import DomainRegistry
121
112
  from datahub.utilities.sqlalchemy_type_converter import (
122
113
  get_native_data_type_for_sqlalchemy_type,
@@ -347,17 +338,19 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
347
338
  )
348
339
 
349
340
  self.views_failed_parsing: Set[str] = set()
350
- self.schema_resolver: SchemaResolver = SchemaResolver(
341
+
342
+ self.discovered_datasets: Set[str] = set()
343
+ self.aggregator = SqlParsingAggregator(
351
344
  platform=self.platform,
352
345
  platform_instance=self.config.platform_instance,
353
346
  env=self.config.env,
347
+ graph=self.ctx.graph,
348
+ generate_lineage=self.include_lineage,
349
+ generate_usage_statistics=False,
350
+ generate_operations=False,
351
+ eager_graph_load=False,
354
352
  )
355
- self.discovered_datasets: Set[str] = set()
356
- self._view_definition_cache: MutableMapping[str, str]
357
- if self.config.use_file_backed_cache:
358
- self._view_definition_cache = FileBackedDict[str]()
359
- else:
360
- self._view_definition_cache = {}
353
+ self.report.sql_aggregator = self.aggregator.report
361
354
 
362
355
  @classmethod
363
356
  def test_connection(cls, config_dict: dict) -> TestConnectionReport:
@@ -572,36 +565,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
572
565
  profile_requests, profiler, platform=self.platform
573
566
  )
574
567
 
575
- if self.config.include_view_lineage:
576
- yield from self.get_view_lineage()
577
-
578
- def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
579
- builder = SqlParsingBuilder(
580
- generate_lineage=True,
581
- generate_usage_statistics=False,
582
- generate_operations=False,
583
- )
584
- for dataset_name in self._view_definition_cache.keys():
585
- # TODO: Ensure that the lineage generated from the view definition
586
- # matches the dataset_name.
587
- view_definition = self._view_definition_cache[dataset_name]
588
- result = self._run_sql_parser(
589
- dataset_name,
590
- view_definition,
591
- self.schema_resolver,
592
- )
593
- if result and result.out_tables:
594
- # This does not yield any workunits but we use
595
- # yield here to execute this method
596
- yield from builder.process_sql_parsing_result(
597
- result=result,
598
- query=view_definition,
599
- is_view_ddl=True,
600
- include_column_lineage=self.config.include_view_column_lineage,
601
- )
602
- else:
603
- self.views_failed_parsing.add(dataset_name)
604
- yield from builder.gen_workunits()
568
+ # Generate workunit for aggregated SQL parsing results
569
+ for mcp in self.aggregator.gen_metadata():
570
+ yield mcp.as_workunit()
605
571
 
606
572
  def get_identifier(
607
573
  self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
@@ -760,16 +726,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
760
726
  )
761
727
  dataset_snapshot.aspects.append(dataset_properties)
762
728
 
763
- if self.config.include_table_location_lineage and location_urn:
764
- external_upstream_table = UpstreamClass(
765
- dataset=location_urn,
766
- type=DatasetLineageTypeClass.COPY,
767
- )
768
- yield MetadataChangeProposalWrapper(
769
- entityUrn=dataset_snapshot.urn,
770
- aspect=UpstreamLineage(upstreams=[external_upstream_table]),
771
- ).as_workunit()
772
-
773
729
  extra_tags = self.get_extra_tags(inspector, schema, table)
774
730
  pk_constraints: dict = inspector.get_pk_constraint(table, schema)
775
731
  partitions: Optional[List[str]] = self.get_partitions(inspector, schema, table)
@@ -795,7 +751,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
795
751
 
796
752
  dataset_snapshot.aspects.append(schema_metadata)
797
753
  if self._save_schema_to_resolver():
798
- self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
754
+ self.aggregator.register_schema(dataset_urn, schema_metadata)
799
755
  self.discovered_datasets.add(dataset_name)
800
756
  db_name = self.get_db_name(inspector)
801
757
 
@@ -815,6 +771,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
815
771
  ),
816
772
  )
817
773
 
774
+ if self.config.include_table_location_lineage and location_urn:
775
+ self.aggregator.add_known_lineage_mapping(
776
+ upstream_urn=location_urn,
777
+ downstream_urn=dataset_snapshot.urn,
778
+ lineage_type=DatasetLineageTypeClass.COPY,
779
+ )
780
+
818
781
  if self.config.domain:
819
782
  assert self.domain_registry
820
783
  yield from get_domain_wu(
@@ -1089,6 +1052,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1089
1052
  self.config.platform_instance,
1090
1053
  self.config.env,
1091
1054
  )
1055
+
1092
1056
  try:
1093
1057
  columns = inspector.get_columns(view, schema)
1094
1058
  except KeyError:
@@ -1108,7 +1072,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1108
1072
  canonical_schema=schema_fields,
1109
1073
  )
1110
1074
  if self._save_schema_to_resolver():
1111
- self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1075
+ self.aggregator.register_schema(dataset_urn, schema_metadata)
1112
1076
  self.discovered_datasets.add(dataset_name)
1113
1077
 
1114
1078
  description, properties, _ = self.get_table_properties(inspector, schema, view)
@@ -1117,7 +1081,18 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1117
1081
  view_definition = self._get_view_definition(inspector, schema, view)
1118
1082
  properties["view_definition"] = view_definition
1119
1083
  if view_definition and self.config.include_view_lineage:
1120
- self._view_definition_cache[dataset_name] = view_definition
1084
+ default_db = None
1085
+ default_schema = None
1086
+ try:
1087
+ default_db, default_schema = self.get_db_schema(dataset_name)
1088
+ except ValueError:
1089
+ logger.warning(f"Invalid view identifier: {dataset_name}")
1090
+ self.aggregator.add_view_definition(
1091
+ view_urn=dataset_urn,
1092
+ view_definition=view_definition,
1093
+ default_db=default_db,
1094
+ default_schema=default_schema,
1095
+ )
1121
1096
 
1122
1097
  dataset_snapshot = DatasetSnapshot(
1123
1098
  urn=dataset_urn,
@@ -1169,48 +1144,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1169
1144
  hasattr(self.config, "include_lineage") and self.config.include_lineage
1170
1145
  )
1171
1146
 
1172
- def _run_sql_parser(
1173
- self, view_identifier: str, query: str, schema_resolver: SchemaResolver
1174
- ) -> Optional[SqlParsingResult]:
1175
- try:
1176
- database, schema = self.get_db_schema(view_identifier)
1177
- except ValueError:
1178
- logger.warning(f"Invalid view identifier: {view_identifier}")
1179
- return None
1180
- raw_lineage = sqlglot_lineage(
1181
- query,
1182
- schema_resolver=schema_resolver,
1183
- default_db=database,
1184
- default_schema=schema,
1185
- )
1186
- view_urn = make_dataset_urn_with_platform_instance(
1187
- self.platform,
1188
- view_identifier,
1189
- self.config.platform_instance,
1190
- self.config.env,
1191
- )
1192
-
1193
- if raw_lineage.debug_info.table_error:
1194
- logger.debug(
1195
- f"Failed to parse lineage for view {view_identifier}: "
1196
- f"{raw_lineage.debug_info.table_error}"
1197
- )
1198
- self.report.num_view_definitions_failed_parsing += 1
1199
- self.report.view_definitions_parsing_failures.append(
1200
- f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}"
1201
- )
1202
- return None
1203
-
1204
- elif raw_lineage.debug_info.column_error:
1205
- self.report.num_view_definitions_failed_column_parsing += 1
1206
- self.report.view_definitions_parsing_failures.append(
1207
- f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}"
1208
- )
1209
- else:
1210
- self.report.num_view_definitions_parsed += 1
1211
- if raw_lineage.out_tables != [view_urn]:
1212
- self.report.num_view_definitions_view_urn_mismatch += 1
1213
- return view_definition_lineage_helper(raw_lineage, view_urn)
1147
+ @property
1148
+ def include_lineage(self):
1149
+ return self.config.include_view_lineage
1214
1150
 
1215
1151
  def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
1216
1152
  database, schema, _view = dataset_identifier.split(".", 2)
@@ -1411,5 +1347,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1411
1347
  schema=schema, table=table, partition=partition, custom_sql=custom_sql
1412
1348
  )
1413
1349
 
1350
+ def get_schema_resolver(self) -> SchemaResolver:
1351
+ return self.aggregator._schema_resolver
1352
+
1414
1353
  def get_report(self):
1415
1354
  return self.report
@@ -7,7 +7,10 @@ from typing import Dict, Iterable, List, Optional, Union, cast
7
7
  from sqlalchemy import create_engine, inspect
8
8
  from sqlalchemy.engine.reflection import Inspector
9
9
 
10
- from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
10
+ from datahub.emitter.mce_builder import (
11
+ make_dataset_urn_with_platform_instance,
12
+ parse_ts_millis,
13
+ )
11
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
15
  from datahub.ingestion.api.workunit import MetadataWorkUnit
13
16
  from datahub.ingestion.source.ge_data_profiler import (
@@ -245,11 +248,7 @@ class GenericProfiler:
245
248
  # If profiling state exists we have to carry over to the new state
246
249
  self.state_handler.add_to_state(dataset_urn, last_profiled)
247
250
 
248
- threshold_time: Optional[datetime] = (
249
- datetime.fromtimestamp(last_profiled / 1000, timezone.utc)
250
- if last_profiled
251
- else None
252
- )
251
+ threshold_time: Optional[datetime] = parse_ts_millis(last_profiled)
253
252
  if (
254
253
  not threshold_time
255
254
  and self.config.profiling.profile_if_updated_since_days is not None
@@ -5,6 +5,7 @@ from datahub.ingestion.glossary.classification_mixin import ClassificationReport
5
5
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
6
  StaleEntityRemovalSourceReport,
7
7
  )
8
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
8
9
  from datahub.utilities.lossy_collections import LossyList
9
10
  from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
10
11
  from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
@@ -52,6 +53,7 @@ class SQLSourceReport(
52
53
  num_view_definitions_failed_parsing: int = 0
53
54
  num_view_definitions_failed_column_parsing: int = 0
54
55
  view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
56
+ sql_aggregator: Optional[SqlAggregatorReport] = None
55
57
 
56
58
  def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
57
59
  """
@@ -12,6 +12,7 @@ from typing import Callable, Generic, Optional, Type, TypeVar
12
12
  import pydantic
13
13
 
14
14
  from datahub.configuration.common import ConfigModel
15
+ from datahub.emitter.mce_builder import parse_ts_millis
15
16
  from datahub.metadata.schema_classes import (
16
17
  DatahubIngestionCheckpointClass,
17
18
  IngestionCheckpointStateClass,
@@ -144,7 +145,7 @@ class Checkpoint(Generic[StateType]):
144
145
  )
145
146
  logger.info(
146
147
  f"Successfully constructed last checkpoint state for job {job_name} "
147
- f"with timestamp {datetime.fromtimestamp(checkpoint_aspect.timestampMillis/1000, tz=timezone.utc)}"
148
+ f"with timestamp {parse_ts_millis(checkpoint_aspect.timestampMillis)}"
148
149
  )
149
150
  return checkpoint
150
151
  return None
@@ -186,6 +186,15 @@ try:
186
186
  except ImportError:
187
187
  REAUTHENTICATE_ERRORS = (NonXMLResponseError,)
188
188
 
189
+ RETRIABLE_ERROR_CODES = [
190
+ 408, # Request Timeout
191
+ 429, # Too Many Requests
192
+ 500, # Internal Server Error
193
+ 502, # Bad Gateway
194
+ 503, # Service Unavailable
195
+ 504, # Gateway Timeout
196
+ ]
197
+
189
198
  logger: logging.Logger = logging.getLogger(__name__)
190
199
 
191
200
  # Replace / with |
@@ -287,7 +296,7 @@ class TableauConnectionConfig(ConfigModel):
287
296
  max_retries=Retry(
288
297
  total=self.max_retries,
289
298
  backoff_factor=1,
290
- status_forcelist=[429, 500, 502, 503, 504],
299
+ status_forcelist=RETRIABLE_ERROR_CODES,
291
300
  )
292
301
  )
293
302
  server._session.mount("http://", adapter)
@@ -911,10 +920,7 @@ class TableauSiteSource:
911
920
  return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
912
921
 
913
922
  def _re_authenticate(self) -> None:
914
- self.report.info(
915
- message="Re-authenticating to Tableau",
916
- context=f"site='{self.site_content_url}'",
917
- )
923
+ logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'")
918
924
  # Sign-in again may not be enough because Tableau sometimes caches invalid sessions
919
925
  # so we need to recreate the Tableau Server object
920
926
  self.server = self.config.make_tableau_client(self.site_content_url)
@@ -1212,9 +1218,11 @@ class TableauSiteSource:
1212
1218
 
1213
1219
  except InternalServerError as ise:
1214
1220
  # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
1215
- if ise.code == 504:
1221
+ # Extended with other retryable errors.
1222
+ if ise.code in RETRIABLE_ERROR_CODES:
1216
1223
  if retries_remaining <= 0:
1217
1224
  raise ise
1225
+ logger.info(f"Retrying query due to error {ise.code}")
1218
1226
  return self.get_connection_object_page(
1219
1227
  query=query,
1220
1228
  connection_type=connection_type,
@@ -4,7 +4,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
4
4
 
5
5
  import dataclasses
6
6
  import logging
7
- from datetime import datetime, timezone
7
+ from datetime import datetime
8
8
  from typing import Any, Dict, Iterable, List, Optional, Union, cast
9
9
  from unittest.mock import patch
10
10
 
@@ -27,6 +27,7 @@ from databricks.sdk.service.sql import (
27
27
  from databricks.sdk.service.workspace import ObjectType
28
28
 
29
29
  import datahub
30
+ from datahub.emitter.mce_builder import parse_ts_millis
30
31
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
31
32
  from datahub.ingestion.source.unity.proxy_profiling import (
32
33
  UnityCatalogProxyProfilingMixin,
@@ -211,16 +212,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
211
212
  id=obj.object_id,
212
213
  path=obj.path,
213
214
  language=obj.language,
214
- created_at=(
215
- datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
216
- if obj.created_at
217
- else None
218
- ),
219
- modified_at=(
220
- datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc)
221
- if obj.modified_at
222
- else None
223
- ),
215
+ created_at=parse_ts_millis(obj.created_at),
216
+ modified_at=parse_ts_millis(obj.modified_at),
224
217
  )
225
218
 
226
219
  def query_history(
@@ -452,17 +445,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
452
445
  properties=obj.properties or {},
453
446
  owner=obj.owner,
454
447
  generation=obj.generation,
455
- created_at=(
456
- datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
457
- if obj.created_at
458
- else None
459
- ),
448
+ created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None),
460
449
  created_by=obj.created_by,
461
- updated_at=(
462
- datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc)
463
- if obj.updated_at
464
- else None
465
- ),
450
+ updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None),
466
451
  updated_by=obj.updated_by,
467
452
  table_id=obj.table_id,
468
453
  comment=obj.comment,
@@ -500,12 +485,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
500
485
  query_id=info.query_id,
501
486
  query_text=info.query_text,
502
487
  statement_type=info.statement_type,
503
- start_time=datetime.fromtimestamp(
504
- info.query_start_time_ms / 1000, tz=timezone.utc
505
- ),
506
- end_time=datetime.fromtimestamp(
507
- info.query_end_time_ms / 1000, tz=timezone.utc
508
- ),
488
+ start_time=parse_ts_millis(info.query_start_time_ms),
489
+ end_time=parse_ts_millis(info.query_end_time_ms),
509
490
  user_id=info.user_id,
510
491
  user_name=info.user_name,
511
492
  executed_as_user_id=info.executed_as_user_id,