acryl-datahub 0.15.0.1rc11__py3-none-any.whl → 0.15.0.1rc12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/METADATA +2381 -2385
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/RECORD +33 -33
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/configuration/common.py +2 -5
- datahub/emitter/mce_builder.py +17 -1
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +2 -2
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +19 -6
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/sql/hive.py +15 -0
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +1 -4
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc12.dist-info}/top_level.txt +0 -0
|
@@ -170,6 +170,8 @@ class DataProcessCleanupReport(SourceReport):
|
|
|
170
170
|
sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
171
171
|
default_factory=TopKDict
|
|
172
172
|
)
|
|
173
|
+
num_data_flows_found: int = 0
|
|
174
|
+
num_data_jobs_found: int = 0
|
|
173
175
|
|
|
174
176
|
|
|
175
177
|
class DataProcessCleanup:
|
|
@@ -265,13 +267,17 @@ class DataProcessCleanup:
|
|
|
265
267
|
self.report.report_failure(
|
|
266
268
|
f"Exception while deleting DPI: {e}", exc=e
|
|
267
269
|
)
|
|
268
|
-
if
|
|
270
|
+
if (
|
|
271
|
+
deleted_count_last_n % self.config.batch_size == 0
|
|
272
|
+
and deleted_count_last_n > 0
|
|
273
|
+
):
|
|
269
274
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
270
275
|
if self.config.delay:
|
|
271
276
|
logger.info(f"Sleeping for {self.config.delay} seconds")
|
|
272
277
|
time.sleep(self.config.delay)
|
|
273
278
|
|
|
274
|
-
|
|
279
|
+
if deleted_count_last_n > 0:
|
|
280
|
+
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
275
281
|
|
|
276
282
|
def delete_entity(self, urn: str, type: str) -> None:
|
|
277
283
|
assert self.ctx.graph
|
|
@@ -351,7 +357,10 @@ class DataProcessCleanup:
|
|
|
351
357
|
except Exception as e:
|
|
352
358
|
self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
|
|
353
359
|
|
|
354
|
-
if
|
|
360
|
+
if (
|
|
361
|
+
deleted_count_retention % self.config.batch_size == 0
|
|
362
|
+
and deleted_count_retention > 0
|
|
363
|
+
):
|
|
355
364
|
logger.info(
|
|
356
365
|
f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
|
|
357
366
|
)
|
|
@@ -393,6 +402,7 @@ class DataProcessCleanup:
|
|
|
393
402
|
scrollAcrossEntities = result.get("scrollAcrossEntities")
|
|
394
403
|
if not scrollAcrossEntities:
|
|
395
404
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
405
|
+
self.report.num_data_flows_found += scrollAcrossEntities.get("count")
|
|
396
406
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
|
|
397
407
|
|
|
398
408
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -415,8 +425,9 @@ class DataProcessCleanup:
|
|
|
415
425
|
assert self.ctx.graph
|
|
416
426
|
|
|
417
427
|
dataFlows: Dict[str, DataFlowEntity] = {}
|
|
418
|
-
|
|
419
|
-
|
|
428
|
+
if self.config.delete_empty_data_flows:
|
|
429
|
+
for flow in self.get_data_flows():
|
|
430
|
+
dataFlows[flow.urn] = flow
|
|
420
431
|
|
|
421
432
|
scroll_id: Optional[str] = None
|
|
422
433
|
previous_scroll_id: Optional[str] = None
|
|
@@ -443,6 +454,7 @@ class DataProcessCleanup:
|
|
|
443
454
|
if not scrollAcrossEntities:
|
|
444
455
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
445
456
|
|
|
457
|
+
self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
|
|
446
458
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
|
|
447
459
|
|
|
448
460
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -481,7 +493,8 @@ class DataProcessCleanup:
|
|
|
481
493
|
|
|
482
494
|
previous_scroll_id = scroll_id
|
|
483
495
|
|
|
484
|
-
|
|
496
|
+
if deleted_jobs > 0:
|
|
497
|
+
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
485
498
|
# Delete empty dataflows if needed
|
|
486
499
|
if self.config.delete_empty_data_flows:
|
|
487
500
|
deleted_data_flows: int = 0
|
|
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
225
225
|
self.init_spark()
|
|
226
226
|
|
|
227
227
|
def init_spark(self):
|
|
228
|
-
os.environ.setdefault("SPARK_VERSION", "3.
|
|
228
|
+
os.environ.setdefault("SPARK_VERSION", "3.5")
|
|
229
229
|
spark_version = os.environ["SPARK_VERSION"]
|
|
230
230
|
|
|
231
231
|
# Importing here to avoid Deequ dependency for non profiling use cases
|
|
@@ -838,3 +838,18 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
838
838
|
entityUrn=dataset_urn,
|
|
839
839
|
aspect=view_properties_aspect,
|
|
840
840
|
).as_workunit()
|
|
841
|
+
|
|
842
|
+
if view_definition and self.config.include_view_lineage:
|
|
843
|
+
default_db = None
|
|
844
|
+
default_schema = None
|
|
845
|
+
try:
|
|
846
|
+
default_db, default_schema = self.get_db_schema(dataset_name)
|
|
847
|
+
except ValueError:
|
|
848
|
+
logger.warning(f"Invalid view identifier: {dataset_name}")
|
|
849
|
+
|
|
850
|
+
self.aggregator.add_view_definition(
|
|
851
|
+
view_urn=dataset_urn,
|
|
852
|
+
view_definition=view_definition,
|
|
853
|
+
default_db=default_db,
|
|
854
|
+
default_schema=default_schema,
|
|
855
|
+
)
|
|
@@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
123
123
|
description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
include_view_lineage: bool = Field(
|
|
127
|
+
default=False, description="", hidden_from_docs=True
|
|
128
|
+
)
|
|
129
|
+
|
|
126
130
|
include_catalog_name_in_ids: bool = Field(
|
|
127
131
|
default=False,
|
|
128
132
|
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
|
|
@@ -160,6 +164,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
160
164
|
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
161
165
|
@capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
|
|
162
166
|
@capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
|
|
167
|
+
@capability(
|
|
168
|
+
SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False
|
|
169
|
+
)
|
|
163
170
|
class HiveMetastoreSource(SQLAlchemySource):
|
|
164
171
|
"""
|
|
165
172
|
This plugin extracts the following:
|
|
@@ -724,7 +724,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
724
724
|
):
|
|
725
725
|
yield from auto_workunit(
|
|
726
726
|
generate_procedure_lineage(
|
|
727
|
-
schema_resolver=self.
|
|
727
|
+
schema_resolver=self.get_schema_resolver(),
|
|
728
728
|
procedure=procedure,
|
|
729
729
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
730
730
|
is_temp_table=self.is_temp_table,
|
|
@@ -11,7 +11,6 @@ from typing import (
|
|
|
11
11
|
Dict,
|
|
12
12
|
Iterable,
|
|
13
13
|
List,
|
|
14
|
-
MutableMapping,
|
|
15
14
|
Optional,
|
|
16
15
|
Set,
|
|
17
16
|
Tuple,
|
|
@@ -36,7 +35,6 @@ from datahub.emitter.mce_builder import (
|
|
|
36
35
|
make_tag_urn,
|
|
37
36
|
)
|
|
38
37
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
39
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
40
38
|
from datahub.ingestion.api.common import PipelineContext
|
|
41
39
|
from datahub.ingestion.api.decorators import capability
|
|
42
40
|
from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
|
|
@@ -79,7 +77,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
79
77
|
StatefulIngestionSourceBase,
|
|
80
78
|
)
|
|
81
79
|
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
82
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
83
80
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
84
81
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
85
82
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
@@ -106,17 +103,11 @@ from datahub.metadata.schema_classes import (
|
|
|
106
103
|
GlobalTagsClass,
|
|
107
104
|
SubTypesClass,
|
|
108
105
|
TagAssociationClass,
|
|
109
|
-
UpstreamClass,
|
|
110
106
|
ViewPropertiesClass,
|
|
111
107
|
)
|
|
112
108
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
113
|
-
from datahub.sql_parsing.
|
|
114
|
-
SqlParsingResult,
|
|
115
|
-
sqlglot_lineage,
|
|
116
|
-
view_definition_lineage_helper,
|
|
117
|
-
)
|
|
109
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
118
110
|
from datahub.telemetry import telemetry
|
|
119
|
-
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
120
111
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
121
112
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
122
113
|
get_native_data_type_for_sqlalchemy_type,
|
|
@@ -347,17 +338,19 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
347
338
|
)
|
|
348
339
|
|
|
349
340
|
self.views_failed_parsing: Set[str] = set()
|
|
350
|
-
|
|
341
|
+
|
|
342
|
+
self.discovered_datasets: Set[str] = set()
|
|
343
|
+
self.aggregator = SqlParsingAggregator(
|
|
351
344
|
platform=self.platform,
|
|
352
345
|
platform_instance=self.config.platform_instance,
|
|
353
346
|
env=self.config.env,
|
|
347
|
+
graph=self.ctx.graph,
|
|
348
|
+
generate_lineage=self.include_lineage,
|
|
349
|
+
generate_usage_statistics=False,
|
|
350
|
+
generate_operations=False,
|
|
351
|
+
eager_graph_load=False,
|
|
354
352
|
)
|
|
355
|
-
self.
|
|
356
|
-
self._view_definition_cache: MutableMapping[str, str]
|
|
357
|
-
if self.config.use_file_backed_cache:
|
|
358
|
-
self._view_definition_cache = FileBackedDict[str]()
|
|
359
|
-
else:
|
|
360
|
-
self._view_definition_cache = {}
|
|
353
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
361
354
|
|
|
362
355
|
@classmethod
|
|
363
356
|
def test_connection(cls, config_dict: dict) -> TestConnectionReport:
|
|
@@ -572,36 +565,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
572
565
|
profile_requests, profiler, platform=self.platform
|
|
573
566
|
)
|
|
574
567
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
579
|
-
builder = SqlParsingBuilder(
|
|
580
|
-
generate_lineage=True,
|
|
581
|
-
generate_usage_statistics=False,
|
|
582
|
-
generate_operations=False,
|
|
583
|
-
)
|
|
584
|
-
for dataset_name in self._view_definition_cache.keys():
|
|
585
|
-
# TODO: Ensure that the lineage generated from the view definition
|
|
586
|
-
# matches the dataset_name.
|
|
587
|
-
view_definition = self._view_definition_cache[dataset_name]
|
|
588
|
-
result = self._run_sql_parser(
|
|
589
|
-
dataset_name,
|
|
590
|
-
view_definition,
|
|
591
|
-
self.schema_resolver,
|
|
592
|
-
)
|
|
593
|
-
if result and result.out_tables:
|
|
594
|
-
# This does not yield any workunits but we use
|
|
595
|
-
# yield here to execute this method
|
|
596
|
-
yield from builder.process_sql_parsing_result(
|
|
597
|
-
result=result,
|
|
598
|
-
query=view_definition,
|
|
599
|
-
is_view_ddl=True,
|
|
600
|
-
include_column_lineage=self.config.include_view_column_lineage,
|
|
601
|
-
)
|
|
602
|
-
else:
|
|
603
|
-
self.views_failed_parsing.add(dataset_name)
|
|
604
|
-
yield from builder.gen_workunits()
|
|
568
|
+
# Generate workunit for aggregated SQL parsing results
|
|
569
|
+
for mcp in self.aggregator.gen_metadata():
|
|
570
|
+
yield mcp.as_workunit()
|
|
605
571
|
|
|
606
572
|
def get_identifier(
|
|
607
573
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
@@ -760,16 +726,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
760
726
|
)
|
|
761
727
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
762
728
|
|
|
763
|
-
if self.config.include_table_location_lineage and location_urn:
|
|
764
|
-
external_upstream_table = UpstreamClass(
|
|
765
|
-
dataset=location_urn,
|
|
766
|
-
type=DatasetLineageTypeClass.COPY,
|
|
767
|
-
)
|
|
768
|
-
yield MetadataChangeProposalWrapper(
|
|
769
|
-
entityUrn=dataset_snapshot.urn,
|
|
770
|
-
aspect=UpstreamLineage(upstreams=[external_upstream_table]),
|
|
771
|
-
).as_workunit()
|
|
772
|
-
|
|
773
729
|
extra_tags = self.get_extra_tags(inspector, schema, table)
|
|
774
730
|
pk_constraints: dict = inspector.get_pk_constraint(table, schema)
|
|
775
731
|
partitions: Optional[List[str]] = self.get_partitions(inspector, schema, table)
|
|
@@ -795,7 +751,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
795
751
|
|
|
796
752
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
797
753
|
if self._save_schema_to_resolver():
|
|
798
|
-
self.
|
|
754
|
+
self.aggregator.register_schema(dataset_urn, schema_metadata)
|
|
799
755
|
self.discovered_datasets.add(dataset_name)
|
|
800
756
|
db_name = self.get_db_name(inspector)
|
|
801
757
|
|
|
@@ -815,6 +771,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
815
771
|
),
|
|
816
772
|
)
|
|
817
773
|
|
|
774
|
+
if self.config.include_table_location_lineage and location_urn:
|
|
775
|
+
self.aggregator.add_known_lineage_mapping(
|
|
776
|
+
upstream_urn=location_urn,
|
|
777
|
+
downstream_urn=dataset_snapshot.urn,
|
|
778
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
779
|
+
)
|
|
780
|
+
|
|
818
781
|
if self.config.domain:
|
|
819
782
|
assert self.domain_registry
|
|
820
783
|
yield from get_domain_wu(
|
|
@@ -1089,6 +1052,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1089
1052
|
self.config.platform_instance,
|
|
1090
1053
|
self.config.env,
|
|
1091
1054
|
)
|
|
1055
|
+
|
|
1092
1056
|
try:
|
|
1093
1057
|
columns = inspector.get_columns(view, schema)
|
|
1094
1058
|
except KeyError:
|
|
@@ -1108,7 +1072,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1108
1072
|
canonical_schema=schema_fields,
|
|
1109
1073
|
)
|
|
1110
1074
|
if self._save_schema_to_resolver():
|
|
1111
|
-
self.
|
|
1075
|
+
self.aggregator.register_schema(dataset_urn, schema_metadata)
|
|
1112
1076
|
self.discovered_datasets.add(dataset_name)
|
|
1113
1077
|
|
|
1114
1078
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
@@ -1117,7 +1081,18 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1117
1081
|
view_definition = self._get_view_definition(inspector, schema, view)
|
|
1118
1082
|
properties["view_definition"] = view_definition
|
|
1119
1083
|
if view_definition and self.config.include_view_lineage:
|
|
1120
|
-
|
|
1084
|
+
default_db = None
|
|
1085
|
+
default_schema = None
|
|
1086
|
+
try:
|
|
1087
|
+
default_db, default_schema = self.get_db_schema(dataset_name)
|
|
1088
|
+
except ValueError:
|
|
1089
|
+
logger.warning(f"Invalid view identifier: {dataset_name}")
|
|
1090
|
+
self.aggregator.add_view_definition(
|
|
1091
|
+
view_urn=dataset_urn,
|
|
1092
|
+
view_definition=view_definition,
|
|
1093
|
+
default_db=default_db,
|
|
1094
|
+
default_schema=default_schema,
|
|
1095
|
+
)
|
|
1121
1096
|
|
|
1122
1097
|
dataset_snapshot = DatasetSnapshot(
|
|
1123
1098
|
urn=dataset_urn,
|
|
@@ -1169,48 +1144,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1169
1144
|
hasattr(self.config, "include_lineage") and self.config.include_lineage
|
|
1170
1145
|
)
|
|
1171
1146
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
try:
|
|
1176
|
-
database, schema = self.get_db_schema(view_identifier)
|
|
1177
|
-
except ValueError:
|
|
1178
|
-
logger.warning(f"Invalid view identifier: {view_identifier}")
|
|
1179
|
-
return None
|
|
1180
|
-
raw_lineage = sqlglot_lineage(
|
|
1181
|
-
query,
|
|
1182
|
-
schema_resolver=schema_resolver,
|
|
1183
|
-
default_db=database,
|
|
1184
|
-
default_schema=schema,
|
|
1185
|
-
)
|
|
1186
|
-
view_urn = make_dataset_urn_with_platform_instance(
|
|
1187
|
-
self.platform,
|
|
1188
|
-
view_identifier,
|
|
1189
|
-
self.config.platform_instance,
|
|
1190
|
-
self.config.env,
|
|
1191
|
-
)
|
|
1192
|
-
|
|
1193
|
-
if raw_lineage.debug_info.table_error:
|
|
1194
|
-
logger.debug(
|
|
1195
|
-
f"Failed to parse lineage for view {view_identifier}: "
|
|
1196
|
-
f"{raw_lineage.debug_info.table_error}"
|
|
1197
|
-
)
|
|
1198
|
-
self.report.num_view_definitions_failed_parsing += 1
|
|
1199
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1200
|
-
f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}"
|
|
1201
|
-
)
|
|
1202
|
-
return None
|
|
1203
|
-
|
|
1204
|
-
elif raw_lineage.debug_info.column_error:
|
|
1205
|
-
self.report.num_view_definitions_failed_column_parsing += 1
|
|
1206
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1207
|
-
f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}"
|
|
1208
|
-
)
|
|
1209
|
-
else:
|
|
1210
|
-
self.report.num_view_definitions_parsed += 1
|
|
1211
|
-
if raw_lineage.out_tables != [view_urn]:
|
|
1212
|
-
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1213
|
-
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1147
|
+
@property
|
|
1148
|
+
def include_lineage(self):
|
|
1149
|
+
return self.config.include_view_lineage
|
|
1214
1150
|
|
|
1215
1151
|
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
1216
1152
|
database, schema, _view = dataset_identifier.split(".", 2)
|
|
@@ -1411,5 +1347,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1411
1347
|
schema=schema, table=table, partition=partition, custom_sql=custom_sql
|
|
1412
1348
|
)
|
|
1413
1349
|
|
|
1350
|
+
def get_schema_resolver(self) -> SchemaResolver:
|
|
1351
|
+
return self.aggregator._schema_resolver
|
|
1352
|
+
|
|
1414
1353
|
def get_report(self):
|
|
1415
1354
|
return self.report
|
|
@@ -7,7 +7,10 @@ from typing import Dict, Iterable, List, Optional, Union, cast
|
|
|
7
7
|
from sqlalchemy import create_engine, inspect
|
|
8
8
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
9
|
|
|
10
|
-
from datahub.emitter.mce_builder import
|
|
10
|
+
from datahub.emitter.mce_builder import (
|
|
11
|
+
make_dataset_urn_with_platform_instance,
|
|
12
|
+
parse_ts_millis,
|
|
13
|
+
)
|
|
11
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
15
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
13
16
|
from datahub.ingestion.source.ge_data_profiler import (
|
|
@@ -245,11 +248,7 @@ class GenericProfiler:
|
|
|
245
248
|
# If profiling state exists we have to carry over to the new state
|
|
246
249
|
self.state_handler.add_to_state(dataset_urn, last_profiled)
|
|
247
250
|
|
|
248
|
-
threshold_time: Optional[datetime] = (
|
|
249
|
-
datetime.fromtimestamp(last_profiled / 1000, timezone.utc)
|
|
250
|
-
if last_profiled
|
|
251
|
-
else None
|
|
252
|
-
)
|
|
251
|
+
threshold_time: Optional[datetime] = parse_ts_millis(last_profiled)
|
|
253
252
|
if (
|
|
254
253
|
not threshold_time
|
|
255
254
|
and self.config.profiling.profile_if_updated_since_days is not None
|
|
@@ -5,6 +5,7 @@ from datahub.ingestion.glossary.classification_mixin import ClassificationReport
|
|
|
5
5
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
6
|
StaleEntityRemovalSourceReport,
|
|
7
7
|
)
|
|
8
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
8
9
|
from datahub.utilities.lossy_collections import LossyList
|
|
9
10
|
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
|
10
11
|
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
@@ -52,6 +53,7 @@ class SQLSourceReport(
|
|
|
52
53
|
num_view_definitions_failed_parsing: int = 0
|
|
53
54
|
num_view_definitions_failed_column_parsing: int = 0
|
|
54
55
|
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
56
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
55
57
|
|
|
56
58
|
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
57
59
|
"""
|
|
@@ -12,6 +12,7 @@ from typing import Callable, Generic, Optional, Type, TypeVar
|
|
|
12
12
|
import pydantic
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.common import ConfigModel
|
|
15
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
15
16
|
from datahub.metadata.schema_classes import (
|
|
16
17
|
DatahubIngestionCheckpointClass,
|
|
17
18
|
IngestionCheckpointStateClass,
|
|
@@ -144,7 +145,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
144
145
|
)
|
|
145
146
|
logger.info(
|
|
146
147
|
f"Successfully constructed last checkpoint state for job {job_name} "
|
|
147
|
-
f"with timestamp {
|
|
148
|
+
f"with timestamp {parse_ts_millis(checkpoint_aspect.timestampMillis)}"
|
|
148
149
|
)
|
|
149
150
|
return checkpoint
|
|
150
151
|
return None
|
|
@@ -920,10 +920,7 @@ class TableauSiteSource:
|
|
|
920
920
|
return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
|
|
921
921
|
|
|
922
922
|
def _re_authenticate(self) -> None:
|
|
923
|
-
|
|
924
|
-
message="Re-authenticating to Tableau",
|
|
925
|
-
context=f"site='{self.site_content_url}'",
|
|
926
|
-
)
|
|
923
|
+
logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'")
|
|
927
924
|
# Sign-in again may not be enough because Tableau sometimes caches invalid sessions
|
|
928
925
|
# so we need to recreate the Tableau Server object
|
|
929
926
|
self.server = self.config.make_tableau_client(self.site_content_url)
|
|
@@ -4,7 +4,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
|
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import logging
|
|
7
|
-
from datetime import datetime
|
|
7
|
+
from datetime import datetime
|
|
8
8
|
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
9
9
|
from unittest.mock import patch
|
|
10
10
|
|
|
@@ -27,6 +27,7 @@ from databricks.sdk.service.sql import (
|
|
|
27
27
|
from databricks.sdk.service.workspace import ObjectType
|
|
28
28
|
|
|
29
29
|
import datahub
|
|
30
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
30
31
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
31
32
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
32
33
|
UnityCatalogProxyProfilingMixin,
|
|
@@ -211,16 +212,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
211
212
|
id=obj.object_id,
|
|
212
213
|
path=obj.path,
|
|
213
214
|
language=obj.language,
|
|
214
|
-
created_at=(
|
|
215
|
-
|
|
216
|
-
if obj.created_at
|
|
217
|
-
else None
|
|
218
|
-
),
|
|
219
|
-
modified_at=(
|
|
220
|
-
datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc)
|
|
221
|
-
if obj.modified_at
|
|
222
|
-
else None
|
|
223
|
-
),
|
|
215
|
+
created_at=parse_ts_millis(obj.created_at),
|
|
216
|
+
modified_at=parse_ts_millis(obj.modified_at),
|
|
224
217
|
)
|
|
225
218
|
|
|
226
219
|
def query_history(
|
|
@@ -452,17 +445,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
452
445
|
properties=obj.properties or {},
|
|
453
446
|
owner=obj.owner,
|
|
454
447
|
generation=obj.generation,
|
|
455
|
-
created_at=(
|
|
456
|
-
datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
|
|
457
|
-
if obj.created_at
|
|
458
|
-
else None
|
|
459
|
-
),
|
|
448
|
+
created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None),
|
|
460
449
|
created_by=obj.created_by,
|
|
461
|
-
updated_at=(
|
|
462
|
-
datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc)
|
|
463
|
-
if obj.updated_at
|
|
464
|
-
else None
|
|
465
|
-
),
|
|
450
|
+
updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None),
|
|
466
451
|
updated_by=obj.updated_by,
|
|
467
452
|
table_id=obj.table_id,
|
|
468
453
|
comment=obj.comment,
|
|
@@ -500,12 +485,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
500
485
|
query_id=info.query_id,
|
|
501
486
|
query_text=info.query_text,
|
|
502
487
|
statement_type=info.statement_type,
|
|
503
|
-
start_time=
|
|
504
|
-
|
|
505
|
-
),
|
|
506
|
-
end_time=datetime.fromtimestamp(
|
|
507
|
-
info.query_end_time_ms / 1000, tz=timezone.utc
|
|
508
|
-
),
|
|
488
|
+
start_time=parse_ts_millis(info.query_start_time_ms),
|
|
489
|
+
end_time=parse_ts_millis(info.query_end_time_ms),
|
|
509
490
|
user_id=info.user_id,
|
|
510
491
|
user_name=info.user_name,
|
|
511
492
|
executed_as_user_id=info.executed_as_user_id,
|