acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
123
123
|
description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
include_view_lineage: bool = Field(
|
|
127
|
+
default=False, description="", hidden_from_docs=True
|
|
128
|
+
)
|
|
129
|
+
|
|
126
130
|
include_catalog_name_in_ids: bool = Field(
|
|
127
131
|
default=False,
|
|
128
132
|
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
|
|
@@ -160,6 +164,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
160
164
|
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
161
165
|
@capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
|
|
162
166
|
@capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
|
|
167
|
+
@capability(
|
|
168
|
+
SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False
|
|
169
|
+
)
|
|
163
170
|
class HiveMetastoreSource(SQLAlchemySource):
|
|
164
171
|
"""
|
|
165
172
|
This plugin extracts the following:
|
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Dict, List, Optional, Union
|
|
3
3
|
|
|
4
|
-
from datahub.emitter.mce_builder import
|
|
4
|
+
from datahub.emitter.mce_builder import (
|
|
5
|
+
make_data_flow_urn,
|
|
6
|
+
make_data_job_urn,
|
|
7
|
+
make_data_platform_urn,
|
|
8
|
+
make_dataplatform_instance_urn,
|
|
9
|
+
)
|
|
5
10
|
from datahub.metadata.schema_classes import (
|
|
6
11
|
DataFlowInfoClass,
|
|
7
12
|
DataJobInfoClass,
|
|
8
13
|
DataJobInputOutputClass,
|
|
14
|
+
DataPlatformInstanceClass,
|
|
9
15
|
)
|
|
10
16
|
|
|
11
17
|
|
|
@@ -204,6 +210,18 @@ class MSSQLDataJob:
|
|
|
204
210
|
status=self.status,
|
|
205
211
|
)
|
|
206
212
|
|
|
213
|
+
@property
|
|
214
|
+
def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
|
|
215
|
+
if self.entity.flow.platform_instance:
|
|
216
|
+
return DataPlatformInstanceClass(
|
|
217
|
+
platform=make_data_platform_urn(self.entity.flow.orchestrator),
|
|
218
|
+
instance=make_dataplatform_instance_urn(
|
|
219
|
+
platform=self.entity.flow.orchestrator,
|
|
220
|
+
instance=self.entity.flow.platform_instance,
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
return None
|
|
224
|
+
|
|
207
225
|
|
|
208
226
|
@dataclass
|
|
209
227
|
class MSSQLDataFlow:
|
|
@@ -238,3 +256,14 @@ class MSSQLDataFlow:
|
|
|
238
256
|
customProperties=self.flow_properties,
|
|
239
257
|
externalUrl=self.external_url,
|
|
240
258
|
)
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
|
|
262
|
+
if self.entity.platform_instance:
|
|
263
|
+
return DataPlatformInstanceClass(
|
|
264
|
+
platform=make_data_platform_urn(self.entity.orchestrator),
|
|
265
|
+
instance=make_dataplatform_instance_urn(
|
|
266
|
+
self.entity.orchestrator, self.entity.platform_instance
|
|
267
|
+
),
|
|
268
|
+
)
|
|
269
|
+
return None
|
|
@@ -639,6 +639,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
639
639
|
aspect=data_job.as_datajob_info_aspect,
|
|
640
640
|
).as_workunit()
|
|
641
641
|
|
|
642
|
+
data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
|
|
643
|
+
if data_platform_instance_aspect:
|
|
644
|
+
yield MetadataChangeProposalWrapper(
|
|
645
|
+
entityUrn=data_job.urn,
|
|
646
|
+
aspect=data_platform_instance_aspect,
|
|
647
|
+
).as_workunit()
|
|
648
|
+
|
|
642
649
|
if include_lineage:
|
|
643
650
|
yield MetadataChangeProposalWrapper(
|
|
644
651
|
entityUrn=data_job.urn,
|
|
@@ -654,6 +661,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
654
661
|
entityUrn=data_flow.urn,
|
|
655
662
|
aspect=data_flow.as_dataflow_info_aspect,
|
|
656
663
|
).as_workunit()
|
|
664
|
+
|
|
665
|
+
data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
|
|
666
|
+
if data_platform_instance_aspect:
|
|
667
|
+
yield MetadataChangeProposalWrapper(
|
|
668
|
+
entityUrn=data_flow.urn,
|
|
669
|
+
aspect=data_platform_instance_aspect,
|
|
670
|
+
).as_workunit()
|
|
657
671
|
# TODO: Add SubType when it appear
|
|
658
672
|
|
|
659
673
|
def get_inspectors(self) -> Iterable[Inspector]:
|
|
@@ -710,7 +724,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
710
724
|
):
|
|
711
725
|
yield from auto_workunit(
|
|
712
726
|
generate_procedure_lineage(
|
|
713
|
-
schema_resolver=self.
|
|
727
|
+
schema_resolver=self.get_schema_resolver(),
|
|
714
728
|
procedure=procedure,
|
|
715
729
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
716
730
|
is_temp_table=self.is_temp_table,
|
|
@@ -11,7 +11,6 @@ from typing import (
|
|
|
11
11
|
Dict,
|
|
12
12
|
Iterable,
|
|
13
13
|
List,
|
|
14
|
-
MutableMapping,
|
|
15
14
|
Optional,
|
|
16
15
|
Set,
|
|
17
16
|
Tuple,
|
|
@@ -36,7 +35,6 @@ from datahub.emitter.mce_builder import (
|
|
|
36
35
|
make_tag_urn,
|
|
37
36
|
)
|
|
38
37
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
39
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
40
38
|
from datahub.ingestion.api.common import PipelineContext
|
|
41
39
|
from datahub.ingestion.api.decorators import capability
|
|
42
40
|
from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
|
|
@@ -79,7 +77,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
79
77
|
StatefulIngestionSourceBase,
|
|
80
78
|
)
|
|
81
79
|
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
82
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
83
80
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
84
81
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
85
82
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
@@ -106,17 +103,11 @@ from datahub.metadata.schema_classes import (
|
|
|
106
103
|
GlobalTagsClass,
|
|
107
104
|
SubTypesClass,
|
|
108
105
|
TagAssociationClass,
|
|
109
|
-
UpstreamClass,
|
|
110
106
|
ViewPropertiesClass,
|
|
111
107
|
)
|
|
112
108
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
113
|
-
from datahub.sql_parsing.
|
|
114
|
-
SqlParsingResult,
|
|
115
|
-
sqlglot_lineage,
|
|
116
|
-
view_definition_lineage_helper,
|
|
117
|
-
)
|
|
109
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
118
110
|
from datahub.telemetry import telemetry
|
|
119
|
-
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
120
111
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
121
112
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
122
113
|
get_native_data_type_for_sqlalchemy_type,
|
|
@@ -347,17 +338,19 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
347
338
|
)
|
|
348
339
|
|
|
349
340
|
self.views_failed_parsing: Set[str] = set()
|
|
350
|
-
|
|
341
|
+
|
|
342
|
+
self.discovered_datasets: Set[str] = set()
|
|
343
|
+
self.aggregator = SqlParsingAggregator(
|
|
351
344
|
platform=self.platform,
|
|
352
345
|
platform_instance=self.config.platform_instance,
|
|
353
346
|
env=self.config.env,
|
|
347
|
+
graph=self.ctx.graph,
|
|
348
|
+
generate_lineage=self.include_lineage,
|
|
349
|
+
generate_usage_statistics=False,
|
|
350
|
+
generate_operations=False,
|
|
351
|
+
eager_graph_load=False,
|
|
354
352
|
)
|
|
355
|
-
self.
|
|
356
|
-
self._view_definition_cache: MutableMapping[str, str]
|
|
357
|
-
if self.config.use_file_backed_cache:
|
|
358
|
-
self._view_definition_cache = FileBackedDict[str]()
|
|
359
|
-
else:
|
|
360
|
-
self._view_definition_cache = {}
|
|
353
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
361
354
|
|
|
362
355
|
@classmethod
|
|
363
356
|
def test_connection(cls, config_dict: dict) -> TestConnectionReport:
|
|
@@ -572,36 +565,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
572
565
|
profile_requests, profiler, platform=self.platform
|
|
573
566
|
)
|
|
574
567
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
579
|
-
builder = SqlParsingBuilder(
|
|
580
|
-
generate_lineage=True,
|
|
581
|
-
generate_usage_statistics=False,
|
|
582
|
-
generate_operations=False,
|
|
583
|
-
)
|
|
584
|
-
for dataset_name in self._view_definition_cache.keys():
|
|
585
|
-
# TODO: Ensure that the lineage generated from the view definition
|
|
586
|
-
# matches the dataset_name.
|
|
587
|
-
view_definition = self._view_definition_cache[dataset_name]
|
|
588
|
-
result = self._run_sql_parser(
|
|
589
|
-
dataset_name,
|
|
590
|
-
view_definition,
|
|
591
|
-
self.schema_resolver,
|
|
592
|
-
)
|
|
593
|
-
if result and result.out_tables:
|
|
594
|
-
# This does not yield any workunits but we use
|
|
595
|
-
# yield here to execute this method
|
|
596
|
-
yield from builder.process_sql_parsing_result(
|
|
597
|
-
result=result,
|
|
598
|
-
query=view_definition,
|
|
599
|
-
is_view_ddl=True,
|
|
600
|
-
include_column_lineage=self.config.include_view_column_lineage,
|
|
601
|
-
)
|
|
602
|
-
else:
|
|
603
|
-
self.views_failed_parsing.add(dataset_name)
|
|
604
|
-
yield from builder.gen_workunits()
|
|
568
|
+
# Generate workunit for aggregated SQL parsing results
|
|
569
|
+
for mcp in self.aggregator.gen_metadata():
|
|
570
|
+
yield mcp.as_workunit()
|
|
605
571
|
|
|
606
572
|
def get_identifier(
|
|
607
573
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
@@ -760,16 +726,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
760
726
|
)
|
|
761
727
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
762
728
|
|
|
763
|
-
if self.config.include_table_location_lineage and location_urn:
|
|
764
|
-
external_upstream_table = UpstreamClass(
|
|
765
|
-
dataset=location_urn,
|
|
766
|
-
type=DatasetLineageTypeClass.COPY,
|
|
767
|
-
)
|
|
768
|
-
yield MetadataChangeProposalWrapper(
|
|
769
|
-
entityUrn=dataset_snapshot.urn,
|
|
770
|
-
aspect=UpstreamLineage(upstreams=[external_upstream_table]),
|
|
771
|
-
).as_workunit()
|
|
772
|
-
|
|
773
729
|
extra_tags = self.get_extra_tags(inspector, schema, table)
|
|
774
730
|
pk_constraints: dict = inspector.get_pk_constraint(table, schema)
|
|
775
731
|
partitions: Optional[List[str]] = self.get_partitions(inspector, schema, table)
|
|
@@ -795,7 +751,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
795
751
|
|
|
796
752
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
797
753
|
if self._save_schema_to_resolver():
|
|
798
|
-
self.
|
|
754
|
+
self.aggregator.register_schema(dataset_urn, schema_metadata)
|
|
799
755
|
self.discovered_datasets.add(dataset_name)
|
|
800
756
|
db_name = self.get_db_name(inspector)
|
|
801
757
|
|
|
@@ -815,6 +771,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
815
771
|
),
|
|
816
772
|
)
|
|
817
773
|
|
|
774
|
+
if self.config.include_table_location_lineage and location_urn:
|
|
775
|
+
self.aggregator.add_known_lineage_mapping(
|
|
776
|
+
upstream_urn=location_urn,
|
|
777
|
+
downstream_urn=dataset_snapshot.urn,
|
|
778
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
779
|
+
)
|
|
780
|
+
|
|
818
781
|
if self.config.domain:
|
|
819
782
|
assert self.domain_registry
|
|
820
783
|
yield from get_domain_wu(
|
|
@@ -1089,6 +1052,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1089
1052
|
self.config.platform_instance,
|
|
1090
1053
|
self.config.env,
|
|
1091
1054
|
)
|
|
1055
|
+
|
|
1092
1056
|
try:
|
|
1093
1057
|
columns = inspector.get_columns(view, schema)
|
|
1094
1058
|
except KeyError:
|
|
@@ -1108,7 +1072,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1108
1072
|
canonical_schema=schema_fields,
|
|
1109
1073
|
)
|
|
1110
1074
|
if self._save_schema_to_resolver():
|
|
1111
|
-
self.
|
|
1075
|
+
self.aggregator.register_schema(dataset_urn, schema_metadata)
|
|
1112
1076
|
self.discovered_datasets.add(dataset_name)
|
|
1113
1077
|
|
|
1114
1078
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
@@ -1117,7 +1081,18 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1117
1081
|
view_definition = self._get_view_definition(inspector, schema, view)
|
|
1118
1082
|
properties["view_definition"] = view_definition
|
|
1119
1083
|
if view_definition and self.config.include_view_lineage:
|
|
1120
|
-
|
|
1084
|
+
default_db = None
|
|
1085
|
+
default_schema = None
|
|
1086
|
+
try:
|
|
1087
|
+
default_db, default_schema = self.get_db_schema(dataset_name)
|
|
1088
|
+
except ValueError:
|
|
1089
|
+
logger.warning(f"Invalid view identifier: {dataset_name}")
|
|
1090
|
+
self.aggregator.add_view_definition(
|
|
1091
|
+
view_urn=dataset_urn,
|
|
1092
|
+
view_definition=view_definition,
|
|
1093
|
+
default_db=default_db,
|
|
1094
|
+
default_schema=default_schema,
|
|
1095
|
+
)
|
|
1121
1096
|
|
|
1122
1097
|
dataset_snapshot = DatasetSnapshot(
|
|
1123
1098
|
urn=dataset_urn,
|
|
@@ -1169,48 +1144,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1169
1144
|
hasattr(self.config, "include_lineage") and self.config.include_lineage
|
|
1170
1145
|
)
|
|
1171
1146
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
try:
|
|
1176
|
-
database, schema = self.get_db_schema(view_identifier)
|
|
1177
|
-
except ValueError:
|
|
1178
|
-
logger.warning(f"Invalid view identifier: {view_identifier}")
|
|
1179
|
-
return None
|
|
1180
|
-
raw_lineage = sqlglot_lineage(
|
|
1181
|
-
query,
|
|
1182
|
-
schema_resolver=schema_resolver,
|
|
1183
|
-
default_db=database,
|
|
1184
|
-
default_schema=schema,
|
|
1185
|
-
)
|
|
1186
|
-
view_urn = make_dataset_urn_with_platform_instance(
|
|
1187
|
-
self.platform,
|
|
1188
|
-
view_identifier,
|
|
1189
|
-
self.config.platform_instance,
|
|
1190
|
-
self.config.env,
|
|
1191
|
-
)
|
|
1192
|
-
|
|
1193
|
-
if raw_lineage.debug_info.table_error:
|
|
1194
|
-
logger.debug(
|
|
1195
|
-
f"Failed to parse lineage for view {view_identifier}: "
|
|
1196
|
-
f"{raw_lineage.debug_info.table_error}"
|
|
1197
|
-
)
|
|
1198
|
-
self.report.num_view_definitions_failed_parsing += 1
|
|
1199
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1200
|
-
f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}"
|
|
1201
|
-
)
|
|
1202
|
-
return None
|
|
1203
|
-
|
|
1204
|
-
elif raw_lineage.debug_info.column_error:
|
|
1205
|
-
self.report.num_view_definitions_failed_column_parsing += 1
|
|
1206
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1207
|
-
f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}"
|
|
1208
|
-
)
|
|
1209
|
-
else:
|
|
1210
|
-
self.report.num_view_definitions_parsed += 1
|
|
1211
|
-
if raw_lineage.out_tables != [view_urn]:
|
|
1212
|
-
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1213
|
-
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1147
|
+
@property
|
|
1148
|
+
def include_lineage(self):
|
|
1149
|
+
return self.config.include_view_lineage
|
|
1214
1150
|
|
|
1215
1151
|
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
1216
1152
|
database, schema, _view = dataset_identifier.split(".", 2)
|
|
@@ -1411,5 +1347,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1411
1347
|
schema=schema, table=table, partition=partition, custom_sql=custom_sql
|
|
1412
1348
|
)
|
|
1413
1349
|
|
|
1350
|
+
def get_schema_resolver(self) -> SchemaResolver:
|
|
1351
|
+
return self.aggregator._schema_resolver
|
|
1352
|
+
|
|
1414
1353
|
def get_report(self):
|
|
1415
1354
|
return self.report
|
|
@@ -7,7 +7,10 @@ from typing import Dict, Iterable, List, Optional, Union, cast
|
|
|
7
7
|
from sqlalchemy import create_engine, inspect
|
|
8
8
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
9
|
|
|
10
|
-
from datahub.emitter.mce_builder import
|
|
10
|
+
from datahub.emitter.mce_builder import (
|
|
11
|
+
make_dataset_urn_with_platform_instance,
|
|
12
|
+
parse_ts_millis,
|
|
13
|
+
)
|
|
11
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
15
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
13
16
|
from datahub.ingestion.source.ge_data_profiler import (
|
|
@@ -245,11 +248,7 @@ class GenericProfiler:
|
|
|
245
248
|
# If profiling state exists we have to carry over to the new state
|
|
246
249
|
self.state_handler.add_to_state(dataset_urn, last_profiled)
|
|
247
250
|
|
|
248
|
-
threshold_time: Optional[datetime] = (
|
|
249
|
-
datetime.fromtimestamp(last_profiled / 1000, timezone.utc)
|
|
250
|
-
if last_profiled
|
|
251
|
-
else None
|
|
252
|
-
)
|
|
251
|
+
threshold_time: Optional[datetime] = parse_ts_millis(last_profiled)
|
|
253
252
|
if (
|
|
254
253
|
not threshold_time
|
|
255
254
|
and self.config.profiling.profile_if_updated_since_days is not None
|
|
@@ -5,6 +5,7 @@ from datahub.ingestion.glossary.classification_mixin import ClassificationReport
|
|
|
5
5
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
6
|
StaleEntityRemovalSourceReport,
|
|
7
7
|
)
|
|
8
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
8
9
|
from datahub.utilities.lossy_collections import LossyList
|
|
9
10
|
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
|
10
11
|
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
@@ -52,6 +53,7 @@ class SQLSourceReport(
|
|
|
52
53
|
num_view_definitions_failed_parsing: int = 0
|
|
53
54
|
num_view_definitions_failed_column_parsing: int = 0
|
|
54
55
|
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
56
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
55
57
|
|
|
56
58
|
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
57
59
|
"""
|
|
@@ -12,6 +12,7 @@ from typing import Callable, Generic, Optional, Type, TypeVar
|
|
|
12
12
|
import pydantic
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.common import ConfigModel
|
|
15
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
15
16
|
from datahub.metadata.schema_classes import (
|
|
16
17
|
DatahubIngestionCheckpointClass,
|
|
17
18
|
IngestionCheckpointStateClass,
|
|
@@ -144,7 +145,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
144
145
|
)
|
|
145
146
|
logger.info(
|
|
146
147
|
f"Successfully constructed last checkpoint state for job {job_name} "
|
|
147
|
-
f"with timestamp {
|
|
148
|
+
f"with timestamp {parse_ts_millis(checkpoint_aspect.timestampMillis)}"
|
|
148
149
|
)
|
|
149
150
|
return checkpoint
|
|
150
151
|
return None
|