acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -18,7 +18,6 @@ from pydantic import Field, validator
|
|
|
18
18
|
from requests.adapters import HTTPAdapter, Retry
|
|
19
19
|
from requests.exceptions import ConnectionError
|
|
20
20
|
from requests.models import HTTPBasicAuth, HTTPError
|
|
21
|
-
from sqllineage.runner import LineageRunner
|
|
22
21
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
23
22
|
|
|
24
23
|
import datahub.emitter.mce_builder as builder
|
|
@@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
820
819
|
)
|
|
821
820
|
return None
|
|
822
821
|
|
|
823
|
-
@lru_cache(maxsize=None)
|
|
824
|
-
def _get_source_from_query(self, raw_query: str) -> set:
|
|
825
|
-
query = self._replace_definitions(raw_query)
|
|
826
|
-
parser = LineageRunner(query)
|
|
827
|
-
source_paths = set()
|
|
828
|
-
try:
|
|
829
|
-
for table in parser.source_tables:
|
|
830
|
-
sources = str(table).split(".")
|
|
831
|
-
source_schema, source_table = sources[-2], sources[-1]
|
|
832
|
-
if source_schema == "<default>":
|
|
833
|
-
source_schema = str(self.config.default_schema)
|
|
834
|
-
|
|
835
|
-
source_paths.add(f"{source_schema}.{source_table}")
|
|
836
|
-
except Exception as e:
|
|
837
|
-
self.report.report_failure(
|
|
838
|
-
title="Failed to Extract Lineage From Query",
|
|
839
|
-
message="Unable to retrieve lineage from Mode query.",
|
|
840
|
-
context=f"Query: {raw_query}, Error: {str(e)}",
|
|
841
|
-
)
|
|
842
|
-
|
|
843
|
-
return source_paths
|
|
844
|
-
|
|
845
822
|
def _get_datasource_urn(
|
|
846
823
|
self,
|
|
847
824
|
platform: str,
|
|
File without changes
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from neo4j import GraphDatabase
|
|
8
|
+
from pydantic.fields import Field
|
|
9
|
+
|
|
10
|
+
from datahub.configuration.source_common import EnvConfigMixin
|
|
11
|
+
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
14
|
+
from datahub.ingestion.api.decorators import (
|
|
15
|
+
SupportStatus,
|
|
16
|
+
config_class,
|
|
17
|
+
platform_name,
|
|
18
|
+
support_status,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
AuditStampClass,
|
|
26
|
+
BooleanTypeClass,
|
|
27
|
+
DatasetPropertiesClass,
|
|
28
|
+
DateTypeClass,
|
|
29
|
+
NullTypeClass,
|
|
30
|
+
NumberTypeClass,
|
|
31
|
+
OtherSchemaClass,
|
|
32
|
+
SchemaFieldClass,
|
|
33
|
+
SchemaMetadataClass,
|
|
34
|
+
StringTypeClass,
|
|
35
|
+
SubTypesClass,
|
|
36
|
+
UnionTypeClass,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
log = logging.getLogger(__name__)
|
|
40
|
+
logging.basicConfig(level=logging.INFO)
|
|
41
|
+
|
|
42
|
+
_type_mapping: Dict[Union[Type, str], Type] = {
|
|
43
|
+
"list": UnionTypeClass,
|
|
44
|
+
"boolean": BooleanTypeClass,
|
|
45
|
+
"integer": NumberTypeClass,
|
|
46
|
+
"local_date_time": DateTypeClass,
|
|
47
|
+
"float": NumberTypeClass,
|
|
48
|
+
"string": StringTypeClass,
|
|
49
|
+
"date": DateTypeClass,
|
|
50
|
+
"node": StringTypeClass,
|
|
51
|
+
"relationship": StringTypeClass,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Neo4jConfig(EnvConfigMixin):
|
|
56
|
+
username: str = Field(description="Neo4j Username")
|
|
57
|
+
password: str = Field(description="Neo4j Password")
|
|
58
|
+
uri: str = Field(description="The URI for the Neo4j server")
|
|
59
|
+
env: str = Field(description="Neo4j env")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Neo4jSourceReport(SourceReport):
|
|
64
|
+
obj_failures: int = 0
|
|
65
|
+
obj_created: int = 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@platform_name("Neo4j", id="neo4j")
|
|
69
|
+
@config_class(Neo4jConfig)
|
|
70
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
+
class Neo4jSource(Source):
|
|
72
|
+
NODE = "node"
|
|
73
|
+
RELATIONSHIP = "relationship"
|
|
74
|
+
PLATFORM = "neo4j"
|
|
75
|
+
|
|
76
|
+
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
|
+
self.ctx = ctx
|
|
78
|
+
self.config = config
|
|
79
|
+
self.report = Neo4jSourceReport()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def create(cls, config_dict, ctx):
|
|
83
|
+
config = Neo4jConfig.parse_obj(config_dict)
|
|
84
|
+
return cls(ctx, config)
|
|
85
|
+
|
|
86
|
+
def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
|
|
87
|
+
type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
|
|
88
|
+
return SchemaFieldDataType(type=type_class())
|
|
89
|
+
|
|
90
|
+
def get_schema_field_class(
|
|
91
|
+
self, col_name: str, col_type: str, **kwargs: Any
|
|
92
|
+
) -> SchemaFieldClass:
|
|
93
|
+
if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
|
|
94
|
+
col_type = self.NODE
|
|
95
|
+
else:
|
|
96
|
+
col_type = col_type
|
|
97
|
+
return SchemaFieldClass(
|
|
98
|
+
fieldPath=col_name,
|
|
99
|
+
type=self.get_field_type(col_type),
|
|
100
|
+
nativeDataType=col_type,
|
|
101
|
+
description=col_type.upper()
|
|
102
|
+
if col_type in (self.NODE, self.RELATIONSHIP)
|
|
103
|
+
else col_type,
|
|
104
|
+
lastModified=AuditStampClass(
|
|
105
|
+
time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def add_properties(
|
|
110
|
+
self,
|
|
111
|
+
dataset: str,
|
|
112
|
+
description: Optional[str] = None,
|
|
113
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
114
|
+
) -> MetadataChangeProposalWrapper:
|
|
115
|
+
dataset_properties = DatasetPropertiesClass(
|
|
116
|
+
description=description,
|
|
117
|
+
customProperties=custom_properties,
|
|
118
|
+
)
|
|
119
|
+
return MetadataChangeProposalWrapper(
|
|
120
|
+
entityUrn=make_dataset_urn(
|
|
121
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
122
|
+
),
|
|
123
|
+
aspect=dataset_properties,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def generate_neo4j_object(
|
|
127
|
+
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
128
|
+
) -> MetadataChangeProposalWrapper:
|
|
129
|
+
try:
|
|
130
|
+
fields = [
|
|
131
|
+
self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
|
|
132
|
+
for d in columns
|
|
133
|
+
for key, value in d.items()
|
|
134
|
+
]
|
|
135
|
+
mcp = MetadataChangeProposalWrapper(
|
|
136
|
+
entityUrn=make_dataset_urn(
|
|
137
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
138
|
+
),
|
|
139
|
+
aspect=SchemaMetadataClass(
|
|
140
|
+
schemaName=dataset,
|
|
141
|
+
platform=make_data_platform_urn(self.PLATFORM),
|
|
142
|
+
version=0,
|
|
143
|
+
hash="",
|
|
144
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
145
|
+
lastModified=AuditStampClass(
|
|
146
|
+
time=round(time.time() * 1000),
|
|
147
|
+
actor="urn:li:corpuser:ingestion",
|
|
148
|
+
),
|
|
149
|
+
fields=fields,
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
self.report.obj_created += 1
|
|
153
|
+
except Exception as e:
|
|
154
|
+
log.error(e)
|
|
155
|
+
self.report.obj_failures += 1
|
|
156
|
+
return mcp
|
|
157
|
+
|
|
158
|
+
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
159
|
+
driver = GraphDatabase.driver(
|
|
160
|
+
self.config.uri, auth=(self.config.username, self.config.password)
|
|
161
|
+
)
|
|
162
|
+
"""
|
|
163
|
+
This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
|
|
164
|
+
with two columns: key and value. The key represents the Neo4j object, while the value contains the
|
|
165
|
+
corresponding metadata.
|
|
166
|
+
|
|
167
|
+
When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
|
|
168
|
+
metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
|
|
169
|
+
relationships.
|
|
170
|
+
|
|
171
|
+
In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
|
|
172
|
+
dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
|
|
173
|
+
|
|
174
|
+
Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
|
|
175
|
+
single dataframe, which will be used to create the DataHub objects.
|
|
176
|
+
|
|
177
|
+
See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
log.info(f"{query}")
|
|
181
|
+
with driver.session() as session:
|
|
182
|
+
result = session.run(query)
|
|
183
|
+
data = [record for record in result]
|
|
184
|
+
log.info("Closing Neo4j driver")
|
|
185
|
+
driver.close()
|
|
186
|
+
|
|
187
|
+
node_df = self.process_nodes(data)
|
|
188
|
+
rel_df = self.process_relationships(data, node_df)
|
|
189
|
+
|
|
190
|
+
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
191
|
+
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.report.failure(
|
|
194
|
+
message="Failed to get neo4j metadata",
|
|
195
|
+
exc=e,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return df
|
|
199
|
+
|
|
200
|
+
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
201
|
+
nodes = [record for record in data if record["value"]["type"] == self.NODE]
|
|
202
|
+
node_df = pd.DataFrame(
|
|
203
|
+
nodes,
|
|
204
|
+
columns=["key", "value"],
|
|
205
|
+
)
|
|
206
|
+
node_df["obj_type"] = node_df["value"].apply(
|
|
207
|
+
lambda record: self.get_obj_type(record)
|
|
208
|
+
)
|
|
209
|
+
node_df["relationships"] = node_df["value"].apply(
|
|
210
|
+
lambda record: self.get_relationships(record)
|
|
211
|
+
)
|
|
212
|
+
node_df["properties"] = node_df["value"].apply(
|
|
213
|
+
lambda record: self.get_properties(record)
|
|
214
|
+
)
|
|
215
|
+
node_df["property_data_types"] = node_df["properties"].apply(
|
|
216
|
+
lambda record: self.get_property_data_types(record)
|
|
217
|
+
)
|
|
218
|
+
node_df["description"] = node_df.apply(
|
|
219
|
+
lambda record: self.get_node_description(record, node_df), axis=1
|
|
220
|
+
)
|
|
221
|
+
return node_df
|
|
222
|
+
|
|
223
|
+
def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
|
|
224
|
+
rels = [
|
|
225
|
+
record for record in data if record["value"]["type"] == self.RELATIONSHIP
|
|
226
|
+
]
|
|
227
|
+
rel_df = pd.DataFrame(rels, columns=["key", "value"])
|
|
228
|
+
rel_df["obj_type"] = rel_df["value"].apply(
|
|
229
|
+
lambda record: self.get_obj_type(record)
|
|
230
|
+
)
|
|
231
|
+
rel_df["properties"] = rel_df["value"].apply(
|
|
232
|
+
lambda record: self.get_properties(record)
|
|
233
|
+
)
|
|
234
|
+
rel_df["property_data_types"] = rel_df["properties"].apply(
|
|
235
|
+
lambda record: self.get_property_data_types(record)
|
|
236
|
+
)
|
|
237
|
+
rel_df["description"] = rel_df.apply(
|
|
238
|
+
lambda record: self.get_rel_descriptions(record, node_df), axis=1
|
|
239
|
+
)
|
|
240
|
+
return rel_df
|
|
241
|
+
|
|
242
|
+
def get_obj_type(self, record: dict) -> str:
|
|
243
|
+
return record["type"]
|
|
244
|
+
|
|
245
|
+
def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
|
|
246
|
+
descriptions = []
|
|
247
|
+
for _, row in df.iterrows():
|
|
248
|
+
relationships = row.get("relationships", {})
|
|
249
|
+
for relationship, props in relationships.items():
|
|
250
|
+
if record["key"] == relationship:
|
|
251
|
+
if props["direction"] == "in":
|
|
252
|
+
for prop in props["labels"]:
|
|
253
|
+
descriptions.append(
|
|
254
|
+
f"({row['key']})-[{record['key']}]->({prop})"
|
|
255
|
+
)
|
|
256
|
+
return "\n".join(descriptions)
|
|
257
|
+
|
|
258
|
+
def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
|
|
259
|
+
descriptions = []
|
|
260
|
+
for _, row in df.iterrows():
|
|
261
|
+
if record["key"] == row["key"]:
|
|
262
|
+
for relationship, props in row["relationships"].items():
|
|
263
|
+
direction = props["direction"]
|
|
264
|
+
for node in set(props["labels"]):
|
|
265
|
+
if direction == "in":
|
|
266
|
+
descriptions.append(
|
|
267
|
+
f"({row['key']})<-[{relationship}]-({node})"
|
|
268
|
+
)
|
|
269
|
+
elif direction == "out":
|
|
270
|
+
descriptions.append(
|
|
271
|
+
f"({row['key']})-[{relationship}]->({node})"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return "\n".join(descriptions)
|
|
275
|
+
|
|
276
|
+
def get_property_data_types(self, record: dict) -> List[dict]:
|
|
277
|
+
return [{k: v["type"]} for k, v in record.items()]
|
|
278
|
+
|
|
279
|
+
def get_properties(self, record: dict) -> str:
|
|
280
|
+
return record["properties"]
|
|
281
|
+
|
|
282
|
+
def get_relationships(self, record: dict) -> dict:
|
|
283
|
+
return record.get("relationships", None)
|
|
284
|
+
|
|
285
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
|
+
df = self.get_neo4j_metadata(
|
|
287
|
+
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
|
+
)
|
|
289
|
+
for index, row in df.iterrows():
|
|
290
|
+
try:
|
|
291
|
+
yield MetadataWorkUnit(
|
|
292
|
+
id=row["key"],
|
|
293
|
+
mcp=self.generate_neo4j_object(
|
|
294
|
+
columns=row["property_data_types"],
|
|
295
|
+
dataset=row["key"],
|
|
296
|
+
),
|
|
297
|
+
is_primary_source=True,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
yield MetadataWorkUnit(
|
|
301
|
+
id=row["key"],
|
|
302
|
+
mcp=MetadataChangeProposalWrapper(
|
|
303
|
+
entityUrn=make_dataset_urn(
|
|
304
|
+
platform=self.PLATFORM,
|
|
305
|
+
name=row["key"],
|
|
306
|
+
env=self.config.env,
|
|
307
|
+
),
|
|
308
|
+
aspect=SubTypesClass(
|
|
309
|
+
typeNames=[
|
|
310
|
+
DatasetSubTypes.NEO4J_NODE
|
|
311
|
+
if row["obj_type"] == self.NODE
|
|
312
|
+
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
313
|
+
]
|
|
314
|
+
),
|
|
315
|
+
),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
yield MetadataWorkUnit(
|
|
319
|
+
id=row["key"],
|
|
320
|
+
mcp=self.add_properties(
|
|
321
|
+
dataset=row["key"],
|
|
322
|
+
custom_properties=None,
|
|
323
|
+
description=row["description"],
|
|
324
|
+
),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
raise e
|
|
329
|
+
|
|
330
|
+
def get_report(self):
|
|
331
|
+
return self.report
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource
|
|
@@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum):
|
|
|
173
173
|
datahub_data_platform_name="redshift",
|
|
174
174
|
)
|
|
175
175
|
|
|
176
|
-
|
|
176
|
+
DATABRICKS_SQL = DataPlatformPair(
|
|
177
177
|
powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks"
|
|
178
178
|
)
|
|
179
179
|
|
|
@@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig(
|
|
|
313
313
|
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
314
314
|
)
|
|
315
315
|
|
|
316
|
-
# Dataset type mapping PowerBI support many type of data-sources. Here user
|
|
317
|
-
# DataSource
|
|
316
|
+
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
317
|
+
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
318
318
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
319
319
|
dataset_type_mapping: Union[
|
|
320
320
|
Dict[str, str], Dict[str, PlatformDetail]
|
|
@@ -1,25 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from abc import ABC
|
|
3
2
|
from dataclasses import dataclass
|
|
4
|
-
from
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from lark import Tree
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class AbstractIdentifierAccessor(ABC): # To pass lint
|
|
12
|
-
pass
|
|
8
|
+
from datahub.ingestion.source.powerbi.config import DataPlatformPair
|
|
9
|
+
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
|
|
13
10
|
|
|
14
|
-
|
|
15
|
-
# @dataclass
|
|
16
|
-
# class ItemSelector:
|
|
17
|
-
# items: Dict[str, Any]
|
|
18
|
-
# next: Optional[AbstractIdentifierAccessor]
|
|
11
|
+
TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
|
|
19
12
|
|
|
20
13
|
|
|
21
14
|
@dataclass
|
|
22
|
-
class IdentifierAccessor
|
|
15
|
+
class IdentifierAccessor:
|
|
23
16
|
"""
|
|
24
17
|
statement
|
|
25
18
|
public_order_date = Source{[Schema="public",Item="order_date"]}[Data]
|
|
@@ -30,13 +23,13 @@ class IdentifierAccessor(AbstractIdentifierAccessor):
|
|
|
30
23
|
|
|
31
24
|
"[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource
|
|
32
25
|
|
|
33
|
-
"public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e
|
|
26
|
+
"public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table
|
|
34
27
|
|
|
35
28
|
"""
|
|
36
29
|
|
|
37
30
|
identifier: str
|
|
38
31
|
items: Dict[str, Any]
|
|
39
|
-
next: Optional[
|
|
32
|
+
next: Optional["IdentifierAccessor"]
|
|
40
33
|
|
|
41
34
|
|
|
42
35
|
@dataclass
|
|
@@ -53,3 +46,31 @@ class ReferencedTable:
|
|
|
53
46
|
database: str
|
|
54
47
|
schema: str
|
|
55
48
|
table: str
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class DataPlatformTable:
|
|
53
|
+
data_platform_pair: DataPlatformPair
|
|
54
|
+
urn: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Lineage:
|
|
59
|
+
upstreams: List[DataPlatformTable]
|
|
60
|
+
column_lineage: List[ColumnLineageInfo]
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def empty() -> "Lineage":
|
|
64
|
+
return Lineage(upstreams=[], column_lineage=[])
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class FunctionName(Enum):
|
|
68
|
+
NATIVE_QUERY = "Value.NativeQuery"
|
|
69
|
+
POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
|
|
70
|
+
ORACLE_DATA_ACCESS = "Oracle.Database"
|
|
71
|
+
SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
|
|
72
|
+
MSSQL_DATA_ACCESS = "Sql.Database"
|
|
73
|
+
DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
|
|
74
|
+
GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
|
|
75
|
+
AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
|
|
76
|
+
DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
|
|
@@ -7,6 +7,7 @@ from typing import Dict, List
|
|
|
7
7
|
import lark
|
|
8
8
|
from lark import Lark, Tree
|
|
9
9
|
|
|
10
|
+
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
10
11
|
from datahub.ingestion.api.common import PipelineContext
|
|
11
12
|
from datahub.ingestion.source.powerbi.config import (
|
|
12
13
|
PowerBiDashboardSourceConfig,
|
|
@@ -65,7 +66,7 @@ def get_upstream_tables(
|
|
|
65
66
|
ctx: PipelineContext,
|
|
66
67
|
config: PowerBiDashboardSourceConfig,
|
|
67
68
|
parameters: Dict[str, str] = {},
|
|
68
|
-
) -> List[
|
|
69
|
+
) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
|
|
69
70
|
if table.expression is None:
|
|
70
71
|
logger.debug(f"There is no M-Query expression in table {table.full_name}")
|
|
71
72
|
return []
|
|
@@ -127,12 +128,14 @@ def get_upstream_tables(
|
|
|
127
128
|
reporter.m_query_parse_successes += 1
|
|
128
129
|
|
|
129
130
|
try:
|
|
130
|
-
lineage: List[
|
|
131
|
+
lineage: List[
|
|
132
|
+
datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
|
|
133
|
+
] = resolver.MQueryResolver(
|
|
131
134
|
table=table,
|
|
132
135
|
parse_tree=parse_tree,
|
|
133
136
|
reporter=reporter,
|
|
134
137
|
parameters=parameters,
|
|
135
|
-
).
|
|
138
|
+
).resolve_to_lineage(
|
|
136
139
|
ctx=ctx,
|
|
137
140
|
config=config,
|
|
138
141
|
platform_instance_resolver=platform_instance_resolver,
|