acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,331 @@
1
+ import logging
2
+ import time
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Iterable, List, Optional, Type, Union
5
+
6
+ import pandas as pd
7
+ from neo4j import GraphDatabase
8
+ from pydantic.fields import Field
9
+
10
+ from datahub.configuration.source_common import EnvConfigMixin
11
+ from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.api.common import PipelineContext
14
+ from datahub.ingestion.api.decorators import (
15
+ SupportStatus,
16
+ config_class,
17
+ platform_name,
18
+ support_status,
19
+ )
20
+ from datahub.ingestion.api.source import Source, SourceReport
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
+ from datahub.metadata.schema_classes import (
25
+ AuditStampClass,
26
+ BooleanTypeClass,
27
+ DatasetPropertiesClass,
28
+ DateTypeClass,
29
+ NullTypeClass,
30
+ NumberTypeClass,
31
+ OtherSchemaClass,
32
+ SchemaFieldClass,
33
+ SchemaMetadataClass,
34
+ StringTypeClass,
35
+ SubTypesClass,
36
+ UnionTypeClass,
37
+ )
38
+
39
+ log = logging.getLogger(__name__)
40
+ logging.basicConfig(level=logging.INFO)
41
+
42
+ _type_mapping: Dict[Union[Type, str], Type] = {
43
+ "list": UnionTypeClass,
44
+ "boolean": BooleanTypeClass,
45
+ "integer": NumberTypeClass,
46
+ "local_date_time": DateTypeClass,
47
+ "float": NumberTypeClass,
48
+ "string": StringTypeClass,
49
+ "date": DateTypeClass,
50
+ "node": StringTypeClass,
51
+ "relationship": StringTypeClass,
52
+ }
53
+
54
+
55
+ class Neo4jConfig(EnvConfigMixin):
56
+ username: str = Field(description="Neo4j Username")
57
+ password: str = Field(description="Neo4j Password")
58
+ uri: str = Field(description="The URI for the Neo4j server")
59
+ env: str = Field(description="Neo4j env")
60
+
61
+
62
+ @dataclass
63
+ class Neo4jSourceReport(SourceReport):
64
+ obj_failures: int = 0
65
+ obj_created: int = 0
66
+
67
+
68
+ @platform_name("Neo4j", id="neo4j")
69
+ @config_class(Neo4jConfig)
70
+ @support_status(SupportStatus.CERTIFIED)
71
+ class Neo4jSource(Source):
72
+ NODE = "node"
73
+ RELATIONSHIP = "relationship"
74
+ PLATFORM = "neo4j"
75
+
76
+ def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
+ self.ctx = ctx
78
+ self.config = config
79
+ self.report = Neo4jSourceReport()
80
+
81
+ @classmethod
82
+ def create(cls, config_dict, ctx):
83
+ config = Neo4jConfig.parse_obj(config_dict)
84
+ return cls(ctx, config)
85
+
86
+ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
87
+ type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
88
+ return SchemaFieldDataType(type=type_class())
89
+
90
+ def get_schema_field_class(
91
+ self, col_name: str, col_type: str, **kwargs: Any
92
+ ) -> SchemaFieldClass:
93
+ if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
94
+ col_type = self.NODE
95
+ else:
96
+ col_type = col_type
97
+ return SchemaFieldClass(
98
+ fieldPath=col_name,
99
+ type=self.get_field_type(col_type),
100
+ nativeDataType=col_type,
101
+ description=col_type.upper()
102
+ if col_type in (self.NODE, self.RELATIONSHIP)
103
+ else col_type,
104
+ lastModified=AuditStampClass(
105
+ time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
106
+ ),
107
+ )
108
+
109
+ def add_properties(
110
+ self,
111
+ dataset: str,
112
+ description: Optional[str] = None,
113
+ custom_properties: Optional[Dict[str, str]] = None,
114
+ ) -> MetadataChangeProposalWrapper:
115
+ dataset_properties = DatasetPropertiesClass(
116
+ description=description,
117
+ customProperties=custom_properties,
118
+ )
119
+ return MetadataChangeProposalWrapper(
120
+ entityUrn=make_dataset_urn(
121
+ platform=self.PLATFORM, name=dataset, env=self.config.env
122
+ ),
123
+ aspect=dataset_properties,
124
+ )
125
+
126
+ def generate_neo4j_object(
127
+ self, dataset: str, columns: list, obj_type: Optional[str] = None
128
+ ) -> MetadataChangeProposalWrapper:
129
+ try:
130
+ fields = [
131
+ self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
132
+ for d in columns
133
+ for key, value in d.items()
134
+ ]
135
+ mcp = MetadataChangeProposalWrapper(
136
+ entityUrn=make_dataset_urn(
137
+ platform=self.PLATFORM, name=dataset, env=self.config.env
138
+ ),
139
+ aspect=SchemaMetadataClass(
140
+ schemaName=dataset,
141
+ platform=make_data_platform_urn(self.PLATFORM),
142
+ version=0,
143
+ hash="",
144
+ platformSchema=OtherSchemaClass(rawSchema=""),
145
+ lastModified=AuditStampClass(
146
+ time=round(time.time() * 1000),
147
+ actor="urn:li:corpuser:ingestion",
148
+ ),
149
+ fields=fields,
150
+ ),
151
+ )
152
+ self.report.obj_created += 1
153
+ except Exception as e:
154
+ log.error(e)
155
+ self.report.obj_failures += 1
156
+ return mcp
157
+
158
+ def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
159
+ driver = GraphDatabase.driver(
160
+ self.config.uri, auth=(self.config.username, self.config.password)
161
+ )
162
+ """
163
+ This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
164
+ with two columns: key and value. The key represents the Neo4j object, while the value contains the
165
+ corresponding metadata.
166
+
167
+ When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
168
+ metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
169
+ relationships.
170
+
171
+ In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
172
+ dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
173
+
174
+ Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
175
+ single dataframe, which will be used to create the DataHub objects.
176
+
177
+ See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
178
+ """
179
+ try:
180
+ log.info(f"{query}")
181
+ with driver.session() as session:
182
+ result = session.run(query)
183
+ data = [record for record in result]
184
+ log.info("Closing Neo4j driver")
185
+ driver.close()
186
+
187
+ node_df = self.process_nodes(data)
188
+ rel_df = self.process_relationships(data, node_df)
189
+
190
+ union_cols = ["key", "obj_type", "property_data_types", "description"]
191
+ df = pd.concat([node_df[union_cols], rel_df[union_cols]])
192
+ except Exception as e:
193
+ self.report.failure(
194
+ message="Failed to get neo4j metadata",
195
+ exc=e,
196
+ )
197
+
198
+ return df
199
+
200
+ def process_nodes(self, data: list) -> pd.DataFrame:
201
+ nodes = [record for record in data if record["value"]["type"] == self.NODE]
202
+ node_df = pd.DataFrame(
203
+ nodes,
204
+ columns=["key", "value"],
205
+ )
206
+ node_df["obj_type"] = node_df["value"].apply(
207
+ lambda record: self.get_obj_type(record)
208
+ )
209
+ node_df["relationships"] = node_df["value"].apply(
210
+ lambda record: self.get_relationships(record)
211
+ )
212
+ node_df["properties"] = node_df["value"].apply(
213
+ lambda record: self.get_properties(record)
214
+ )
215
+ node_df["property_data_types"] = node_df["properties"].apply(
216
+ lambda record: self.get_property_data_types(record)
217
+ )
218
+ node_df["description"] = node_df.apply(
219
+ lambda record: self.get_node_description(record, node_df), axis=1
220
+ )
221
+ return node_df
222
+
223
+ def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
224
+ rels = [
225
+ record for record in data if record["value"]["type"] == self.RELATIONSHIP
226
+ ]
227
+ rel_df = pd.DataFrame(rels, columns=["key", "value"])
228
+ rel_df["obj_type"] = rel_df["value"].apply(
229
+ lambda record: self.get_obj_type(record)
230
+ )
231
+ rel_df["properties"] = rel_df["value"].apply(
232
+ lambda record: self.get_properties(record)
233
+ )
234
+ rel_df["property_data_types"] = rel_df["properties"].apply(
235
+ lambda record: self.get_property_data_types(record)
236
+ )
237
+ rel_df["description"] = rel_df.apply(
238
+ lambda record: self.get_rel_descriptions(record, node_df), axis=1
239
+ )
240
+ return rel_df
241
+
242
+ def get_obj_type(self, record: dict) -> str:
243
+ return record["type"]
244
+
245
+ def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
246
+ descriptions = []
247
+ for _, row in df.iterrows():
248
+ relationships = row.get("relationships", {})
249
+ for relationship, props in relationships.items():
250
+ if record["key"] == relationship:
251
+ if props["direction"] == "in":
252
+ for prop in props["labels"]:
253
+ descriptions.append(
254
+ f"({row['key']})-[{record['key']}]->({prop})"
255
+ )
256
+ return "\n".join(descriptions)
257
+
258
+ def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
259
+ descriptions = []
260
+ for _, row in df.iterrows():
261
+ if record["key"] == row["key"]:
262
+ for relationship, props in row["relationships"].items():
263
+ direction = props["direction"]
264
+ for node in set(props["labels"]):
265
+ if direction == "in":
266
+ descriptions.append(
267
+ f"({row['key']})<-[{relationship}]-({node})"
268
+ )
269
+ elif direction == "out":
270
+ descriptions.append(
271
+ f"({row['key']})-[{relationship}]->({node})"
272
+ )
273
+
274
+ return "\n".join(descriptions)
275
+
276
+ def get_property_data_types(self, record: dict) -> List[dict]:
277
+ return [{k: v["type"]} for k, v in record.items()]
278
+
279
+ def get_properties(self, record: dict) -> str:
280
+ return record["properties"]
281
+
282
+ def get_relationships(self, record: dict) -> dict:
283
+ return record.get("relationships", None)
284
+
285
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
+ df = self.get_neo4j_metadata(
287
+ "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
288
+ )
289
+ for index, row in df.iterrows():
290
+ try:
291
+ yield MetadataWorkUnit(
292
+ id=row["key"],
293
+ mcp=self.generate_neo4j_object(
294
+ columns=row["property_data_types"],
295
+ dataset=row["key"],
296
+ ),
297
+ is_primary_source=True,
298
+ )
299
+
300
+ yield MetadataWorkUnit(
301
+ id=row["key"],
302
+ mcp=MetadataChangeProposalWrapper(
303
+ entityUrn=make_dataset_urn(
304
+ platform=self.PLATFORM,
305
+ name=row["key"],
306
+ env=self.config.env,
307
+ ),
308
+ aspect=SubTypesClass(
309
+ typeNames=[
310
+ DatasetSubTypes.NEO4J_NODE
311
+ if row["obj_type"] == self.NODE
312
+ else DatasetSubTypes.NEO4J_RELATIONSHIP
313
+ ]
314
+ ),
315
+ ),
316
+ )
317
+
318
+ yield MetadataWorkUnit(
319
+ id=row["key"],
320
+ mcp=self.add_properties(
321
+ dataset=row["key"],
322
+ custom_properties=None,
323
+ description=row["description"],
324
+ ),
325
+ )
326
+
327
+ except Exception as e:
328
+ raise e
329
+
330
+ def get_report(self):
331
+ return self.report
@@ -1 +0,0 @@
1
- from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource
@@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum):
173
173
  datahub_data_platform_name="redshift",
174
174
  )
175
175
 
176
- DATABRICK_SQL = DataPlatformPair(
176
+ DATABRICKS_SQL = DataPlatformPair(
177
177
  powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks"
178
178
  )
179
179
 
@@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig(
313
313
  " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
314
314
  )
315
315
 
316
- # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI
317
- # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is
316
+ # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
317
+ # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
318
318
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
319
319
  dataset_type_mapping: Union[
320
320
  Dict[str, str], Dict[str, PlatformDetail]
@@ -1,25 +1,18 @@
1
1
  import os
2
- from abc import ABC
3
2
  from dataclasses import dataclass
4
- from typing import Any, Dict, Optional
3
+ from enum import Enum
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
6
  from lark import Tree
7
7
 
8
- TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
9
-
10
-
11
- class AbstractIdentifierAccessor(ABC): # To pass lint
12
- pass
8
+ from datahub.ingestion.source.powerbi.config import DataPlatformPair
9
+ from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
13
10
 
14
-
15
- # @dataclass
16
- # class ItemSelector:
17
- # items: Dict[str, Any]
18
- # next: Optional[AbstractIdentifierAccessor]
11
+ TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
19
12
 
20
13
 
21
14
  @dataclass
22
- class IdentifierAccessor(AbstractIdentifierAccessor):
15
+ class IdentifierAccessor:
23
16
  """
24
17
  statement
25
18
  public_order_date = Source{[Schema="public",Item="order_date"]}[Data]
@@ -30,13 +23,13 @@ class IdentifierAccessor(AbstractIdentifierAccessor):
30
23
 
31
24
  "[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource
32
25
 
33
- "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table
26
+ "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table
34
27
 
35
28
  """
36
29
 
37
30
  identifier: str
38
31
  items: Dict[str, Any]
39
- next: Optional[AbstractIdentifierAccessor]
32
+ next: Optional["IdentifierAccessor"]
40
33
 
41
34
 
42
35
  @dataclass
@@ -53,3 +46,31 @@ class ReferencedTable:
53
46
  database: str
54
47
  schema: str
55
48
  table: str
49
+
50
+
51
+ @dataclass
52
+ class DataPlatformTable:
53
+ data_platform_pair: DataPlatformPair
54
+ urn: str
55
+
56
+
57
+ @dataclass
58
+ class Lineage:
59
+ upstreams: List[DataPlatformTable]
60
+ column_lineage: List[ColumnLineageInfo]
61
+
62
+ @staticmethod
63
+ def empty() -> "Lineage":
64
+ return Lineage(upstreams=[], column_lineage=[])
65
+
66
+
67
+ class FunctionName(Enum):
68
+ NATIVE_QUERY = "Value.NativeQuery"
69
+ POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
70
+ ORACLE_DATA_ACCESS = "Oracle.Database"
71
+ SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
72
+ MSSQL_DATA_ACCESS = "Sql.Database"
73
+ DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
74
+ GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
75
+ AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
+ DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
@@ -7,6 +7,7 @@ from typing import Dict, List
7
7
  import lark
8
8
  from lark import Lark, Tree
9
9
 
10
+ import datahub.ingestion.source.powerbi.m_query.data_classes
10
11
  from datahub.ingestion.api.common import PipelineContext
11
12
  from datahub.ingestion.source.powerbi.config import (
12
13
  PowerBiDashboardSourceConfig,
@@ -65,7 +66,7 @@ def get_upstream_tables(
65
66
  ctx: PipelineContext,
66
67
  config: PowerBiDashboardSourceConfig,
67
68
  parameters: Dict[str, str] = {},
68
- ) -> List[resolver.Lineage]:
69
+ ) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
69
70
  if table.expression is None:
70
71
  logger.debug(f"There is no M-Query expression in table {table.full_name}")
71
72
  return []
@@ -127,12 +128,14 @@ def get_upstream_tables(
127
128
  reporter.m_query_parse_successes += 1
128
129
 
129
130
  try:
130
- lineage: List[resolver.Lineage] = resolver.MQueryResolver(
131
+ lineage: List[
132
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
133
+ ] = resolver.MQueryResolver(
131
134
  table=table,
132
135
  parse_tree=parse_tree,
133
136
  reporter=reporter,
134
137
  parameters=parameters,
135
- ).resolve_to_data_platform_table_list(
138
+ ).resolve_to_lineage(
136
139
  ctx=ctx,
137
140
  config=config,
138
141
  platform_instance_resolver=platform_instance_resolver,