acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from functools import partial
|
|
3
|
-
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from lark import Token, Tree
|
|
6
6
|
|
|
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
|
|
|
58
58
|
if isinstance(node, Token):
|
|
59
59
|
return None
|
|
60
60
|
|
|
61
|
-
for child in
|
|
61
|
+
for child in node.children:
|
|
62
62
|
child_node: Optional[Tree] = internal(child)
|
|
63
63
|
if child_node is not None:
|
|
64
64
|
return child_node
|
|
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
|
|
|
99
99
|
logger.debug(f"Unable to resolve parameter reference to {ref}")
|
|
100
100
|
values.append(ref)
|
|
101
101
|
elif isinstance(node, Token):
|
|
102
|
-
values.append(
|
|
102
|
+
values.append(node.value)
|
|
103
103
|
return
|
|
104
104
|
else:
|
|
105
105
|
for child in node.children:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger(__name__)
|
|
7
7
|
|
|
@@ -14,12 +14,18 @@ def validate_parse_tree(
|
|
|
14
14
|
:param native_query_enabled: Whether user want to extract lineage from native query
|
|
15
15
|
:return: True or False.
|
|
16
16
|
"""
|
|
17
|
-
function_names = [
|
|
17
|
+
function_names = [
|
|
18
|
+
fun.value
|
|
19
|
+
for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
|
|
20
|
+
]
|
|
18
21
|
if not any(fun in expression for fun in function_names):
|
|
19
22
|
return False, "DataAccess function is not present in M-Query expression."
|
|
20
23
|
|
|
21
24
|
if native_query_enabled is False:
|
|
22
|
-
if
|
|
25
|
+
if (
|
|
26
|
+
datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
|
|
27
|
+
in function_names
|
|
28
|
+
):
|
|
23
29
|
return (
|
|
24
30
|
False,
|
|
25
31
|
"Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",
|
|
@@ -10,6 +10,7 @@ from typing import Iterable, List, Optional, Tuple, Union
|
|
|
10
10
|
import more_itertools
|
|
11
11
|
|
|
12
12
|
import datahub.emitter.mce_builder as builder
|
|
13
|
+
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
13
14
|
import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
|
|
14
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
16
|
from datahub.emitter.mcp_builder import ContainerKey, gen_containers
|
|
@@ -42,12 +43,13 @@ from datahub.ingestion.source.powerbi.config import (
|
|
|
42
43
|
Constant,
|
|
43
44
|
PowerBiDashboardSourceConfig,
|
|
44
45
|
PowerBiDashboardSourceReport,
|
|
46
|
+
SupportedDataPlatform,
|
|
45
47
|
)
|
|
46
48
|
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
|
|
47
49
|
AbstractDataPlatformInstanceResolver,
|
|
48
50
|
create_dataplatform_instance_resolver,
|
|
49
51
|
)
|
|
50
|
-
from datahub.ingestion.source.powerbi.m_query import parser
|
|
52
|
+
from datahub.ingestion.source.powerbi.m_query import parser
|
|
51
53
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
|
|
52
54
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
53
55
|
StaleEntityRemovalHandler,
|
|
@@ -182,7 +184,9 @@ class Mapper:
|
|
|
182
184
|
return [schema_mcp]
|
|
183
185
|
|
|
184
186
|
def make_fine_grained_lineage_class(
|
|
185
|
-
self,
|
|
187
|
+
self,
|
|
188
|
+
lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
|
|
189
|
+
dataset_urn: str,
|
|
186
190
|
) -> List[FineGrainedLineage]:
|
|
187
191
|
fine_grained_lineages: List[FineGrainedLineage] = []
|
|
188
192
|
|
|
@@ -234,7 +238,9 @@ class Mapper:
|
|
|
234
238
|
upstream: List[UpstreamClass] = []
|
|
235
239
|
cll_lineage: List[FineGrainedLineage] = []
|
|
236
240
|
|
|
237
|
-
upstream_lineage: List[
|
|
241
|
+
upstream_lineage: List[
|
|
242
|
+
datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
|
|
243
|
+
] = parser.get_upstream_tables(
|
|
238
244
|
table=table,
|
|
239
245
|
reporter=self.__reporter,
|
|
240
246
|
platform_instance_resolver=self.__dataplatform_instance_resolver,
|
|
@@ -1294,7 +1300,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1294
1300
|
def validate_dataset_type_mapping(self):
|
|
1295
1301
|
powerbi_data_platforms: List[str] = [
|
|
1296
1302
|
data_platform.value.powerbi_data_platform_name
|
|
1297
|
-
for data_platform in
|
|
1303
|
+
for data_platform in SupportedDataPlatform
|
|
1298
1304
|
]
|
|
1299
1305
|
|
|
1300
1306
|
for key in self.source_config.dataset_type_mapping.keys():
|
|
@@ -1481,7 +1487,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1481
1487
|
|
|
1482
1488
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1483
1489
|
# As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
|
|
1484
|
-
# This will result in creating checkpoint for each workspace_id
|
|
1490
|
+
# This will result in creating a checkpoint for each workspace_id
|
|
1485
1491
|
if self.source_config.modified_since:
|
|
1486
1492
|
return [] # Handle these in get_workunits_internal
|
|
1487
1493
|
else:
|
|
@@ -1492,7 +1498,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1492
1498
|
|
|
1493
1499
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1494
1500
|
"""
|
|
1495
|
-
Datahub Ingestion framework
|
|
1501
|
+
Datahub Ingestion framework invokes this method
|
|
1496
1502
|
"""
|
|
1497
1503
|
logger.info("PowerBi plugin execution is started")
|
|
1498
1504
|
# Validate dataset type mapping
|
|
@@ -78,8 +78,27 @@ class PulsarSchema:
|
|
|
78
78
|
def __init__(self, schema):
|
|
79
79
|
self.schema_version = schema.get("version")
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
81
|
+
schema_data = schema.get("data")
|
|
82
|
+
if not schema_data:
|
|
83
|
+
logger.warning("Schema data is empty or None. Using default empty schema.")
|
|
84
|
+
schema_data = "{}"
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
avro_schema = json.loads(schema_data)
|
|
88
|
+
except json.JSONDecodeError as e:
|
|
89
|
+
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
|
|
90
|
+
avro_schema = {}
|
|
91
|
+
|
|
92
|
+
self.schema_name = "null"
|
|
93
|
+
if avro_schema.get("namespace") and avro_schema.get("name"):
|
|
94
|
+
self.schema_name = (
|
|
95
|
+
avro_schema.get("namespace") + "." + avro_schema.get("name")
|
|
96
|
+
)
|
|
97
|
+
elif avro_schema.get("namespace"):
|
|
98
|
+
self.schema_name = avro_schema.get("namespace")
|
|
99
|
+
elif avro_schema.get("name"):
|
|
100
|
+
self.schema_name = avro_schema.get("name")
|
|
101
|
+
|
|
83
102
|
self.schema_description = avro_schema.get("doc")
|
|
84
103
|
self.schema_type = schema.get("type")
|
|
85
104
|
self.schema_str = schema.get("data")
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional, Set
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Set
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
|
22
22
|
platform_name,
|
|
23
23
|
support_status,
|
|
24
24
|
)
|
|
25
|
-
from datahub.ingestion.api.registry import import_path
|
|
26
25
|
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
|
27
26
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
27
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
|
|
|
39
38
|
ChartTypeClass,
|
|
40
39
|
DashboardInfoClass,
|
|
41
40
|
)
|
|
41
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
42
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
|
-
from datahub.utilities.sql_parser_base import SQLParser
|
|
45
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
46
45
|
|
|
47
46
|
logger = logging.getLogger(__name__)
|
|
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
|
|
|
270
269
|
parse_table_names_from_sql: bool = Field(
|
|
271
270
|
default=False, description="See note below."
|
|
272
271
|
)
|
|
273
|
-
sql_parser: str = Field(
|
|
274
|
-
default="datahub.utilities.sql_parser.DefaultSQLParser",
|
|
275
|
-
description="custom SQL parser. See note below for details.",
|
|
276
|
-
)
|
|
277
272
|
|
|
278
273
|
env: str = Field(
|
|
279
274
|
default=DEFAULT_ENV,
|
|
@@ -354,7 +349,6 @@ class RedashSource(Source):
|
|
|
354
349
|
self.api_page_limit = self.config.api_page_limit or math.inf
|
|
355
350
|
|
|
356
351
|
self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
|
|
357
|
-
self.sql_parser_path = self.config.sql_parser
|
|
358
352
|
|
|
359
353
|
logger.info(
|
|
360
354
|
f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
|
|
@@ -380,31 +374,6 @@ class RedashSource(Source):
|
|
|
380
374
|
config = RedashConfig.parse_obj(config_dict)
|
|
381
375
|
return cls(ctx, config)
|
|
382
376
|
|
|
383
|
-
@classmethod
|
|
384
|
-
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
|
|
385
|
-
assert "." in sql_parser_path, "sql_parser-path must contain a ."
|
|
386
|
-
parser_cls = import_path(sql_parser_path)
|
|
387
|
-
|
|
388
|
-
if not issubclass(parser_cls, SQLParser):
|
|
389
|
-
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
|
|
390
|
-
return parser_cls
|
|
391
|
-
|
|
392
|
-
@classmethod
|
|
393
|
-
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
|
|
394
|
-
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
|
|
395
|
-
|
|
396
|
-
try:
|
|
397
|
-
sql_table_names: List[str] = parser_cls(sql).get_tables()
|
|
398
|
-
except Exception as e:
|
|
399
|
-
logger.warning(f"Sql parser failed on {sql} with {e}")
|
|
400
|
-
return []
|
|
401
|
-
|
|
402
|
-
# Remove quotes from table names
|
|
403
|
-
sql_table_names = [t.replace('"', "") for t in sql_table_names]
|
|
404
|
-
sql_table_names = [t.replace("`", "") for t in sql_table_names]
|
|
405
|
-
|
|
406
|
-
return sql_table_names
|
|
407
|
-
|
|
408
377
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
|
409
378
|
url = f"/api/data_sources/{data_source_id}"
|
|
410
379
|
resp = self.client._get(url).json()
|
|
@@ -441,14 +410,6 @@ class RedashSource(Source):
|
|
|
441
410
|
|
|
442
411
|
return database_name
|
|
443
412
|
|
|
444
|
-
def _construct_datalineage_urn(
|
|
445
|
-
self, platform: str, database_name: str, sql_table_name: str
|
|
446
|
-
) -> str:
|
|
447
|
-
full_dataset_name = get_full_qualified_name(
|
|
448
|
-
platform, database_name, sql_table_name
|
|
449
|
-
)
|
|
450
|
-
return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
|
|
451
|
-
|
|
452
413
|
def _get_datasource_urns(
|
|
453
414
|
self, data_source: Dict, sql_query_data: Dict = {}
|
|
454
415
|
) -> Optional[List[str]]:
|
|
@@ -464,34 +425,23 @@ class RedashSource(Source):
|
|
|
464
425
|
# Getting table lineage from SQL parsing
|
|
465
426
|
if self.parse_table_names_from_sql and data_source_syntax == "sql":
|
|
466
427
|
dataset_urns = list()
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
428
|
+
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
|
429
|
+
query=query,
|
|
430
|
+
platform=platform,
|
|
431
|
+
env=self.config.env,
|
|
432
|
+
platform_instance=None,
|
|
433
|
+
default_db=database_name,
|
|
434
|
+
)
|
|
435
|
+
# make sure dataset_urns is not empty list
|
|
436
|
+
dataset_urns = sql_parser_in_tables.in_tables
|
|
437
|
+
if sql_parser_in_tables.debug_info.table_error:
|
|
472
438
|
self.report.queries_problem_parsing.add(str(query_id))
|
|
473
439
|
self.error(
|
|
474
440
|
logger,
|
|
475
441
|
"sql-parsing",
|
|
476
|
-
f"exception {
|
|
442
|
+
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
|
477
443
|
)
|
|
478
|
-
sql_table_names = []
|
|
479
|
-
for sql_table_name in sql_table_names:
|
|
480
|
-
try:
|
|
481
|
-
dataset_urns.append(
|
|
482
|
-
self._construct_datalineage_urn(
|
|
483
|
-
platform, database_name, sql_table_name
|
|
484
|
-
)
|
|
485
|
-
)
|
|
486
|
-
except Exception:
|
|
487
|
-
self.report.queries_problem_parsing.add(str(query_id))
|
|
488
|
-
self.warn(
|
|
489
|
-
logger,
|
|
490
|
-
"data-urn-invalid",
|
|
491
|
-
f"Problem making URN for {sql_table_name} parsed from query {query_id}",
|
|
492
|
-
)
|
|
493
444
|
|
|
494
|
-
# make sure dataset_urns is not empty list
|
|
495
445
|
return dataset_urns if len(dataset_urns) > 0 else None
|
|
496
446
|
|
|
497
447
|
else:
|
|
@@ -159,6 +159,7 @@ class RedshiftConfig(
|
|
|
159
159
|
description="Whether to extract column level lineage. This config works with rest-sink only.",
|
|
160
160
|
)
|
|
161
161
|
|
|
162
|
+
# TODO - use DatasetPropertiesConfigMixin instead
|
|
162
163
|
patch_custom_properties: bool = Field(
|
|
163
164
|
default=True,
|
|
164
165
|
description="Whether to patch custom properties on existing datasets rather than replace.",
|
|
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
222
222
|
```
|
|
223
223
|
"""
|
|
224
224
|
|
|
225
|
+
# TODO: Replace with standardized types in sql_types.py
|
|
225
226
|
REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
|
|
226
227
|
str,
|
|
227
228
|
Type[
|
|
@@ -830,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
830
831
|
customProperties=custom_properties,
|
|
831
832
|
)
|
|
832
833
|
if self.config.patch_custom_properties:
|
|
834
|
+
# TODO: use auto_incremental_properties workunit processor instead
|
|
835
|
+
# Deprecate use of patch_custom_properties
|
|
833
836
|
patch_builder = create_dataset_props_patch_builder(
|
|
834
837
|
dataset_urn, dataset_properties
|
|
835
838
|
)
|
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
|
9
9
|
from itertools import groupby
|
|
10
10
|
from pathlib import PurePath
|
|
11
11
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
import smart_open.compression as so_compression
|
|
14
15
|
from more_itertools import peekable
|
|
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
993
994
|
folders = []
|
|
994
995
|
for dir in dirs_to_process:
|
|
995
996
|
logger.info(f"Getting files from folder: {dir}")
|
|
996
|
-
prefix_to_process = dir.
|
|
997
|
-
self.create_s3_path(bucket_name, "/")
|
|
998
|
-
)
|
|
997
|
+
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
999
998
|
|
|
1000
999
|
folders.extend(
|
|
1001
1000
|
self.get_folder_info(
|
|
@@ -4,7 +4,12 @@ from typing import Dict, Iterable, List, Optional
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
5
|
from datahub.configuration.common import ConfigurationError
|
|
6
6
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
7
|
-
from datahub.emitter.mcp_builder import
|
|
7
|
+
from datahub.emitter.mcp_builder import (
|
|
8
|
+
add_entity_to_container,
|
|
9
|
+
add_owner_to_entity_wu,
|
|
10
|
+
add_tags_to_entity_wu,
|
|
11
|
+
gen_containers,
|
|
12
|
+
)
|
|
8
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
9
14
|
from datahub.ingestion.api.decorators import (
|
|
10
15
|
SourceCapability,
|
|
@@ -59,12 +64,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
59
64
|
UpstreamLineage,
|
|
60
65
|
)
|
|
61
66
|
from datahub.metadata.schema_classes import (
|
|
67
|
+
AuditStampClass,
|
|
62
68
|
BrowsePathEntryClass,
|
|
63
69
|
BrowsePathsV2Class,
|
|
64
70
|
ChangeAuditStampsClass,
|
|
65
71
|
ChartInfoClass,
|
|
66
72
|
DashboardInfoClass,
|
|
67
73
|
DataPlatformInstanceClass,
|
|
74
|
+
EdgeClass,
|
|
68
75
|
GlobalTagsClass,
|
|
69
76
|
InputFieldClass,
|
|
70
77
|
InputFieldsClass,
|
|
@@ -74,6 +81,7 @@ from datahub.metadata.schema_classes import (
|
|
|
74
81
|
SchemaFieldClass,
|
|
75
82
|
SchemaFieldDataTypeClass,
|
|
76
83
|
StringTypeClass,
|
|
84
|
+
SubTypesClass,
|
|
77
85
|
TagAssociationClass,
|
|
78
86
|
)
|
|
79
87
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
@@ -257,11 +265,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
257
265
|
entries = [
|
|
258
266
|
BrowsePathEntryClass(id=parent_entity_urn, urn=parent_entity_urn)
|
|
259
267
|
] + [BrowsePathEntryClass(id=path) for path in paths]
|
|
260
|
-
if self.config.platform_instance:
|
|
261
|
-
urn = builder.make_dataplatform_instance_urn(
|
|
262
|
-
self.platform, self.config.platform_instance
|
|
263
|
-
)
|
|
264
|
-
entries = [BrowsePathEntryClass(id=urn, urn=urn)] + entries
|
|
265
268
|
return MetadataChangeProposalWrapper(
|
|
266
269
|
entityUrn=entity_urn,
|
|
267
270
|
aspect=BrowsePathsV2Class(entries),
|
|
@@ -424,11 +427,11 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
424
427
|
elements: List[Element],
|
|
425
428
|
workbook: Workbook,
|
|
426
429
|
all_input_fields: List[InputFieldClass],
|
|
430
|
+
paths: List[str],
|
|
427
431
|
) -> Iterable[MetadataWorkUnit]:
|
|
428
432
|
"""
|
|
429
433
|
Map Sigma page element to Datahub Chart
|
|
430
434
|
"""
|
|
431
|
-
|
|
432
435
|
for element in elements:
|
|
433
436
|
chart_urn = builder.make_chart_urn(
|
|
434
437
|
platform=self.platform,
|
|
@@ -459,11 +462,14 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
459
462
|
),
|
|
460
463
|
).as_workunit()
|
|
461
464
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
465
|
+
if workbook.workspaceId:
|
|
466
|
+
yield self._gen_entity_browsepath_aspect(
|
|
467
|
+
entity_urn=chart_urn,
|
|
468
|
+
parent_entity_urn=builder.make_container_urn(
|
|
469
|
+
self._gen_workspace_key(workbook.workspaceId)
|
|
470
|
+
),
|
|
471
|
+
paths=paths + [workbook.name],
|
|
472
|
+
)
|
|
467
473
|
|
|
468
474
|
# Add sigma dataset's upstream dataset urn mapping
|
|
469
475
|
for dataset_urn, upstream_dataset_urns in inputs.items():
|
|
@@ -494,7 +500,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
494
500
|
|
|
495
501
|
all_input_fields.extend(element_input_fields)
|
|
496
502
|
|
|
497
|
-
def _gen_pages_workunit(
|
|
503
|
+
def _gen_pages_workunit(
|
|
504
|
+
self, workbook: Workbook, paths: List[str]
|
|
505
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
498
506
|
"""
|
|
499
507
|
Map Sigma workbook page to Datahub dashboard
|
|
500
508
|
"""
|
|
@@ -505,20 +513,23 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
505
513
|
|
|
506
514
|
yield self._gen_dashboard_info_workunit(page)
|
|
507
515
|
|
|
508
|
-
yield from add_entity_to_container(
|
|
509
|
-
container_key=self._gen_workbook_key(workbook.workbookId),
|
|
510
|
-
entity_type="dashboard",
|
|
511
|
-
entity_urn=dashboard_urn,
|
|
512
|
-
)
|
|
513
|
-
|
|
514
516
|
dpi_aspect = self._gen_dataplatform_instance_aspect(dashboard_urn)
|
|
515
517
|
if dpi_aspect:
|
|
516
518
|
yield dpi_aspect
|
|
517
519
|
|
|
518
520
|
all_input_fields: List[InputFieldClass] = []
|
|
519
521
|
|
|
522
|
+
if workbook.workspaceId:
|
|
523
|
+
yield self._gen_entity_browsepath_aspect(
|
|
524
|
+
entity_urn=dashboard_urn,
|
|
525
|
+
parent_entity_urn=builder.make_container_urn(
|
|
526
|
+
self._gen_workspace_key(workbook.workspaceId)
|
|
527
|
+
),
|
|
528
|
+
paths=paths + [workbook.name],
|
|
529
|
+
)
|
|
530
|
+
|
|
520
531
|
yield from self._gen_elements_workunit(
|
|
521
|
-
page.elements, workbook, all_input_fields
|
|
532
|
+
page.elements, workbook, all_input_fields, paths
|
|
522
533
|
)
|
|
523
534
|
|
|
524
535
|
yield MetadataChangeProposalWrapper(
|
|
@@ -531,42 +542,89 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
531
542
|
Map Sigma Workbook to Datahub container
|
|
532
543
|
"""
|
|
533
544
|
owner_username = self.sigma_api.get_user_name(workbook.createdBy)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
545
|
+
|
|
546
|
+
dashboard_urn = self._gen_dashboard_urn(workbook.workbookId)
|
|
547
|
+
|
|
548
|
+
yield self._gen_entity_status_aspect(dashboard_urn)
|
|
549
|
+
|
|
550
|
+
lastModified = AuditStampClass(
|
|
551
|
+
time=int(workbook.updatedAt.timestamp() * 1000),
|
|
552
|
+
actor="urn:li:corpuser:datahub",
|
|
553
|
+
)
|
|
554
|
+
created = AuditStampClass(
|
|
555
|
+
time=int(workbook.createdAt.timestamp() * 1000),
|
|
556
|
+
actor="urn:li:corpuser:datahub",
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
dashboard_info_cls = DashboardInfoClass(
|
|
560
|
+
title=workbook.name,
|
|
561
|
+
description=workbook.description if workbook.description else "",
|
|
562
|
+
dashboards=[
|
|
563
|
+
EdgeClass(
|
|
564
|
+
destinationUrn=self._gen_dashboard_urn(page.get_urn_part()),
|
|
565
|
+
sourceUrn=dashboard_urn,
|
|
566
|
+
)
|
|
567
|
+
for page in workbook.pages
|
|
568
|
+
],
|
|
569
|
+
externalUrl=workbook.url,
|
|
570
|
+
lastModified=ChangeAuditStampsClass(
|
|
571
|
+
created=created, lastModified=lastModified
|
|
543
572
|
),
|
|
544
|
-
|
|
573
|
+
customProperties={
|
|
545
574
|
"path": workbook.path,
|
|
546
575
|
"latestVersion": str(workbook.latestVersion),
|
|
547
576
|
},
|
|
548
|
-
owner_urn=(
|
|
549
|
-
builder.make_user_urn(owner_username)
|
|
550
|
-
if self.config.ingest_owner and owner_username
|
|
551
|
-
else None
|
|
552
|
-
),
|
|
553
|
-
external_url=workbook.url,
|
|
554
|
-
tags=[workbook.badge] if workbook.badge else None,
|
|
555
|
-
created=int(workbook.createdAt.timestamp() * 1000),
|
|
556
|
-
last_modified=int(workbook.updatedAt.timestamp() * 1000),
|
|
557
577
|
)
|
|
578
|
+
yield MetadataChangeProposalWrapper(
|
|
579
|
+
entityUrn=dashboard_urn, aspect=dashboard_info_cls
|
|
580
|
+
).as_workunit()
|
|
581
|
+
|
|
582
|
+
# Set subtype
|
|
583
|
+
yield MetadataChangeProposalWrapper(
|
|
584
|
+
entityUrn=dashboard_urn,
|
|
585
|
+
aspect=SubTypesClass(typeNames=[BIContainerSubTypes.SIGMA_WORKBOOK]),
|
|
586
|
+
).as_workunit()
|
|
587
|
+
|
|
588
|
+
# Ownership
|
|
589
|
+
owner_urn = (
|
|
590
|
+
builder.make_user_urn(owner_username)
|
|
591
|
+
if self.config.ingest_owner and owner_username
|
|
592
|
+
else None
|
|
593
|
+
)
|
|
594
|
+
if owner_urn:
|
|
595
|
+
yield from add_owner_to_entity_wu(
|
|
596
|
+
entity_type="dashboard",
|
|
597
|
+
entity_urn=dashboard_urn,
|
|
598
|
+
owner_urn=owner_urn,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
# Tags
|
|
602
|
+
tags = [workbook.badge] if workbook.badge else None
|
|
603
|
+
if tags:
|
|
604
|
+
yield from add_tags_to_entity_wu(
|
|
605
|
+
entity_type="dashboard",
|
|
606
|
+
entity_urn=dashboard_urn,
|
|
607
|
+
tags=sorted(tags),
|
|
608
|
+
)
|
|
558
609
|
|
|
559
610
|
paths = workbook.path.split("/")[1:]
|
|
560
|
-
if
|
|
611
|
+
if workbook.workspaceId:
|
|
561
612
|
yield self._gen_entity_browsepath_aspect(
|
|
562
|
-
entity_urn=
|
|
613
|
+
entity_urn=dashboard_urn,
|
|
563
614
|
parent_entity_urn=builder.make_container_urn(
|
|
564
615
|
self._gen_workspace_key(workbook.workspaceId)
|
|
565
616
|
),
|
|
566
|
-
paths=paths,
|
|
617
|
+
paths=paths + [workbook.name],
|
|
567
618
|
)
|
|
568
619
|
|
|
569
|
-
|
|
620
|
+
if len(paths) == 0:
|
|
621
|
+
yield from add_entity_to_container(
|
|
622
|
+
container_key=self._gen_workspace_key(workbook.workspaceId),
|
|
623
|
+
entity_type="dashboard",
|
|
624
|
+
entity_urn=dashboard_urn,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
yield from self._gen_pages_workunit(workbook, paths)
|
|
570
628
|
|
|
571
629
|
def _gen_sigma_dataset_upstream_lineage_workunit(
|
|
572
630
|
self,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Dict, List, Optional, Set
|
|
4
|
+
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import Field, SecretStr, root_validator, validator
|
|
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
|
|
|
16
16
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
17
17
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
18
18
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
19
|
+
from datahub.ingestion.api.incremental_properties_helper import (
|
|
20
|
+
IncrementalPropertiesConfigMixin,
|
|
21
|
+
)
|
|
19
22
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
20
23
|
ClassificationSourceConfigMixin,
|
|
21
24
|
)
|
|
@@ -118,9 +121,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
118
121
|
)
|
|
119
122
|
|
|
120
123
|
# Always exclude reporting metadata for INFORMATION_SCHEMA schema
|
|
121
|
-
if schema_pattern
|
|
124
|
+
if schema_pattern:
|
|
122
125
|
logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
|
|
123
|
-
|
|
126
|
+
assert isinstance(schema_pattern, AllowDenyPattern)
|
|
127
|
+
schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
|
|
124
128
|
|
|
125
129
|
return values
|
|
126
130
|
|
|
@@ -187,6 +191,7 @@ class SnowflakeV2Config(
|
|
|
187
191
|
StatefulUsageConfigMixin,
|
|
188
192
|
StatefulProfilingConfigMixin,
|
|
189
193
|
ClassificationSourceConfigMixin,
|
|
194
|
+
IncrementalPropertiesConfigMixin,
|
|
190
195
|
):
|
|
191
196
|
include_usage_stats: bool = Field(
|
|
192
197
|
default=True,
|