acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from functools import partial
|
|
3
|
-
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from lark import Token, Tree
|
|
6
6
|
|
|
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
|
|
|
58
58
|
if isinstance(node, Token):
|
|
59
59
|
return None
|
|
60
60
|
|
|
61
|
-
for child in
|
|
61
|
+
for child in node.children:
|
|
62
62
|
child_node: Optional[Tree] = internal(child)
|
|
63
63
|
if child_node is not None:
|
|
64
64
|
return child_node
|
|
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
|
|
|
99
99
|
logger.debug(f"Unable to resolve parameter reference to {ref}")
|
|
100
100
|
values.append(ref)
|
|
101
101
|
elif isinstance(node, Token):
|
|
102
|
-
values.append(
|
|
102
|
+
values.append(node.value)
|
|
103
103
|
return
|
|
104
104
|
else:
|
|
105
105
|
for child in node.children:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger(__name__)
|
|
7
7
|
|
|
@@ -14,12 +14,18 @@ def validate_parse_tree(
|
|
|
14
14
|
:param native_query_enabled: Whether user want to extract lineage from native query
|
|
15
15
|
:return: True or False.
|
|
16
16
|
"""
|
|
17
|
-
function_names = [
|
|
17
|
+
function_names = [
|
|
18
|
+
fun.value
|
|
19
|
+
for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
|
|
20
|
+
]
|
|
18
21
|
if not any(fun in expression for fun in function_names):
|
|
19
22
|
return False, "DataAccess function is not present in M-Query expression."
|
|
20
23
|
|
|
21
24
|
if native_query_enabled is False:
|
|
22
|
-
if
|
|
25
|
+
if (
|
|
26
|
+
datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
|
|
27
|
+
in function_names
|
|
28
|
+
):
|
|
23
29
|
return (
|
|
24
30
|
False,
|
|
25
31
|
"Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",
|
|
@@ -10,6 +10,7 @@ from typing import Iterable, List, Optional, Tuple, Union
|
|
|
10
10
|
import more_itertools
|
|
11
11
|
|
|
12
12
|
import datahub.emitter.mce_builder as builder
|
|
13
|
+
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
13
14
|
import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
|
|
14
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
16
|
from datahub.emitter.mcp_builder import ContainerKey, gen_containers
|
|
@@ -42,12 +43,13 @@ from datahub.ingestion.source.powerbi.config import (
|
|
|
42
43
|
Constant,
|
|
43
44
|
PowerBiDashboardSourceConfig,
|
|
44
45
|
PowerBiDashboardSourceReport,
|
|
46
|
+
SupportedDataPlatform,
|
|
45
47
|
)
|
|
46
48
|
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
|
|
47
49
|
AbstractDataPlatformInstanceResolver,
|
|
48
50
|
create_dataplatform_instance_resolver,
|
|
49
51
|
)
|
|
50
|
-
from datahub.ingestion.source.powerbi.m_query import parser
|
|
52
|
+
from datahub.ingestion.source.powerbi.m_query import parser
|
|
51
53
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
|
|
52
54
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
53
55
|
StaleEntityRemovalHandler,
|
|
@@ -182,7 +184,9 @@ class Mapper:
|
|
|
182
184
|
return [schema_mcp]
|
|
183
185
|
|
|
184
186
|
def make_fine_grained_lineage_class(
|
|
185
|
-
self,
|
|
187
|
+
self,
|
|
188
|
+
lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
|
|
189
|
+
dataset_urn: str,
|
|
186
190
|
) -> List[FineGrainedLineage]:
|
|
187
191
|
fine_grained_lineages: List[FineGrainedLineage] = []
|
|
188
192
|
|
|
@@ -234,7 +238,9 @@ class Mapper:
|
|
|
234
238
|
upstream: List[UpstreamClass] = []
|
|
235
239
|
cll_lineage: List[FineGrainedLineage] = []
|
|
236
240
|
|
|
237
|
-
upstream_lineage: List[
|
|
241
|
+
upstream_lineage: List[
|
|
242
|
+
datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
|
|
243
|
+
] = parser.get_upstream_tables(
|
|
238
244
|
table=table,
|
|
239
245
|
reporter=self.__reporter,
|
|
240
246
|
platform_instance_resolver=self.__dataplatform_instance_resolver,
|
|
@@ -1294,7 +1300,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1294
1300
|
def validate_dataset_type_mapping(self):
|
|
1295
1301
|
powerbi_data_platforms: List[str] = [
|
|
1296
1302
|
data_platform.value.powerbi_data_platform_name
|
|
1297
|
-
for data_platform in
|
|
1303
|
+
for data_platform in SupportedDataPlatform
|
|
1298
1304
|
]
|
|
1299
1305
|
|
|
1300
1306
|
for key in self.source_config.dataset_type_mapping.keys():
|
|
@@ -1481,7 +1487,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1481
1487
|
|
|
1482
1488
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1483
1489
|
# As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
|
|
1484
|
-
# This will result in creating checkpoint for each workspace_id
|
|
1490
|
+
# This will result in creating a checkpoint for each workspace_id
|
|
1485
1491
|
if self.source_config.modified_since:
|
|
1486
1492
|
return [] # Handle these in get_workunits_internal
|
|
1487
1493
|
else:
|
|
@@ -1492,7 +1498,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1492
1498
|
|
|
1493
1499
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1494
1500
|
"""
|
|
1495
|
-
Datahub Ingestion framework
|
|
1501
|
+
Datahub Ingestion framework invokes this method
|
|
1496
1502
|
"""
|
|
1497
1503
|
logger.info("PowerBi plugin execution is started")
|
|
1498
1504
|
# Validate dataset type mapping
|
|
@@ -78,8 +78,27 @@ class PulsarSchema:
|
|
|
78
78
|
def __init__(self, schema):
|
|
79
79
|
self.schema_version = schema.get("version")
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
81
|
+
schema_data = schema.get("data")
|
|
82
|
+
if not schema_data:
|
|
83
|
+
logger.warning("Schema data is empty or None. Using default empty schema.")
|
|
84
|
+
schema_data = "{}"
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
avro_schema = json.loads(schema_data)
|
|
88
|
+
except json.JSONDecodeError as e:
|
|
89
|
+
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
|
|
90
|
+
avro_schema = {}
|
|
91
|
+
|
|
92
|
+
self.schema_name = "null"
|
|
93
|
+
if avro_schema.get("namespace") and avro_schema.get("name"):
|
|
94
|
+
self.schema_name = (
|
|
95
|
+
avro_schema.get("namespace") + "." + avro_schema.get("name")
|
|
96
|
+
)
|
|
97
|
+
elif avro_schema.get("namespace"):
|
|
98
|
+
self.schema_name = avro_schema.get("namespace")
|
|
99
|
+
elif avro_schema.get("name"):
|
|
100
|
+
self.schema_name = avro_schema.get("name")
|
|
101
|
+
|
|
83
102
|
self.schema_description = avro_schema.get("doc")
|
|
84
103
|
self.schema_type = schema.get("type")
|
|
85
104
|
self.schema_str = schema.get("data")
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional, Set
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Set
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
|
22
22
|
platform_name,
|
|
23
23
|
support_status,
|
|
24
24
|
)
|
|
25
|
-
from datahub.ingestion.api.registry import import_path
|
|
26
25
|
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
|
27
26
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
27
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
|
|
|
39
38
|
ChartTypeClass,
|
|
40
39
|
DashboardInfoClass,
|
|
41
40
|
)
|
|
41
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
42
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
|
-
from datahub.utilities.sql_parser_base import SQLParser
|
|
45
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
46
45
|
|
|
47
46
|
logger = logging.getLogger(__name__)
|
|
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
|
|
|
270
269
|
parse_table_names_from_sql: bool = Field(
|
|
271
270
|
default=False, description="See note below."
|
|
272
271
|
)
|
|
273
|
-
sql_parser: str = Field(
|
|
274
|
-
default="datahub.utilities.sql_parser.DefaultSQLParser",
|
|
275
|
-
description="custom SQL parser. See note below for details.",
|
|
276
|
-
)
|
|
277
272
|
|
|
278
273
|
env: str = Field(
|
|
279
274
|
default=DEFAULT_ENV,
|
|
@@ -354,7 +349,6 @@ class RedashSource(Source):
|
|
|
354
349
|
self.api_page_limit = self.config.api_page_limit or math.inf
|
|
355
350
|
|
|
356
351
|
self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
|
|
357
|
-
self.sql_parser_path = self.config.sql_parser
|
|
358
352
|
|
|
359
353
|
logger.info(
|
|
360
354
|
f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
|
|
@@ -380,31 +374,6 @@ class RedashSource(Source):
|
|
|
380
374
|
config = RedashConfig.parse_obj(config_dict)
|
|
381
375
|
return cls(ctx, config)
|
|
382
376
|
|
|
383
|
-
@classmethod
|
|
384
|
-
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
|
|
385
|
-
assert "." in sql_parser_path, "sql_parser-path must contain a ."
|
|
386
|
-
parser_cls = import_path(sql_parser_path)
|
|
387
|
-
|
|
388
|
-
if not issubclass(parser_cls, SQLParser):
|
|
389
|
-
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
|
|
390
|
-
return parser_cls
|
|
391
|
-
|
|
392
|
-
@classmethod
|
|
393
|
-
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
|
|
394
|
-
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
|
|
395
|
-
|
|
396
|
-
try:
|
|
397
|
-
sql_table_names: List[str] = parser_cls(sql).get_tables()
|
|
398
|
-
except Exception as e:
|
|
399
|
-
logger.warning(f"Sql parser failed on {sql} with {e}")
|
|
400
|
-
return []
|
|
401
|
-
|
|
402
|
-
# Remove quotes from table names
|
|
403
|
-
sql_table_names = [t.replace('"', "") for t in sql_table_names]
|
|
404
|
-
sql_table_names = [t.replace("`", "") for t in sql_table_names]
|
|
405
|
-
|
|
406
|
-
return sql_table_names
|
|
407
|
-
|
|
408
377
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
|
409
378
|
url = f"/api/data_sources/{data_source_id}"
|
|
410
379
|
resp = self.client._get(url).json()
|
|
@@ -441,14 +410,6 @@ class RedashSource(Source):
|
|
|
441
410
|
|
|
442
411
|
return database_name
|
|
443
412
|
|
|
444
|
-
def _construct_datalineage_urn(
|
|
445
|
-
self, platform: str, database_name: str, sql_table_name: str
|
|
446
|
-
) -> str:
|
|
447
|
-
full_dataset_name = get_full_qualified_name(
|
|
448
|
-
platform, database_name, sql_table_name
|
|
449
|
-
)
|
|
450
|
-
return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
|
|
451
|
-
|
|
452
413
|
def _get_datasource_urns(
|
|
453
414
|
self, data_source: Dict, sql_query_data: Dict = {}
|
|
454
415
|
) -> Optional[List[str]]:
|
|
@@ -464,34 +425,23 @@ class RedashSource(Source):
|
|
|
464
425
|
# Getting table lineage from SQL parsing
|
|
465
426
|
if self.parse_table_names_from_sql and data_source_syntax == "sql":
|
|
466
427
|
dataset_urns = list()
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
428
|
+
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
|
429
|
+
query=query,
|
|
430
|
+
platform=platform,
|
|
431
|
+
env=self.config.env,
|
|
432
|
+
platform_instance=None,
|
|
433
|
+
default_db=database_name,
|
|
434
|
+
)
|
|
435
|
+
# make sure dataset_urns is not empty list
|
|
436
|
+
dataset_urns = sql_parser_in_tables.in_tables
|
|
437
|
+
if sql_parser_in_tables.debug_info.table_error:
|
|
472
438
|
self.report.queries_problem_parsing.add(str(query_id))
|
|
473
439
|
self.error(
|
|
474
440
|
logger,
|
|
475
441
|
"sql-parsing",
|
|
476
|
-
f"exception {
|
|
442
|
+
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
|
477
443
|
)
|
|
478
|
-
sql_table_names = []
|
|
479
|
-
for sql_table_name in sql_table_names:
|
|
480
|
-
try:
|
|
481
|
-
dataset_urns.append(
|
|
482
|
-
self._construct_datalineage_urn(
|
|
483
|
-
platform, database_name, sql_table_name
|
|
484
|
-
)
|
|
485
|
-
)
|
|
486
|
-
except Exception:
|
|
487
|
-
self.report.queries_problem_parsing.add(str(query_id))
|
|
488
|
-
self.warn(
|
|
489
|
-
logger,
|
|
490
|
-
"data-urn-invalid",
|
|
491
|
-
f"Problem making URN for {sql_table_name} parsed from query {query_id}",
|
|
492
|
-
)
|
|
493
444
|
|
|
494
|
-
# make sure dataset_urns is not empty list
|
|
495
445
|
return dataset_urns if len(dataset_urns) > 0 else None
|
|
496
446
|
|
|
497
447
|
else:
|
|
@@ -159,6 +159,7 @@ class RedshiftConfig(
|
|
|
159
159
|
description="Whether to extract column level lineage. This config works with rest-sink only.",
|
|
160
160
|
)
|
|
161
161
|
|
|
162
|
+
# TODO - use DatasetPropertiesConfigMixin instead
|
|
162
163
|
patch_custom_properties: bool = Field(
|
|
163
164
|
default=True,
|
|
164
165
|
description="Whether to patch custom properties on existing datasets rather than replace.",
|
|
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
222
222
|
```
|
|
223
223
|
"""
|
|
224
224
|
|
|
225
|
+
# TODO: Replace with standardized types in sql_types.py
|
|
225
226
|
REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
|
|
226
227
|
str,
|
|
227
228
|
Type[
|
|
@@ -830,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
830
831
|
customProperties=custom_properties,
|
|
831
832
|
)
|
|
832
833
|
if self.config.patch_custom_properties:
|
|
834
|
+
# TODO: use auto_incremental_properties workunit processor instead
|
|
835
|
+
# Deprecate use of patch_custom_properties
|
|
833
836
|
patch_builder = create_dataset_props_patch_builder(
|
|
834
837
|
dataset_urn, dataset_properties
|
|
835
838
|
)
|
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
|
9
9
|
from itertools import groupby
|
|
10
10
|
from pathlib import PurePath
|
|
11
11
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
import smart_open.compression as so_compression
|
|
14
15
|
from more_itertools import peekable
|
|
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
993
994
|
folders = []
|
|
994
995
|
for dir in dirs_to_process:
|
|
995
996
|
logger.info(f"Getting files from folder: {dir}")
|
|
996
|
-
prefix_to_process = dir.
|
|
997
|
-
self.create_s3_path(bucket_name, "/")
|
|
998
|
-
)
|
|
997
|
+
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
999
998
|
|
|
1000
999
|
folders.extend(
|
|
1001
1000
|
self.get_folder_info(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Dict, List, Optional, Set
|
|
4
|
+
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import Field, SecretStr, root_validator, validator
|
|
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
|
|
|
16
16
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
17
17
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
18
18
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
19
|
+
from datahub.ingestion.api.incremental_properties_helper import (
|
|
20
|
+
IncrementalPropertiesConfigMixin,
|
|
21
|
+
)
|
|
19
22
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
20
23
|
ClassificationSourceConfigMixin,
|
|
21
24
|
)
|
|
@@ -118,9 +121,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
118
121
|
)
|
|
119
122
|
|
|
120
123
|
# Always exclude reporting metadata for INFORMATION_SCHEMA schema
|
|
121
|
-
if schema_pattern
|
|
124
|
+
if schema_pattern:
|
|
122
125
|
logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
|
|
123
|
-
|
|
126
|
+
assert isinstance(schema_pattern, AllowDenyPattern)
|
|
127
|
+
schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
|
|
124
128
|
|
|
125
129
|
return values
|
|
126
130
|
|
|
@@ -187,6 +191,7 @@ class SnowflakeV2Config(
|
|
|
187
191
|
StatefulUsageConfigMixin,
|
|
188
192
|
StatefulProfilingConfigMixin,
|
|
189
193
|
ClassificationSourceConfigMixin,
|
|
194
|
+
IncrementalPropertiesConfigMixin,
|
|
190
195
|
):
|
|
191
196
|
include_usage_stats: bool = Field(
|
|
192
197
|
default=True,
|
|
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
|
|
|
43
43
|
"EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
|
|
44
44
|
"KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
|
|
45
45
|
"OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
|
|
46
|
+
"OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
|
|
46
47
|
}
|
|
47
48
|
|
|
48
49
|
_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
|
|
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
104
105
|
description="Connect args to pass to Snowflake SqlAlchemy driver",
|
|
105
106
|
exclude=True,
|
|
106
107
|
)
|
|
108
|
+
token: Optional[str] = pydantic.Field(
|
|
109
|
+
default=None,
|
|
110
|
+
description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
|
|
111
|
+
)
|
|
107
112
|
|
|
108
113
|
def get_account(self) -> str:
|
|
109
114
|
assert self.account_id
|
|
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
148
153
|
logger.info(f"using authenticator type '{v}'")
|
|
149
154
|
return v
|
|
150
155
|
|
|
156
|
+
@pydantic.validator("token", always=True)
|
|
157
|
+
def validate_token_oauth_config(cls, v, values):
|
|
158
|
+
auth_type = values.get("authentication_type")
|
|
159
|
+
if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
160
|
+
if not v:
|
|
161
|
+
raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
|
|
162
|
+
elif v is not None:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
|
|
165
|
+
)
|
|
166
|
+
return v
|
|
167
|
+
|
|
151
168
|
@staticmethod
|
|
152
169
|
def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
|
|
153
170
|
if oauth_config is None:
|
|
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
333
350
|
application=_APPLICATION_NAME,
|
|
334
351
|
**connect_args,
|
|
335
352
|
)
|
|
353
|
+
elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
354
|
+
return snowflake.connector.connect(
|
|
355
|
+
user=self.username,
|
|
356
|
+
account=self.account_id,
|
|
357
|
+
authenticator="oauth",
|
|
358
|
+
token=self.token, # Token generated externally and provided directly to the recipe
|
|
359
|
+
warehouse=self.warehouse,
|
|
360
|
+
role=self.role,
|
|
361
|
+
application=_APPLICATION_NAME,
|
|
362
|
+
**connect_args,
|
|
363
|
+
)
|
|
336
364
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR":
|
|
337
365
|
return self.get_oauth_connection()
|
|
338
366
|
elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
|
|
@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
413
413
|
return UpstreamLineageEdge.parse_obj(db_row)
|
|
414
414
|
except Exception as e:
|
|
415
415
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
416
|
+
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
417
|
+
downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
|
|
416
418
|
self.structured_reporter.warning(
|
|
417
419
|
"Failed to parse lineage edge",
|
|
418
|
-
|
|
420
|
+
# Tricky: sometimes the full row data is too large, and so the context
|
|
421
|
+
# message gets truncated. By pulling out the upstreams and downstream
|
|
422
|
+
# list, we can at least get the important fields if truncation does occur.
|
|
423
|
+
context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
|
|
419
424
|
exc=e,
|
|
420
425
|
)
|
|
421
426
|
return None
|
|
@@ -129,10 +129,12 @@ class SnowflakeQuery:
|
|
|
129
129
|
row_count AS "ROW_COUNT",
|
|
130
130
|
bytes AS "BYTES",
|
|
131
131
|
clustering_key AS "CLUSTERING_KEY",
|
|
132
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
132
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
133
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
134
|
+
is_iceberg AS "IS_ICEBERG"
|
|
133
135
|
FROM {db_clause}information_schema.tables t
|
|
134
136
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
135
|
-
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
|
|
137
|
+
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
136
138
|
order by table_schema, table_name"""
|
|
137
139
|
|
|
138
140
|
@staticmethod
|
|
@@ -149,10 +151,12 @@ class SnowflakeQuery:
|
|
|
149
151
|
row_count AS "ROW_COUNT",
|
|
150
152
|
bytes AS "BYTES",
|
|
151
153
|
clustering_key AS "CLUSTERING_KEY",
|
|
152
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
154
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
155
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
156
|
+
is_iceberg AS "IS_ICEBERG"
|
|
153
157
|
FROM {db_clause}information_schema.tables t
|
|
154
158
|
where table_schema='{schema_name}'
|
|
155
|
-
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
159
|
+
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
156
160
|
order by table_schema, table_name"""
|
|
157
161
|
|
|
158
162
|
@staticmethod
|
|
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
233
237
|
LIMIT {limit} {from_clause};
|
|
234
238
|
"""
|
|
235
239
|
|
|
240
|
+
@staticmethod
|
|
241
|
+
def get_secure_view_definitions() -> str:
|
|
242
|
+
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
243
|
+
return """
|
|
244
|
+
SELECT
|
|
245
|
+
TABLE_CATALOG as "TABLE_CATALOG",
|
|
246
|
+
TABLE_SCHEMA as "TABLE_SCHEMA",
|
|
247
|
+
TABLE_NAME as "TABLE_NAME",
|
|
248
|
+
VIEW_DEFINITION as "VIEW_DEFINITION"
|
|
249
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
|
|
250
|
+
WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
|
|
251
|
+
"""
|
|
252
|
+
|
|
236
253
|
@staticmethod
|
|
237
254
|
def columns_for_schema(
|
|
238
255
|
schema_name: str,
|
|
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
|
|
|
113
113
|
external_lineage_queries_secs: float = -1
|
|
114
114
|
num_tables_with_known_upstreams: int = 0
|
|
115
115
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
116
|
+
num_secure_views_missing_definition: int = 0
|
|
116
117
|
|
|
117
118
|
data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
|
|
118
119
|
|
|
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
|
|
|
90
90
|
foreign_keys: List[SnowflakeFK] = field(default_factory=list)
|
|
91
91
|
tags: Optional[List[SnowflakeTag]] = None
|
|
92
92
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
93
|
+
is_dynamic: bool = False
|
|
94
|
+
is_iceberg: bool = False
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_hybrid(self) -> bool:
|
|
98
|
+
return self.type is not None and self.type == "HYBRID TABLE"
|
|
93
99
|
|
|
94
100
|
|
|
95
101
|
@dataclass
|
|
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
|
|
|
98
104
|
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
99
105
|
tags: Optional[List[SnowflakeTag]] = None
|
|
100
106
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
107
|
+
is_secure: bool = False
|
|
101
108
|
|
|
102
109
|
|
|
103
110
|
@dataclass
|
|
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
259
266
|
snowflake_schemas.append(snowflake_schema)
|
|
260
267
|
return snowflake_schemas
|
|
261
268
|
|
|
269
|
+
@serialized_lru_cache(maxsize=1)
|
|
270
|
+
def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
|
|
271
|
+
secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
|
|
272
|
+
lambda: defaultdict(lambda: defaultdict())
|
|
273
|
+
)
|
|
274
|
+
cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
|
|
275
|
+
for view in cur:
|
|
276
|
+
db_name = view["TABLE_CATALOG"]
|
|
277
|
+
schema_name = view["TABLE_SCHEMA"]
|
|
278
|
+
view_name = view["TABLE_NAME"]
|
|
279
|
+
secure_view_definitions[db_name][schema_name][view_name] = view[
|
|
280
|
+
"VIEW_DEFINITION"
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
return secure_view_definitions
|
|
284
|
+
|
|
262
285
|
@serialized_lru_cache(maxsize=1)
|
|
263
286
|
def get_tables_for_database(
|
|
264
287
|
self, db_name: str
|
|
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
289
312
|
rows_count=table["ROW_COUNT"],
|
|
290
313
|
comment=table["COMMENT"],
|
|
291
314
|
clustering_key=table["CLUSTERING_KEY"],
|
|
315
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
316
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
292
317
|
)
|
|
293
318
|
)
|
|
294
319
|
return tables
|
|
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
313
338
|
rows_count=table["ROW_COUNT"],
|
|
314
339
|
comment=table["COMMENT"],
|
|
315
340
|
clustering_key=table["CLUSTERING_KEY"],
|
|
341
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
342
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
316
343
|
)
|
|
317
344
|
)
|
|
318
345
|
return tables
|
|
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
356
383
|
materialized=(
|
|
357
384
|
view.get("is_materialized", "false").lower() == "true"
|
|
358
385
|
),
|
|
386
|
+
is_secure=(view.get("is_secure", "false").lower() == "true"),
|
|
359
387
|
)
|
|
360
388
|
)
|
|
361
389
|
|