acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -353,6 +353,19 @@ class PowerBiDashboardSourceConfig(
|
|
|
353
353
|
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
354
354
|
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
355
355
|
)
|
|
356
|
+
# ODBC DSN to database (or database.schema) mapping
|
|
357
|
+
dsn_to_database_schema: Dict[str, str] = pydantic.Field(
|
|
358
|
+
default={},
|
|
359
|
+
description="A mapping of ODBC DSN to database names with optional schema names "
|
|
360
|
+
"(some database platforms such a MySQL use the table name pattern 'database.table', "
|
|
361
|
+
"while others use the pattern 'database.schema.table'). "
|
|
362
|
+
"This mapping is used in conjunction with ODBC SQL query parsing. "
|
|
363
|
+
"If SQL queries used with ODBC do not reference fully qualified tables names, "
|
|
364
|
+
"then you should configure mappings for your DSNs. "
|
|
365
|
+
"For example with an ODBC connection string 'DSN=database' where the database "
|
|
366
|
+
"is 'prod' you would configure the mapping as 'database: prod'. "
|
|
367
|
+
"If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
|
|
368
|
+
)
|
|
356
369
|
# deprecated warning
|
|
357
370
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
358
371
|
"dataset_type_mapping",
|
|
@@ -614,3 +627,23 @@ class PowerBiDashboardSourceConfig(
|
|
|
614
627
|
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
615
628
|
)
|
|
616
629
|
return values
|
|
630
|
+
|
|
631
|
+
@root_validator(skip_on_failure=True)
|
|
632
|
+
def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
|
|
633
|
+
if values.get("dsn_to_database_schema") is not None:
|
|
634
|
+
dsn_mapping = values.get("dsn_to_database_schema")
|
|
635
|
+
if not isinstance(dsn_mapping, dict):
|
|
636
|
+
raise ValueError("dsn_to_database_schema must contain key-value pairs")
|
|
637
|
+
|
|
638
|
+
for _key, value in dsn_mapping.items():
|
|
639
|
+
if not isinstance(value, str):
|
|
640
|
+
raise ValueError(
|
|
641
|
+
"dsn_to_database_schema mapping values must be strings"
|
|
642
|
+
)
|
|
643
|
+
parts = value.split(".")
|
|
644
|
+
if len(parts) != 1 and len(parts) != 2:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"dsn_to_database_schema invalid mapping value: {value}"
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
return values
|
|
@@ -3,7 +3,9 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import Dict, List, Optional, Tuple, Type, cast
|
|
5
5
|
|
|
6
|
+
import sqlglot
|
|
6
7
|
from lark import Tree
|
|
8
|
+
from sqlglot import ParseError, expressions as exp
|
|
7
9
|
|
|
8
10
|
from datahub.configuration.source_common import PlatformDetail
|
|
9
11
|
from datahub.emitter import mce_builder as builder
|
|
@@ -209,15 +211,34 @@ class AbstractLineage(ABC):
|
|
|
209
211
|
|
|
210
212
|
return None
|
|
211
213
|
|
|
214
|
+
@staticmethod
|
|
215
|
+
def is_sql_query(query: Optional[str]) -> bool:
|
|
216
|
+
if not query:
|
|
217
|
+
return False
|
|
218
|
+
query = native_sql_parser.remove_special_characters(query)
|
|
219
|
+
try:
|
|
220
|
+
expression = sqlglot.parse_one(query)
|
|
221
|
+
return isinstance(expression, exp.Select)
|
|
222
|
+
except (ParseError, Exception):
|
|
223
|
+
logger.debug(f"Failed to parse query as SQL: {query}")
|
|
224
|
+
return False
|
|
225
|
+
|
|
212
226
|
def parse_custom_sql(
|
|
213
|
-
self,
|
|
227
|
+
self,
|
|
228
|
+
query: str,
|
|
229
|
+
server: str,
|
|
230
|
+
database: Optional[str],
|
|
231
|
+
schema: Optional[str],
|
|
232
|
+
platform_pair: Optional[DataPlatformPair] = None,
|
|
214
233
|
) -> Lineage:
|
|
215
234
|
dataplatform_tables: List[DataPlatformTable] = []
|
|
235
|
+
if not platform_pair:
|
|
236
|
+
platform_pair = self.get_platform_pair()
|
|
216
237
|
|
|
217
238
|
platform_detail: PlatformDetail = (
|
|
218
239
|
self.platform_instance_resolver.get_platform_instance(
|
|
219
240
|
PowerBIPlatformDetail(
|
|
220
|
-
data_platform_pair=
|
|
241
|
+
data_platform_pair=platform_pair,
|
|
221
242
|
data_platform_server=server,
|
|
222
243
|
)
|
|
223
244
|
)
|
|
@@ -231,7 +252,7 @@ class AbstractLineage(ABC):
|
|
|
231
252
|
native_sql_parser.parse_custom_sql(
|
|
232
253
|
ctx=self.ctx,
|
|
233
254
|
query=query,
|
|
234
|
-
platform=
|
|
255
|
+
platform=platform_pair.datahub_data_platform_name,
|
|
235
256
|
platform_instance=platform_detail.platform_instance,
|
|
236
257
|
env=platform_detail.env,
|
|
237
258
|
database=database,
|
|
@@ -258,7 +279,7 @@ class AbstractLineage(ABC):
|
|
|
258
279
|
for urn in parsed_result.in_tables:
|
|
259
280
|
dataplatform_tables.append(
|
|
260
281
|
DataPlatformTable(
|
|
261
|
-
data_platform_pair=
|
|
282
|
+
data_platform_pair=platform_pair,
|
|
262
283
|
urn=urn,
|
|
263
284
|
)
|
|
264
285
|
)
|
|
@@ -956,7 +977,7 @@ class OdbcLineage(AbstractLineage):
|
|
|
956
977
|
f"data-access function detail {data_access_func_detail}"
|
|
957
978
|
)
|
|
958
979
|
|
|
959
|
-
connect_string,
|
|
980
|
+
connect_string, query = self.get_db_detail_from_argument(
|
|
960
981
|
data_access_func_detail.arg_list
|
|
961
982
|
)
|
|
962
983
|
|
|
@@ -972,12 +993,19 @@ class OdbcLineage(AbstractLineage):
|
|
|
972
993
|
data_platform, powerbi_platform = extract_platform(connect_string)
|
|
973
994
|
server_name = extract_server(connect_string)
|
|
974
995
|
|
|
996
|
+
dsn = extract_dsn(connect_string)
|
|
997
|
+
if not dsn:
|
|
998
|
+
self.reporter.warning(
|
|
999
|
+
title="Can not determine ODBC DSN",
|
|
1000
|
+
message="Can not extract DSN from ODBC connect string. Skipping Lineage creation.",
|
|
1001
|
+
context=f"table-name={self.table.full_name}, connect-string={connect_string}",
|
|
1002
|
+
)
|
|
1003
|
+
return Lineage.empty()
|
|
1004
|
+
logger.debug(f"Extracted DSN: {dsn}")
|
|
1005
|
+
|
|
975
1006
|
if not data_platform:
|
|
976
|
-
|
|
977
|
-
if
|
|
978
|
-
logger.debug(f"Extracted DSN: {dsn}")
|
|
979
|
-
server_name = dsn
|
|
980
|
-
if dsn and self.config.dsn_to_platform_name:
|
|
1007
|
+
server_name = dsn
|
|
1008
|
+
if self.config.dsn_to_platform_name:
|
|
981
1009
|
logger.debug(f"Attempting to map DSN {dsn} to platform")
|
|
982
1010
|
name = self.config.dsn_to_platform_name.get(dsn)
|
|
983
1011
|
if name:
|
|
@@ -1006,6 +1034,63 @@ class OdbcLineage(AbstractLineage):
|
|
|
1006
1034
|
elif not server_name:
|
|
1007
1035
|
server_name = "unknown"
|
|
1008
1036
|
|
|
1037
|
+
if self.is_sql_query(query):
|
|
1038
|
+
return self.query_lineage(query, platform_pair, server_name, dsn)
|
|
1039
|
+
else:
|
|
1040
|
+
return self.expression_lineage(
|
|
1041
|
+
data_access_func_detail, data_platform, platform_pair, server_name
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
def query_lineage(
|
|
1045
|
+
self,
|
|
1046
|
+
query: Optional[str],
|
|
1047
|
+
platform_pair: DataPlatformPair,
|
|
1048
|
+
server_name: str,
|
|
1049
|
+
dsn: str,
|
|
1050
|
+
) -> Lineage:
|
|
1051
|
+
database = None
|
|
1052
|
+
schema = None
|
|
1053
|
+
|
|
1054
|
+
if not query:
|
|
1055
|
+
# query should never be None as it is checked before calling this function.
|
|
1056
|
+
# however, we need to check just in case.
|
|
1057
|
+
self.reporter.warning(
|
|
1058
|
+
title="ODBC Query is null",
|
|
1059
|
+
message="No SQL to parse. Skipping Lineage creation.",
|
|
1060
|
+
context=f"table-name={self.table.full_name}",
|
|
1061
|
+
)
|
|
1062
|
+
return Lineage.empty()
|
|
1063
|
+
|
|
1064
|
+
if self.config.dsn_to_database_schema:
|
|
1065
|
+
value = self.config.dsn_to_database_schema.get(dsn)
|
|
1066
|
+
if value:
|
|
1067
|
+
parts = value.split(".")
|
|
1068
|
+
if len(parts) == 1:
|
|
1069
|
+
database = parts[0]
|
|
1070
|
+
elif len(parts) == 2:
|
|
1071
|
+
database = parts[0]
|
|
1072
|
+
schema = parts[1]
|
|
1073
|
+
|
|
1074
|
+
logger.debug(
|
|
1075
|
+
f"ODBC query processing: dsn={dsn} mapped to database={database}, schema={schema}"
|
|
1076
|
+
)
|
|
1077
|
+
result = self.parse_custom_sql(
|
|
1078
|
+
query=query,
|
|
1079
|
+
server=server_name,
|
|
1080
|
+
database=database,
|
|
1081
|
+
schema=schema,
|
|
1082
|
+
platform_pair=platform_pair,
|
|
1083
|
+
)
|
|
1084
|
+
logger.debug(f"ODBC query lineage generated {len(result.upstreams)} upstreams")
|
|
1085
|
+
return result
|
|
1086
|
+
|
|
1087
|
+
def expression_lineage(
|
|
1088
|
+
self,
|
|
1089
|
+
data_access_func_detail: DataAccessFunctionDetail,
|
|
1090
|
+
data_platform: str,
|
|
1091
|
+
platform_pair: DataPlatformPair,
|
|
1092
|
+
server_name: str,
|
|
1093
|
+
) -> Lineage:
|
|
1009
1094
|
database_name = None
|
|
1010
1095
|
schema_name = None
|
|
1011
1096
|
table_name = None
|
|
@@ -1144,6 +1229,11 @@ class SupportedPattern(Enum):
|
|
|
1144
1229
|
FunctionName.ODBC_DATA_ACCESS,
|
|
1145
1230
|
)
|
|
1146
1231
|
|
|
1232
|
+
ODBC_QUERY = (
|
|
1233
|
+
OdbcLineage,
|
|
1234
|
+
FunctionName.ODBC_QUERY,
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1147
1237
|
def handler(self) -> Type[AbstractLineage]:
|
|
1148
1238
|
return self.value[0]
|
|
1149
1239
|
|
|
@@ -40,6 +40,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
40
40
|
from datahub.ingestion.source.common.subtypes import (
|
|
41
41
|
BIAssetSubTypes,
|
|
42
42
|
BIContainerSubTypes,
|
|
43
|
+
SourceCapabilityModifier,
|
|
43
44
|
)
|
|
44
45
|
from datahub.ingestion.source.powerbi.config import (
|
|
45
46
|
Constant,
|
|
@@ -1229,6 +1230,10 @@ class Mapper:
|
|
|
1229
1230
|
@capability(
|
|
1230
1231
|
SourceCapability.CONTAINERS,
|
|
1231
1232
|
"Enabled by default",
|
|
1233
|
+
subtype_modifier=[
|
|
1234
|
+
SourceCapabilityModifier.POWERBI_WORKSPACE,
|
|
1235
|
+
SourceCapabilityModifier.POWERBI_DATASET,
|
|
1236
|
+
],
|
|
1232
1237
|
)
|
|
1233
1238
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
1234
1239
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
@@ -9,6 +9,7 @@ from datahub.configuration import ConfigModel
|
|
|
9
9
|
from datahub.configuration.common import AllowDenyPattern
|
|
10
10
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
11
11
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
12
|
+
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
12
13
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
13
14
|
IncrementalLineageConfigMixin,
|
|
14
15
|
)
|
|
@@ -101,6 +102,12 @@ class RedshiftConfig(
|
|
|
101
102
|
)
|
|
102
103
|
|
|
103
104
|
_database_alias_removed = pydantic_removed_field("database_alias")
|
|
105
|
+
_use_lineage_v2_removed = pydantic_removed_field("use_lineage_v2")
|
|
106
|
+
_rename_lineage_v2_generate_queries_to_lineage_generate_queries = (
|
|
107
|
+
pydantic_renamed_field(
|
|
108
|
+
"lineage_v2_generate_queries", "lineage_generate_queries"
|
|
109
|
+
)
|
|
110
|
+
)
|
|
104
111
|
|
|
105
112
|
default_schema: str = Field(
|
|
106
113
|
default="public",
|
|
@@ -112,13 +119,9 @@ class RedshiftConfig(
|
|
|
112
119
|
description="Whether target Redshift instance is serverless (alternative is provisioned cluster)",
|
|
113
120
|
)
|
|
114
121
|
|
|
115
|
-
|
|
116
|
-
default=True,
|
|
117
|
-
description="Whether to use the new SQL-based lineage collector.",
|
|
118
|
-
)
|
|
119
|
-
lineage_v2_generate_queries: bool = Field(
|
|
122
|
+
lineage_generate_queries: bool = Field(
|
|
120
123
|
default=True,
|
|
121
|
-
description="Whether to generate queries entities for the
|
|
124
|
+
description="Whether to generate queries entities for the SQL-based lineage collector.",
|
|
122
125
|
)
|
|
123
126
|
|
|
124
127
|
include_table_lineage: bool = Field(
|