acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show
  1. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
  2. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
  3. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +46 -6
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  11. datahub/ingestion/source/common/subtypes.py +3 -0
  12. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  13. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  14. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  15. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  16. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  17. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  18. datahub/ingestion/source/excel/__init__.py +0 -0
  19. datahub/ingestion/source/excel/config.py +92 -0
  20. datahub/ingestion/source/excel/excel_file.py +539 -0
  21. datahub/ingestion/source/excel/profiling.py +308 -0
  22. datahub/ingestion/source/excel/report.py +49 -0
  23. datahub/ingestion/source/excel/source.py +662 -0
  24. datahub/ingestion/source/excel/util.py +18 -0
  25. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  26. datahub/ingestion/source/openapi.py +1 -1
  27. datahub/ingestion/source/powerbi/config.py +33 -0
  28. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  29. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  31. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  32. datahub/ingestion/source/redshift/config.py +9 -6
  33. datahub/ingestion/source/redshift/lineage.py +386 -687
  34. datahub/ingestion/source/redshift/redshift.py +19 -106
  35. datahub/ingestion/source/s3/source.py +65 -59
  36. datahub/ingestion/source/snowflake/constants.py +2 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  39. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  40. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  42. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
  43. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  44. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
  46. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  47. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  48. datahub/ingestion/source/sql/mssql/source.py +62 -3
  49. datahub/ingestion/source/sql_queries.py +24 -2
  50. datahub/ingestion/source/state/checkpoint.py +3 -28
  51. datahub/ingestion/source/unity/config.py +74 -9
  52. datahub/ingestion/source/unity/proxy.py +167 -5
  53. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  54. datahub/ingestion/source/unity/proxy_types.py +24 -0
  55. datahub/ingestion/source/unity/report.py +5 -0
  56. datahub/ingestion/source/unity/source.py +111 -1
  57. datahub/ingestion/source/usage/usage_common.py +1 -0
  58. datahub/metadata/_internal_schema_classes.py +573 -517
  59. datahub/metadata/_urns/urn_defs.py +1748 -1748
  60. datahub/metadata/schema.avsc +18564 -18484
  61. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  62. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  63. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  64. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  65. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  66. datahub/metadata/schemas/Ownership.avsc +69 -0
  67. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  68. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  69. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  70. datahub/metadata/schemas/__init__.py +3 -3
  71. datahub/sdk/chart.py +36 -22
  72. datahub/sdk/dashboard.py +38 -62
  73. datahub/sdk/lineage_client.py +6 -26
  74. datahub/sdk/main_client.py +7 -3
  75. datahub/sdk/search_filters.py +16 -0
  76. datahub/specific/aspect_helpers/siblings.py +73 -0
  77. datahub/specific/dataset.py +2 -0
  78. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  79. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  80. datahub/upgrade/upgrade.py +14 -2
  81. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  82. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
  83. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
  84. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
@@ -353,6 +353,19 @@ class PowerBiDashboardSourceConfig(
353
353
  "For example with an ODBC connection string 'DSN=database' where the database type "
354
354
  "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
355
355
  )
356
+ # ODBC DSN to database (or database.schema) mapping
357
+ dsn_to_database_schema: Dict[str, str] = pydantic.Field(
358
+ default={},
359
+ description="A mapping of ODBC DSN to database names with optional schema names "
360
+ "(some database platforms such a MySQL use the table name pattern 'database.table', "
361
+ "while others use the pattern 'database.schema.table'). "
362
+ "This mapping is used in conjunction with ODBC SQL query parsing. "
363
+ "If SQL queries used with ODBC do not reference fully qualified tables names, "
364
+ "then you should configure mappings for your DSNs. "
365
+ "For example with an ODBC connection string 'DSN=database' where the database "
366
+ "is 'prod' you would configure the mapping as 'database: prod'. "
367
+ "If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
368
+ )
356
369
  # deprecated warning
357
370
  _dataset_type_mapping = pydantic_field_deprecated(
358
371
  "dataset_type_mapping",
@@ -614,3 +627,23 @@ class PowerBiDashboardSourceConfig(
614
627
  "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
615
628
  )
616
629
  return values
630
+
631
+ @root_validator(skip_on_failure=True)
632
+ def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
633
+ if values.get("dsn_to_database_schema") is not None:
634
+ dsn_mapping = values.get("dsn_to_database_schema")
635
+ if not isinstance(dsn_mapping, dict):
636
+ raise ValueError("dsn_to_database_schema must contain key-value pairs")
637
+
638
+ for _key, value in dsn_mapping.items():
639
+ if not isinstance(value, str):
640
+ raise ValueError(
641
+ "dsn_to_database_schema mapping values must be strings"
642
+ )
643
+ parts = value.split(".")
644
+ if len(parts) != 1 and len(parts) != 2:
645
+ raise ValueError(
646
+ f"dsn_to_database_schema invalid mapping value: {value}"
647
+ )
648
+
649
+ return values
@@ -76,3 +76,4 @@ class FunctionName(Enum):
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
77
  MYSQL_DATA_ACCESS = "MySQL.Database"
78
78
  ODBC_DATA_ACCESS = "Odbc.DataSource"
79
+ ODBC_QUERY = "Odbc.Query"
@@ -3,7 +3,9 @@ from abc import ABC, abstractmethod
3
3
  from enum import Enum
4
4
  from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
+ import sqlglot
6
7
  from lark import Tree
8
+ from sqlglot import ParseError, expressions as exp
7
9
 
8
10
  from datahub.configuration.source_common import PlatformDetail
9
11
  from datahub.emitter import mce_builder as builder
@@ -209,15 +211,34 @@ class AbstractLineage(ABC):
209
211
 
210
212
  return None
211
213
 
214
+ @staticmethod
215
+ def is_sql_query(query: Optional[str]) -> bool:
216
+ if not query:
217
+ return False
218
+ query = native_sql_parser.remove_special_characters(query)
219
+ try:
220
+ expression = sqlglot.parse_one(query)
221
+ return isinstance(expression, exp.Select)
222
+ except (ParseError, Exception):
223
+ logger.debug(f"Failed to parse query as SQL: {query}")
224
+ return False
225
+
212
226
  def parse_custom_sql(
213
- self, query: str, server: str, database: Optional[str], schema: Optional[str]
227
+ self,
228
+ query: str,
229
+ server: str,
230
+ database: Optional[str],
231
+ schema: Optional[str],
232
+ platform_pair: Optional[DataPlatformPair] = None,
214
233
  ) -> Lineage:
215
234
  dataplatform_tables: List[DataPlatformTable] = []
235
+ if not platform_pair:
236
+ platform_pair = self.get_platform_pair()
216
237
 
217
238
  platform_detail: PlatformDetail = (
218
239
  self.platform_instance_resolver.get_platform_instance(
219
240
  PowerBIPlatformDetail(
220
- data_platform_pair=self.get_platform_pair(),
241
+ data_platform_pair=platform_pair,
221
242
  data_platform_server=server,
222
243
  )
223
244
  )
@@ -231,7 +252,7 @@ class AbstractLineage(ABC):
231
252
  native_sql_parser.parse_custom_sql(
232
253
  ctx=self.ctx,
233
254
  query=query,
234
- platform=self.get_platform_pair().datahub_data_platform_name,
255
+ platform=platform_pair.datahub_data_platform_name,
235
256
  platform_instance=platform_detail.platform_instance,
236
257
  env=platform_detail.env,
237
258
  database=database,
@@ -258,7 +279,7 @@ class AbstractLineage(ABC):
258
279
  for urn in parsed_result.in_tables:
259
280
  dataplatform_tables.append(
260
281
  DataPlatformTable(
261
- data_platform_pair=self.get_platform_pair(),
282
+ data_platform_pair=platform_pair,
262
283
  urn=urn,
263
284
  )
264
285
  )
@@ -956,7 +977,7 @@ class OdbcLineage(AbstractLineage):
956
977
  f"data-access function detail {data_access_func_detail}"
957
978
  )
958
979
 
959
- connect_string, _ = self.get_db_detail_from_argument(
980
+ connect_string, query = self.get_db_detail_from_argument(
960
981
  data_access_func_detail.arg_list
961
982
  )
962
983
 
@@ -972,12 +993,19 @@ class OdbcLineage(AbstractLineage):
972
993
  data_platform, powerbi_platform = extract_platform(connect_string)
973
994
  server_name = extract_server(connect_string)
974
995
 
996
+ dsn = extract_dsn(connect_string)
997
+ if not dsn:
998
+ self.reporter.warning(
999
+ title="Can not determine ODBC DSN",
1000
+ message="Can not extract DSN from ODBC connect string. Skipping Lineage creation.",
1001
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
1002
+ )
1003
+ return Lineage.empty()
1004
+ logger.debug(f"Extracted DSN: {dsn}")
1005
+
975
1006
  if not data_platform:
976
- dsn = extract_dsn(connect_string)
977
- if dsn:
978
- logger.debug(f"Extracted DSN: {dsn}")
979
- server_name = dsn
980
- if dsn and self.config.dsn_to_platform_name:
1007
+ server_name = dsn
1008
+ if self.config.dsn_to_platform_name:
981
1009
  logger.debug(f"Attempting to map DSN {dsn} to platform")
982
1010
  name = self.config.dsn_to_platform_name.get(dsn)
983
1011
  if name:
@@ -1006,6 +1034,63 @@ class OdbcLineage(AbstractLineage):
1006
1034
  elif not server_name:
1007
1035
  server_name = "unknown"
1008
1036
 
1037
+ if self.is_sql_query(query):
1038
+ return self.query_lineage(query, platform_pair, server_name, dsn)
1039
+ else:
1040
+ return self.expression_lineage(
1041
+ data_access_func_detail, data_platform, platform_pair, server_name
1042
+ )
1043
+
1044
+ def query_lineage(
1045
+ self,
1046
+ query: Optional[str],
1047
+ platform_pair: DataPlatformPair,
1048
+ server_name: str,
1049
+ dsn: str,
1050
+ ) -> Lineage:
1051
+ database = None
1052
+ schema = None
1053
+
1054
+ if not query:
1055
+ # query should never be None as it is checked before calling this function.
1056
+ # however, we need to check just in case.
1057
+ self.reporter.warning(
1058
+ title="ODBC Query is null",
1059
+ message="No SQL to parse. Skipping Lineage creation.",
1060
+ context=f"table-name={self.table.full_name}",
1061
+ )
1062
+ return Lineage.empty()
1063
+
1064
+ if self.config.dsn_to_database_schema:
1065
+ value = self.config.dsn_to_database_schema.get(dsn)
1066
+ if value:
1067
+ parts = value.split(".")
1068
+ if len(parts) == 1:
1069
+ database = parts[0]
1070
+ elif len(parts) == 2:
1071
+ database = parts[0]
1072
+ schema = parts[1]
1073
+
1074
+ logger.debug(
1075
+ f"ODBC query processing: dsn={dsn} mapped to database={database}, schema={schema}"
1076
+ )
1077
+ result = self.parse_custom_sql(
1078
+ query=query,
1079
+ server=server_name,
1080
+ database=database,
1081
+ schema=schema,
1082
+ platform_pair=platform_pair,
1083
+ )
1084
+ logger.debug(f"ODBC query lineage generated {len(result.upstreams)} upstreams")
1085
+ return result
1086
+
1087
+ def expression_lineage(
1088
+ self,
1089
+ data_access_func_detail: DataAccessFunctionDetail,
1090
+ data_platform: str,
1091
+ platform_pair: DataPlatformPair,
1092
+ server_name: str,
1093
+ ) -> Lineage:
1009
1094
  database_name = None
1010
1095
  schema_name = None
1011
1096
  table_name = None
@@ -1144,6 +1229,11 @@ class SupportedPattern(Enum):
1144
1229
  FunctionName.ODBC_DATA_ACCESS,
1145
1230
  )
1146
1231
 
1232
+ ODBC_QUERY = (
1233
+ OdbcLineage,
1234
+ FunctionName.ODBC_QUERY,
1235
+ )
1236
+
1147
1237
  def handler(self) -> Type[AbstractLineage]:
1148
1238
  return self.value[0]
1149
1239
 
@@ -40,6 +40,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
40
40
  from datahub.ingestion.source.common.subtypes import (
41
41
  BIAssetSubTypes,
42
42
  BIContainerSubTypes,
43
+ SourceCapabilityModifier,
43
44
  )
44
45
  from datahub.ingestion.source.powerbi.config import (
45
46
  Constant,
@@ -1229,6 +1230,10 @@ class Mapper:
1229
1230
  @capability(
1230
1231
  SourceCapability.CONTAINERS,
1231
1232
  "Enabled by default",
1233
+ subtype_modifier=[
1234
+ SourceCapabilityModifier.POWERBI_WORKSPACE,
1235
+ SourceCapabilityModifier.POWERBI_DATASET,
1236
+ ],
1232
1237
  )
1233
1238
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
1234
1239
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
@@ -101,7 +101,7 @@ logger = logging.getLogger(__name__)
101
101
  )
102
102
  @capability(
103
103
  SourceCapability.LINEAGE_FINE,
104
- "Disabled by default. ",
104
+ "Disabled by default.",
105
105
  )
106
106
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
107
107
  @capability(
@@ -9,6 +9,7 @@ from datahub.configuration import ConfigModel
9
9
  from datahub.configuration.common import AllowDenyPattern
10
10
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
11
11
  from datahub.configuration.validate_field_removal import pydantic_removed_field
12
+ from datahub.configuration.validate_field_rename import pydantic_renamed_field
12
13
  from datahub.ingestion.api.incremental_lineage_helper import (
13
14
  IncrementalLineageConfigMixin,
14
15
  )
@@ -101,6 +102,12 @@ class RedshiftConfig(
101
102
  )
102
103
 
103
104
  _database_alias_removed = pydantic_removed_field("database_alias")
105
+ _use_lineage_v2_removed = pydantic_removed_field("use_lineage_v2")
106
+ _rename_lineage_v2_generate_queries_to_lineage_generate_queries = (
107
+ pydantic_renamed_field(
108
+ "lineage_v2_generate_queries", "lineage_generate_queries"
109
+ )
110
+ )
104
111
 
105
112
  default_schema: str = Field(
106
113
  default="public",
@@ -112,13 +119,9 @@ class RedshiftConfig(
112
119
  description="Whether target Redshift instance is serverless (alternative is provisioned cluster)",
113
120
  )
114
121
 
115
- use_lineage_v2: bool = Field(
116
- default=True,
117
- description="Whether to use the new SQL-based lineage collector.",
118
- )
119
- lineage_v2_generate_queries: bool = Field(
122
+ lineage_generate_queries: bool = Field(
120
123
  default=True,
121
- description="Whether to generate queries entities for the new SQL-based lineage collector.",
124
+ description="Whether to generate queries entities for the SQL-based lineage collector.",
122
125
  )
123
126
 
124
127
  include_table_lineage: bool = Field(