acryl-datahub 1.2.0.7rc2__py3-none-any.whl → 1.2.0.7rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (31) hide show
  1. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/METADATA +2754 -2749
  2. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/RECORD +30 -30
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/autogenerated/capability_summary.json +1 -1
  5. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  6. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  7. datahub/ingestion/source/redshift/config.py +9 -6
  8. datahub/ingestion/source/redshift/lineage.py +386 -687
  9. datahub/ingestion/source/redshift/redshift.py +19 -106
  10. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +4 -1
  11. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  12. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  13. datahub/ingestion/source/sql/mssql/source.py +62 -3
  14. datahub/ingestion/source/unity/config.py +74 -9
  15. datahub/ingestion/source/unity/proxy.py +167 -5
  16. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  17. datahub/ingestion/source/unity/proxy_types.py +24 -0
  18. datahub/ingestion/source/unity/report.py +5 -0
  19. datahub/ingestion/source/unity/source.py +111 -1
  20. datahub/ingestion/source/usage/usage_common.py +1 -0
  21. datahub/metadata/_internal_schema_classes.py +5 -5
  22. datahub/metadata/schema.avsc +66 -60
  23. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  24. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  25. datahub/sdk/chart.py +36 -22
  26. datahub/sdk/dashboard.py +38 -62
  27. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  28. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/WHEEL +0 -0
  29. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/entry_points.txt +0 -0
  30. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/licenses/LICENSE +0 -0
  31. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import itertools
3
2
  import logging
4
3
  from collections import defaultdict
5
4
  from typing import Dict, Iterable, List, Optional, Type, Union
@@ -52,8 +51,7 @@ from datahub.ingestion.source.common.subtypes import (
52
51
  from datahub.ingestion.source.redshift.config import RedshiftConfig
53
52
  from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
54
53
  from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
55
- from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
56
- from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
54
+ from datahub.ingestion.source.redshift.lineage import RedshiftSqlLineage
57
55
  from datahub.ingestion.source.redshift.profile import RedshiftProfiler
58
56
  from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
59
57
  from datahub.ingestion.source.redshift.redshift_schema import (
@@ -72,7 +70,6 @@ from datahub.ingestion.source.sql.sql_utils import (
72
70
  add_table_to_schema_container,
73
71
  gen_database_container,
74
72
  gen_database_key,
75
- gen_lineage,
76
73
  gen_schema_container,
77
74
  gen_schema_key,
78
75
  get_dataplatform_instance_aspect,
@@ -116,7 +113,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
116
113
  )
117
114
  from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
118
115
  from datahub.utilities import memory_footprint
119
- from datahub.utilities.dedup_list import deduplicate_list
120
116
  from datahub.utilities.mapping import Constants
121
117
  from datahub.utilities.perf_timer import PerfTimer
122
118
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -423,40 +419,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
423
419
  memory_footprint.total_size(self.db_views)
424
420
  )
425
421
 
426
- if self.config.use_lineage_v2:
427
- with RedshiftSqlLineageV2(
428
- config=self.config,
429
- report=self.report,
430
- context=self.ctx,
431
- database=database,
432
- redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
433
- ) as lineage_extractor:
434
- yield from lineage_extractor.aggregator.register_schemas_from_stream(
435
- self.process_schemas(connection, database)
436
- )
437
-
438
- with self.report.new_stage(LINEAGE_EXTRACTION):
439
- yield from self.extract_lineage_v2(
440
- connection=connection,
441
- database=database,
442
- lineage_extractor=lineage_extractor,
443
- )
444
-
445
- all_tables = self.get_all_tables()
446
- else:
447
- yield from self.process_schemas(connection, database)
422
+ with RedshiftSqlLineage(
423
+ config=self.config,
424
+ report=self.report,
425
+ context=self.ctx,
426
+ database=database,
427
+ redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
428
+ ) as lineage_extractor:
429
+ yield from lineage_extractor.aggregator.register_schemas_from_stream(
430
+ self.process_schemas(connection, database)
431
+ )
448
432
 
449
- all_tables = self.get_all_tables()
433
+ with self.report.new_stage(LINEAGE_EXTRACTION):
434
+ yield from self.extract_lineage_v2(
435
+ connection=connection,
436
+ database=database,
437
+ lineage_extractor=lineage_extractor,
438
+ )
450
439
 
451
- if (
452
- self.config.include_table_lineage
453
- or self.config.include_view_lineage
454
- or self.config.include_copy_lineage
455
- ):
456
- with self.report.new_stage(LINEAGE_EXTRACTION):
457
- yield from self.extract_lineage(
458
- connection=connection, all_tables=all_tables, database=database
459
- )
440
+ all_tables = self.get_all_tables()
460
441
 
461
442
  if self.config.include_usage_statistics:
462
443
  with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
@@ -968,45 +949,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
968
949
 
969
950
  self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
970
951
 
971
- def extract_lineage(
972
- self,
973
- connection: redshift_connector.Connection,
974
- database: str,
975
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
976
- ) -> Iterable[MetadataWorkUnit]:
977
- if not self._should_ingest_lineage():
978
- return
979
-
980
- lineage_extractor = RedshiftLineageExtractor(
981
- config=self.config,
982
- report=self.report,
983
- context=self.ctx,
984
- redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
985
- )
986
-
987
- with PerfTimer() as timer:
988
- lineage_extractor.populate_lineage(
989
- database=database, connection=connection, all_tables=all_tables
990
- )
991
-
992
- self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
993
- digits=2
994
- )
995
- yield from self.generate_lineage(
996
- database, lineage_extractor=lineage_extractor
997
- )
998
-
999
- if self.redundant_lineage_run_skip_handler:
1000
- # Update the checkpoint state for this run.
1001
- self.redundant_lineage_run_skip_handler.update_state(
1002
- self.config.start_time, self.config.end_time
1003
- )
1004
-
1005
952
  def extract_lineage_v2(
1006
953
  self,
1007
954
  connection: redshift_connector.Connection,
1008
955
  database: str,
1009
- lineage_extractor: RedshiftSqlLineageV2,
956
+ lineage_extractor: RedshiftSqlLineage,
1010
957
  ) -> Iterable[MetadataWorkUnit]:
1011
958
  if self.config.include_share_lineage:
1012
959
  outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
@@ -1069,40 +1016,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1069
1016
 
1070
1017
  return True
1071
1018
 
1072
- def generate_lineage(
1073
- self, database: str, lineage_extractor: RedshiftLineageExtractor
1074
- ) -> Iterable[MetadataWorkUnit]:
1075
- logger.info(f"Generate lineage for {database}")
1076
- for schema in deduplicate_list(
1077
- itertools.chain(self.db_tables[database], self.db_views[database])
1078
- ):
1079
- if (
1080
- database not in self.db_schemas
1081
- or schema not in self.db_schemas[database]
1082
- ):
1083
- logger.warning(
1084
- f"Either database {database} or {schema} exists in the lineage but was not discovered earlier. Something went wrong."
1085
- )
1086
- continue
1087
-
1088
- table_or_view: Union[RedshiftTable, RedshiftView]
1089
- for table_or_view in (
1090
- []
1091
- + self.db_tables[database].get(schema, [])
1092
- + self.db_views[database].get(schema, [])
1093
- ):
1094
- datahub_dataset_name = f"{database}.{schema}.{table_or_view.name}"
1095
- dataset_urn = self.gen_dataset_urn(datahub_dataset_name)
1096
-
1097
- lineage_info = lineage_extractor.get_lineage(
1098
- table_or_view,
1099
- dataset_urn,
1100
- self.db_schemas[database][schema],
1101
- )
1102
- if lineage_info:
1103
- # incremental lineage generation is taken care by auto_incremental_lineage
1104
- yield from gen_lineage(dataset_urn, lineage_info)
1105
-
1106
1019
  def add_config_to_report(self):
1107
1020
  self.report.stateful_lineage_ingestion_enabled = (
1108
1021
  self.config.enable_stateful_lineage_ingestion
@@ -441,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
441
441
  tables = self.fetch_tables_for_schema(
442
442
  snowflake_schema, db_name, schema_name
443
443
  )
444
+ if self.config.include_views:
445
+ views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
446
+
447
+ if self.config.include_tables:
444
448
  db_tables[schema_name] = tables
445
449
  yield from self._process_tables(
446
450
  tables, snowflake_schema, db_name, schema_name
447
451
  )
448
452
 
449
453
  if self.config.include_views:
450
- views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
451
454
  yield from self._process_views(
452
455
  views, snowflake_schema, db_name, schema_name
453
456
  )
@@ -199,6 +199,7 @@ class SnowflakeV2Source(
199
199
  ),
200
200
  generate_usage_statistics=False,
201
201
  generate_operations=False,
202
+ generate_queries=self.config.include_queries,
202
203
  format_queries=self.config.format_sql_queries,
203
204
  is_temp_table=self._is_temp_table,
204
205
  is_allowed_table=self._is_allowed_table,
@@ -134,7 +134,9 @@ class StoredProcedure:
134
134
 
135
135
  @property
136
136
  def escape_full_name(self) -> str:
137
- return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
137
+ return f"[{self.db}].[{self.schema}].[{self.formatted_name}]".replace(
138
+ "'", r"''"
139
+ )
138
140
 
139
141
  def to_base_procedure(self) -> BaseProcedure:
140
142
  return BaseProcedure(
@@ -10,6 +10,7 @@ from sqlalchemy import create_engine, inspect
10
10
  from sqlalchemy.engine.base import Connection
11
11
  from sqlalchemy.engine.reflection import Inspector
12
12
  from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
+ from sqlalchemy.sql import quoted_name
13
14
 
14
15
  import datahub.metadata.schema_classes as models
15
16
  from datahub.configuration.common import AllowDenyPattern
@@ -130,10 +131,14 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
130
131
  "match the entire table name in database.schema.table format. Defaults are to set in such a way "
131
132
  "to ignore the temporary staging tables created by known ETL tools.",
132
133
  )
134
+ quote_schemas: bool = Field(
135
+ default=False,
136
+ description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
137
+ )
133
138
 
134
139
  @pydantic.validator("uri_args")
135
140
  def passwords_match(cls, v, values, **kwargs):
136
- if values["use_odbc"] and "driver" not in v:
141
+ if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
137
142
  raise ValueError("uri_args must contain a 'driver' option")
138
143
  elif not values["use_odbc"] and v:
139
144
  raise ValueError("uri_args is not supported when ODBC is disabled")
@@ -159,7 +164,15 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
159
164
  uri_opts=uri_opts,
160
165
  )
161
166
  if self.use_odbc:
162
- uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}"
167
+ final_uri_args = self.uri_args.copy()
168
+ if final_uri_args and current_db:
169
+ final_uri_args.update({"database": current_db})
170
+
171
+ uri = (
172
+ f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
173
+ if final_uri_args
174
+ else uri
175
+ )
163
176
  return uri
164
177
 
165
178
  @property
@@ -923,7 +936,11 @@ class SQLServerSource(SQLAlchemySource):
923
936
  logger.debug(f"sql_alchemy_url={url}")
924
937
  engine = create_engine(url, **self.config.options)
925
938
 
926
- if self.config.database and self.config.database != "":
939
+ if (
940
+ self.config.database
941
+ and self.config.database != ""
942
+ or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
943
+ ):
927
944
  inspector = inspect(engine)
928
945
  yield inspector
929
946
  else:
@@ -1020,3 +1037,45 @@ class SQLServerSource(SQLAlchemySource):
1020
1037
  if self.config.convert_urns_to_lowercase
1021
1038
  else table_ref_str
1022
1039
  )
1040
+
1041
+ def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
1042
+ for schema in super().get_allowed_schemas(inspector, db_name):
1043
+ if self.config.quote_schemas:
1044
+ yield quoted_name(schema, True)
1045
+ else:
1046
+ yield schema
1047
+
1048
+ def get_db_name(self, inspector: Inspector) -> str:
1049
+ engine = inspector.engine
1050
+
1051
+ try:
1052
+ if (
1053
+ engine
1054
+ and hasattr(engine, "url")
1055
+ and hasattr(engine.url, "database")
1056
+ and engine.url.database
1057
+ ):
1058
+ return str(engine.url.database).strip('"')
1059
+
1060
+ if (
1061
+ engine
1062
+ and hasattr(engine, "url")
1063
+ and hasattr(engine.url, "query")
1064
+ and "odbc_connect" in engine.url.query
1065
+ ):
1066
+ # According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
1067
+ database = re.search(
1068
+ r"DATABASE=([^;]*);",
1069
+ urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
1070
+ flags=re.IGNORECASE,
1071
+ )
1072
+
1073
+ if database and database.group(1):
1074
+ return database.group(1)
1075
+
1076
+ return ""
1077
+
1078
+ except Exception as e:
1079
+ raise RuntimeError(
1080
+ "Unable to get database name from Sqlalchemy inspector"
1081
+ ) from e
@@ -35,6 +35,10 @@ from datahub.utilities.global_warning_util import add_global_warning
35
35
 
36
36
  logger = logging.getLogger(__name__)
37
37
 
38
+ # Configuration default constants
39
+ INCLUDE_TAGS_DEFAULT = True
40
+ INCLUDE_HIVE_METASTORE_DEFAULT = True
41
+
38
42
 
39
43
  class LineageDataSource(ConfigEnum):
40
44
  AUTO = "AUTO"
@@ -137,10 +141,18 @@ class UnityCatalogSourceConfig(
137
141
  )
138
142
  warehouse_id: Optional[str] = pydantic.Field(
139
143
  default=None,
140
- description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
144
+ description=(
145
+ "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
146
+ "Required for the following features that need SQL access: "
147
+ "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
148
+ "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
149
+ "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
150
+ "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
151
+ "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
152
+ ),
141
153
  )
142
154
  include_hive_metastore: bool = pydantic.Field(
143
- default=True,
155
+ default=INCLUDE_HIVE_METASTORE_DEFAULT,
144
156
  description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
145
157
  )
146
158
  workspace_name: Optional[str] = pydantic.Field(
@@ -236,8 +248,12 @@ class UnityCatalogSourceConfig(
236
248
  )
237
249
 
238
250
  include_tags: bool = pydantic.Field(
239
- default=True,
240
- description="Option to enable/disable column/table tag extraction.",
251
+ default=INCLUDE_TAGS_DEFAULT,
252
+ description=(
253
+ "Option to enable/disable column/table tag extraction. "
254
+ "Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
255
+ "If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
256
+ ),
241
257
  )
242
258
 
243
259
  _rename_table_ownership = pydantic_renamed_field(
@@ -310,8 +326,62 @@ class UnityCatalogSourceConfig(
310
326
  description="Details about the delta lake, incase to emit siblings",
311
327
  )
312
328
 
329
+ include_ml_model_aliases: bool = pydantic.Field(
330
+ default=False,
331
+ description="Whether to include ML model aliases in the ingestion.",
332
+ )
333
+
334
+ ml_model_max_results: int = pydantic.Field(
335
+ default=1000,
336
+ ge=0,
337
+ description="Maximum number of ML models to ingest.",
338
+ )
339
+
340
+ _forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
341
+ _forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
342
+
313
343
  scheme: str = DATABRICKS
314
344
 
345
+ def __init__(self, **data):
346
+ # First, let the parent handle the root validators and field processing
347
+ super().__init__(**data)
348
+
349
+ # After model creation, check if we need to auto-disable features
350
+ # based on the final warehouse_id value (which may have been set by root validators)
351
+ include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
352
+ include_hive_metastore_original = data.get(
353
+ "include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
354
+ )
355
+
356
+ # Track what we're force-disabling
357
+ forced_disable_tag_extraction = False
358
+ forced_disable_hive_metastore_extraction = False
359
+
360
+ # Check if features should be auto-disabled based on final warehouse_id
361
+ if include_tags_original and not self.warehouse_id:
362
+ forced_disable_tag_extraction = True
363
+ self.include_tags = False # Modify the model attribute directly
364
+ logger.warning(
365
+ "warehouse_id is not set but include_tags=True. "
366
+ "Automatically disabling tag extraction since it requires SQL queries. "
367
+ "Set warehouse_id to enable tag extraction."
368
+ )
369
+
370
+ if include_hive_metastore_original and not self.warehouse_id:
371
+ forced_disable_hive_metastore_extraction = True
372
+ self.include_hive_metastore = False # Modify the model attribute directly
373
+ logger.warning(
374
+ "warehouse_id is not set but include_hive_metastore=True. "
375
+ "Automatically disabling hive metastore extraction since it requires SQL queries. "
376
+ "Set warehouse_id to enable hive metastore extraction."
377
+ )
378
+
379
+ # Set private attributes
380
+ self._forced_disable_tag_extraction = forced_disable_tag_extraction
381
+ self._forced_disable_hive_metastore_extraction = (
382
+ forced_disable_hive_metastore_extraction
383
+ )
384
+
315
385
  def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
316
386
  uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
317
387
  if database:
@@ -381,11 +451,6 @@ class UnityCatalogSourceConfig(
381
451
  "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
382
452
  )
383
453
 
384
- if values.get("include_hive_metastore") and not values.get("warehouse_id"):
385
- raise ValueError(
386
- "When `include_hive_metastore` is set, `warehouse_id` must be set."
387
- )
388
-
389
454
  if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
390
455
  profiling.warehouse_id = values["warehouse_id"]
391
456