acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/METADATA +2324 -2324
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/RECORD +25 -25
  3. datahub/__init__.py +1 -1
  4. datahub/cli/cli_utils.py +12 -1
  5. datahub/emitter/rest_emitter.py +140 -92
  6. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  7. datahub/ingestion/api/source.py +4 -0
  8. datahub/ingestion/glossary/classifier.py +2 -3
  9. datahub/ingestion/graph/client.py +14 -11
  10. datahub/ingestion/graph/config.py +1 -1
  11. datahub/ingestion/source/aws/glue.py +52 -35
  12. datahub/ingestion/source/bigquery_v2/bigquery.py +2 -0
  13. datahub/ingestion/source/bigquery_v2/bigquery_config.py +8 -0
  14. datahub/ingestion/source/datahub/config.py +10 -0
  15. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  16. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  17. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +11 -7
  18. datahub/ingestion/source/snowflake/snowflake_config.py +8 -0
  19. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  20. datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
  21. datahub/ingestion/source/unity/source.py +0 -4
  22. datahub/sql_parsing/sql_parsing_aggregator.py +8 -5
  23. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/WHEEL +0 -0
  24. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/entry_points.txt +0 -0
  25. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/top_level.txt +0 -0
@@ -1054,49 +1054,66 @@ class GlueSource(StatefulIngestionSourceBase):
1054
1054
  yield from self.gen_database_containers(database)
1055
1055
 
1056
1056
  for table in tables:
1057
- database_name = table["DatabaseName"]
1058
1057
  table_name = table["Name"]
1059
- full_table_name = f"{database_name}.{table_name}"
1060
- self.report.report_table_scanned()
1061
- if not self.source_config.database_pattern.allowed(
1062
- database_name
1063
- ) or not self.source_config.table_pattern.allowed(full_table_name):
1064
- self.report.report_table_dropped(full_table_name)
1065
- continue
1058
+ try:
1059
+ yield from self._gen_table_wu(table=table)
1060
+ except KeyError as e:
1061
+ self.report.report_failure(
1062
+ message="Failed to extract workunit for table",
1063
+ context=f"Table: {table_name}",
1064
+ exc=e,
1065
+ )
1066
+ if self.extract_transforms:
1067
+ yield from self._transform_extraction()
1066
1068
 
1067
- dataset_urn = make_dataset_urn_with_platform_instance(
1068
- platform=self.platform,
1069
- name=full_table_name,
1070
- env=self.env,
1071
- platform_instance=self.source_config.platform_instance,
1072
- )
1069
+ def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
1070
+ database_name = table["DatabaseName"]
1071
+ table_name = table["Name"]
1072
+ full_table_name = f"{database_name}.{table_name}"
1073
+ self.report.report_table_scanned()
1074
+ if not self.source_config.database_pattern.allowed(
1075
+ database_name
1076
+ ) or not self.source_config.table_pattern.allowed(full_table_name):
1077
+ self.report.report_table_dropped(full_table_name)
1078
+ return
1079
+
1080
+ dataset_urn = make_dataset_urn_with_platform_instance(
1081
+ platform=self.platform,
1082
+ name=full_table_name,
1083
+ env=self.env,
1084
+ platform_instance=self.source_config.platform_instance,
1085
+ )
1073
1086
 
1074
- mce = self._extract_record(dataset_urn, table, full_table_name)
1075
- yield MetadataWorkUnit(full_table_name, mce=mce)
1087
+ mce = self._extract_record(dataset_urn, table, full_table_name)
1088
+ yield MetadataWorkUnit(full_table_name, mce=mce)
1076
1089
 
1077
- # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1078
- # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1079
- yield MetadataChangeProposalWrapper(
1080
- entityUrn=dataset_urn,
1081
- aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1082
- ).as_workunit()
1090
+ # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1091
+ # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1092
+ yield MetadataChangeProposalWrapper(
1093
+ entityUrn=dataset_urn,
1094
+ aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1095
+ ).as_workunit()
1083
1096
 
1084
- yield from self._get_domain_wu(
1085
- dataset_name=full_table_name,
1086
- entity_urn=dataset_urn,
1087
- )
1088
- yield from self.add_table_to_database_container(
1089
- dataset_urn=dataset_urn, db_name=database_name
1090
- )
1097
+ yield from self._get_domain_wu(
1098
+ dataset_name=full_table_name,
1099
+ entity_urn=dataset_urn,
1100
+ )
1101
+ yield from self.add_table_to_database_container(
1102
+ dataset_urn=dataset_urn, db_name=database_name
1103
+ )
1091
1104
 
1092
- wu = self.get_lineage_if_enabled(mce)
1093
- if wu:
1094
- yield wu
1105
+ wu = self.get_lineage_if_enabled(mce)
1106
+ if wu:
1107
+ yield wu
1095
1108
 
1109
+ try:
1096
1110
  yield from self.get_profile_if_enabled(mce, database_name, table_name)
1097
-
1098
- if self.extract_transforms:
1099
- yield from self._transform_extraction()
1111
+ except KeyError as e:
1112
+ self.report.report_failure(
1113
+ message="Failed to extract profile for table",
1114
+ context=f"Table: {dataset_urn}",
1115
+ exc=e,
1116
+ )
1100
1117
 
1101
1118
  def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
1102
1119
  dags: Dict[str, Optional[Dict[str, Any]]] = {}
@@ -281,6 +281,8 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
281
281
  include_lineage=self.config.include_table_lineage,
282
282
  include_usage_statistics=self.config.include_usage_statistics,
283
283
  include_operations=self.config.usage.include_operational_stats,
284
+ include_queries=self.config.include_queries,
285
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
284
286
  top_n_queries=self.config.usage.top_n_queries,
285
287
  region_qualifiers=self.config.region_qualifiers,
286
288
  ),
@@ -447,6 +447,14 @@ class BigQueryV2Config(
447
447
  default=False,
448
448
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
449
449
  )
450
+ include_queries: bool = Field(
451
+ default=True,
452
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
453
+ )
454
+ include_query_usage_statistics: bool = Field(
455
+ default=True,
456
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
457
+ )
450
458
 
451
459
  @property
452
460
  def have_table_data_read_permission(self) -> bool:
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Optional, Set
3
3
 
4
+ import pydantic
4
5
  from pydantic import Field, root_validator
5
6
 
6
7
  from datahub.configuration.common import AllowDenyPattern
@@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
119
120
  " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
120
121
  )
121
122
  return values
123
+
124
+ @pydantic.validator("database_connection")
125
+ def validate_mysql_scheme(
126
+ cls, v: SQLAlchemyConnectionConfig
127
+ ) -> SQLAlchemyConnectionConfig:
128
+ if "mysql" in v.scheme:
129
+ if v.scheme != "mysql+pymysql":
130
+ raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
131
+ return v
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
151
151
  self, query: str, params: Dict[str, Any]
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
- if self.engine.dialect.name == "postgresql":
154
+ if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
155
  with conn.begin(): # Transaction required for PostgreSQL server-side cursor
156
+ # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
+ # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
156
158
  conn = conn.execution_options(
157
159
  stream_results=True,
158
160
  yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
160
162
  result = conn.execute(query, params)
161
163
  for row in result:
162
164
  yield dict(row)
163
- elif self.engine.dialect.name == "mysql": # MySQL
164
- import MySQLdb
165
-
166
- with contextlib.closing(
167
- conn.connection.cursor(MySQLdb.cursors.SSCursor)
168
- ) as cursor:
169
- logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
170
- cursor.execute(query, params)
171
-
172
- columns = [desc[0] for desc in cursor.description]
173
- while True:
174
- rows = cursor.fetchmany(self.config.database_query_batch_size)
175
- if not rows:
176
- break # Use break instead of return in generator
177
- for row in rows:
178
- yield dict(zip(columns, row))
179
165
  else:
180
166
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
181
167
 
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
130
130
  self._commit_progress(i)
131
131
 
132
132
  def _get_kafka_workunits(
133
- self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
133
+ self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
134
134
  ) -> Iterable[MetadataWorkUnit]:
135
135
  if self.config.kafka_connection is None:
136
136
  return
@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
- QUERY_QUERY_ENTITY = """
23
- query listQueries($input: ScrollAcrossEntitiesInput!) {
22
+ QUERY_ENTITIES = """
23
+ query listEntities($input: ScrollAcrossEntitiesInput!) {
24
24
  scrollAcrossEntities(input: $input) {
25
25
  nextScrollId
26
26
  count
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
29
29
  ... on QueryEntity {
30
30
  urn
31
31
  }
32
+ ... on DataProcessInstance {
33
+ urn
34
+ }
32
35
  }
33
36
  }
34
37
  }
@@ -225,16 +228,16 @@ class SoftDeletedEntitiesCleanup:
225
228
  time.sleep(self.config.delay)
226
229
  return futures
227
230
 
228
- def _get_soft_deleted_queries(self) -> Iterable[str]:
231
+ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
229
232
  assert self.ctx.graph
230
233
  scroll_id: Optional[str] = None
231
234
  while True:
232
235
  try:
233
236
  result = self.ctx.graph.execute_graphql(
234
- QUERY_QUERY_ENTITY,
237
+ graphql_query,
235
238
  {
236
239
  "input": {
237
- "types": ["QUERY"],
240
+ "types": [entity_type],
238
241
  "query": "*",
239
242
  "scrollId": scroll_id if scroll_id else None,
240
243
  "count": self.config.batch_size,
@@ -254,7 +257,7 @@ class SoftDeletedEntitiesCleanup:
254
257
  )
255
258
  except Exception as e:
256
259
  self.report.failure(
257
- f"While trying to get queries with {scroll_id}", exc=e
260
+ f"While trying to get {entity_type} with {scroll_id}", exc=e
258
261
  )
259
262
  break
260
263
  scroll_across_entities = result.get("scrollAcrossEntities")
@@ -275,7 +278,8 @@ class SoftDeletedEntitiesCleanup:
275
278
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
276
279
  batch_size=self.config.batch_size,
277
280
  )
278
- yield from self._get_soft_deleted_queries()
281
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
282
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
279
283
 
280
284
  def _times_up(self) -> bool:
281
285
  if (
@@ -221,6 +221,14 @@ class SnowflakeV2Config(
221
221
  default=False,
222
222
  description="If enabled, uses the new queries extractor to extract queries from snowflake.",
223
223
  )
224
+ include_queries: bool = Field(
225
+ default=True,
226
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
227
+ )
228
+ include_query_usage_statistics: bool = Field(
229
+ default=True,
230
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
231
+ )
224
232
 
225
233
  lazy_schema_resolver: bool = Field(
226
234
  default=True,
@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
40
40
  ColumnRef,
41
41
  DownstreamColumnRef,
42
42
  )
43
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
43
44
  from datahub.utilities.perf_timer import PerfTimer
44
45
  from datahub.utilities.time import ts_millis_to_datetime
45
46
 
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
239
240
  downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
240
241
 
241
242
  known_lineage = KnownQueryLineageInfo(
243
+ query_id=get_query_fingerprint(
244
+ query.query_text, self.identifiers.platform, fast=True
245
+ ),
242
246
  query_text=query.query_text,
243
247
  downstream=downstream_table_urn,
244
248
  upstreams=self.map_query_result_upstreams(
@@ -528,6 +528,8 @@ class SnowflakeV2Source(
528
528
  include_lineage=self.config.include_table_lineage,
529
529
  include_usage_statistics=self.config.include_usage_stats,
530
530
  include_operations=self.config.include_operational_stats,
531
+ include_queries=self.config.include_queries,
532
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
531
533
  user_email_pattern=self.config.user_email_pattern,
532
534
  ),
533
535
  structured_report=self.report,
@@ -26,9 +26,6 @@ from datahub.emitter.mcp_builder import (
26
26
  gen_containers,
27
27
  )
28
28
  from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29
- from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
30
- EnsureAspectSizeProcessor,
31
- )
32
29
  from datahub.ingestion.api.common import PipelineContext
33
30
  from datahub.ingestion.api.decorators import (
34
31
  SupportStatus,
@@ -263,7 +260,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
263
260
  StaleEntityRemovalHandler.create(
264
261
  self, self.config, self.ctx
265
262
  ).workunit_processor,
266
- EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
267
263
  ]
268
264
 
269
265
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
165
165
  timestamp: Optional[datetime] = None
166
166
  session_id: Optional[str] = None
167
167
  query_type: QueryType = QueryType.UNKNOWN
168
+ query_id: Optional[str] = None
168
169
 
169
170
 
170
171
  @dataclasses.dataclass
@@ -618,11 +619,13 @@ class SqlParsingAggregator(Closeable):
618
619
  self.report.num_known_query_lineage += 1
619
620
 
620
621
  # Generate a fingerprint for the query.
621
- with self.report.sql_fingerprinting_timer:
622
- query_fingerprint = get_query_fingerprint(
623
- known_query_lineage.query_text,
624
- platform=self.platform.platform_name,
625
- )
622
+ query_fingerprint = known_query_lineage.query_id
623
+ if not query_fingerprint:
624
+ with self.report.sql_fingerprinting_timer:
625
+ query_fingerprint = get_query_fingerprint(
626
+ known_query_lineage.query_text,
627
+ platform=self.platform.platform_name,
628
+ )
626
629
  formatted_query = self._maybe_format_query(known_query_lineage.query_text)
627
630
 
628
631
  # Register the query.