acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (51) hide show
  1. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
  2. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataproduct/dataproduct.py +26 -0
  5. datahub/cli/config_utils.py +18 -10
  6. datahub/cli/docker_check.py +2 -1
  7. datahub/cli/docker_cli.py +4 -2
  8. datahub/cli/graphql_cli.py +1422 -0
  9. datahub/cli/quickstart_versioning.py +2 -2
  10. datahub/cli/specific/dataproduct_cli.py +2 -4
  11. datahub/cli/specific/user_cli.py +172 -1
  12. datahub/configuration/env_vars.py +331 -0
  13. datahub/configuration/kafka.py +6 -4
  14. datahub/emitter/mce_builder.py +2 -4
  15. datahub/emitter/rest_emitter.py +15 -15
  16. datahub/entrypoints.py +2 -0
  17. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  18. datahub/ingestion/api/source.py +5 -0
  19. datahub/ingestion/graph/client.py +197 -0
  20. datahub/ingestion/graph/config.py +2 -2
  21. datahub/ingestion/sink/datahub_rest.py +6 -5
  22. datahub/ingestion/source/aws/aws_common.py +20 -13
  23. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
  24. datahub/ingestion/source/grafana/models.py +5 -0
  25. datahub/ingestion/source/iceberg/iceberg.py +39 -19
  26. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
  27. datahub/ingestion/source/mode.py +13 -0
  28. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  29. datahub/ingestion/source/schema_inference/object.py +22 -6
  30. datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
  31. datahub/ingestion/source/sql/mssql/source.py +7 -1
  32. datahub/ingestion/source/sql/teradata.py +80 -65
  33. datahub/ingestion/source/unity/config.py +31 -0
  34. datahub/ingestion/source/unity/proxy.py +73 -0
  35. datahub/ingestion/source/unity/source.py +27 -70
  36. datahub/ingestion/source/unity/usage.py +46 -4
  37. datahub/metadata/_internal_schema_classes.py +544 -544
  38. datahub/metadata/_urns/urn_defs.py +1728 -1728
  39. datahub/metadata/schema.avsc +15157 -15157
  40. datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
  41. datahub/sql_parsing/sqlglot_lineage.py +7 -0
  42. datahub/telemetry/telemetry.py +8 -3
  43. datahub/utilities/file_backed_collections.py +2 -2
  44. datahub/utilities/is_pytest.py +3 -2
  45. datahub/utilities/logging_manager.py +22 -6
  46. datahub/utilities/sample_data.py +5 -4
  47. datahub/emitter/sql_parsing_builder.py +0 -306
  48. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
  49. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
  50. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
  51. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,6 @@ from datahub.emitter.mcp_builder import (
30
30
  add_entity_to_container,
31
31
  gen_containers,
32
32
  )
33
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
34
33
  from datahub.ingestion.api.common import PipelineContext
35
34
  from datahub.ingestion.api.decorators import (
36
35
  SupportStatus,
@@ -142,11 +141,7 @@ from datahub.metadata.schema_classes import (
142
141
  from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
143
142
  from datahub.sdk import MLModel, MLModelGroup
144
143
  from datahub.sql_parsing.schema_resolver import SchemaResolver
145
- from datahub.sql_parsing.sqlglot_lineage import (
146
- SqlParsingResult,
147
- sqlglot_lineage,
148
- view_definition_lineage_helper,
149
- )
144
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
150
145
  from datahub.utilities.file_backed_collections import FileBackedDict
151
146
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
152
147
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -199,6 +194,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
199
194
  platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
200
195
  None
201
196
  )
197
+ sql_parsing_aggregator: Optional[SqlParsingAggregator] = None
202
198
 
203
199
  def get_report(self) -> UnityCatalogReport:
204
200
  return self.report
@@ -218,6 +214,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
218
214
  report=self.report,
219
215
  hive_metastore_proxy=self.hive_metastore_proxy,
220
216
  lineage_data_source=config.lineage_data_source,
217
+ usage_data_source=config.usage_data_source,
221
218
  databricks_api_page_size=config.databricks_api_page_size,
222
219
  )
223
220
 
@@ -244,9 +241,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
244
241
  self.table_refs: Set[TableReference] = set()
245
242
  self.view_refs: Set[TableReference] = set()
246
243
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
247
- self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
248
- FileBackedDict()
249
- )
250
244
 
251
245
  # Global map of tables, for profiling
252
246
  self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -290,6 +284,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
290
284
  platform_instance=self.config.platform_instance,
291
285
  env=self.config.env,
292
286
  )
287
+ self.sql_parsing_aggregator = SqlParsingAggregator(
288
+ platform=self.platform,
289
+ platform_instance=self.config.platform_instance,
290
+ env=self.config.env,
291
+ schema_resolver=self.sql_parser_schema_resolver,
292
+ generate_lineage=True,
293
+ generate_queries=False,
294
+ generate_usage_statistics=False,
295
+ generate_operations=False,
296
+ )
297
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
293
298
  except Exception as e:
294
299
  logger.debug("Exception", exc_info=True)
295
300
  self.warn(
@@ -629,8 +634,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
629
634
  self.sql_parser_schema_resolver.add_schema_metadata(
630
635
  dataset_urn, schema_metadata
631
636
  )
632
- if table.view_definition:
633
- self.view_definitions[dataset_urn] = (table.ref, table.view_definition)
637
+ if table.view_definition and self.sql_parsing_aggregator:
638
+ self.sql_parsing_aggregator.add_view_definition(
639
+ view_urn=dataset_urn,
640
+ view_definition=table.view_definition,
641
+ default_db=table.ref.catalog,
642
+ default_schema=table.ref.schema,
643
+ )
634
644
 
635
645
  if (
636
646
  table_props.customProperties.get("table_type")
@@ -1334,75 +1344,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1334
1344
  )
1335
1345
  ]
1336
1346
 
1337
- def _run_sql_parser(
1338
- self, view_ref: TableReference, query: str, schema_resolver: SchemaResolver
1339
- ) -> Optional[SqlParsingResult]:
1340
- raw_lineage = sqlglot_lineage(
1341
- query,
1342
- schema_resolver=schema_resolver,
1343
- default_db=view_ref.catalog,
1344
- default_schema=view_ref.schema,
1345
- )
1346
- view_urn = self.gen_dataset_urn(view_ref)
1347
-
1348
- if raw_lineage.debug_info.table_error:
1349
- logger.debug(
1350
- f"Failed to parse lineage for view {view_ref}: "
1351
- f"{raw_lineage.debug_info.table_error}"
1352
- )
1353
- self.report.num_view_definitions_failed_parsing += 1
1354
- self.report.view_definitions_parsing_failures.append(
1355
- f"Table-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.table_error}"
1356
- )
1357
- return None
1358
-
1359
- elif raw_lineage.debug_info.column_error:
1360
- self.report.num_view_definitions_failed_column_parsing += 1
1361
- self.report.view_definitions_parsing_failures.append(
1362
- f"Column-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.column_error}"
1363
- )
1364
- else:
1365
- self.report.num_view_definitions_parsed += 1
1366
- if raw_lineage.out_tables != [view_urn]:
1367
- self.report.num_view_definitions_view_urn_mismatch += 1
1368
- return view_definition_lineage_helper(raw_lineage, view_urn)
1369
-
1370
1347
  def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
1371
1348
  if not (
1372
1349
  self.config.include_hive_metastore
1373
1350
  and self.config.include_table_lineage
1374
- and self.sql_parser_schema_resolver
1351
+ and self.sql_parsing_aggregator
1375
1352
  ):
1376
1353
  return
1377
- # This is only used for parsing view lineage. Usage, Operations are emitted elsewhere
1378
- builder = SqlParsingBuilder(
1379
- generate_lineage=True,
1380
- generate_usage_statistics=False,
1381
- generate_operations=False,
1382
- )
1383
- for dataset_name in self.view_definitions:
1384
- view_ref, view_definition = self.view_definitions[dataset_name]
1385
- result = self._run_sql_parser(
1386
- view_ref,
1387
- view_definition,
1388
- self.sql_parser_schema_resolver,
1389
- )
1390
- if result and result.out_tables:
1391
- # This does not yield any workunits but we use
1392
- # yield here to execute this method
1393
- yield from builder.process_sql_parsing_result(
1394
- result=result,
1395
- query=view_definition,
1396
- is_view_ddl=True,
1397
- include_column_lineage=self.config.include_view_column_lineage,
1398
- )
1399
- yield from builder.gen_workunits()
1354
+
1355
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
1356
+ yield mcp.as_workunit()
1400
1357
 
1401
1358
  def close(self):
1402
1359
  if self.hive_metastore_proxy:
1403
1360
  self.hive_metastore_proxy.close()
1404
- if self.view_definitions:
1405
- self.view_definitions.close()
1361
+ if self.sql_parsing_aggregator:
1362
+ self.sql_parsing_aggregator.close()
1406
1363
  if self.sql_parser_schema_resolver:
1407
1364
  self.sql_parser_schema_resolver.close()
1408
1365
 
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
11
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
12
  from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
14
- from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig
14
+ from datahub.ingestion.source.unity.config import (
15
+ UnityCatalogSourceConfig,
16
+ UsageDataSource,
17
+ )
15
18
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
16
19
  from datahub.ingestion.source.unity.proxy_types import (
17
20
  OPERATION_STATEMENT_TYPES,
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
164
167
  aspect=operation_aspect,
165
168
  ).as_workunit()
166
169
 
170
+ def _validate_usage_data_source_config(self) -> None:
171
+ """Validate usage data source configuration before execution."""
172
+ usage_data_source = self.config.usage_data_source
173
+
174
+ if (
175
+ usage_data_source == UsageDataSource.SYSTEM_TABLES
176
+ and not self.proxy.warehouse_id
177
+ ):
178
+ raise ValueError(
179
+ "usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
180
+ "Either set warehouse_id or use AUTO/API mode."
181
+ )
182
+
167
183
  def _get_queries(self) -> Iterable[Query]:
168
184
  try:
169
- yield from self.proxy.query_history(
170
- self.config.start_time, self.config.end_time
171
- )
185
+ self._validate_usage_data_source_config()
186
+ usage_data_source = self.config.usage_data_source
187
+
188
+ if usage_data_source == UsageDataSource.AUTO:
189
+ if self.proxy.warehouse_id:
190
+ logger.info(
191
+ "Using system tables for usage query history (AUTO mode)"
192
+ )
193
+ yield from self.proxy.get_query_history_via_system_tables(
194
+ self.config.start_time, self.config.end_time
195
+ )
196
+ else:
197
+ logger.info(
198
+ "Using API for usage query history (AUTO mode, no warehouse)"
199
+ )
200
+ yield from self.proxy.query_history(
201
+ self.config.start_time, self.config.end_time
202
+ )
203
+ elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
204
+ logger.info("Using system tables for usage query history (forced)")
205
+ yield from self.proxy.get_query_history_via_system_tables(
206
+ self.config.start_time, self.config.end_time
207
+ )
208
+ elif usage_data_source == UsageDataSource.API:
209
+ logger.info("Using API for usage query history (forced)")
210
+ yield from self.proxy.query_history(
211
+ self.config.start_time, self.config.end_time
212
+ )
213
+
172
214
  except Exception as e:
173
215
  logger.warning("Error getting queries", exc_info=True)
174
216
  self.report.report_warning("get-queries", str(e))