acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/metadata/_internal_schema_classes.py +544 -544
- datahub/metadata/_urns/urn_defs.py +1728 -1728
- datahub/metadata/schema.avsc +15157 -15157
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,6 @@ from datahub.emitter.mcp_builder import (
|
|
|
30
30
|
add_entity_to_container,
|
|
31
31
|
gen_containers,
|
|
32
32
|
)
|
|
33
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
34
33
|
from datahub.ingestion.api.common import PipelineContext
|
|
35
34
|
from datahub.ingestion.api.decorators import (
|
|
36
35
|
SupportStatus,
|
|
@@ -142,11 +141,7 @@ from datahub.metadata.schema_classes import (
|
|
|
142
141
|
from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
|
|
143
142
|
from datahub.sdk import MLModel, MLModelGroup
|
|
144
143
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
145
|
-
from datahub.sql_parsing.
|
|
146
|
-
SqlParsingResult,
|
|
147
|
-
sqlglot_lineage,
|
|
148
|
-
view_definition_lineage_helper,
|
|
149
|
-
)
|
|
144
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
150
145
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
151
146
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
152
147
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -199,6 +194,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
199
194
|
platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
|
|
200
195
|
None
|
|
201
196
|
)
|
|
197
|
+
sql_parsing_aggregator: Optional[SqlParsingAggregator] = None
|
|
202
198
|
|
|
203
199
|
def get_report(self) -> UnityCatalogReport:
|
|
204
200
|
return self.report
|
|
@@ -218,6 +214,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
218
214
|
report=self.report,
|
|
219
215
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
220
216
|
lineage_data_source=config.lineage_data_source,
|
|
217
|
+
usage_data_source=config.usage_data_source,
|
|
221
218
|
databricks_api_page_size=config.databricks_api_page_size,
|
|
222
219
|
)
|
|
223
220
|
|
|
@@ -244,9 +241,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
244
241
|
self.table_refs: Set[TableReference] = set()
|
|
245
242
|
self.view_refs: Set[TableReference] = set()
|
|
246
243
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
247
|
-
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
248
|
-
FileBackedDict()
|
|
249
|
-
)
|
|
250
244
|
|
|
251
245
|
# Global map of tables, for profiling
|
|
252
246
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -290,6 +284,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
290
284
|
platform_instance=self.config.platform_instance,
|
|
291
285
|
env=self.config.env,
|
|
292
286
|
)
|
|
287
|
+
self.sql_parsing_aggregator = SqlParsingAggregator(
|
|
288
|
+
platform=self.platform,
|
|
289
|
+
platform_instance=self.config.platform_instance,
|
|
290
|
+
env=self.config.env,
|
|
291
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
292
|
+
generate_lineage=True,
|
|
293
|
+
generate_queries=False,
|
|
294
|
+
generate_usage_statistics=False,
|
|
295
|
+
generate_operations=False,
|
|
296
|
+
)
|
|
297
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
293
298
|
except Exception as e:
|
|
294
299
|
logger.debug("Exception", exc_info=True)
|
|
295
300
|
self.warn(
|
|
@@ -629,8 +634,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
629
634
|
self.sql_parser_schema_resolver.add_schema_metadata(
|
|
630
635
|
dataset_urn, schema_metadata
|
|
631
636
|
)
|
|
632
|
-
if table.view_definition:
|
|
633
|
-
self.
|
|
637
|
+
if table.view_definition and self.sql_parsing_aggregator:
|
|
638
|
+
self.sql_parsing_aggregator.add_view_definition(
|
|
639
|
+
view_urn=dataset_urn,
|
|
640
|
+
view_definition=table.view_definition,
|
|
641
|
+
default_db=table.ref.catalog,
|
|
642
|
+
default_schema=table.ref.schema,
|
|
643
|
+
)
|
|
634
644
|
|
|
635
645
|
if (
|
|
636
646
|
table_props.customProperties.get("table_type")
|
|
@@ -1334,75 +1344,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1334
1344
|
)
|
|
1335
1345
|
]
|
|
1336
1346
|
|
|
1337
|
-
def _run_sql_parser(
|
|
1338
|
-
self, view_ref: TableReference, query: str, schema_resolver: SchemaResolver
|
|
1339
|
-
) -> Optional[SqlParsingResult]:
|
|
1340
|
-
raw_lineage = sqlglot_lineage(
|
|
1341
|
-
query,
|
|
1342
|
-
schema_resolver=schema_resolver,
|
|
1343
|
-
default_db=view_ref.catalog,
|
|
1344
|
-
default_schema=view_ref.schema,
|
|
1345
|
-
)
|
|
1346
|
-
view_urn = self.gen_dataset_urn(view_ref)
|
|
1347
|
-
|
|
1348
|
-
if raw_lineage.debug_info.table_error:
|
|
1349
|
-
logger.debug(
|
|
1350
|
-
f"Failed to parse lineage for view {view_ref}: "
|
|
1351
|
-
f"{raw_lineage.debug_info.table_error}"
|
|
1352
|
-
)
|
|
1353
|
-
self.report.num_view_definitions_failed_parsing += 1
|
|
1354
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1355
|
-
f"Table-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.table_error}"
|
|
1356
|
-
)
|
|
1357
|
-
return None
|
|
1358
|
-
|
|
1359
|
-
elif raw_lineage.debug_info.column_error:
|
|
1360
|
-
self.report.num_view_definitions_failed_column_parsing += 1
|
|
1361
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1362
|
-
f"Column-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.column_error}"
|
|
1363
|
-
)
|
|
1364
|
-
else:
|
|
1365
|
-
self.report.num_view_definitions_parsed += 1
|
|
1366
|
-
if raw_lineage.out_tables != [view_urn]:
|
|
1367
|
-
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1368
|
-
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1369
|
-
|
|
1370
1347
|
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
1371
1348
|
if not (
|
|
1372
1349
|
self.config.include_hive_metastore
|
|
1373
1350
|
and self.config.include_table_lineage
|
|
1374
|
-
and self.
|
|
1351
|
+
and self.sql_parsing_aggregator
|
|
1375
1352
|
):
|
|
1376
1353
|
return
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
generate_usage_statistics=False,
|
|
1381
|
-
generate_operations=False,
|
|
1382
|
-
)
|
|
1383
|
-
for dataset_name in self.view_definitions:
|
|
1384
|
-
view_ref, view_definition = self.view_definitions[dataset_name]
|
|
1385
|
-
result = self._run_sql_parser(
|
|
1386
|
-
view_ref,
|
|
1387
|
-
view_definition,
|
|
1388
|
-
self.sql_parser_schema_resolver,
|
|
1389
|
-
)
|
|
1390
|
-
if result and result.out_tables:
|
|
1391
|
-
# This does not yield any workunits but we use
|
|
1392
|
-
# yield here to execute this method
|
|
1393
|
-
yield from builder.process_sql_parsing_result(
|
|
1394
|
-
result=result,
|
|
1395
|
-
query=view_definition,
|
|
1396
|
-
is_view_ddl=True,
|
|
1397
|
-
include_column_lineage=self.config.include_view_column_lineage,
|
|
1398
|
-
)
|
|
1399
|
-
yield from builder.gen_workunits()
|
|
1354
|
+
|
|
1355
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
1356
|
+
yield mcp.as_workunit()
|
|
1400
1357
|
|
|
1401
1358
|
def close(self):
|
|
1402
1359
|
if self.hive_metastore_proxy:
|
|
1403
1360
|
self.hive_metastore_proxy.close()
|
|
1404
|
-
if self.
|
|
1405
|
-
self.
|
|
1361
|
+
if self.sql_parsing_aggregator:
|
|
1362
|
+
self.sql_parsing_aggregator.close()
|
|
1406
1363
|
if self.sql_parser_schema_resolver:
|
|
1407
1364
|
self.sql_parser_schema_resolver.close()
|
|
1408
1365
|
|
|
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
|
|
|
11
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
12
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
14
|
-
from datahub.ingestion.source.unity.config import
|
|
14
|
+
from datahub.ingestion.source.unity.config import (
|
|
15
|
+
UnityCatalogSourceConfig,
|
|
16
|
+
UsageDataSource,
|
|
17
|
+
)
|
|
15
18
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
16
19
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
17
20
|
OPERATION_STATEMENT_TYPES,
|
|
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
|
|
|
164
167
|
aspect=operation_aspect,
|
|
165
168
|
).as_workunit()
|
|
166
169
|
|
|
170
|
+
def _validate_usage_data_source_config(self) -> None:
|
|
171
|
+
"""Validate usage data source configuration before execution."""
|
|
172
|
+
usage_data_source = self.config.usage_data_source
|
|
173
|
+
|
|
174
|
+
if (
|
|
175
|
+
usage_data_source == UsageDataSource.SYSTEM_TABLES
|
|
176
|
+
and not self.proxy.warehouse_id
|
|
177
|
+
):
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
|
|
180
|
+
"Either set warehouse_id or use AUTO/API mode."
|
|
181
|
+
)
|
|
182
|
+
|
|
167
183
|
def _get_queries(self) -> Iterable[Query]:
|
|
168
184
|
try:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
185
|
+
self._validate_usage_data_source_config()
|
|
186
|
+
usage_data_source = self.config.usage_data_source
|
|
187
|
+
|
|
188
|
+
if usage_data_source == UsageDataSource.AUTO:
|
|
189
|
+
if self.proxy.warehouse_id:
|
|
190
|
+
logger.info(
|
|
191
|
+
"Using system tables for usage query history (AUTO mode)"
|
|
192
|
+
)
|
|
193
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
194
|
+
self.config.start_time, self.config.end_time
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
logger.info(
|
|
198
|
+
"Using API for usage query history (AUTO mode, no warehouse)"
|
|
199
|
+
)
|
|
200
|
+
yield from self.proxy.query_history(
|
|
201
|
+
self.config.start_time, self.config.end_time
|
|
202
|
+
)
|
|
203
|
+
elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
|
|
204
|
+
logger.info("Using system tables for usage query history (forced)")
|
|
205
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
206
|
+
self.config.start_time, self.config.end_time
|
|
207
|
+
)
|
|
208
|
+
elif usage_data_source == UsageDataSource.API:
|
|
209
|
+
logger.info("Using API for usage query history (forced)")
|
|
210
|
+
yield from self.proxy.query_history(
|
|
211
|
+
self.config.start_time, self.config.end_time
|
|
212
|
+
)
|
|
213
|
+
|
|
172
214
|
except Exception as e:
|
|
173
215
|
logger.warning("Error getting queries", exc_info=True)
|
|
174
216
|
self.report.report_warning("get-queries", str(e))
|