acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2563 -2561
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +46 -44
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -836,9 +836,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
836
836
|
"native_data_type": str(timestamp_type),
|
|
837
837
|
}
|
|
838
838
|
|
|
839
|
-
# visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate
|
|
840
|
-
# of visit_timestampz(). The function has been renamed from visit_timestampz().
|
|
841
|
-
# Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed.
|
|
842
839
|
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
|
|
843
840
|
# Avro supports 2 types of timestamp:
|
|
844
841
|
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
@@ -855,22 +852,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
855
852
|
"native_data_type": str(timestamptz_type),
|
|
856
853
|
}
|
|
857
854
|
|
|
858
|
-
def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
|
|
859
|
-
# Avro supports 2 types of timestamp:
|
|
860
|
-
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
861
|
-
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
862
|
-
# utcAdjustment: bool = True
|
|
863
|
-
return {
|
|
864
|
-
"type": "long",
|
|
865
|
-
"logicalType": "timestamp-micros",
|
|
866
|
-
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
867
|
-
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
868
|
-
# "logicalType": "timestamp-micros"
|
|
869
|
-
# if timestamp_type.adjust_to_utc
|
|
870
|
-
# else "local-timestamp-micros",
|
|
871
|
-
"native_data_type": str(timestamptz_type),
|
|
872
|
-
}
|
|
873
|
-
|
|
874
855
|
def visit_string(self, string_type: StringType) -> Dict[str, Any]:
|
|
875
856
|
return {
|
|
876
857
|
"type": "string",
|
|
@@ -889,3 +870,42 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
889
870
|
"type": "bytes",
|
|
890
871
|
"native_data_type": str(binary_type),
|
|
891
872
|
}
|
|
873
|
+
|
|
874
|
+
def visit_timestamp_ns(self, timestamp_ns_type: Any) -> Dict[str, Any]:
|
|
875
|
+
# Handle nanosecond precision timestamps
|
|
876
|
+
# Avro supports 2 types of timestamp:
|
|
877
|
+
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
878
|
+
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
879
|
+
return {
|
|
880
|
+
"type": "long",
|
|
881
|
+
"logicalType": "timestamp-micros",
|
|
882
|
+
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
883
|
+
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
884
|
+
# "logicalType": "timestamp-micros"
|
|
885
|
+
# if timestamp_ns_type.adjust_to_utc
|
|
886
|
+
# else "local-timestamp-micros",
|
|
887
|
+
"native_data_type": str(timestamp_ns_type),
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
def visit_timestamptz_ns(self, timestamptz_ns_type: Any) -> Dict[str, Any]:
|
|
891
|
+
# Handle nanosecond precision timestamps with timezone
|
|
892
|
+
# Avro supports 2 types of timestamp:
|
|
893
|
+
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
894
|
+
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
895
|
+
return {
|
|
896
|
+
"type": "long",
|
|
897
|
+
"logicalType": "timestamp-micros",
|
|
898
|
+
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
899
|
+
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
900
|
+
# "logicalType": "timestamp-micros"
|
|
901
|
+
# if timestamptz_ns_type.adjust_to_utc
|
|
902
|
+
# else "local-timestamp-micros",
|
|
903
|
+
"native_data_type": str(timestamptz_ns_type),
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
def visit_unknown(self, unknown_type: Any) -> Dict[str, Any]:
|
|
907
|
+
# Handle unknown types
|
|
908
|
+
return {
|
|
909
|
+
"type": "string",
|
|
910
|
+
"native_data_type": str(unknown_type),
|
|
911
|
+
}
|
|
@@ -412,7 +412,10 @@ class MongoSourceConnector(BaseConnector):
|
|
|
412
412
|
|
|
413
413
|
# Escape topic_prefix to handle cases where it contains dots
|
|
414
414
|
# Some users configure topic.prefix like "my.mongodb" which breaks the regex
|
|
415
|
-
|
|
415
|
+
|
|
416
|
+
# \w is equivalent to [a-zA-Z0-9_]
|
|
417
|
+
# So [\w-]+ matches alphanumeric characters, underscores, and hyphens
|
|
418
|
+
topic_naming_pattern = rf"{re.escape(topic_prefix)}\.([\w-]+)\.([\w-]+)"
|
|
416
419
|
|
|
417
420
|
if not self.connector_manifest.topic_names:
|
|
418
421
|
return lineages
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -214,6 +214,10 @@ class ModeConfig(
|
|
|
214
214
|
description="Number of items per page for paginated API requests.",
|
|
215
215
|
)
|
|
216
216
|
|
|
217
|
+
exclude_archived: bool = Field(
|
|
218
|
+
default=False, description="Exclude archived reports"
|
|
219
|
+
)
|
|
220
|
+
|
|
217
221
|
@validator("connect_uri")
|
|
218
222
|
def remove_trailing_slash(cls, v):
|
|
219
223
|
return config_clean.remove_trailing_slashes(v)
|
|
@@ -1473,6 +1477,15 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1473
1477
|
logger.debug(
|
|
1474
1478
|
f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
|
|
1475
1479
|
)
|
|
1480
|
+
if self.config.exclude_archived:
|
|
1481
|
+
logger.debug(
|
|
1482
|
+
f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
|
|
1483
|
+
)
|
|
1484
|
+
reports_page = [
|
|
1485
|
+
report
|
|
1486
|
+
for report in reports_page
|
|
1487
|
+
if not report.get("archived", False)
|
|
1488
|
+
]
|
|
1476
1489
|
yield reports_page
|
|
1477
1490
|
except ModeRequestError as e:
|
|
1478
1491
|
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import importlib.resources as pkg_resource
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
from typing import Dict, List, Optional
|
|
6
5
|
|
|
7
6
|
import lark
|
|
8
7
|
from lark import Lark, Tree
|
|
9
8
|
|
|
10
9
|
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
10
|
+
from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout
|
|
11
11
|
from datahub.ingestion.api.common import PipelineContext
|
|
12
12
|
from datahub.ingestion.source.powerbi.config import (
|
|
13
13
|
PowerBiDashboardSourceConfig,
|
|
@@ -25,7 +25,7 @@ from datahub.utilities.threading_timeout import TimeoutException, threading_time
|
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
-
_M_QUERY_PARSE_TIMEOUT =
|
|
28
|
+
_M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout()
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
@functools.lru_cache(maxsize=1)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
7
6
|
|
|
7
|
+
from datahub.configuration.env_vars import get_snowflake_schema_parallelism
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
10
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
@@ -22,7 +22,7 @@ from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
|
22
22
|
|
|
23
23
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
25
|
-
SCHEMA_PARALLELISM =
|
|
25
|
+
SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
@dataclass
|
|
@@ -149,18 +149,24 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
149
149
|
uri_opts: Optional[Dict[str, Any]] = None,
|
|
150
150
|
current_db: Optional[str] = None,
|
|
151
151
|
) -> str:
|
|
152
|
+
current_db = current_db or self.database
|
|
153
|
+
|
|
152
154
|
if self.use_odbc:
|
|
153
155
|
# Ensure that the import is available.
|
|
154
156
|
import pyodbc # noqa: F401
|
|
155
157
|
|
|
156
158
|
self.scheme = "mssql+pyodbc"
|
|
157
159
|
|
|
160
|
+
# ODBC requires a database name, otherwise it will interpret host_port
|
|
161
|
+
# as a pre-defined ODBC connection name.
|
|
162
|
+
current_db = current_db or "master"
|
|
163
|
+
|
|
158
164
|
uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
|
|
159
165
|
self.scheme, # type: ignore
|
|
160
166
|
self.username,
|
|
161
167
|
self.password.get_secret_value() if self.password else None,
|
|
162
168
|
self.host_port, # type: ignore
|
|
163
|
-
current_db
|
|
169
|
+
current_db,
|
|
164
170
|
uri_opts=uri_opts,
|
|
165
171
|
)
|
|
166
172
|
if self.use_odbc:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
import time
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
@@ -67,6 +68,12 @@ from datahub.utilities.stats_collections import TopKDict
|
|
|
67
68
|
|
|
68
69
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
69
70
|
|
|
71
|
+
# Precompiled regex pattern for case-insensitive "(not casespecific)" removal
|
|
72
|
+
NOT_CASESPECIFIC_PATTERN = re.compile(r"\(not casespecific\)", re.IGNORECASE)
|
|
73
|
+
|
|
74
|
+
# Teradata uses a two-tier database.table naming approach without default database prefixing
|
|
75
|
+
DEFAULT_NO_DATABASE_TERADATA = None
|
|
76
|
+
|
|
70
77
|
# Common excluded databases used in multiple places
|
|
71
78
|
EXCLUDED_DATABASES = [
|
|
72
79
|
"All",
|
|
@@ -453,6 +460,13 @@ class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
|
|
|
453
460
|
# Global metadata extraction timing (single query for all databases)
|
|
454
461
|
metadata_extraction_total_sec: float = 0.0
|
|
455
462
|
|
|
463
|
+
# Lineage extraction query time range (actively used)
|
|
464
|
+
lineage_start_time: Optional[datetime] = None
|
|
465
|
+
lineage_end_time: Optional[datetime] = None
|
|
466
|
+
|
|
467
|
+
# Audit query processing statistics
|
|
468
|
+
num_audit_query_entries_processed: int = 0
|
|
469
|
+
|
|
456
470
|
|
|
457
471
|
class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
|
|
458
472
|
scheme: str = Field(default="teradatasql", description="database scheme")
|
|
@@ -726,7 +740,8 @@ ORDER by DataBaseName, TableName;
|
|
|
726
740
|
env=self.config.env,
|
|
727
741
|
schema_resolver=self.schema_resolver,
|
|
728
742
|
graph=self.ctx.graph,
|
|
729
|
-
generate_lineage=self.
|
|
743
|
+
generate_lineage=self.config.include_view_lineage
|
|
744
|
+
or self.config.include_table_lineage,
|
|
730
745
|
generate_queries=self.config.include_queries,
|
|
731
746
|
generate_usage_statistics=self.config.include_usage_statistics,
|
|
732
747
|
generate_query_usage_statistics=self.config.include_usage_statistics,
|
|
@@ -1327,6 +1342,9 @@ ORDER by DataBaseName, TableName;
|
|
|
1327
1342
|
current_query_metadata = None
|
|
1328
1343
|
|
|
1329
1344
|
for entry in entries:
|
|
1345
|
+
# Count each audit query entry processed
|
|
1346
|
+
self.report.num_audit_query_entries_processed += 1
|
|
1347
|
+
|
|
1330
1348
|
query_id = getattr(entry, "query_id", None)
|
|
1331
1349
|
query_text = str(getattr(entry, "query_text", ""))
|
|
1332
1350
|
|
|
@@ -1367,15 +1385,18 @@ ORDER by DataBaseName, TableName;
|
|
|
1367
1385
|
default_database = getattr(metadata_entry, "default_database", None)
|
|
1368
1386
|
|
|
1369
1387
|
# Apply Teradata-specific query transformations
|
|
1370
|
-
cleaned_query =
|
|
1388
|
+
cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", full_query_text)
|
|
1371
1389
|
|
|
1390
|
+
# For Teradata's two-tier architecture (database.table), we should not set default_db
|
|
1391
|
+
# to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
|
|
1392
|
+
# The SQL parser will treat database.table references correctly without default_db
|
|
1372
1393
|
return ObservedQuery(
|
|
1373
1394
|
query=cleaned_query,
|
|
1374
1395
|
session_id=session_id,
|
|
1375
1396
|
timestamp=timestamp,
|
|
1376
1397
|
user=CorpUserUrn(user) if user else None,
|
|
1377
|
-
default_db=
|
|
1378
|
-
default_schema=default_database,
|
|
1398
|
+
default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
|
|
1399
|
+
default_schema=default_database,
|
|
1379
1400
|
)
|
|
1380
1401
|
|
|
1381
1402
|
def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
|
|
@@ -1393,15 +1414,18 @@ ORDER by DataBaseName, TableName;
|
|
|
1393
1414
|
default_database = getattr(entry, "default_database", None)
|
|
1394
1415
|
|
|
1395
1416
|
# Apply Teradata-specific query transformations
|
|
1396
|
-
cleaned_query =
|
|
1417
|
+
cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", query_text)
|
|
1397
1418
|
|
|
1419
|
+
# For Teradata's two-tier architecture (database.table), we should not set default_db
|
|
1420
|
+
# to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
|
|
1421
|
+
# However, we should set default_schema for unqualified table references
|
|
1398
1422
|
return ObservedQuery(
|
|
1399
1423
|
query=cleaned_query,
|
|
1400
1424
|
session_id=session_id,
|
|
1401
1425
|
timestamp=timestamp,
|
|
1402
1426
|
user=CorpUserUrn(user) if user else None,
|
|
1403
|
-
default_db=
|
|
1404
|
-
default_schema=default_database, #
|
|
1427
|
+
default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
|
|
1428
|
+
default_schema=default_database, # Set default_schema for unqualified table references
|
|
1405
1429
|
)
|
|
1406
1430
|
|
|
1407
1431
|
def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
|
|
@@ -1421,7 +1445,7 @@ ORDER by DataBaseName, TableName;
|
|
|
1421
1445
|
|
|
1422
1446
|
for query_index, query in enumerate(queries, 1):
|
|
1423
1447
|
logger.info(
|
|
1424
|
-
f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
|
|
1448
|
+
f"Executing lineage query {query_index}/{len(queries)} for time range {self.config.start_time} to {self.config.end_time} with {cursor_type} cursor..."
|
|
1425
1449
|
)
|
|
1426
1450
|
|
|
1427
1451
|
# Use helper method to try server-side cursor with fallback
|
|
@@ -1589,79 +1613,70 @@ ORDER by DataBaseName, TableName;
|
|
|
1589
1613
|
else:
|
|
1590
1614
|
return connection.execute(text(query))
|
|
1591
1615
|
|
|
1616
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
1617
|
+
"""Override to prevent parent class from generating aggregator work units during schema extraction.
|
|
1618
|
+
|
|
1619
|
+
We handle aggregator generation manually after populating it with audit log data.
|
|
1620
|
+
"""
|
|
1621
|
+
# Do nothing - we'll call the parent implementation manually after populating the aggregator
|
|
1622
|
+
return iter([])
|
|
1623
|
+
|
|
1592
1624
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1593
1625
|
logger.info("Starting Teradata metadata extraction")
|
|
1594
1626
|
|
|
1595
|
-
#
|
|
1596
|
-
# Sql parser operates on lowercase urns so we need to lowercase the urns
|
|
1627
|
+
# Step 1: Schema extraction first (parent class will skip aggregator generation due to our override)
|
|
1597
1628
|
with self.report.new_stage("Schema metadata extraction"):
|
|
1598
1629
|
yield from super().get_workunits_internal()
|
|
1599
1630
|
logger.info("Completed schema metadata extraction")
|
|
1600
1631
|
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
"""Override base class to skip aggregator gen_metadata() call.
|
|
1609
|
-
|
|
1610
|
-
Teradata handles aggregator processing after adding audit log queries,
|
|
1611
|
-
so we skip the base class call to prevent duplicate processing.
|
|
1612
|
-
"""
|
|
1613
|
-
# Return empty iterator - Teradata will handle aggregator processing
|
|
1614
|
-
# after adding audit log queries in _get_audit_log_mcps_with_aggregator()
|
|
1615
|
-
return iter([])
|
|
1632
|
+
# Step 2: Lineage extraction after schema extraction
|
|
1633
|
+
# This allows lineage processing to have access to all discovered schema information
|
|
1634
|
+
with self.report.new_stage("Audit log extraction and lineage processing"):
|
|
1635
|
+
self._populate_aggregator_from_audit_logs()
|
|
1636
|
+
# Call parent implementation directly to generate aggregator work units
|
|
1637
|
+
yield from super()._generate_aggregator_workunits()
|
|
1638
|
+
logger.info("Completed lineage processing")
|
|
1616
1639
|
|
|
1617
|
-
def
|
|
1640
|
+
def _populate_aggregator_from_audit_logs(self) -> None:
|
|
1618
1641
|
"""SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
if self.config.include_table_lineage or self.config.include_usage_statistics:
|
|
1624
|
-
# Step 1: Stream query entries from database with memory-efficient processing
|
|
1625
|
-
with self.report.new_stage("Fetching lineage entries from Audit Logs"):
|
|
1626
|
-
queries_processed = 0
|
|
1627
|
-
entries_processed = False
|
|
1628
|
-
|
|
1629
|
-
# Use streaming query reconstruction for memory efficiency
|
|
1630
|
-
for observed_query in self._reconstruct_queries_streaming(
|
|
1631
|
-
self._fetch_lineage_entries_chunked()
|
|
1632
|
-
):
|
|
1633
|
-
entries_processed = True
|
|
1634
|
-
self.aggregator.add(observed_query)
|
|
1642
|
+
with self.report.new_stage("Lineage extraction from Teradata audit logs"):
|
|
1643
|
+
# Record the lineage query time range in the report
|
|
1644
|
+
self.report.lineage_start_time = self.config.start_time
|
|
1645
|
+
self.report.lineage_end_time = self.config.end_time
|
|
1635
1646
|
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
f"Processed {queries_processed} queries to aggregator"
|
|
1640
|
-
)
|
|
1647
|
+
logger.info(
|
|
1648
|
+
f"Starting lineage extraction from Teradata audit logs (time range: {self.config.start_time} to {self.config.end_time})"
|
|
1649
|
+
)
|
|
1641
1650
|
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1651
|
+
if (
|
|
1652
|
+
self.config.include_table_lineage
|
|
1653
|
+
or self.config.include_usage_statistics
|
|
1654
|
+
):
|
|
1655
|
+
# Step 1: Stream query entries from database with memory-efficient processing
|
|
1656
|
+
with self.report.new_stage("Fetching lineage entries from Audit Logs"):
|
|
1657
|
+
queries_processed = 0
|
|
1658
|
+
|
|
1659
|
+
# Use streaming query reconstruction for memory efficiency
|
|
1660
|
+
for observed_query in self._reconstruct_queries_streaming(
|
|
1661
|
+
self._fetch_lineage_entries_chunked()
|
|
1662
|
+
):
|
|
1663
|
+
self.aggregator.add(observed_query)
|
|
1664
|
+
|
|
1665
|
+
queries_processed += 1
|
|
1666
|
+
if queries_processed % 10000 == 0:
|
|
1667
|
+
logger.info(
|
|
1668
|
+
f"Processed {queries_processed} queries to aggregator"
|
|
1669
|
+
)
|
|
1645
1670
|
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1671
|
+
if queries_processed == 0:
|
|
1672
|
+
logger.info("No lineage entries found")
|
|
1673
|
+
return
|
|
1649
1674
|
|
|
1650
|
-
# Step 2: Generate work units from aggregator
|
|
1651
|
-
with self.report.new_stage("SqlParsingAggregator metadata generation"):
|
|
1652
|
-
logger.info("Generating metadata work units from SqlParsingAggregator")
|
|
1653
|
-
work_unit_count = 0
|
|
1654
|
-
for mcp in self.aggregator.gen_metadata():
|
|
1655
|
-
work_unit_count += 1
|
|
1656
|
-
if work_unit_count % 10000 == 0:
|
|
1657
1675
|
logger.info(
|
|
1658
|
-
f"
|
|
1676
|
+
f"Completed adding {queries_processed} queries to SqlParsingAggregator"
|
|
1659
1677
|
)
|
|
1660
|
-
yield mcp.as_workunit()
|
|
1661
1678
|
|
|
1662
|
-
logger.info(
|
|
1663
|
-
f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
|
|
1664
|
-
)
|
|
1679
|
+
logger.info("Completed lineage extraction from Teradata audit logs")
|
|
1665
1680
|
|
|
1666
1681
|
def close(self) -> None:
|
|
1667
1682
|
"""Clean up resources when source is closed."""
|
|
@@ -49,6 +49,12 @@ class LineageDataSource(ConfigEnum):
|
|
|
49
49
|
API = "API"
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
class UsageDataSource(ConfigEnum):
|
|
53
|
+
AUTO = "AUTO"
|
|
54
|
+
SYSTEM_TABLES = "SYSTEM_TABLES"
|
|
55
|
+
API = "API"
|
|
56
|
+
|
|
57
|
+
|
|
52
58
|
class UnityCatalogProfilerConfig(ConfigModel):
|
|
53
59
|
method: str = Field(
|
|
54
60
|
description=(
|
|
@@ -285,6 +291,17 @@ class UnityCatalogSourceConfig(
|
|
|
285
291
|
description="Generate usage statistics.",
|
|
286
292
|
)
|
|
287
293
|
|
|
294
|
+
usage_data_source: UsageDataSource = pydantic.Field(
|
|
295
|
+
default=UsageDataSource.AUTO,
|
|
296
|
+
description=(
|
|
297
|
+
"Source for usage/query history data extraction. Options: "
|
|
298
|
+
f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
|
|
299
|
+
"This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
|
|
300
|
+
f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
|
|
301
|
+
f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
|
|
302
|
+
),
|
|
303
|
+
)
|
|
304
|
+
|
|
288
305
|
# TODO: Remove `type:ignore` by refactoring config
|
|
289
306
|
profiling: Union[
|
|
290
307
|
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
@@ -446,6 +463,20 @@ class UnityCatalogSourceConfig(
|
|
|
446
463
|
|
|
447
464
|
return values
|
|
448
465
|
|
|
466
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
467
|
+
def validate_usage_data_source_with_warehouse(
|
|
468
|
+
cls, values: Dict[str, Any]
|
|
469
|
+
) -> Dict[str, Any]:
|
|
470
|
+
usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
|
|
471
|
+
warehouse_id = values.get("warehouse_id")
|
|
472
|
+
|
|
473
|
+
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
474
|
+
raise ValueError(
|
|
475
|
+
f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return values
|
|
479
|
+
|
|
449
480
|
@pydantic.validator("schema_pattern", always=True)
|
|
450
481
|
def schema_pattern_should__always_deny_information_schema(
|
|
451
482
|
cls, v: AllowDenyPattern
|
|
@@ -40,6 +40,7 @@ from datahub.api.entities.external.unity_catalog_external_entites import UnityCa
|
|
|
40
40
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
41
41
|
from datahub.ingestion.source.unity.config import (
|
|
42
42
|
LineageDataSource,
|
|
43
|
+
UsageDataSource,
|
|
43
44
|
)
|
|
44
45
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
45
46
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
@@ -163,6 +164,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
163
164
|
report: UnityCatalogReport,
|
|
164
165
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
165
166
|
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
167
|
+
usage_data_source: UsageDataSource = UsageDataSource.AUTO,
|
|
166
168
|
databricks_api_page_size: int = 0,
|
|
167
169
|
):
|
|
168
170
|
self._workspace_client = WorkspaceClient(
|
|
@@ -175,6 +177,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
175
177
|
self.report = report
|
|
176
178
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
177
179
|
self.lineage_data_source = lineage_data_source
|
|
180
|
+
self.usage_data_source = usage_data_source
|
|
178
181
|
self.databricks_api_page_size = databricks_api_page_size
|
|
179
182
|
self._sql_connection_params = {
|
|
180
183
|
"server_hostname": self._workspace_client.config.host.replace(
|
|
@@ -182,6 +185,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
182
185
|
),
|
|
183
186
|
"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
|
|
184
187
|
"access_token": self._workspace_client.config.token,
|
|
188
|
+
"user_agent_entry": "datahub",
|
|
185
189
|
}
|
|
186
190
|
|
|
187
191
|
def check_basic_connectivity(self) -> bool:
|
|
@@ -401,6 +405,75 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
401
405
|
method, path, body={**body, "page_token": response["next_page_token"]}
|
|
402
406
|
)
|
|
403
407
|
|
|
408
|
+
def get_query_history_via_system_tables(
|
|
409
|
+
self,
|
|
410
|
+
start_time: datetime,
|
|
411
|
+
end_time: datetime,
|
|
412
|
+
) -> Iterable[Query]:
|
|
413
|
+
"""Get query history using system.query.history table.
|
|
414
|
+
|
|
415
|
+
This method provides an alternative to the REST API for fetching query history,
|
|
416
|
+
offering better performance and richer data for large query volumes.
|
|
417
|
+
"""
|
|
418
|
+
logger.info(
|
|
419
|
+
f"Fetching query history from system.query.history for period: {start_time} to {end_time}"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
allowed_types = [typ.value for typ in ALLOWED_STATEMENT_TYPES]
|
|
423
|
+
statement_type_filter = ", ".join(f"'{typ}'" for typ in allowed_types)
|
|
424
|
+
|
|
425
|
+
query = f"""
|
|
426
|
+
SELECT
|
|
427
|
+
statement_id,
|
|
428
|
+
statement_text,
|
|
429
|
+
statement_type,
|
|
430
|
+
start_time,
|
|
431
|
+
end_time,
|
|
432
|
+
executed_by,
|
|
433
|
+
executed_as,
|
|
434
|
+
executed_by_user_id,
|
|
435
|
+
executed_as_user_id
|
|
436
|
+
FROM system.query.history
|
|
437
|
+
WHERE
|
|
438
|
+
start_time >= %s
|
|
439
|
+
AND end_time <= %s
|
|
440
|
+
AND execution_status = 'FINISHED'
|
|
441
|
+
AND statement_type IN ({statement_type_filter})
|
|
442
|
+
ORDER BY start_time
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
rows = self._execute_sql_query(query, (start_time, end_time))
|
|
447
|
+
for row in rows:
|
|
448
|
+
try:
|
|
449
|
+
yield Query(
|
|
450
|
+
query_id=row.statement_id,
|
|
451
|
+
query_text=row.statement_text,
|
|
452
|
+
statement_type=(
|
|
453
|
+
QueryStatementType(row.statement_type)
|
|
454
|
+
if row.statement_type
|
|
455
|
+
else None
|
|
456
|
+
),
|
|
457
|
+
start_time=row.start_time,
|
|
458
|
+
end_time=row.end_time,
|
|
459
|
+
user_id=row.executed_by_user_id,
|
|
460
|
+
user_name=row.executed_by,
|
|
461
|
+
executed_as_user_id=row.executed_as_user_id,
|
|
462
|
+
executed_as_user_name=row.executed_as,
|
|
463
|
+
)
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.warning(f"Error parsing query from system table: {e}")
|
|
466
|
+
self.report.report_warning("query-parse-system-table", str(e))
|
|
467
|
+
except Exception as e:
|
|
468
|
+
logger.error(
|
|
469
|
+
f"Error fetching query history from system tables: {e}", exc_info=True
|
|
470
|
+
)
|
|
471
|
+
self.report.report_failure(
|
|
472
|
+
title="Failed to fetch query history from system tables",
|
|
473
|
+
message="Error querying system.query.history table",
|
|
474
|
+
context=f"Query period: {start_time} to {end_time}",
|
|
475
|
+
)
|
|
476
|
+
|
|
404
477
|
def _build_datetime_where_conditions(
|
|
405
478
|
self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
|
|
406
479
|
) -> str:
|