acryl-datahub 1.3.0.1rc1__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (48) hide show
  1. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2539 -2537
  2. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +47 -45
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataproduct/dataproduct.py +26 -0
  5. datahub/cli/config_utils.py +18 -10
  6. datahub/cli/docker_check.py +2 -1
  7. datahub/cli/docker_cli.py +4 -2
  8. datahub/cli/graphql_cli.py +1422 -0
  9. datahub/cli/quickstart_versioning.py +2 -2
  10. datahub/cli/specific/dataproduct_cli.py +2 -4
  11. datahub/cli/specific/user_cli.py +172 -1
  12. datahub/configuration/env_vars.py +331 -0
  13. datahub/configuration/kafka.py +6 -4
  14. datahub/emitter/mce_builder.py +2 -4
  15. datahub/emitter/rest_emitter.py +15 -15
  16. datahub/entrypoints.py +2 -0
  17. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  18. datahub/ingestion/api/source.py +5 -0
  19. datahub/ingestion/graph/client.py +197 -0
  20. datahub/ingestion/graph/config.py +2 -2
  21. datahub/ingestion/sink/datahub_rest.py +6 -5
  22. datahub/ingestion/source/aws/aws_common.py +20 -13
  23. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
  24. datahub/ingestion/source/delta_lake/config.py +8 -4
  25. datahub/ingestion/source/grafana/models.py +5 -0
  26. datahub/ingestion/source/iceberg/iceberg.py +39 -19
  27. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
  28. datahub/ingestion/source/mode.py +13 -0
  29. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  30. datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
  31. datahub/ingestion/source/sql/mssql/source.py +7 -1
  32. datahub/ingestion/source/sql/teradata.py +80 -65
  33. datahub/ingestion/source/unity/config.py +31 -0
  34. datahub/ingestion/source/unity/proxy.py +73 -0
  35. datahub/ingestion/source/unity/source.py +27 -70
  36. datahub/ingestion/source/unity/usage.py +46 -4
  37. datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
  38. datahub/sql_parsing/sqlglot_lineage.py +7 -0
  39. datahub/telemetry/telemetry.py +8 -3
  40. datahub/utilities/file_backed_collections.py +2 -2
  41. datahub/utilities/is_pytest.py +3 -2
  42. datahub/utilities/logging_manager.py +22 -6
  43. datahub/utilities/sample_data.py +5 -4
  44. datahub/emitter/sql_parsing_builder.py +0 -306
  45. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
  46. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
  47. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
  48. {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0
@@ -836,9 +836,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
836
836
  "native_data_type": str(timestamp_type),
837
837
  }
838
838
 
839
- # visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate
840
- # of visit_timestampz(). The function has been renamed from visit_timestampz().
841
- # Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed.
842
839
  def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
843
840
  # Avro supports 2 types of timestamp:
844
841
  # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
@@ -855,22 +852,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
855
852
  "native_data_type": str(timestamptz_type),
856
853
  }
857
854
 
858
- def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
859
- # Avro supports 2 types of timestamp:
860
- # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
861
- # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
862
- # utcAdjustment: bool = True
863
- return {
864
- "type": "long",
865
- "logicalType": "timestamp-micros",
866
- # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
867
- # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
868
- # "logicalType": "timestamp-micros"
869
- # if timestamp_type.adjust_to_utc
870
- # else "local-timestamp-micros",
871
- "native_data_type": str(timestamptz_type),
872
- }
873
-
874
855
  def visit_string(self, string_type: StringType) -> Dict[str, Any]:
875
856
  return {
876
857
  "type": "string",
@@ -889,3 +870,42 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
889
870
  "type": "bytes",
890
871
  "native_data_type": str(binary_type),
891
872
  }
873
+
874
+ def visit_timestamp_ns(self, timestamp_ns_type: Any) -> Dict[str, Any]:
875
+ # Handle nanosecond precision timestamps
876
+ # Avro supports 2 types of timestamp:
877
+ # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
878
+ # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
879
+ return {
880
+ "type": "long",
881
+ "logicalType": "timestamp-micros",
882
+ # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
883
+ # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
884
+ # "logicalType": "timestamp-micros"
885
+ # if timestamp_ns_type.adjust_to_utc
886
+ # else "local-timestamp-micros",
887
+ "native_data_type": str(timestamp_ns_type),
888
+ }
889
+
890
+ def visit_timestamptz_ns(self, timestamptz_ns_type: Any) -> Dict[str, Any]:
891
+ # Handle nanosecond precision timestamps with timezone
892
+ # Avro supports 2 types of timestamp:
893
+ # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
894
+ # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
895
+ return {
896
+ "type": "long",
897
+ "logicalType": "timestamp-micros",
898
+ # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
899
+ # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
900
+ # "logicalType": "timestamp-micros"
901
+ # if timestamptz_ns_type.adjust_to_utc
902
+ # else "local-timestamp-micros",
903
+ "native_data_type": str(timestamptz_ns_type),
904
+ }
905
+
906
+ def visit_unknown(self, unknown_type: Any) -> Dict[str, Any]:
907
+ # Handle unknown types
908
+ return {
909
+ "type": "string",
910
+ "native_data_type": str(unknown_type),
911
+ }
@@ -412,7 +412,10 @@ class MongoSourceConnector(BaseConnector):
412
412
 
413
413
  # Escape topic_prefix to handle cases where it contains dots
414
414
  # Some users configure topic.prefix like "my.mongodb" which breaks the regex
415
- topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
415
+
416
+ # \w is equivalent to [a-zA-Z0-9_]
417
+ # So [\w-]+ matches alphanumeric characters, underscores, and hyphens
418
+ topic_naming_pattern = rf"{re.escape(topic_prefix)}\.([\w-]+)\.([\w-]+)"
416
419
 
417
420
  if not self.connector_manifest.topic_names:
418
421
  return lineages
@@ -214,6 +214,10 @@ class ModeConfig(
214
214
  description="Number of items per page for paginated API requests.",
215
215
  )
216
216
 
217
+ exclude_archived: bool = Field(
218
+ default=False, description="Exclude archived reports"
219
+ )
220
+
217
221
  @validator("connect_uri")
218
222
  def remove_trailing_slash(cls, v):
219
223
  return config_clean.remove_trailing_slashes(v)
@@ -1473,6 +1477,15 @@ class ModeSource(StatefulIngestionSourceBase):
1473
1477
  logger.debug(
1474
1478
  f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
1475
1479
  )
1480
+ if self.config.exclude_archived:
1481
+ logger.debug(
1482
+ f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
1483
+ )
1484
+ reports_page = [
1485
+ report
1486
+ for report in reports_page
1487
+ if not report.get("archived", False)
1488
+ ]
1476
1489
  yield reports_page
1477
1490
  except ModeRequestError as e:
1478
1491
  if isinstance(e, HTTPError) and e.response.status_code == 404:
@@ -1,13 +1,13 @@
1
1
  import functools
2
2
  import importlib.resources as pkg_resource
3
3
  import logging
4
- import os
5
4
  from typing import Dict, List, Optional
6
5
 
7
6
  import lark
8
7
  from lark import Lark, Tree
9
8
 
10
9
  import datahub.ingestion.source.powerbi.m_query.data_classes
10
+ from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout
11
11
  from datahub.ingestion.api.common import PipelineContext
12
12
  from datahub.ingestion.source.powerbi.config import (
13
13
  PowerBiDashboardSourceConfig,
@@ -25,7 +25,7 @@ from datahub.utilities.threading_timeout import TimeoutException, threading_time
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
- _M_QUERY_PARSE_TIMEOUT = int(os.getenv("DATAHUB_POWERBI_M_QUERY_PARSE_TIMEOUT", 60))
28
+ _M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout()
29
29
 
30
30
 
31
31
  @functools.lru_cache(maxsize=1)
@@ -1,10 +1,10 @@
1
1
  import logging
2
- import os
3
2
  from collections import defaultdict
4
3
  from dataclasses import dataclass, field
5
4
  from datetime import datetime
6
5
  from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
7
6
 
7
+ from datahub.configuration.env_vars import get_snowflake_schema_parallelism
8
8
  from datahub.ingestion.api.report import SupportsAsObj
9
9
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
10
10
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
@@ -22,7 +22,7 @@ from datahub.utilities.serialized_lru_cache import serialized_lru_cache
22
22
 
23
23
  logger: logging.Logger = logging.getLogger(__name__)
24
24
 
25
- SCHEMA_PARALLELISM = int(os.getenv("DATAHUB_SNOWFLAKE_SCHEMA_PARALLELISM", 20))
25
+ SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
26
26
 
27
27
 
28
28
  @dataclass
@@ -149,18 +149,24 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
149
149
  uri_opts: Optional[Dict[str, Any]] = None,
150
150
  current_db: Optional[str] = None,
151
151
  ) -> str:
152
+ current_db = current_db or self.database
153
+
152
154
  if self.use_odbc:
153
155
  # Ensure that the import is available.
154
156
  import pyodbc # noqa: F401
155
157
 
156
158
  self.scheme = "mssql+pyodbc"
157
159
 
160
+ # ODBC requires a database name, otherwise it will interpret host_port
161
+ # as a pre-defined ODBC connection name.
162
+ current_db = current_db or "master"
163
+
158
164
  uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
159
165
  self.scheme, # type: ignore
160
166
  self.username,
161
167
  self.password.get_secret_value() if self.password else None,
162
168
  self.host_port, # type: ignore
163
- current_db if current_db else self.database,
169
+ current_db,
164
170
  uri_opts=uri_opts,
165
171
  )
166
172
  if self.use_odbc:
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import re
2
3
  import time
3
4
  from collections import defaultdict
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -67,6 +68,12 @@ from datahub.utilities.stats_collections import TopKDict
67
68
 
68
69
  logger: logging.Logger = logging.getLogger(__name__)
69
70
 
71
+ # Precompiled regex pattern for case-insensitive "(not casespecific)" removal
72
+ NOT_CASESPECIFIC_PATTERN = re.compile(r"\(not casespecific\)", re.IGNORECASE)
73
+
74
+ # Teradata uses a two-tier database.table naming approach without default database prefixing
75
+ DEFAULT_NO_DATABASE_TERADATA = None
76
+
70
77
  # Common excluded databases used in multiple places
71
78
  EXCLUDED_DATABASES = [
72
79
  "All",
@@ -453,6 +460,13 @@ class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
453
460
  # Global metadata extraction timing (single query for all databases)
454
461
  metadata_extraction_total_sec: float = 0.0
455
462
 
463
+ # Lineage extraction query time range (actively used)
464
+ lineage_start_time: Optional[datetime] = None
465
+ lineage_end_time: Optional[datetime] = None
466
+
467
+ # Audit query processing statistics
468
+ num_audit_query_entries_processed: int = 0
469
+
456
470
 
457
471
  class BaseTeradataConfig(TwoTierSQLAlchemyConfig):
458
472
  scheme: str = Field(default="teradatasql", description="database scheme")
@@ -726,7 +740,8 @@ ORDER by DataBaseName, TableName;
726
740
  env=self.config.env,
727
741
  schema_resolver=self.schema_resolver,
728
742
  graph=self.ctx.graph,
729
- generate_lineage=self.include_lineage,
743
+ generate_lineage=self.config.include_view_lineage
744
+ or self.config.include_table_lineage,
730
745
  generate_queries=self.config.include_queries,
731
746
  generate_usage_statistics=self.config.include_usage_statistics,
732
747
  generate_query_usage_statistics=self.config.include_usage_statistics,
@@ -1327,6 +1342,9 @@ ORDER by DataBaseName, TableName;
1327
1342
  current_query_metadata = None
1328
1343
 
1329
1344
  for entry in entries:
1345
+ # Count each audit query entry processed
1346
+ self.report.num_audit_query_entries_processed += 1
1347
+
1330
1348
  query_id = getattr(entry, "query_id", None)
1331
1349
  query_text = str(getattr(entry, "query_text", ""))
1332
1350
 
@@ -1367,15 +1385,18 @@ ORDER by DataBaseName, TableName;
1367
1385
  default_database = getattr(metadata_entry, "default_database", None)
1368
1386
 
1369
1387
  # Apply Teradata-specific query transformations
1370
- cleaned_query = full_query_text.replace("(NOT CASESPECIFIC)", "")
1388
+ cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", full_query_text)
1371
1389
 
1390
+ # For Teradata's two-tier architecture (database.table), we should not set default_db
1391
+ # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
1392
+ # The SQL parser will treat database.table references correctly without default_db
1372
1393
  return ObservedQuery(
1373
1394
  query=cleaned_query,
1374
1395
  session_id=session_id,
1375
1396
  timestamp=timestamp,
1376
1397
  user=CorpUserUrn(user) if user else None,
1377
- default_db=default_database,
1378
- default_schema=default_database, # Teradata uses database as schema
1398
+ default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
1399
+ default_schema=default_database,
1379
1400
  )
1380
1401
 
1381
1402
  def _convert_entry_to_observed_query(self, entry: Any) -> ObservedQuery:
@@ -1393,15 +1414,18 @@ ORDER by DataBaseName, TableName;
1393
1414
  default_database = getattr(entry, "default_database", None)
1394
1415
 
1395
1416
  # Apply Teradata-specific query transformations
1396
- cleaned_query = query_text.replace("(NOT CASESPECIFIC)", "")
1417
+ cleaned_query = NOT_CASESPECIFIC_PATTERN.sub("", query_text)
1397
1418
 
1419
+ # For Teradata's two-tier architecture (database.table), we should not set default_db
1420
+ # to avoid incorrect URN generation like "dbc.database.table" instead of "database.table"
1421
+ # However, we should set default_schema for unqualified table references
1398
1422
  return ObservedQuery(
1399
1423
  query=cleaned_query,
1400
1424
  session_id=session_id,
1401
1425
  timestamp=timestamp,
1402
1426
  user=CorpUserUrn(user) if user else None,
1403
- default_db=default_database,
1404
- default_schema=default_database, # Teradata uses database as schema
1427
+ default_db=DEFAULT_NO_DATABASE_TERADATA, # Teradata uses two-tier database.table naming without default database prefixing
1428
+ default_schema=default_database, # Set default_schema for unqualified table references
1405
1429
  )
1406
1430
 
1407
1431
  def _fetch_lineage_entries_chunked(self) -> Iterable[Any]:
@@ -1421,7 +1445,7 @@ ORDER by DataBaseName, TableName;
1421
1445
 
1422
1446
  for query_index, query in enumerate(queries, 1):
1423
1447
  logger.info(
1424
- f"Executing lineage query {query_index}/{len(queries)} with {cursor_type} cursor..."
1448
+ f"Executing lineage query {query_index}/{len(queries)} for time range {self.config.start_time} to {self.config.end_time} with {cursor_type} cursor..."
1425
1449
  )
1426
1450
 
1427
1451
  # Use helper method to try server-side cursor with fallback
@@ -1589,79 +1613,70 @@ ORDER by DataBaseName, TableName;
1589
1613
  else:
1590
1614
  return connection.execute(text(query))
1591
1615
 
1616
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
1617
+ """Override to prevent parent class from generating aggregator work units during schema extraction.
1618
+
1619
+ We handle aggregator generation manually after populating it with audit log data.
1620
+ """
1621
+ # Do nothing - we'll call the parent implementation manually after populating the aggregator
1622
+ return iter([])
1623
+
1592
1624
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1593
1625
  logger.info("Starting Teradata metadata extraction")
1594
1626
 
1595
- # Add all schemas to the schema resolver
1596
- # Sql parser operates on lowercase urns so we need to lowercase the urns
1627
+ # Step 1: Schema extraction first (parent class will skip aggregator generation due to our override)
1597
1628
  with self.report.new_stage("Schema metadata extraction"):
1598
1629
  yield from super().get_workunits_internal()
1599
1630
  logger.info("Completed schema metadata extraction")
1600
1631
 
1601
- with self.report.new_stage("Audit log extraction"):
1602
- yield from self._get_audit_log_mcps_with_aggregator()
1603
-
1604
- # SqlParsingAggregator handles its own work unit generation internally
1605
- logger.info("Lineage processing completed by SqlParsingAggregator")
1606
-
1607
- def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
1608
- """Override base class to skip aggregator gen_metadata() call.
1609
-
1610
- Teradata handles aggregator processing after adding audit log queries,
1611
- so we skip the base class call to prevent duplicate processing.
1612
- """
1613
- # Return empty iterator - Teradata will handle aggregator processing
1614
- # after adding audit log queries in _get_audit_log_mcps_with_aggregator()
1615
- return iter([])
1632
+ # Step 2: Lineage extraction after schema extraction
1633
+ # This allows lineage processing to have access to all discovered schema information
1634
+ with self.report.new_stage("Audit log extraction and lineage processing"):
1635
+ self._populate_aggregator_from_audit_logs()
1636
+ # Call parent implementation directly to generate aggregator work units
1637
+ yield from super()._generate_aggregator_workunits()
1638
+ logger.info("Completed lineage processing")
1616
1639
 
1617
- def _get_audit_log_mcps_with_aggregator(self) -> Iterable[MetadataWorkUnit]:
1640
+ def _populate_aggregator_from_audit_logs(self) -> None:
1618
1641
  """SqlParsingAggregator-based lineage extraction with enhanced capabilities."""
1619
- logger.info(
1620
- "Fetching queries from Teradata audit logs for SqlParsingAggregator"
1621
- )
1622
-
1623
- if self.config.include_table_lineage or self.config.include_usage_statistics:
1624
- # Step 1: Stream query entries from database with memory-efficient processing
1625
- with self.report.new_stage("Fetching lineage entries from Audit Logs"):
1626
- queries_processed = 0
1627
- entries_processed = False
1628
-
1629
- # Use streaming query reconstruction for memory efficiency
1630
- for observed_query in self._reconstruct_queries_streaming(
1631
- self._fetch_lineage_entries_chunked()
1632
- ):
1633
- entries_processed = True
1634
- self.aggregator.add(observed_query)
1642
+ with self.report.new_stage("Lineage extraction from Teradata audit logs"):
1643
+ # Record the lineage query time range in the report
1644
+ self.report.lineage_start_time = self.config.start_time
1645
+ self.report.lineage_end_time = self.config.end_time
1635
1646
 
1636
- queries_processed += 1
1637
- if queries_processed % 10000 == 0:
1638
- logger.info(
1639
- f"Processed {queries_processed} queries to aggregator"
1640
- )
1647
+ logger.info(
1648
+ f"Starting lineage extraction from Teradata audit logs (time range: {self.config.start_time} to {self.config.end_time})"
1649
+ )
1641
1650
 
1642
- if not entries_processed:
1643
- logger.info("No lineage entries found")
1644
- return
1651
+ if (
1652
+ self.config.include_table_lineage
1653
+ or self.config.include_usage_statistics
1654
+ ):
1655
+ # Step 1: Stream query entries from database with memory-efficient processing
1656
+ with self.report.new_stage("Fetching lineage entries from Audit Logs"):
1657
+ queries_processed = 0
1658
+
1659
+ # Use streaming query reconstruction for memory efficiency
1660
+ for observed_query in self._reconstruct_queries_streaming(
1661
+ self._fetch_lineage_entries_chunked()
1662
+ ):
1663
+ self.aggregator.add(observed_query)
1664
+
1665
+ queries_processed += 1
1666
+ if queries_processed % 10000 == 0:
1667
+ logger.info(
1668
+ f"Processed {queries_processed} queries to aggregator"
1669
+ )
1645
1670
 
1646
- logger.info(
1647
- f"Completed adding {queries_processed} queries to SqlParsingAggregator"
1648
- )
1671
+ if queries_processed == 0:
1672
+ logger.info("No lineage entries found")
1673
+ return
1649
1674
 
1650
- # Step 2: Generate work units from aggregator
1651
- with self.report.new_stage("SqlParsingAggregator metadata generation"):
1652
- logger.info("Generating metadata work units from SqlParsingAggregator")
1653
- work_unit_count = 0
1654
- for mcp in self.aggregator.gen_metadata():
1655
- work_unit_count += 1
1656
- if work_unit_count % 10000 == 0:
1657
1675
  logger.info(
1658
- f"Generated {work_unit_count} work units from aggregator"
1676
+ f"Completed adding {queries_processed} queries to SqlParsingAggregator"
1659
1677
  )
1660
- yield mcp.as_workunit()
1661
1678
 
1662
- logger.info(
1663
- f"Completed SqlParsingAggregator processing: {work_unit_count} work units generated"
1664
- )
1679
+ logger.info("Completed lineage extraction from Teradata audit logs")
1665
1680
 
1666
1681
  def close(self) -> None:
1667
1682
  """Clean up resources when source is closed."""
@@ -49,6 +49,12 @@ class LineageDataSource(ConfigEnum):
49
49
  API = "API"
50
50
 
51
51
 
52
+ class UsageDataSource(ConfigEnum):
53
+ AUTO = "AUTO"
54
+ SYSTEM_TABLES = "SYSTEM_TABLES"
55
+ API = "API"
56
+
57
+
52
58
  class UnityCatalogProfilerConfig(ConfigModel):
53
59
  method: str = Field(
54
60
  description=(
@@ -285,6 +291,17 @@ class UnityCatalogSourceConfig(
285
291
  description="Generate usage statistics.",
286
292
  )
287
293
 
294
+ usage_data_source: UsageDataSource = pydantic.Field(
295
+ default=UsageDataSource.AUTO,
296
+ description=(
297
+ "Source for usage/query history data extraction. Options: "
298
+ f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
299
+ "This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
300
+ f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
301
+ f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
302
+ ),
303
+ )
304
+
288
305
  # TODO: Remove `type:ignore` by refactoring config
289
306
  profiling: Union[
290
307
  UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
@@ -446,6 +463,20 @@ class UnityCatalogSourceConfig(
446
463
 
447
464
  return values
448
465
 
466
+ @pydantic.root_validator(skip_on_failure=True)
467
+ def validate_usage_data_source_with_warehouse(
468
+ cls, values: Dict[str, Any]
469
+ ) -> Dict[str, Any]:
470
+ usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
471
+ warehouse_id = values.get("warehouse_id")
472
+
473
+ if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
474
+ raise ValueError(
475
+ f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
476
+ )
477
+
478
+ return values
479
+
449
480
  @pydantic.validator("schema_pattern", always=True)
450
481
  def schema_pattern_should__always_deny_information_schema(
451
482
  cls, v: AllowDenyPattern
@@ -40,6 +40,7 @@ from datahub.api.entities.external.unity_catalog_external_entites import UnityCa
40
40
  from datahub.emitter.mce_builder import parse_ts_millis
41
41
  from datahub.ingestion.source.unity.config import (
42
42
  LineageDataSource,
43
+ UsageDataSource,
43
44
  )
44
45
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
45
46
  from datahub.ingestion.source.unity.proxy_profiling import (
@@ -163,6 +164,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
163
164
  report: UnityCatalogReport,
164
165
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
165
166
  lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
167
+ usage_data_source: UsageDataSource = UsageDataSource.AUTO,
166
168
  databricks_api_page_size: int = 0,
167
169
  ):
168
170
  self._workspace_client = WorkspaceClient(
@@ -175,6 +177,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
175
177
  self.report = report
176
178
  self.hive_metastore_proxy = hive_metastore_proxy
177
179
  self.lineage_data_source = lineage_data_source
180
+ self.usage_data_source = usage_data_source
178
181
  self.databricks_api_page_size = databricks_api_page_size
179
182
  self._sql_connection_params = {
180
183
  "server_hostname": self._workspace_client.config.host.replace(
@@ -182,6 +185,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
182
185
  ),
183
186
  "http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
184
187
  "access_token": self._workspace_client.config.token,
188
+ "user_agent_entry": "datahub",
185
189
  }
186
190
 
187
191
  def check_basic_connectivity(self) -> bool:
@@ -401,6 +405,75 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
401
405
  method, path, body={**body, "page_token": response["next_page_token"]}
402
406
  )
403
407
 
408
+ def get_query_history_via_system_tables(
409
+ self,
410
+ start_time: datetime,
411
+ end_time: datetime,
412
+ ) -> Iterable[Query]:
413
+ """Get query history using system.query.history table.
414
+
415
+ This method provides an alternative to the REST API for fetching query history,
416
+ offering better performance and richer data for large query volumes.
417
+ """
418
+ logger.info(
419
+ f"Fetching query history from system.query.history for period: {start_time} to {end_time}"
420
+ )
421
+
422
+ allowed_types = [typ.value for typ in ALLOWED_STATEMENT_TYPES]
423
+ statement_type_filter = ", ".join(f"'{typ}'" for typ in allowed_types)
424
+
425
+ query = f"""
426
+ SELECT
427
+ statement_id,
428
+ statement_text,
429
+ statement_type,
430
+ start_time,
431
+ end_time,
432
+ executed_by,
433
+ executed_as,
434
+ executed_by_user_id,
435
+ executed_as_user_id
436
+ FROM system.query.history
437
+ WHERE
438
+ start_time >= %s
439
+ AND end_time <= %s
440
+ AND execution_status = 'FINISHED'
441
+ AND statement_type IN ({statement_type_filter})
442
+ ORDER BY start_time
443
+ """
444
+
445
+ try:
446
+ rows = self._execute_sql_query(query, (start_time, end_time))
447
+ for row in rows:
448
+ try:
449
+ yield Query(
450
+ query_id=row.statement_id,
451
+ query_text=row.statement_text,
452
+ statement_type=(
453
+ QueryStatementType(row.statement_type)
454
+ if row.statement_type
455
+ else None
456
+ ),
457
+ start_time=row.start_time,
458
+ end_time=row.end_time,
459
+ user_id=row.executed_by_user_id,
460
+ user_name=row.executed_by,
461
+ executed_as_user_id=row.executed_as_user_id,
462
+ executed_as_user_name=row.executed_as,
463
+ )
464
+ except Exception as e:
465
+ logger.warning(f"Error parsing query from system table: {e}")
466
+ self.report.report_warning("query-parse-system-table", str(e))
467
+ except Exception as e:
468
+ logger.error(
469
+ f"Error fetching query history from system tables: {e}", exc_info=True
470
+ )
471
+ self.report.report_failure(
472
+ title="Failed to fetch query history from system tables",
473
+ message="Error querying system.query.history table",
474
+ context=f"Query period: {start_time} to {end_time}",
475
+ )
476
+
404
477
  def _build_datetime_where_conditions(
405
478
  self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
406
479
  ) -> str: