PyPI - acryl-datahub - Versions diffs - 1.3.0.1rc4__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl - Mend

acryl-datahub 1.3.0.1rc4py3-none-any.whl → 1.3.0.1rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (31) hide show

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -25,6 +25,7 @@ from datahub.ingestion.source.redshift.query import (
     RedshiftServerlessQuery,
 )
 from datahub.ingestion.source.redshift.redshift_schema import (
+    RedshiftDataDictionary,
     RedshiftTable,
     RedshiftView,
 )
@@ -263,8 +264,7 @@ class RedshiftUsageExtractor:
         connection: redshift_connector.Connection,
         all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
     ) -> Iterable[RedshiftAccessEvent]:
-        cursor = connection.cursor()
-        cursor.execute(query)
+        cursor = RedshiftDataDictionary.get_query_result(conn=connection, query=query)
         results = cursor.fetchmany()
         field_names = [i[0] for i in cursor.description]
         while results:

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -31,6 +31,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulLineageConfigMixin,
     StatefulProfilingConfigMixin,
+    StatefulTimeWindowConfigMixin,
     StatefulUsageConfigMixin,
 )
 from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
@@ -199,6 +200,7 @@ class SnowflakeV2Config(
     SnowflakeUsageConfig,
     StatefulLineageConfigMixin,
     StatefulUsageConfigMixin,
+    StatefulTimeWindowConfigMixin,
     StatefulProfilingConfigMixin,
     ClassificationSourceConfigMixin,
     IncrementalPropertiesConfigMixin,
@@ -477,6 +479,20 @@ class SnowflakeV2Config(
         return shares
+    @root_validator(pre=False, skip_on_failure=True)
+    def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
+        if values.get("use_queries_v2"):
+            if values.get("enable_stateful_lineage_ingestion") or values.get(
+                "enable_stateful_usage_ingestion"
+            ):
+                logger.warning(
+                    "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
+                    "when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
+                    "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
+                    "for the unified time window extraction (lineage + usage + operations + queries)."
+                )
+        return values
     def outbounds(self) -> Dict[str, Set[DatabaseId]]:
         """
         Returns mapping of

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -17,6 +17,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFr
 from datahub.configuration.time_window_config import (
     BaseTimeWindowConfig,
     BucketDuration,
+    get_time_bucket,
 )
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
@@ -50,6 +51,9 @@ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
     StoredProcLineageReport,
     StoredProcLineageTracker,
 )
+from datahub.ingestion.source.state.redundant_run_skip_handler import (
+    RedundantQueriesRunSkipHandler,
+)
 from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
 from datahub.metadata.urns import CorpUserUrn
 from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -180,6 +184,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         structured_report: SourceReport,
         filters: SnowflakeFilter,
         identifiers: SnowflakeIdentifierBuilder,
+        redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
         graph: Optional[DataHubGraph] = None,
         schema_resolver: Optional[SchemaResolver] = None,
         discovered_tables: Optional[List[str]] = None,
@@ -191,9 +196,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         self.filters = filters
         self.identifiers = identifiers
         self.discovered_tables = set(discovered_tables) if discovered_tables else None
+        self.redundant_run_skip_handler = redundant_run_skip_handler
         self._structured_report = structured_report
+        # Adjust time window based on stateful ingestion state
+        self.start_time, self.end_time = self._get_time_window()
         # The exit stack helps ensure that we close all the resources we open.
         self._exit_stack = contextlib.ExitStack()
@@ -211,8 +220,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 generate_query_usage_statistics=self.config.include_query_usage_statistics,
                 usage_config=BaseUsageConfig(
                     bucket_duration=self.config.window.bucket_duration,
-                    start_time=self.config.window.start_time,
-                    end_time=self.config.window.end_time,
+                    start_time=self.start_time,
+                    end_time=self.end_time,
                     user_email_pattern=self.config.user_email_pattern,
                     # TODO make the rest of the fields configurable
                 ),
@@ -228,6 +237,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def structured_reporter(self) -> SourceReport:
         return self._structured_report
+    def _get_time_window(self) -> tuple[datetime, datetime]:
+        if self.redundant_run_skip_handler:
+            start_time, end_time = (
+                self.redundant_run_skip_handler.suggest_run_time_window(
+                    self.config.window.start_time,
+                    self.config.window.end_time,
+                )
+            )
+        else:
+            start_time = self.config.window.start_time
+            end_time = self.config.window.end_time
+        # Usage statistics are aggregated per bucket (typically per day).
+        # To ensure accurate aggregated metrics, we need to align the start_time
+        # to the beginning of a bucket so that we include complete bucket periods.
+        if self.config.include_usage_statistics:
+            start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
+        return start_time, end_time
+    def _update_state(self) -> None:
+        if self.redundant_run_skip_handler:
+            self.redundant_run_skip_handler.update_state(
+                self.config.window.start_time,
+                self.config.window.end_time,
+                self.config.window.bucket_duration,
+            )
     @functools.cached_property
     def local_temp_path(self) -> pathlib.Path:
         if self.config.local_temp_path:
@@ -355,6 +392,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         with self.report.aggregator_generate_timer:
             yield from auto_workunit(self.aggregator.gen_metadata())
+        # Update the stateful ingestion state after successful extraction
+        self._update_state()
     def fetch_users(self) -> UsersMapping:
         users: UsersMapping = dict()
         with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
@@ -378,8 +418,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         # Derived from _populate_external_lineage_from_copy_history.
         query: str = SnowflakeQuery.copy_lineage_history(
-            start_time_millis=int(self.config.window.start_time.timestamp() * 1000),
-            end_time_millis=int(self.config.window.end_time.timestamp() * 1000),
+            start_time_millis=int(self.start_time.timestamp() * 1000),
+            end_time_millis=int(self.end_time.timestamp() * 1000),
             downstreams_deny_pattern=self.config.temporary_tables_pattern,
         )
@@ -414,8 +454,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
     ]:
         query_log_query = QueryLogQueryBuilder(
-            start_time=self.config.window.start_time,
-            end_time=self.config.window.end_time,
+            start_time=self.start_time,
+            end_time=self.end_time,
             bucket_duration=self.config.window.bucket_duration,
             deny_usernames=self.config.pushdown_deny_usernames,
             allow_usernames=self.config.pushdown_allow_usernames,

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -73,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
 from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantLineageRunSkipHandler,
+    RedundantQueriesRunSkipHandler,
     RedundantUsageRunSkipHandler,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -207,7 +208,7 @@ class SnowflakeV2Source(
         )
         self.report.sql_aggregator = self.aggregator.report
-        if self.config.include_table_lineage:
+        if self.config.include_table_lineage and not self.config.use_queries_v2:
             redundant_lineage_run_skip_handler: Optional[
                 RedundantLineageRunSkipHandler
             ] = None
@@ -589,6 +590,17 @@ class SnowflakeV2Source(
             with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
                 schema_resolver = self.aggregator._schema_resolver
+                redundant_queries_run_skip_handler: Optional[
+                    RedundantQueriesRunSkipHandler
+                ] = None
+                if self.config.enable_stateful_time_window:
+                    redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
+                        source=self,
+                        config=self.config,
+                        pipeline_name=self.ctx.pipeline_name,
+                        run_id=self.ctx.run_id,
+                    )
                 queries_extractor = SnowflakeQueriesExtractor(
                     connection=self.connection,
                     # TODO: this should be its own section in main recipe
@@ -614,6 +626,7 @@ class SnowflakeV2Source(
                     structured_report=self.report,
                     filters=self.filters,
                     identifiers=self.identifiers,
+                    redundant_run_skip_handler=redundant_queries_run_skip_handler,
                     schema_resolver=schema_resolver,
                     discovered_tables=self.discovered_datasets,
                     graph=self.ctx.graph,

datahub/ingestion/source/sql/mysql.py CHANGED Viewed

@@ -1,14 +1,17 @@
 # This import verifies that the dependencies are available.
-from typing import List
+import logging
+from typing import TYPE_CHECKING, Any, List, Optional
 import pymysql  # noqa: F401
 from pydantic.fields import Field
-from sqlalchemy import util
+from sqlalchemy import create_engine, event, inspect, util
 from sqlalchemy.dialects.mysql import BIT, base
 from sqlalchemy.dialects.mysql.enumerated import SET
 from sqlalchemy.engine.reflection import Inspector
+if TYPE_CHECKING:
+    from sqlalchemy.engine import Engine
 from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -18,11 +21,16 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.source.aws.aws_common import (
+    AwsConnectionConfig,
+    RDSIAMTokenManager,
+)
 from datahub.ingestion.source.sql.sql_common import (
     make_sqlalchemy_type,
     register_custom_type,
 )
 from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
+from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
 from datahub.ingestion.source.sql.stored_procedures.base import (
     BaseProcedure,
 )
@@ -31,6 +39,9 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
     TwoTierSQLAlchemySource,
 )
 from datahub.metadata.schema_classes import BytesTypeClass
+from datahub.utilities.str_enum import StrEnum
+logger = logging.getLogger(__name__)
 SET.__repr__ = util.generic_repr  # type:ignore
@@ -54,11 +65,33 @@ base.ischema_names["polygon"] = POLYGON
 base.ischema_names["decimal128"] = DECIMAL128
+class MySQLAuthMode(StrEnum):
+    """Authentication mode for MySQL connection."""
+    PASSWORD = "PASSWORD"
+    AWS_IAM = "AWS_IAM"
 class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
     # defaults
     host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
     scheme: HiddenFromDocs[str] = "mysql+pymysql"
+    # Authentication configuration
+    auth_mode: MySQLAuthMode = Field(
+        default=MySQLAuthMode.PASSWORD,
+        description="Authentication mode to use for the MySQL connection. "
+        "Options are 'PASSWORD' (default) for standard username/password authentication, "
+        "or 'AWS_IAM' for AWS RDS IAM authentication.",
+    )
+    aws_config: AwsConnectionConfig = Field(
+        default_factory=AwsConnectionConfig,
+        description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
+        "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
+        "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
+        "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
+    )
 class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
     def get_identifier(self, *, schema: str, table: str) -> str:
@@ -91,9 +124,27 @@ class MySQLSource(TwoTierSQLAlchemySource):
     Table, row, and column statistics via optional SQL profiling
     """
-    def __init__(self, config, ctx):
+    config: MySQLConfig
+    def __init__(self, config: MySQLConfig, ctx: Any):
         super().__init__(config, ctx, self.get_platform())
+        self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
+        if config.auth_mode == MySQLAuthMode.AWS_IAM:
+            hostname, port = parse_host_port(config.host_port, default_port=3306)
+            if port is None:
+                raise ValueError("Port must be specified for RDS IAM authentication")
+            if not config.username:
+                raise ValueError("username is required for RDS IAM authentication")
+            self._rds_iam_token_manager = RDSIAMTokenManager(
+                endpoint=hostname,
+                username=config.username,
+                port=port,
+                aws_config=config.aws_config,
+            )
     def get_platform(self):
         return "mysql"
@@ -102,6 +153,52 @@ class MySQLSource(TwoTierSQLAlchemySource):
         config = MySQLConfig.parse_obj(config_dict)
         return cls(config, ctx)
+    def _setup_rds_iam_event_listener(
+        self, engine: "Engine", database_name: Optional[str] = None
+    ) -> None:
+        """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
+        if not (
+            self.config.auth_mode == MySQLAuthMode.AWS_IAM
+            and self._rds_iam_token_manager
+        ):
+            return
+        def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
+            if not self._rds_iam_token_manager:
+                raise RuntimeError("RDS IAM Token Manager is not initialized")
+            cparams["password"] = self._rds_iam_token_manager.get_token()
+            # PyMySQL requires SSL to be enabled for RDS IAM authentication.
+            # Preserve any existing SSL configuration, otherwise enable with default settings.
+            # The {"ssl": True} dict is a workaround to make PyMySQL recognize that SSL
+            # should be enabled, since the library requires a truthy value in the ssl parameter.
+            # See https://pymysql.readthedocs.io/en/latest/modules/connections.html#pymysql.connections.Connection
+            cparams["ssl"] = cparams.get("ssl") or {"ssl": True}
+        event.listen(engine, "do_connect", do_connect_listener)  # type: ignore[misc]
+    def get_inspectors(self):
+        url = self.config.get_sql_alchemy_url()
+        logger.debug(f"sql_alchemy_url={url}")
+        engine = create_engine(url, **self.config.options)
+        self._setup_rds_iam_event_listener(engine)
+        with engine.connect() as conn:
+            inspector = inspect(conn)
+            if self.config.database and self.config.database != "":
+                databases = [self.config.database]
+            else:
+                databases = inspector.get_schema_names()
+            for db in databases:
+                if self.config.database_pattern.allowed(db):
+                    url = self.config.get_sql_alchemy_url(current_db=db)
+                    db_engine = create_engine(url, **self.config.options)
+                    self._setup_rds_iam_event_listener(db_engine, database_name=db)
+                    with db_engine.connect() as conn:
+                        inspector = inspect(conn)
+                        yield inspector
     def add_profile_metadata(self, inspector: Inspector) -> None:
         if not self.config.is_profiling_enabled():
             return

datahub/ingestion/source/sql/postgres.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
 # This import verifies that the dependencies are available.
 import psycopg2  # noqa: F401
@@ -14,9 +14,12 @@ import sqlalchemy.dialects.postgresql as custom_types
 from geoalchemy2 import Geometry  # noqa: F401
 from pydantic import BaseModel
 from pydantic.fields import Field
-from sqlalchemy import create_engine, inspect
+from sqlalchemy import create_engine, event, inspect
 from sqlalchemy.engine.reflection import Inspector
+if TYPE_CHECKING:
+    from sqlalchemy.engine import Engine
 from datahub.configuration.common import AllowDenyPattern
 from datahub.emitter import mce_builder
 from datahub.emitter.mcp_builder import mcps_from_mce
@@ -30,12 +33,17 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.aws.aws_common import (
+    AwsConnectionConfig,
+    RDSIAMTokenManager,
+)
 from datahub.ingestion.source.sql.sql_common import (
     SQLAlchemySource,
     SqlWorkUnit,
     register_custom_type,
 )
 from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
+from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
 from datahub.ingestion.source.sql.stored_procedures.base import (
     BaseProcedure,
 )
@@ -44,6 +52,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     BytesTypeClass,
     MapTypeClass,
 )
+from datahub.utilities.str_enum import StrEnum
 logger: logging.Logger = logging.getLogger(__name__)
@@ -100,12 +109,34 @@ class ViewLineageEntry(BaseModel):
     dependent_schema: str
+class PostgresAuthMode(StrEnum):
+    """Authentication mode for PostgreSQL connection."""
+    PASSWORD = "PASSWORD"
+    AWS_IAM = "AWS_IAM"
 class BasePostgresConfig(BasicSQLAlchemyConfig):
     scheme: str = Field(default="postgresql+psycopg2", description="database scheme")
     schema_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern(deny=["information_schema"])
     )
+    # Authentication configuration
+    auth_mode: PostgresAuthMode = Field(
+        default=PostgresAuthMode.PASSWORD,
+        description="Authentication mode to use for the PostgreSQL connection. "
+        "Options are 'PASSWORD' (default) for standard username/password authentication, "
+        "or 'AWS_IAM' for AWS RDS IAM authentication.",
+    )
+    aws_config: AwsConnectionConfig = Field(
+        default_factory=AwsConnectionConfig,
+        description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
+        "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
+        "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
+        "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
+    )
 class PostgresConfig(BasePostgresConfig):
     database_pattern: AllowDenyPattern = Field(
@@ -160,6 +191,22 @@ class PostgresSource(SQLAlchemySource):
     def __init__(self, config: PostgresConfig, ctx: PipelineContext):
         super().__init__(config, ctx, self.get_platform())
+        self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
+        if config.auth_mode == PostgresAuthMode.AWS_IAM:
+            hostname, port = parse_host_port(config.host_port, default_port=5432)
+            if port is None:
+                raise ValueError("Port must be specified for RDS IAM authentication")
+            if not config.username:
+                raise ValueError("username is required for RDS IAM authentication")
+            self._rds_iam_token_manager = RDSIAMTokenManager(
+                endpoint=hostname,
+                username=config.username,
+                port=port,
+                aws_config=config.aws_config,
+            )
     def get_platform(self):
         return "postgres"
@@ -168,13 +215,36 @@ class PostgresSource(SQLAlchemySource):
         config = PostgresConfig.parse_obj(config_dict)
         return cls(config, ctx)
+    def _setup_rds_iam_event_listener(
+        self, engine: "Engine", database_name: Optional[str] = None
+    ) -> None:
+        """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
+        if not (
+            self.config.auth_mode == PostgresAuthMode.AWS_IAM
+            and self._rds_iam_token_manager
+        ):
+            return
+        def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
+            if not self._rds_iam_token_manager:
+                raise RuntimeError("RDS IAM Token Manager is not initialized")
+            cparams["password"] = self._rds_iam_token_manager.get_token()
+            if cparams.get("sslmode") not in ("require", "verify-ca", "verify-full"):
+                cparams["sslmode"] = "require"
+        event.listen(engine, "do_connect", do_connect_listener)  # type: ignore[misc]
     def get_inspectors(self) -> Iterable[Inspector]:
         # Note: get_sql_alchemy_url will choose `sqlalchemy_uri` over the passed in database
         url = self.config.get_sql_alchemy_url(
             database=self.config.database or self.config.initial_database
         )
         logger.debug(f"sql_alchemy_url={url}")
         engine = create_engine(url, **self.config.options)
+        self._setup_rds_iam_event_listener(engine)
         with engine.connect() as conn:
             if self.config.database or self.config.sqlalchemy_uri:
                 inspector = inspect(conn)
@@ -182,14 +252,21 @@ class PostgresSource(SQLAlchemySource):
             else:
                 # pg_database catalog -  https://www.postgresql.org/docs/current/catalog-pg-database.html
                 # exclude template databases - https://www.postgresql.org/docs/current/manage-ag-templatedbs.html
+                # exclude rdsadmin - AWS RDS administrative database
                 databases = conn.execute(
-                    "SELECT datname from pg_database where datname not in ('template0', 'template1')"
+                    "SELECT datname from pg_database where datname not in ('template0', 'template1', 'rdsadmin')"
                 )
                 for db in databases:
                     if not self.config.database_pattern.allowed(db["datname"]):
                         continue
                     url = self.config.get_sql_alchemy_url(database=db["datname"])
-                    with create_engine(url, **self.config.options).connect() as conn:
+                    db_engine = create_engine(url, **self.config.options)
+                    self._setup_rds_iam_event_listener(
+                        db_engine, database_name=db["datname"]
+                    )
+                    with db_engine.connect() as conn:
                         inspector = inspect(conn)
                         yield inspector

datahub/ingestion/source/sql/sqlalchemy_uri.py CHANGED Viewed

@@ -1,8 +1,45 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 from sqlalchemy.engine import URL
+def parse_host_port(
+    host_port: str, default_port: Optional[int] = None
+) -> Tuple[str, Optional[int]]:
+    """
+    Parse a host:port string into separate host and port components.
+    Args:
+        host_port: String in format "host:port" or just "host"
+        default_port: Optional default port to use if not specified in host_port
+    Returns:
+        Tuple of (hostname, port) where port may be None if not specified
+    Examples:
+        >>> parse_host_port("localhost:3306")
+        ('localhost', 3306)
+        >>> parse_host_port("localhost")
+        ('localhost', None)
+        >>> parse_host_port("localhost", 5432)
+        ('localhost', 5432)
+        >>> parse_host_port("db.example.com:invalid", 3306)
+        ('db.example.com', 3306)
+    """
+    try:
+        host, port_str = host_port.rsplit(":", 1)
+        port: Optional[int]
+        try:
+            port = int(port_str)
+        except ValueError:
+            # Port is not a valid integer
+            port = default_port
+        return host, port
+    except ValueError:
+        # No colon found, entire string is the hostname
+        return host_port, default_port
 def make_sqlalchemy_uri(
     scheme: str,
     username: Optional[str],
@@ -14,12 +51,7 @@ def make_sqlalchemy_uri(
     host: Optional[str] = None
     port: Optional[int] = None
     if at:
-        try:
-            host, port_str = at.rsplit(":", 1)
-            port = int(port_str)
-        except ValueError:
-            host = at
-            port = None
+        host, port = parse_host_port(at)
     if uri_opts:
         uri_opts = {k: v for k, v in uri_opts.items() if v is not None}

datahub/ingestion/source/state/redundant_run_skip_handler.py CHANGED Viewed

@@ -244,3 +244,24 @@ class RedundantUsageRunSkipHandler(RedundantRunSkipHandler):
             cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
             cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
             cur_state.bucket_duration = bucket_duration
+class RedundantQueriesRunSkipHandler(RedundantRunSkipHandler):
+    """
+    Handler for stateful ingestion of queries v2 extraction.
+    Manages the time window for audit log extraction that combines
+    lineage, usage, operations, and queries.
+    """
+    def get_job_name_suffix(self):
+        return "_audit_window"
+    def update_state(
+        self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration
+    ) -> None:
+        cur_checkpoint = self.get_current_checkpoint()
+        if cur_checkpoint:
+            cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state)
+            cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
+            cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
+            cur_state.bucket_duration = bucket_duration

datahub/ingestion/source/state/stateful_ingestion_base.py CHANGED Viewed

@@ -101,7 +101,9 @@ class StatefulLineageConfigMixin(ConfigModel):
         default=True,
         description="Enable stateful lineage ingestion."
         " This will store lineage window timestamps after successful lineage ingestion. "
-        "and will not run lineage ingestion for same timestamps in subsequent run. ",
+        "and will not run lineage ingestion for same timestamps in subsequent run. "
+        "NOTE: This only works with use_queries_v2=False (legacy extraction path). "
+        "For queries v2, use enable_stateful_time_window instead.",
     )
     _store_last_lineage_extraction_timestamp = pydantic_renamed_field(
@@ -150,7 +152,9 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
         default=True,
         description="Enable stateful lineage ingestion."
         " This will store usage window timestamps after successful usage ingestion. "
-        "and will not run usage ingestion for same timestamps in subsequent run. ",
+        "and will not run usage ingestion for same timestamps in subsequent run. "
+        "NOTE: This only works with use_queries_v2=False (legacy extraction path). "
+        "For queries v2, use enable_stateful_time_window instead.",
     )
     _store_last_usage_extraction_timestamp = pydantic_renamed_field(
@@ -169,6 +173,30 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
         return values
+class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
+    enable_stateful_time_window: bool = Field(
+        default=False,
+        description="Enable stateful time window tracking."
+        " This will store the time window after successful extraction "
+        "and adjust the time window in subsequent runs to avoid reprocessing. "
+        "NOTE: This is ONLY applicable when using queries v2 (use_queries_v2=True). "
+        "This replaces enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion "
+        "for the queries v2 extraction path, since queries v2 extracts lineage, usage, operations, "
+        "and queries together from a single audit log and uses a unified time window.",
+    )
+    @root_validator(skip_on_failure=True)
+    def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
+        sti = values.get("stateful_ingestion")
+        if not sti or not sti.enabled:
+            if values.get("enable_stateful_time_window"):
+                logger.warning(
+                    "Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
+                )
+                values["enable_stateful_time_window"] = False
+        return values
 @dataclass
 class StatefulIngestionReport(SourceReport):
     pass

acryl-datahub 1.3.0.1rc4__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.3.0.1rc4py3-none-any.whl → 1.3.0.1rc6py3-none-any.whl