PyPI - data-factory-utils - Versions diffs - 0.3.0__tar.gz - Mend

data-factory-utils 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data_factory_utils-0.3.0/PKG-INFO +54 -0
data_factory_utils-0.3.0/README.md +38 -0
data_factory_utils-0.3.0/pyproject.toml +87 -0
data_factory_utils-0.3.0/src/data_factory_utils/__init__.py +1 -0
data_factory_utils-0.3.0/src/data_factory_utils/athena.py +208 -0
data_factory_utils-0.3.0/src/data_factory_utils/environment.py +334 -0
data_factory_utils-0.3.0/src/data_factory_utils/query.py +232 -0

data_factory_utils-0.3.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.3
+Name: data-factory-utils
+Version: 0.3.0
+Summary: Utility functions for interacting with data factories.
+Requires-Dist: boto3>=1.42.8
+Requires-Dist: boto3-stubs>=1.42.89
+Requires-Dist: botocore>=1.42.7
+Requires-Dist: cloudpathlib>=0.23.0
+Requires-Dist: mypy-boto3>=1.42.3
+Requires-Dist: mypy-boto3-athena>=1.42.43
+Requires-Dist: mypy-boto3-s3>=1.42.85
+Requires-Dist: mypy-boto3-sts>=1.42.3
+Requires-Dist: polars>=1.40.0
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+# data-factory-utils
+A package for random utils for data factories.
+## Installation
+This is a published package. Install using your favourite installation method.
+```bash
+uv add data-factory-utils
+pip install data-factory-utils
+```
+## Usage
+### Environment functions
+This set of functions reads from your data factory dynamically. It should infer the environment you are in as well.
+No matter how many times you initiate the class, it will re-use old variables. To do so...
+```python
+from data_factory_utils.environment import Environment
+env = Environment()
+```
+To return information about the environment (if we are in development with account number 0101010101):
+```python
+env.account_no
+# 0101010101
+env.environment_name
+# dev
+env.is_prod
+# False
+```
+To get an S3 bucket name (outputted as `cloudpathlib`'s `S3Path`) (let us imagine here that the name is `emds-dev-random-name-202512161154001309058001`):
+```python
+s3_random_name_bucket = env.get_full_bucket_url("random-name", full_prefix=True)
+print(str(s3_random_name_bucket.bucket))
+# emds-dev-random-name-202512161154001309058001
+```

data_factory_utils-0.3.0/README.md ADDED Viewed

@@ -0,0 +1,38 @@
+# data-factory-utils
+A package for random utils for data factories.
+## Installation
+This is a published package. Install using your favourite installation method.
+```bash
+uv add data-factory-utils
+pip install data-factory-utils
+```
+## Usage
+### Environment functions
+This set of functions reads from your data factory dynamically. It should infer the environment you are in as well.
+No matter how many times you initiate the class, it will re-use old variables. To do so...
+```python
+from data_factory_utils.environment import Environment
+env = Environment()
+```
+To return information about the environment (if we are in development with account number 0101010101):
+```python
+env.account_no
+# 0101010101
+env.environment_name
+# dev
+env.is_prod
+# False
+```
+To get an S3 bucket name (outputted as `cloudpathlib`'s `S3Path`) (let us imagine here that the name is `emds-dev-random-name-202512161154001309058001`):
+```python
+s3_random_name_bucket = env.get_full_bucket_url("random-name", full_prefix=True)
+print(str(s3_random_name_bucket.bucket))
+# emds-dev-random-name-202512161154001309058001
+```

data_factory_utils-0.3.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,87 @@
+[project]
+name = "data-factory-utils"
+version = "0.3.0"
+description = "Utility functions for interacting with data factories."
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "boto3>=1.42.8",
+    "boto3-stubs>=1.42.89",
+    "botocore>=1.42.7",
+    "cloudpathlib>=0.23.0",
+    "mypy-boto3>=1.42.3",
+    "mypy-boto3-athena>=1.42.43",
+    "mypy-boto3-s3>=1.42.85",
+    "mypy-boto3-sts>=1.42.3",
+    "polars>=1.40.0",
+]
+[build-system]
+requires = ["uv_build>=0.9.17,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "mypy>=1.20.1",
+    "prek>=0.2.21",
+    "ruff>=0.14.8",
+    "toml-cli>=0.8.2",
+    "ty>=0.0.1a33",
+]
+test = [
+    "moto>=5.1.18",
+    "pytest>=9.0.2",
+]
+[tool.ruff]
+line-length = 120
+[tool.bandit]
+exclude_dirs = ["/tests", "/.venv"]
+[tool.mypy]
+strict = true
+namespace_packages = false
+disallow_untyped_defs = true
+follow_untyped_imports = true
+exclude = ["tests"]
+[tool.ruff.lint]
+select = ["ALL"]
+# Remove warnings
+ignore = ["D203", "D213", "COM812"]
+[tool.ruff.lint.per-file-ignores]
+"tests/**.py" = ["S101"]
+[tool.semantic_release]
+commit_message = "{version}\n\nAutomatically generated by python-semantic-release"
+commit_parser = "conventional"
+logging_use_named_masks = false
+major_on_zero = false
+allow_zero_version = true
+no_git_verify = false
+tag_format = "{version}"
+[tool.semantic_release.branches.main]
+match = "main"
+prerelease_token = "rc"
+prerelease = false
+[tool.semantic_release.branches.other]
+match = ".*"
+prerelease_token = "rc"
+prerelease = true
+[tool.semantic_release.commit_parser_options]
+minor_tags = ["feat"]
+patch_tags = ["fix", "perf"]
+other_allowed_tags = ["build", "chore", "ci", "docs", "style", "refactor", "test"]
+allowed_tags = ["feat", "fix", "perf", "build", "chore", "ci", "docs", "style", "refactor", "test"]
+default_bump_level = 0
+parse_squash_commits = true
+ignore_merge_commits = true
+[tool.bandit.assert_used]
+skips = ['*_test.py', '*/test_*.py']

data_factory_utils-0.3.0/src/data_factory_utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Init file."""

data_factory_utils-0.3.0/src/data_factory_utils/athena.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""Athena helper functions."""
+import logging
+import time
+from collections.abc import Generator
+import boto3
+import polars as pl
+from mypy_boto3_athena.client import AthenaClient
+from mypy_boto3_athena.type_defs import GetQueryResultsOutputTypeDef, QueryExecutionContextTypeDef
+from data_factory_utils.environment import Environment
+from data_factory_utils.query import Query
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class NoQueryIdError(Exception):
+    """Exception for no query id being returned."""
+class AthenaQueryError(Exception):
+    """Exception for raising issues with Athena query execution.
+    boto3 itself doesn't provide a generic 'query failed' error, so we create our own to raise when queries fail.
+    """
+class AthenaConfig:
+    """Container for Athena configuration.
+    It's slow to create so we do it once and share it between queries.
+    """
+    athena_client: AthenaClient
+    result_bucket: str
+    database: str
+    catalog: str | None
+    workgroup: str
+    def __init__(
+        self,
+        database: str,
+        catalog: str | None = None,
+        result_bucket_name: str = "athena-query-results",
+        environment: Environment | None = None,
+    ) -> None:
+        """Initialise class."""
+        if environment is None:
+            environment = Environment(use_web_identity=False)
+        self.result_bucket = str(environment.get_full_bucket_url(result_bucket_name, full_prefix=True))
+        self.athena_client = boto3.client("athena")
+        self.database = database
+        self.catalog = catalog
+        self.workgroup = f"{environment.account_number}-ears-sars"
+class AthenaQuery:
+    """Abstraction of an Athena query.
+    boto3's API triggers the query, gets the query status and retrieves the response as separate calls.
+    AthenaQuery implements methods to simplify this flow.
+    """
+    query: Query
+    execution_id: str | None
+    config: AthenaConfig
+    has_succeeded: bool
+    def __init__(self, query: Query, config: AthenaConfig) -> None:
+        """Initialise class."""
+        self.query = query
+        self.execution_id = None
+        self.config = config
+        self.has_succeeded = False
+    def run_and_await(self) -> None:
+        """Run query, wait for it to complete."""
+        self.execution_id = self.start_execution()
+        self.await_query_completion()
+    def start_execution(self) -> str:
+        """Begin an Athena query with standard settings. Return the query ID."""
+        query_context: QueryExecutionContextTypeDef = {"Database": self.config.database}
+        if self.config.catalog is not None:
+            query_context["Catalog"] = self.config.catalog
+        query_string = str(self.query).strip()
+        if not query_string:
+            msg = "Query string cannot be empty"
+            raise ValueError(msg)
+        logger.debug(query_string)
+        response = self.config.athena_client.start_query_execution(
+            QueryString=str(self.query),
+            ResultConfiguration={"OutputLocation": self.config.result_bucket},
+            QueryExecutionContext=query_context,
+            WorkGroup=self.config.workgroup,
+        )
+        self.execution_id = str(response["QueryExecutionId"])
+        if not self.execution_id:
+            msg = "No Query Execution Id. Response: %s"
+            raise NoQueryIdError(msg, response)
+        logger.info("Query Execution ID: %s", self.execution_id)
+        return self.execution_id
+    def stop_execution(self) -> None:
+        """Stop an Athena query's execution."""
+        if self.execution_id is None:
+            msg = "No query id available. Make sure that the query has been startedbefore attempting to stop it."
+            raise AthenaQueryError(msg)
+        self.config.athena_client.stop_query_execution(QueryExecutionId=self.execution_id)
+    def await_query_completion(
+        self,
+        max_iterations: int = 20,
+        max_interval_seconds: int = 120,
+    ) -> None:
+        """Wait for a managed query with the given ID to finish and return the result."""
+        if self.execution_id is None:
+            msg = "No query id available. Make sure that the query has been startedbefore querying the result."
+            raise ValueError(msg)
+        for i in range(max_iterations):
+            # Exponential backoff on checking query result
+            wait_time = min(max_interval_seconds, 2**i)
+            time.sleep(wait_time)
+            # Check whether the query has terminated
+            response = self.config.athena_client.get_query_execution(
+                QueryExecutionId=self.execution_id,
+            )
+            execution_status = response["QueryExecution"]["Status"]
+            state = execution_status["State"]
+            logger.debug("Query state: %s", state)
+            match state:
+                case "QUEUED" | "RUNNING":
+                    continue
+                case "SUCCEEDED":
+                    self.has_succeeded = True
+                    logger.info("Query succeeded")
+                    return
+                case "FAILED" | "CANCELLED":
+                    msg = f"Query execution failed: {execution_status}"
+                    raise AthenaQueryError(msg)
+                case _:
+                    msg = f"Unknown state: {state}. Full status: {execution_status}"
+                    raise AthenaQueryError(msg)
+        # If we've polled as many times as we allow and haven't had a result, cancel
+        # the query and throw an error.
+        self.stop_execution()
+        msg = f"Maximum configured query duration reached. Stopped query: {self.execution_id}"
+        raise AthenaQueryError(msg)
+    def _get_responses(self) -> Generator[GetQueryResultsOutputTypeDef]:
+        """Get the raw results of the query."""
+        # If we don't already know it to have finished, wait for the query to finish
+        if not self.has_succeeded:
+            self.await_query_completion()
+        if self.execution_id is None:
+            msg = "No query id available."
+            raise ValueError(msg)
+        response = self.config.athena_client.get_query_results(
+            QueryExecutionId=self.execution_id,
+        )
+        yield response
+        while "NextToken" in response:
+            response = self.config.athena_client.get_query_results(
+                QueryExecutionId=self.execution_id,
+                NextToken=response["NextToken"],
+            )
+            yield response
+    def parse_response_as_dataframe(self) -> pl.DataFrame:
+        """Return the table of results from the query as a dict of columns."""
+        # Get the raw query results from the API
+        responses = self._get_responses()
+        # Parse the raw row data from the results
+        rows = []
+        for response in responses:
+            for row in response["ResultSet"]["Rows"]:
+                row_data = [column.get("VarCharValue") for column in row["Data"]]
+                rows.append(row_data)
+        if len(rows) == 0:
+            return pl.DataFrame({})
+        # The first row returned by the response is the names of the columns.
+        column_names, values_in_rows = rows[0], rows[1:]
+        if len(values_in_rows) == 0:
+            return pl.DataFrame(data={col: [] for col in column_names})
+        # Convert a list of rows into a list of columns
+        values_in_columns = list(zip(*values_in_rows, strict=True))
+        # Package it up into a dictionary of columns
+        return pl.DataFrame(dict(zip(column_names, values_in_columns, strict=True)))

data_factory_utils-0.3.0/src/data_factory_utils/environment.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""Class to help with AWS environment inference and configuration."""
+import logging
+import os
+import secrets
+import string
+from pathlib import Path
+from typing import Self
+import boto3
+import botocore.exceptions
+from cloudpathlib import S3Path
+from mypy_boto3_s3.type_defs import BucketTypeDef
+from mypy_boto3_sts.type_defs import CredentialsTypeDef
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+alphabet = string.ascii_lowercase
+def _raise_missing_token() -> None:
+    msg = "AWS_WEB_IDENTITY_TOKEN_FILE environment variable not set."
+    logger.exception(msg)
+    raise RuntimeError(msg)
+def _raise_missing_role_arn() -> None:
+    raise MissingEnvVarRoleArnError
+class MissingEnvVarRoleArnError(Exception):
+    """Exception for a missing role arn env var."""
+    def __init__(self) -> None:
+        """Raise error with message."""
+        self.message = "Add AWS_ROLE_ARN to environment variables."
+        super().__init__()
+class Environment:
+    """AWS environment inference and configuration.
+    This class helps determine the environment (prod, preprod, test, dev),
+    manage AWS credentials (via web identity or default),
+    and construct environment-specific S3 bucket URLs.
+    """
+    _instance: Self | None = None
+    def __new__(cls, *_args: object, **_kwargs: object) -> Self:
+        """Singleton instantiation."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(
+        self,
+        job_name: str | None = "",
+        bucket_prefix: str | None = "emds",
+        *,
+        use_web_identity: bool,
+    ) -> None:
+        """Initialize the environment context.
+        Parameters
+        ----------
+        job_name : str, optional
+            Name used when assuming a role session, by default empty string.
+        bucket_prefix : str, optional
+            Base prefix used to identify datahub buckets, by default "emds".
+        use_web_identity : bool, optional
+            Whether to try using web identity credentials first.
+        """
+        if hasattr(self, "_initialized") and self._initialized:
+            return
+        self.job_name = job_name
+        self.bucket_prefix = bucket_prefix
+        self.use_web_identity = use_web_identity
+        self.session = self._init_session()
+        self.alias = self._fetch_account_alias()
+        self.account_no = self._get_account_number()
+        self.bucket_list = self._list_buckets()
+        self._initialized: bool = True
+    def _init_session(self) -> boto3.Session:
+        """Initialise a boto3 session, optionally using web identity credentials.
+        Returns
+        -------
+        boto3.Session
+            Configured boto3 session.
+        """
+        if self.use_web_identity:
+            try:
+                token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
+                if token_path is None:
+                    _raise_missing_token()
+                else:
+                    path = Path(token_path)
+                with path.open() as f:
+                    web_identity_token = f.read()
+                role_arn = os.environ.get("AWS_ROLE_ARN")
+                if role_arn is None:
+                    _raise_missing_role_arn()
+                else:
+                    role_arn_arg = role_arn
+                sts_client = boto3.client("sts")
+                response = sts_client.assume_role_with_web_identity(
+                    RoleArn=role_arn_arg,
+                    RoleSessionName=f"session-{self.job_name}",
+                    WebIdentityToken=web_identity_token,
+                    DurationSeconds=900,
+                )
+                return boto3.session.Session(
+                    aws_access_key_id=response["Credentials"]["AccessKeyId"],
+                    aws_secret_access_key=response["Credentials"]["SecretAccessKey"],
+                    aws_session_token=response["Credentials"]["SessionToken"],
+                )
+            except Exception as e:
+                logger.warning("Web identity failed: %s. Falling back to default session.", e)
+                raise
+        return boto3.session.Session()
+    def _fetch_account_alias(self) -> str:
+        """Fetch the AWS account alias.
+        Returns
+        -------
+        str
+            Account alias or default.
+        Notes
+        -----
+        Falls back to 'preproduction' alias if none found.
+        """
+        try:
+            aliases = boto3.client("iam").list_account_aliases().get("AccountAliases", [])
+            return aliases[0] if aliases else "electronic-monitoring-data-preproduction"
+        except botocore.exceptions.ClientError:
+            logger.warning("Failed to fetch account alias, assuming preproduction.")
+            return "electronic-monitoring-data-preproduction"
+    def _get_account_number(self) -> str:
+        """Return the AWS account number."""
+        try:
+            return boto3.client("sts").get_caller_identity()["Account"]
+        except botocore.exceptions.NoCredentialsError:
+            msg = "AWS credentials not found."
+            logger.exception(msg)
+            raise RuntimeError(msg) from None
+    def _list_buckets(self) -> list[BucketTypeDef]:
+        """List all available S3 buckets."""
+        try:
+            return boto3.client("s3").list_buckets()["Buckets"]
+        except Exception as e:
+            logger.warning("Could not list buckets: %s", e)
+            raise
+    @property
+    def account_number(self) -> str:
+        """Return the AWS account number."""
+        return self.account_no
+    @property
+    def environment_name(self) -> str:
+        """Infer environment name from account alias.
+        Returns
+        -------
+        str
+            One of: prod, dev, preprod, test, or fallback to raw alias suffix.
+        """
+        full_env_name = self.alias.split("-")[-1]
+        return {
+            "production": "prod",
+            "development": "dev",
+            "preproduction": "preprod",
+            "test": "test",
+        }.get(full_env_name, full_env_name)
+    @property
+    def is_prod(self) -> bool:
+        """Check if the environment is production."""
+        return self.environment_name == "prod"
+    def get_full_bucket_url(
+        self,
+        bucket_prefix: str | None = None,
+        *,
+        full_prefix: bool,
+    ) -> S3Path | None:
+        """Get S3Path to bucket matching environment and prefix.
+        Parameters
+        ----------
+        bucket_prefix : str, optional
+            Prefix to search for (overrides default prefix).
+        full_prefix : bool, optional
+            Whether to match full bucket name exactly.
+        Returns
+        -------
+        Optional[S3Path]
+            S3Path to the matched bucket or None if not found.
+        """
+        search_prefix = bucket_prefix or self.bucket_prefix
+        expected_name = f"{self.bucket_prefix}-{self.environment_name}-{search_prefix}"
+        for bucket in self.bucket_list:
+            bucket_name = bucket["Name"]
+            if full_prefix:
+                if expected_name == "-".join(bucket_name.split("-")[:-1]) or expected_name == bucket_name:
+                    return S3Path(f"s3://{bucket_name}")
+            elif expected_name in bucket_name:
+                return S3Path(f"s3://{bucket_name}")
+        return None
+    def get_api_invoke_url(self, api_name: str, region: str) -> str:
+        """Get API invoke url from env."""
+        client = boto3.client("apigateway", region)
+        rest_api_response = client.get_rest_apis()
+        matches = [it for it in rest_api_response["items"] if it["name"] == api_name]
+        if len(matches) > 1:
+            raise ValueError
+        api_details = matches[0]
+        return f"https://{api_details['id']}.execute-api.{region}.amazonaws.com/"
+    def refresh_credentials(self) -> CredentialsTypeDef:
+        """Refresh credentials via STS.
+        Returns
+        -------
+        dict
+            New credentials.
+        """
+        try:
+            token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
+            if token_path is None:
+                _raise_missing_token()
+            else:
+                path = Path(token_path)
+            with path.open() as f:
+                web_identity_token = f.read()
+            role_arn = os.environ.get("AWS_ROLE_ARN")
+            if role_arn is None:
+                _raise_missing_role_arn()
+            else:
+                roel_arn_arg = role_arn
+            sts_client = boto3.client("sts")
+            rand_suffix = "".join(secrets.choice(alphabet) for _ in range(10))
+            response_assume_role = sts_client.assume_role_with_web_identity(
+                RoleArn=roel_arn_arg,
+                RoleSessionName=f"session-{self.job_name}-{rand_suffix}",
+                WebIdentityToken=web_identity_token,
+                DurationSeconds=900,
+            )
+            return response_assume_role["Credentials"]
+        except Exception:
+            logger.exception("Web identity failed. Falling back to get_session_token.")
+        sts_client = boto3.client("sts")
+        response_session_token = sts_client.get_session_token(DurationSeconds=900)
+        return response_session_token["Credentials"]
+    def export_dbt_variables(self, *, actions: bool = False, airflow: bool = False) -> None:
+        """Export dbt variables for the environment."""
+        s3_data_bucket_name = self.get_full_bucket_url("cadt", full_prefix=True)
+        dbt_test_profile_workgroup = f"{self.account_number}-default"
+        dbt_suffix = "" if self.is_prod else f"_{self.environment_name}_dbt"
+        h3_lambda_arn = f"arn:aws:lambda:eu-west-2:{self.account_no}:function:h3-udf"
+        if actions:
+            export_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
+                >> $GITHUB_ENV\n'
+            export_bucket = f'echo "S3_DATA_BUCKET_NAME={s3_data_bucket_name}" \
+                >> $GITHUB_ENV\n'
+            export_dbt_profile = f'echo \
+                "DBT_TEST_PROFILE_WORKGROUP={dbt_test_profile_workgroup}"\
+                 >> $GITHUB_ENV\n'
+            export_dbt_profile_location = ""
+            export_h3_lambda_arn = f"""echo \
+                export H3_LAMBDA_ARN='{h3_lambda_arn}'
+                 >> $GITHUB_ENV\n
+            """
+            export_dbt_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
+                >> $GITHUB_ENV\n'
+        else:
+            export_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
+            export_bucket = f"export S3_DATA_BUCKET_NAME='{s3_data_bucket_name}'\n"
+            export_dbt_profile = f"""
+                export DBT_TEST_PROFILE_WORKGROUP='{dbt_test_profile_workgroup}'\n
+                """
+            export_dbt_profile_location = 'export DBT_PROFILES_DIR="../.dbt/"\n'
+            export_h3_lambda_arn = f"export H3_LAMBDA_ARN='{h3_lambda_arn}'"
+            export_dbt_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
+        with Path("set_env.sh").open("w") as f:
+            f.write(export_suffix)
+            f.write(export_bucket)
+            f.write(export_dbt_profile)
+            if not airflow:
+                f.write(export_dbt_profile_location)
+            f.write(export_h3_lambda_arn)
+            f.write(export_dbt_suffix)
+    @classmethod
+    def clear(cls) -> None:
+        """Reset the singleton instance.
+        Use this to force restart of the class. Mainly for testing.
+        """
+        cls._instance = None
+    @classmethod
+    def instance(cls) -> "Environment | None":
+        """Return the current singleton instance, if any. Mainly for testing."""
+        return cls._instance

data_factory_utils-0.3.0/src/data_factory_utils/query.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Build safe trino query."""
+import re
+import textwrap
+from datetime import UTC, date, datetime
+from typing import ClassVar, Literal, Self
+IDENTIFIER = re.compile(r"^[a-z_][a-z0-9_\$]*$")
+DISALLOWED_CHARACTERS = re.compile(r"[;\n\r]")
+MAX_LIST_LENGTH = 1000
+Column = str
+OrderByClause = tuple[Column, Literal["ASC", "DESC"]]
+DatesConditions = tuple[Literal["<", ">", "=", "<=", ">="], datetime | str]
+def _ensure_safe_fragment(fragment: str) -> None:
+    """Raise error for any bad chars."""
+    if DISALLOWED_CHARACTERS.search(fragment):
+        msg = "Disallowed characters in fragment"
+        raise ValueError(msg)
+def validate_identifier(name: str) -> str:
+    """Ensure a valid SQL/Athena identifier."""
+    if not IDENTIFIER.match(name):
+        msg = f"Invalid identifier: {name!r}"
+        raise ValueError(msg)
+    return name
+def validate_order_by_direction(direction: str) -> None:
+    """Ensure that the given direction is valid for ordering by."""
+    if direction.upper() not in ("ASC", "DESC"):
+        msg = f"Invalid direction in order by clause: {direction}"
+        raise ValueError(msg)
+def quote_literal(value: None | str | float | datetime | date) -> str:
+    """Safely quote literal values for SQL."""
+    if isinstance(value, bool):
+        return "TRUE" if value else "FALSE"
+    if value is None:
+        return "NULL"
+    if isinstance(value, (int, float)):
+        return str(value)
+    if isinstance(value, str):
+        _ensure_safe_fragment(value)
+        escaped = value.replace("'", "''")
+        return f"'{escaped}'"
+    if isinstance(value, (date, datetime)):
+        return f"cast('{value.strftime('%Y-%m-%d %H:%M:%S')}' as timestamp(6))"
+    return None
+def quote_list_literal(values: list[str | int | float] | tuple[str | int | float]) -> str:
+    """Quote a list into trino."""
+    vals = list(values)
+    if len(vals) > MAX_LIST_LENGTH:
+        msg = "Too many items in list"
+        raise ValueError(msg)
+    return "(" + ", ".join(quote_literal(v) for v in vals) + ")"
+class Query:
+    """A safe Athena SQL query builder."""
+    keywords: ClassVar[list[str]] = [
+        "WITH",
+        "SELECT",
+        "FROM",
+        "WHERE",
+        "GROUP BY",
+        "HAVING",
+        "ORDER BY",
+        "LIMIT",
+    ]
+    def __init__(self) -> None:
+        """Init function."""
+        self.parts: dict[str, list[str]] = {kw: [] for kw in self.keywords}
+    def SELECT(self, *columns: str, distinct: bool = False) -> Self:  # noqa: N802
+        """Wrap selected columns."""
+        for c in columns:
+            if c != "*":
+                for part in c.split(","):
+                    validate_identifier(part)
+        col_list = [f'"{c}"' if c != "*" else "*" for c in columns]
+        if distinct:
+            col_list[0] = "DISTINCT " + col_list[0]
+        self.parts["SELECT"].extend(col_list)
+        return self
+    def FROM(self, database: str, table: str) -> Self:  # noqa: N802
+        """From database name table name."""
+        validate_identifier(database)
+        validate_identifier(table)
+        self.parts["FROM"].append(f'"{database}"."{table}"')
+        return self
+    def WHERE(  # noqa: N802
+        self, *, unquote: bool = False, **conditions: str | list[str | int | float] | tuple[str | int | float]
+    ) -> Self:
+        """Where col=value style."""
+        for col, val in conditions.items():
+            validate_identifier(col)
+            if isinstance(val, (list, tuple)):
+                expr = f'"{col}" IN {val}' if unquote else f'"{col}" IN {quote_list_literal(val)}'
+            else:
+                expr = f'"{col}" = {val}' if unquote else f'"{col}" = {quote_literal(val)}'
+            self.parts["WHERE"].append(expr)
+        return self
+    def WHERE_LIKE(  # noqa: N802
+        self,
+        field_wrapper: Literal["", "upper", "lower"] = "",
+        connector: Literal["", "OR", "AND"] = "",
+        **conditions: str,
+    ) -> Self:
+        """Where Like filter."""
+        for col, val in conditions.items():
+            validate_identifier(col)
+            if isinstance(val, list):
+                range_exprs = [
+                    f'{field_wrapper}("{col}") LIKE {field_wrapper}({quote_literal("%" + str(option) + "%")})'
+                    for option in val
+                ]
+                expr = "(" + f" {connector} ".join(range_exprs) + ")"
+            else:
+                expr = f'{field_wrapper}("{col}") LIKE {field_wrapper}({quote_literal("%" + str(val) + "%")})'
+            self.parts["WHERE"].append(expr)
+        return self
+    def DATES(self, **conditions: DatesConditions) -> Self:  # noqa: N802
+        """Add specific date filtering
+        Accepts conditions where the key is the column name and value is a tuple: (operator, date_string)
+        Intended to be used to constructing a bounding range of dates rather than dates between two.
+        For dates between two values, see DATE_RANGES.
+        Example:
+        -------
+          - QUERY().DATES(field_1=('<', 2023-01-01))
+        """  # noqa: D205
+        valid_operators = {"<", ">", "=", "<=", ">="}
+        expressions = []
+        for col, (operator, date_lit) in conditions.items():
+            # Validation
+            if operator not in valid_operators:
+                err_txt = f"Invalid operator: {operator}"
+                raise ValueError(err_txt)
+            validate_identifier(col)
+            if isinstance(date_lit, str):
+                dt = datetime.strptime(date_lit, "%Y-%m-%d %H:%M:%S").astimezone(UTC)
+                dt.replace(tzinfo=UTC)
+                expr = f'"{col}" {operator} {quote_literal(dt)}'
+            elif isinstance(date_lit, datetime):
+                date_lit.replace(tzinfo=UTC)
+                expr = f'"{col}" {operator} {quote_literal(date_lit)}'
+            expressions.append(expr)
+        group_expr = "(" + " AND ".join(expressions) + ")"
+        if not expressions:
+            self.parts["WHERE"].append("")
+        else:
+            self.parts["WHERE"].append(group_expr)
+        return self
+    def DATE_RANGES(self, cols: list[str], start: datetime, end: datetime) -> Self:  # noqa: N802
+        """Add a grouped OR condition for date ranges: (col1 BETWEEN ... OR col2 BETWEEN ...)."""
+        for c in cols:
+            validate_identifier(c)
+        start_lit = quote_literal(start)
+        end_lit = quote_literal(end)
+        range_exprs = [f'"{c}" BETWEEN {start_lit} AND {end_lit}' for c in cols]
+        group_expr = "(" + " OR ".join(range_exprs) + ")"
+        self.parts["WHERE"].append(group_expr)
+        return self
+    def ORDER_BY(self, *order_by_clauses: str | OrderByClause) -> Self:  # noqa: N802
+        """Order by action."""
+        for clause in order_by_clauses:
+            if isinstance(clause, tuple):
+                col, direction = clause
+                for part in col.split("."):
+                    validate_identifier(part)
+                validate_order_by_direction(direction)
+                self.parts["ORDER BY"].append(f'"{col}" {direction}')
+            else:
+                # Allow ordering by columns without direction
+                col = clause
+                for part in col.split("."):
+                    validate_identifier(part)
+                self.parts["ORDER BY"].append(f'"{col}"')
+        return self
+    def LIMIT(self, n: int) -> Self:  # noqa: N802
+        """Limit query."""
+        if not isinstance(n, int) or n < 0:
+            msg = "LIMIT must be a non-negative integer"
+            raise ValueError(msg)
+        self.parts["LIMIT"].append(str(n))
+        return self
+    def __str__(self) -> str:
+        """Create query."""
+        sql_lines = []
+        for kw in self.keywords:
+            vals = self.parts[kw]
+            if not vals:
+                continue
+            match kw:
+                case "SELECT" | "ORDER BY" | "GROUP BY":
+                    sql_lines.append(f"{kw} " + ", ".join(vals))
+                case "FROM":
+                    sql_lines.append(f"{kw} " + ", ".join(vals))
+                case "WHERE" | "HAVING":
+                    sql_lines.append(f"{kw} " + " AND ".join(vals))
+                case "LIMIT":
+                    sql_lines.append(f"{kw} {vals[0]}")
+                case _:
+                    sql_lines.append(f"{kw} {' '.join(vals)}")
+        return "\n".join(sql_lines)
+    def pretty(self) -> str:
+        """Make SQL look nice."""
+        return textwrap.indent(str(self), "    ")