PyPI - rdxz2-utill - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

rdxz2-utill 0.0.2py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{rdxz2_utill-0.0.2.dist-info → rdxz2_utill-0.1.5.dist-info}/METADATA +16 -15
rdxz2_utill-0.1.5.dist-info/RECORD +38 -0
{rdxz2_utill-0.0.2.dist-info → rdxz2_utill-0.1.5.dist-info}/WHEEL +1 -1
utill/cmd/_bq.py +16 -3
utill/cmd/_conf.py +21 -16
utill/cmd/_enc.py +8 -4
utill/cmd/_mb.py +141 -0
utill/cmd/_pg.py +4 -2
utill/cmd/utill.py +203 -61
utill/my_bq.py +661 -293
utill/my_cli.py +48 -0
utill/my_compare.py +34 -0
utill/my_const.py +9 -9
utill/my_csv.py +41 -20
utill/my_datetime.py +25 -12
utill/my_encryption.py +31 -13
utill/my_env.py +25 -14
utill/my_file.py +16 -14
utill/my_gcs.py +93 -105
utill/my_gdrive.py +196 -0
utill/my_input.py +8 -4
utill/my_json.py +6 -6
utill/my_mb.py +357 -337
utill/my_pg.py +97 -62
utill/my_queue.py +96 -7
utill/my_string.py +23 -5
utill/my_style.py +18 -16
utill/my_tunnel.py +30 -9
utill/my_xlsx.py +12 -9
utill/templates/mb.json +2 -1
utill/templates/pg.json +2 -1
rdxz2_utill-0.0.2.dist-info/RECORD +0 -34
{rdxz2_utill-0.0.2.dist-info → rdxz2_utill-0.1.5.dist-info}/entry_points.txt +0 -0
{rdxz2_utill-0.0.2.dist-info → rdxz2_utill-0.1.5.dist-info}/licenses/LICENSE +0 -0
{rdxz2_utill-0.0.2.dist-info → rdxz2_utill-0.1.5.dist-info}/top_level.txt +0 -0

utill/my_bq.py CHANGED Viewed

@@ -1,358 +1,726 @@
-import humanize
+import datetime
 import math
 import os
 import shutil
+import textwrap
+import time
 from enum import Enum
-from google.cloud import bigquery, storage
+from enum import StrEnum
+from enum import auto
+from google.cloud import bigquery
+from google.cloud.exceptions import NotFound
+from humanize import naturalsize
+from humanize import precisedelta
 from loguru import logger
-from textwrap import dedent
-from .my_const import ByteSize
-from .my_csv import read_header, combine as csv_combine, compress
-from .my_datetime import current_datetime_str
-from .my_env import envs
-from .my_file import make_sure_path_is_directory
-from .my_gcs import GCS
-from .my_queue import ThreadingQ
-from .my_string import replace_nonnumeric
-from .my_xlsx import csv_to_xlsx
-MAP__PYTHON_DTYPE__BQ_DTYPE = {
-    int: 'INTEGER',
-    str: 'STRING',
-    float: 'STRING',
+from . import my_csv
+from . import my_datetime
+from . import my_env
+from . import my_gcs
+from . import my_string
+from . import my_xlsx
+PY_DATA_TYPE__BQ_DATA_TYPE = {
+    int: "INTEGER",
+    str: "STRING",
+    float: "STRING",
 }
-class LoadStrategy(Enum):
-    OVERWRITE = 1
-    APPEND = 2
+class DataFileFormat(StrEnum):
+    CSV = "CSV"
+    JSON = "JSON"
+    AVRO = "AVRO"
+    PARQUET = "PARQUET"
+    ORC = "ORC"
-class Dtype:
-    INT64 = 'INT64'
-    INTEGER = 'INTEGER'
-    FLOAT64 = 'FLOAT64'
+class DataFileCompression(StrEnum):
+    GZIP = "GZIP"
+    SNAPPY = "SNAPPY"
-    DECIMAL = 'DECIMAL'
+class LoadStrategy(Enum):
+    OVERWRITE = auto()
+    APPEND = auto()
-    STRING = 'STRING'
-    JSON = 'JSON'
-    DATE = 'DATE'
-    TIME = 'TIME'
-    DATETIME = 'DATETIME'
-    TIMESTAMP = 'TIMESTAMP'
+class Dtype:
+    INT64 = "INT64"
+    INTEGER = "INTEGER"
+    FLOAT64 = "FLOAT64"
-    BOOL = 'BOOL'
+    DECIMAL = "DECIMAL"
-    ARRAY_INT64 = 'ARRAY<INT64>'
-    ARRAY_INTEGER = 'ARRAY<INTEGER>'
-    ARRAY_FLOAT64 = 'ARRAY<FLOAT64>'
-    ARRAY_STRING = 'ARRAY<STRING>'
-    ARRAY_JSON = 'ARRAY<JSON>'
-    ARRAY_DATE = 'ARRAY<DATE>'
-    ARRAY_DATETIME = 'ARRAY<DATETIME>'
-    ARRAY_TIMESTAMP = 'ARRAY<TIMESTAMP>'
-    ARRAY_BOOL = 'ARRAY<BOOL>'
+    STRING = "STRING"
+    JSON = "JSON"
+    DATE = "DATE"
+    TIME = "TIME"
+    DATETIME = "DATETIME"
+    TIMESTAMP = "TIMESTAMP"
-class BQ():
-    def __init__(self, project: str = None):
-        self.project = project or envs.GCP_PROJECT_ID
+    BOOL = "BOOL"
-        self.client = bigquery.Client(project=self.project)
-        logger.debug(f'BQ client open, project: {self.project or "<application-default>"}')
+    ARRAY_INT64 = "ARRAY<INT64>"
+    ARRAY_INTEGER = "ARRAY<INTEGER>"
+    ARRAY_FLOAT64 = "ARRAY<FLOAT64>"
+    ARRAY_STRING = "ARRAY<STRING>"
+    ARRAY_JSON = "ARRAY<JSON>"
+    ARRAY_DATE = "ARRAY<DATE>"
+    ARRAY_DATETIME = "ARRAY<DATETIME>"
+    ARRAY_TIMESTAMP = "ARRAY<TIMESTAMP>"
+    ARRAY_BOOL = "ARRAY<BOOL>"
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        self.close_client()
+class BQ:
+    def __init__(self, location: str | None = None, project_id: str = None):
+        if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
+            logger.warning("Using ADC for BigQuery authentication")
-    def execute_query(self, query: str | list[str], dry_run: bool = False, parameters: dict = {}) -> bigquery.QueryJob:
-        multi = type(query) == list
-        if multi:
-            query = '\n'.join([x if str(x).strip().endswith(';') else x + ';' for x in query if x])
-        else:
-            query = query.strip()
+        # if location is None and my_env.envs.GCP_REGION is None:
+        #     raise ValueError('GCP region must be set in environment variables.')
-        # Build paramters
+        self.client = bigquery.Client(
+            project=project_id or my_env.envs.GCP_PROJECT_ID,
+            location=location or my_env.envs.GCP_REGION,
+        )
+        logger.debug(f"BQ client open, project: {self.client.project}")
+    # MARK: Query execution
+    def execute_query(
+        self,
+        query: str | list[str],
+        parameters: dict = {},
+        dry_run: bool = False,
+        temporary_table: bool = False,
+    ) -> bigquery.QueryJob:
+        # Reconstruct query, handle multiple queries in a single job
+        is_multi = isinstance(query, list)
+        queries = query if is_multi else [query]
+        queries = [textwrap.dedent(q).strip() for q in queries]
+        queries = [
+            q if q.endswith(";") else q + ";" for q in queries
+        ]  # Append ';' character for each query
+        query = "\n".join(queries)
+        # Evaluate parameter
         query_parameters = []
         for parameter, value in parameters.items():
-            if type(value) == list:
-                query_parameters.append(bigquery.ArrayQueryParameter(parameter, MAP__PYTHON_DTYPE__BQ_DTYPE[type(value[0])], value))
+            is_array = isinstance(value, list)
+            value_type_py = type(value[0]) if is_array else type(value)
+            if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
+                raise ValueError(
+                    f"Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}"
+                )
+            value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
+            # Handle data type conversions
+            if value_type_py == datetime.date:
+                value = (
+                    [v.strftime("%Y-%m-%d") for v in value]
+                    if is_array
+                    else value.strftime("%Y-%m-%d")
+                )
+            if is_array:
+                query_parameters.append(
+                    bigquery.ArrayQueryParameter(parameter, value_type_bq, value)
+                )
             else:
-                query_parameters.append(bigquery.ScalarQueryParameter(parameter, MAP__PYTHON_DTYPE__BQ_DTYPE[type(value)], value))
+                query_parameters.append(
+                    bigquery.ScalarQueryParameter(parameter, value_type_bq, value)
+                )
-        logger.debug(f'🔎 Query:\n{query}')
-        query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
+        logger.debug(f"🔎 Query:\n{query}")
+        query_job_config = bigquery.QueryJobConfig(
+            dry_run=dry_run, query_parameters=query_parameters
+        )
+        if temporary_table:
+            query_job_config.destination = None
+        t = time.time()
         query_job = self.client.query(query, job_config=query_job_config)
-        query_job.result()  # Wait query execution
+        (
+            logger.info(
+                f"Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults"
+            )
+            if not dry_run
+            else None
+        )
+        query_job.result()  # Wait for the job to complete
+        elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
-        if not multi:
-            logger.debug(f'[Job ID] {query_job.job_id}, [Processed] {humanize.naturalsize(query_job.total_bytes_processed)}, [Billed] {humanize.naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s)',)
+        if not is_multi:
+            logger.info(
+                f"[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}",
+            )
         else:
-            logger.debug(f'[Job ID] {query_job.job_id}')
+            logger.info(f"[Job ID] {query_job.job_id} [Elapsed] {elapsed}")
-            jobs: list[bigquery.QueryJob] = self.client.list_jobs(parent_job=query_job.job_id)
-            [logger.debug(f'[Script ID] {job.job_id}, [Processed] {humanize.naturalsize(job.total_bytes_processed)}, [Billed] {humanize.naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
+            jobs: list[bigquery.QueryJob] = list(
+                self.client.list_jobs(parent_job=query_job.job_id)
+            )
+            [
+                logger.info(
+                    f"[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)",
+                )
+                for job in jobs
+            ]
         return query_job
-    def create_table(self, bq_table_fqn: str, schema: list[bigquery.SchemaField], partition_col: str, cluster_cols: list[str]):
-        table = bigquery.Table(bq_table_fqn, schema=schema)
-        if partition_col:
-            table.time_partitioning = bigquery.TimePartitioning(field=partition_col)
-            table.partitioning_type = 'DAY'
-        if cluster_cols:
-            table.clustering_fields = cluster_cols
-        bq_table = self.client.create_table(table)
-        logger.info(f'✅ Table created: {bq_table_fqn}')
-        return bq_table
-    def drop_table(self, bq_table_fqn: str):
-        self.client.delete_table(bq_table_fqn)
-        logger.info(f'✅ Table dropped: {bq_table_fqn}')
-    def load_data_into(self, bq_table_fqn: str, gcs_path: list[str] | str, cols: dict[str, Dtype], partition_col: str = None, cluster_cols: list[str] = None, overwrite: bool = False):
-        if type(gcs_path) == str:
-            gcs_path = [gcs_path]
-        gcs_path_str = ',\n'.join([f'        \'{x}\'' for x in gcs_path])
-        load_data_keyword = 'OVERWRITE' if overwrite else 'INTO'
-        cols_str = ',\n'.join([f'    `{x}` {y}' for x, y in cols.items()])
-        cluster_cols_str = ','.join([f'`{x}`' for x in cluster_cols]) if cluster_cols else None
-        query = dedent(
-            f'''
-            LOAD DATA {load_data_keyword} `{bq_table_fqn}` (
-            {cols_str}
+    # MARK: Table operations
+    def create_table(
+        self,
+        dst_table_fqn: str,
+        query: str,
+        query_parameters: dict = {},
+        *,
+        description: str | None = None,
+        schema: list[dict] | None = None,
+        partition_by: str | None = None,
+        clustering_fields: list[str] | None = None,
+        expiration_timestamp_utc: datetime.datetime | None = None,
+        require_partition_filter: bool = False,
+        replace: bool = False,
+    ):
+        self.raise_for_invalid_table_fqn(dst_table_fqn)
+        # Construct table options
+        logger.debug("Constructing table options ...")
+        table_options = []
+        if expiration_timestamp_utc:
+            table_options.append(
+                f"  expiration_timestamp='{expiration_timestamp_utc.isoformat()}'"
             )
-            {f"PARTITION BY `{partition_col}`" if partition_col is not None else "-- No partition column provided"}
-            {f"CLUSTER BY {cluster_cols_str}" if cluster_cols_str is not None else "-- No cluster column provided"}
-            FROM FILES(
-                skip_leading_rows=1,
-                allow_quoted_newlines=true,
-                format='csv',
-                compression='gzip',
-                uris = [
-            {gcs_path_str}
-                ]
-            );
-            '''
+        if partition_by and require_partition_filter:
+            table_options.append("  require_partition_filter=TRUE")
+        if description:
+            table_options.append(f"  description='{description}'")
+        # Check if table exists
+        logger.debug("Checking if destination table exists ...")
+        dst_table_project_id, dst_table_dataset_id, dst_table_id = (
+            self.get_table_fqn_parts(dst_table_fqn)
         )
-        logger.debug(f'⌛ Load data into: {bq_table_fqn}')
-        query_job = self.execute_query(query)
-        logger.info(f'✅ Load data into: {bq_table_fqn}')
-        return query_job
-    def export_data(self, query: str, gcs_path: str, pre_query: str = None):
-        if '*' not in gcs_path:
-            raise ValueError('GCS path need to have a single \'*\' wildcard character')
-        query = dedent(
-            f'''
-            EXPORT DATA OPTIONS (
-                uri='{gcs_path}',
-                format='csv',
-                compression='gzip',
-                overwrite=true,
-                header=true,
-                field_delimiter=',')
-            AS (
-            {query}
-            );
-            '''
+        table_exist = self.is_table_exists(
+            project_id=dst_table_project_id,
+            dataset_id=dst_table_dataset_id,
+            table_id=dst_table_id,
         )
-        if pre_query:
-            query = [pre_query, query]
-        logger.debug(f'⌛ Export data into: {gcs_path}')
-        query_job = self.execute_query(query)
-        logger.info(f'✅ Exported data into: {gcs_path}')
-        return query_job
-    def upload_csv(self, src_filename: str, bq_table_fqn: str, cols: dict[str, Dtype], partition_col: str = None, cluster_cols: list[str] = None, load_strategy: LoadStrategy = LoadStrategy.APPEND):
-        # <<----- START: Validation
-        if load_strategy not in LoadStrategy:
-            raise ValueError('Invalid load strategy')
+        # Construct beautiful query string
+        if table_exist and not replace:
+            logger.debug("Table exists, constructing INSERT query ...")
+            query_parts = [f"INSERT INTO `{dst_table_fqn}`"]
+            if schema:
+                schema_str = ",\n".join([column["name"] for column in schema])
+                query_parts.append(f"(\n{schema_str}\n)")
+            if table_options:
+                table_options_str = ",\n".join(table_options)
+                query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
+        else:
+            logger.debug("Table not exist, constructing CREATE TABLE query ...")
+            query_parts = [
+                f"CREATE OR REPLACE TABLE `{dst_table_fqn}`",
+            ]
+            if schema:
+                schema_str = ",\n".join(
+                    [f'  {column["name"]} {column["data_type"]}' for column in schema]
+                )
+                query_parts.append(f"(\n{schema_str}\n)")
+            if partition_by:
+                query_parts.append(f"PARTITION BY {partition_by}")
+            if clustering_fields:
+                clustering_fields_str = ", ".join(
+                    [f"`{field}`" for field in clustering_fields]
+                )
+                query_parts.append(f"CLUSTER BY {clustering_fields_str}")
+            if table_options:
+                table_options_str = ",\n".join(table_options)
+                query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
+            query_parts.append("AS")
+        query_parts.append(textwrap.dedent(query).strip())
+        # Execute
+        logger.debug("Executing query ...")
+        query = "\n".join(query_parts)
+        self.execute_query(query, parameters=query_parameters)
-        if not src_filename.endswith('.csv'):
-            raise ValueError('Please provide file path with .csv extension!')
+    def drop_table(self, bq_table_fqn: str):
+        logger.info(f"Dropping table: {bq_table_fqn} ...")
+        self.raise_for_invalid_table_fqn(bq_table_fqn)
+        self.client.delete_table(bq_table_fqn, not_found_ok=True)
+    # MARK: Table data
+    def load_data(
+        self,
+        src_gcs_uri: str,
+        dst_table_fqn: str,
+        *,
+        schema: list[dict] | None = None,
+        partition_by: str | None = None,
+        clustering_fields: list[str] | None = None,
+        field_delimiter: str = ",",
+        load_strategy: LoadStrategy = LoadStrategy.APPEND,
+        format: DataFileFormat = DataFileFormat.CSV,
+        compression=None,
+    ):
+        self.raise_for_invalid_table_fqn(dst_table_fqn)
+        logger.debug(f"Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...")
+        # Construct LOAD options
+        logger.debug("Constructing LOAD options ...")
+        load_options = [  # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
+            f"  format='{format}'",
+            f"  uris=['{src_gcs_uri}']",
+        ]
+        if format == DataFileFormat.CSV:
+            load_options.append("  skip_leading_rows=1")
+            load_options.append(f"  field_delimiter='{field_delimiter}'")
+            load_options.append("  allow_quoted_newlines=true")
+        if compression:
+            load_options.append(f"  compression='{compression}'")
+        load_options_str = ",\n".join(load_options)
+        # Construct beautiful query string
+        logger.debug("Constructing LOAD query ...")
+        schema_str = ",\n".join(
+            [f'  {column["name"]} {column["data_type"]}' for column in schema]
+        )
+        query_parts = [
+            f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)'
+        ]
+        if partition_by:
+            query_parts.append(f"PARTITION BY {partition_by}")
+        if clustering_fields:
+            clustering_fields_str = ", ".join(
+                [f"`{field}`" for field in clustering_fields]
+            )
+            query_parts.append(f"CLUSTER BY {clustering_fields_str}")
+        query_parts.append(f"FROM FILES (\n{load_options_str}\n)")
+        query = "\n".join(query_parts)
+        # Execute
+        logger.debug("Executing query ...")
+        self.execute_query(query)
+    def export_data(
+        self,
+        query: str,
+        dst_gcs_uri: str,
+        *,
+        parameters: dict = {},
+        format: DataFileFormat = DataFileFormat.CSV,
+        compression: DataFileCompression | None = None,
+        header: bool = True,
+        delimiter: str = ",",
+    ):
+        logger.debug(f"Exporting query into {dst_gcs_uri} ...")
+        # GCS uri validation
+        if (
+            format == DataFileFormat.CSV
+            and compression == DataFileCompression.GZIP
+            and not dst_gcs_uri.endswith(".gz")
+        ):
+            raise ValueError(
+                "GCS path need to ends with .gz if using compression = GCSCompression.GZIP"
+            )
+        elif (
+            format == DataFileFormat.CSV
+            and compression != DataFileCompression.GZIP
+            and not dst_gcs_uri.endswith(".csv")
+        ):
+            raise ValueError(
+                "GCS path need to ends with .csv if using format = GCSExportFormat.CSV"
+            )
+        elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith(".parquet"):
+            raise ValueError(
+                "GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET"
+            )
-        if partition_col is not None:
-            if partition_col not in cols.keys():
-                raise ValueError(f'Partition \'{partition_col}\' not exists in columns!')
-        if cluster_cols is not None:
-            if cluster_cols not in cols.keys():
-                raise ValueError(f'Cluster \'{cluster_cols}\' not exists in columns!')
+        # Construct options
+        logger.debug("Constructing EXPORT options ...")
+        options = [
+            f"  uri='{dst_gcs_uri}'",
+            f"  format='{format}'",
+            "  overwrite=TRUE",
+        ]
+        if format == DataFileFormat.CSV:
+            options.append(
+                f"  field_delimiter='{delimiter}'",
+            )
+            if header:
+                options.append(
+                    f'  header={"true" if header else "false"}',
+                )
+        if compression:
+            options.append(f"  compression='{compression}'")
+        options_str = ",\n".join(options)
+        # Construct beautiful query string
+        logger.debug("Constructing EXPORT query ...")
+        query = (
+            f"EXPORT DATA OPTIONS (\n"
+            f"{options_str}\n"
+            f")\n"
+            f"AS (\n"
+            f"{textwrap.dedent(query).strip()}\n"
+            f");"
+        )
-        # Build list of columns with its datatypes
-        csv_cols = set(read_header(src_filename))
-        excessive_cols = set(cols.keys()) - set(csv_cols)
-        if excessive_cols:
-            raise ValueError(f'{len(excessive_cols)} columns not exists in CSV file: {", ".join(excessive_cols)}')
-        nonexistent_cols = set(csv_cols) - set(cols.keys())
-        if nonexistent_cols:
-            raise ValueError(f'{len(nonexistent_cols)} columns from CSV are missing: {", ".join(nonexistent_cols)}')
+        # Execute
+        logger.debug("Executing query ...")
+        self.execute_query(query=query, parameters=parameters)
+    def upload_csv(
+        self,
+        src_filepath: str,
+        dst_table_fqn: str,
+        schema: list[dict] | None = None,
+        gcs_bucket: str | None = None,
+        partition_by: str = None,
+        clustering_fields: list[str] = None,
+        compression: DataFileCompression | None = None,
+        load_strategy: LoadStrategy = LoadStrategy.APPEND,
+    ):
+        self.raise_for_invalid_table_fqn(dst_table_fqn)
+        if compression == DataFileCompression.GZIP and not src_filepath.endswith(".gz"):
+            raise ValueError(
+                "Please provide file path with .gz extension if using compression = GZIP"
+            )
+        elif not src_filepath.endswith(".csv"):
+            raise ValueError("Please provide file path with .csv extension")
-        # END: Validation ----->>
+        src_filename, src_fileextension = os.path.splitext(src_filepath)
+        src_filename = os.path.basename(src_filename)  # Only get filename
-        # <<----- START: Upload to GCS
+        # # <<----- START: Upload to GCS
-        gcs = GCS(self.project)
-        tmp_dir = f'tmp/upload__{current_datetime_str()}'
+        # gcs = GCS(self.project_id)
+        # tmp_dir = f'tmp/upload__{current_datetime_str()}'
-        # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
-        # A single file can produce more than one compressed file in GCS
-        def producer(src_file: str):
-            for dst_file in compress(src_file, keep=True, max_size_bytes=ByteSize.GB * 3):
-                yield (dst_file, )
+        # # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
+        # # A single file can produce more than one compressed file in GCS
+        # def producer(src_file: str):
+        #     for dst_file in compress(src_file,
+        # keep=True, max_size_bytes=ByteSize.GB * 3):
+        #         yield (dst_file, )
-        def consumer(dst_file: str):
-            remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
-            logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
-            blob = gcs.upload(dst_file, remote_file_name, mv=True)
-            return blob
+        # def consumer(dst_file: str):
+        #     remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
+        #     logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
+        #     blob = gcs.upload(dst_file, remote_file_name, move=True)
+        #     return blob
-        blobs: list[storage.Blob]
-        _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
+        # blobs: list[storage.Blob]
+        # _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
-        # END: Upload to GCS ----->>
+        # # END: Upload to GCS ----->>
-        # <<----- START: Load to BQ
+        # Upload to GCS
+        # TODO: Re-implement the producer-consumer model to upload multiple files
+        gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
+        dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(src_filename, "_").lower()}{src_fileextension}'
+        gcs.upload(src_filepath, dst_blobpath)
+        # Load to BQ
         try:
-            gcs_filename_fqns = [f'gs://{blob.bucket.name}/{blob.name}' for blob in blobs]
-            match load_strategy:
-                case LoadStrategy.OVERWRITE:
-                    self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols, overwrite=True)
-                case LoadStrategy.APPEND:
-                    self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols)
-                case _:
-                    return ValueError(f'Load strategy not recognized: {load_strategy}')
-        except Exception as e:
-            raise e
+            self.load_data(
+                f"gs://{gcs.bucket.name}/{dst_blobpath}",
+                dst_table_fqn,
+                schema=schema,
+                partition_by=partition_by,
+                clustering_fields=clustering_fields,
+                format=DataFileFormat.CSV,
+                compression=compression,
+                load_strategy=load_strategy,
+            )
+        except:
+            raise
         finally:
-            [GCS.remove_blob(blob) for blob in blobs]
-        # END: Load to BQ ----->>
-    def download_csv(self, query: str, dst_filename: str, combine: bool = True, pre_query: str = None):
-        if not dst_filename.endswith('.csv'):
-            raise ValueError('Destination filename must ends with .csv!')
-        dirname = os.path.dirname(dst_filename)
-        make_sure_path_is_directory(dirname)
-        # Remove & recreate existing folder
-        if os.path.exists(dirname):
-            shutil.rmtree(dirname)
-        os.makedirs(dirname, exist_ok=True)
-        # Export data into GCS
-        current_time = current_datetime_str()
-        gcs_path = f'gs://{envs.GCS_BUCKET}/tmp/unload__{current_time}/*.csv.gz'
-        self.export_data(query, gcs_path, pre_query)
-        # Download into local machine
-        gcs = GCS(self.project)
-        logger.info('Downloads from GCS...')
-        downloaded_filenames = []
-        for blob in gcs.list(f'tmp/unload__{current_time}/'):
-            file_path_part = os.path.join(dirname, blob.name.split('/')[-1])
-            gcs.download(blob, file_path_part)
-            downloaded_filenames.append(file_path_part)
-        # Combine the file and clean up the file chunks
-        if combine:
-            logger.info('Combine downloaded csv...')
-            csv_combine(downloaded_filenames, dst_filename)
-            shutil.rmtree(dirname)
-        return dst_filename
-    def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
-        if not dst_filename.endswith('.xlsx'):
-            raise ValueError('Destination filename must ends with .xlsx!')
+            gcs.delete_blob(dst_blobpath)
+    def download_csv(
+        self,
+        query: str,
+        dst_filepath: str,
+        *,
+        gcs_bucket: str | None = None,
+        query_parameters: dict = {},
+        csv_row_limit: int | None = None,
+    ) -> str | list[str]:
+        if not dst_filepath.endswith(".csv"):
+            raise ValueError("Destination filename must ends with .csv")
+        # Init
+        gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
+        # Generic function to export-download-combine csv file from BQ->GCS->local
+        def _export_download_combine(
+            query: str,
+            dst_gcs_prefix: str,
+            dst_filepath: str,
+            query_parameters: dict = {},
+        ):
+            # Init tmp directory
+            tmp_dirname = f"/tmp/my_bq_{my_datetime.get_current_datetime_str()}"
+            if os.path.exists(tmp_dirname):
+                shutil.rmtree(tmp_dirname, ignore_errors=True)
+            os.makedirs(tmp_dirname, exist_ok=True)
+            logger.debug(f"Temporary directory created: {tmp_dirname}")
+            try:
+                # Export to GCS
+                dst_gcs_uri = f"gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz"
+                self.export_data(
+                    query,
+                    dst_gcs_uri,
+                    parameters=query_parameters,
+                    format=DataFileFormat.CSV,
+                    compression=DataFileCompression.GZIP,
+                )
+                # Download from GCS
+                local_tmp_filepaths = []
+                for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
+                    local_tmp_filepath = os.path.join(
+                        tmp_dirname, tmp_blobs.name.split("/")[-1]
+                    )
+                    gcs.download(tmp_blobs, local_tmp_filepath, move=True)
+                    # logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
+                    local_tmp_filepaths.append(local_tmp_filepath)
+                # Combine downloaded files
+                my_csv.combine(
+                    local_tmp_filepaths, dst_filepath, gzip=True, delete=True
+                )
+            except:
+                raise
+            finally:
+                shutil.rmtree(tmp_dirname, ignore_errors=True)  # Remove local folder
+                [
+                    gcs.delete_blob(blob_filepath)
+                    for blob_filepath in gcs.list_blobs(dst_gcs_prefix)
+                ]  # Remove temporary GCS files
+            logger.info(f"Export-download-combine done: {dst_filepath}")
+        # Limited csv rows
+        if csv_row_limit:
+            tmp_table_fqn: str | None = None
+            tmp_table_fqn_rn: str | None = None
+            try:
+                # Create temporary table
+                query_job = self.execute_query(query, temporary_table=True)
+                tmp_table_fqn = str(query_job.destination)
+                logger.debug(f"Create temp table: {tmp_table_fqn}")
+                # Create another temporary table for row numbering
+                query_job = self.execute_query(
+                    f"SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`",
+                    temporary_table=True,
+                )
+                tmp_table_fqn_rn = str(query_job.destination)
+                logger.debug(f"Create temp table (rn): {tmp_table_fqn_rn}")
+                # Process parts
+                count = list(
+                    self.execute_query(
+                        f"SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`"
+                    ).result()
+                )[0][0]
+                parts = math.ceil(count / csv_row_limit)
+                logger.info(f"Total part: {count} / {csv_row_limit} = {parts}")
+                dst_filepaths = []
+                for part in range(parts):
+                    dst_filepath_part = (
+                        f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
+                    )
+                    _export_download_combine(
+                        f"SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn",
+                        dst_gcs_prefix=gcs.build_tmp_dirpath(),
+                        dst_filepath=dst_filepath_part,
+                    )
+                    dst_filepaths.append(dst_filepath_part)
+                return dst_filepaths
+            except:
+                raise
+            finally:
+                # Drop temporary tables
+                if tmp_table_fqn_rn:
+                    self.drop_table(tmp_table_fqn_rn)
+                if tmp_table_fqn:
+                    self.drop_table(tmp_table_fqn)
+        # Unlimited csv rows
+        else:
+            _export_download_combine(
+                query,
+                gcs.build_tmp_dirpath(),
+                dst_filepath,
+                query_parameters=query_parameters,
+            )
+            return dst_filepath
+        # query_job_result = query_job.result()
+        # row_count = 0
+        # file_index = 1
+        # # Stream-download-split result
+        # def open_file(f):
+        #     if f:
+        #         f.close()
+        #     dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{file_index:06}.csv' if row_limit else dst_filepath
+        #     logger.info(f'Writing into file: {dst_filepath_part} ...')
+        #     f = open(dst_filepath_part, 'w', newline='', encoding='utf-8')
+        #     writer = csv.writer(f)
+        #     writer.writerow([field.name for field in query_job_result.schema])  # Write header
+        #     return f, writer
+        # f, writer = open_file(None)
+        # for row in query_job_result:
+        #     writer.writerow(row)
+        #     if row_limit:
+        #         row_count += 1
+        #         if row_count >= row_limit:
+        #             row_count = 0
+        #             file_index += 1
+        #             f, writer = open_file(f)
+        # if f:
+        #     f.close()
+    def download_xlsx(
+        self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000
+    ):
+        if not dst_filename.endswith(".xlsx"):
+            raise ValueError("Destination filename must ends with .xlsx!")
         # Create a temporary table acting as excel file splitting
-        table_name_tmp = f'{src_table_fqn}_'
-        self.execute_query(f'CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`')
+        table_name_tmp = f"{src_table_fqn}_"
+        self.execute_query(
+            f"CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`"
+        )
         try:
             # Calculate the number of excel file parts based on row limit
-            cnt = list(self.execute_query(f'SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`').result())[0][0]
+            cnt = list(
+                self.execute_query(
+                    f"SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`"
+                ).result()
+            )[0][0]
             parts = math.ceil(cnt / xlsx_row_limit)
-            logger.debug(f'Total part: {cnt} / {xlsx_row_limit} = {parts}')
+            logger.debug(f"Total part: {cnt} / {xlsx_row_limit} = {parts}")
             # Download per parts
             for part in range(parts):
-                logger.debug(f'Downloading part {part + 1}...')
-                file_path_tmp = f'{dst_filename}_part{part + 1}'
-                file_path_tmp_csv = f'{file_path_tmp}.csv'
-                self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
-                csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
+                logger.debug(f"Downloading part {part + 1}...")
+                file_path_tmp = f"{dst_filename}_part{part + 1}"
+                file_path_tmp_csv = f"{file_path_tmp}.csv"
+                self.download_csv(
+                    f"SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}",
+                    f"{file_path_tmp}{os.sep}",
+                )
+                my_xlsx.csv_to_xlsx(file_path_tmp_csv, f"{file_path_tmp}.xlsx")
                 os.remove(file_path_tmp_csv)
         except Exception as e:
             raise e
         finally:
-            self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
-    def copy_table(self, src_table_id: str, dst_table_id: str, drop: bool = False):
-        # Create or replace
-        self.client.delete_table(dst_table_id, not_found_ok=True)
-        self.client.copy_table(src_table_id, dst_table_id).result()
-        logger.debug(f'Table {src_table_id} copied to {dst_table_id}')
-        if drop:
-            self.client.delete_table(src_table_id)
-            logger.debug(f'Table {src_table_id} dropped')
-    def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
-        src_project_id, src_dataset_id, _ = src_view_id.split('.')
-        dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
-        # Create or replace
-        src_view = self.client.get_table(src_view_id)
-        dst_view = bigquery.Table(dst_view_id)
-        dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
-        self.client.delete_table(dst_view, not_found_ok=True)
-        self.client.create_table(dst_view)
-        logger.debug(f'View {src_view_id} copied to {dst_view}')
-        if drop:
-            self.client.delete_table(src_view_id)
-            logger.debug(f'View {src_view_id} dropped')
-    def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
-        src_project_id, src_dataset_id, _ = src_routine_id.split('.')
-        dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
-        # Create or replace
-        src_routine = self.client.get_routine(src_routine_id)
-        dst_routine = bigquery.Routine(dst_routine_id)
-        dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
-        dst_routine.type_ = src_routine.type_
-        dst_routine.description = src_routine.description
-        dst_routine.language = src_routine.language
-        dst_routine.arguments = src_routine.arguments
-        dst_routine.return_type = src_routine.return_type
-        self.client.delete_routine(dst_routine, not_found_ok=True)
-        self.client.create_routine(dst_routine)
-        logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
-        if drop:
-            self.client.delete_routine(src_routine_id)
-            logger.debug(f'Routine {src_routine_id} dropped')
-    def close_client(self):
+            self.execute_query(f"DROP TABLE IF EXISTS `{table_name_tmp}`")
+    # def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
+    #     src_project_id, src_dataset_id, _ = src_view_id.split('.')
+    #     dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
+    #     # Create or replace
+    #     src_view = self.client.get_table(src_view_id)
+    #     dst_view = bigquery.Table(dst_view_id)
+    #     dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
+    #     self.client.delete_table(dst_view, not_found_ok=True)
+    #     self.client.create_table(dst_view)
+    #     logger.debug(f'View {src_view_id} copied to {dst_view}')
+    #     if drop:
+    #         self.client.delete_table(src_view_id)
+    #         logger.debug(f'View {src_view_id} dropped')
+    # def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
+    #     src_project_id, src_dataset_id, _ = src_routine_id.split('.')
+    #     dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
+    #     # Create or replace
+    #     src_routine = self.client.get_routine(src_routine_id)
+    #     dst_routine = bigquery.Routine(dst_routine_id)
+    #     dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
+    #     dst_routine.type_ = src_routine.type_
+    #     dst_routine.description = src_routine.description
+    #     dst_routine.language = src_routine.language
+    #     dst_routine.arguments = src_routine.arguments
+    #     dst_routine.return_type = src_routine.return_type
+    #     self.client.delete_routine(dst_routine, not_found_ok=True)
+    #     self.client.create_routine(dst_routine)
+    #     logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
+    #     if drop:
+    #         self.client.delete_routine(src_routine_id)
+    #         logger.debug(f'Routine {src_routine_id} dropped')
+    # MARK: Utilities
+    @staticmethod
+    def get_table_fqn_parts(name: str | list[str]) -> list[str] | list[list[str]]:
+        """Get  fully qualified table name, following this format `<projectid>.<datasetid>.<tableid>`
+        Args:
+            name (str | list[str]): Input name (can be multiple)
+        Returns:
+            list[str] | list[list[str]]: The FQN parts. If the input is list then returns list of FQN parts instead.
+        """
+        if isinstance(name, list):
+            return [BQ.get_table_fqn_parts(x) for x in name]
+        split = name.split(".")
+        if len(split) == 3:
+            return split
+        else:
+            raise ValueError(f"{name} is not a valid table FQN")
+    @staticmethod
+    def raise_for_invalid_table_fqn(name: str | list[str]):
+        """Raise an error if the provied name is a fully qualified table name
+        Args:
+            name (str | list[str]): Input name (can be multiple)
+        Raises:
+            ValueError: If name is not a fully qualified table name
+        """
+        if not BQ.get_table_fqn_parts(name):
+            raise ValueError(f"{name} is not a valid table FQN")
+    def is_table_exists(self, table_fqn: str) -> bool:
+        self.raise_for_invalid_table_fqn(table_fqn)
+        try:
+            self.client.get_table(table_fqn)
+            return True
+        except NotFound:
+            return False
+    def close(self):
         self.client.close()
-        logger.debug('BQ client close')
+        logger.debug("BQ client close")

rdxz2-utill 0.0.2__py3-none-any.whl → 0.1.5__py3-none-any.whl

rdxz2-utill 0.0.2py3-none-any.whl → 0.1.5py3-none-any.whl