PyPI - rdxz2-utill - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

rdxz2-utill 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rdxz2-utill might be problematic. Click here for more details.

Files changed (32) hide show

{rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/METADATA +2 -1
rdxz2_utill-0.1.4.dist-info/RECORD +37 -0
utill/cmd/_bq.py +16 -3
utill/cmd/_conf.py +15 -15
utill/cmd/_enc.py +8 -4
utill/cmd/_mb.py +140 -0
utill/cmd/_pg.py +4 -2
utill/cmd/utill.py +203 -61
utill/my_bq.py +287 -162
utill/my_compare.py +1 -1
utill/my_const.py +11 -8
utill/my_csv.py +31 -15
utill/my_datetime.py +21 -10
utill/my_encryption.py +31 -13
utill/my_env.py +22 -13
utill/my_file.py +15 -13
utill/my_gcs.py +40 -16
utill/my_gdrive.py +195 -0
utill/my_input.py +8 -4
utill/my_json.py +6 -6
utill/my_mb.py +351 -357
utill/my_pg.py +76 -46
utill/my_queue.py +37 -24
utill/my_string.py +23 -5
utill/my_style.py +18 -16
utill/my_tunnel.py +29 -9
utill/my_xlsx.py +11 -8
rdxz2_utill-0.1.2.dist-info/RECORD +0 -35
{rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/WHEEL +0 -0
{rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/entry_points.txt +0 -0
{rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/licenses/LICENSE +0 -0
{rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/top_level.txt +0 -0

utill/my_bq.py CHANGED Viewed

@@ -19,23 +19,23 @@ import textwrap
 import time
 PY_DATA_TYPE__BQ_DATA_TYPE = {
-    int: 'INTEGER',
-    str: 'STRING',
-    float: 'STRING',
+    int: "INTEGER",
+    str: "STRING",
+    float: "STRING",
 }
 class DataFileFormat(StrEnum):
-    CSV = 'CSV'
-    JSON = 'JSON'
-    AVRO = 'AVRO'
-    PARQUET = 'PARQUET'
-    ORC = 'ORC'
+    CSV = "CSV"
+    JSON = "JSON"
+    AVRO = "AVRO"
+    PARQUET = "PARQUET"
+    ORC = "ORC"
 class DataFileCompression(StrEnum):
-    GZIP = 'GZIP'
-    SNAPPY = 'SNAPPY'
+    GZIP = "GZIP"
+    SNAPPY = "SNAPPY"
 class LoadStrategy(Enum):
@@ -44,43 +44,46 @@ class LoadStrategy(Enum):
 class Dtype:
-    INT64 = 'INT64'
-    INTEGER = 'INTEGER'
-    FLOAT64 = 'FLOAT64'
+    INT64 = "INT64"
+    INTEGER = "INTEGER"
+    FLOAT64 = "FLOAT64"
-    DECIMAL = 'DECIMAL'
+    DECIMAL = "DECIMAL"
-    STRING = 'STRING'
-    JSON = 'JSON'
+    STRING = "STRING"
+    JSON = "JSON"
-    DATE = 'DATE'
-    TIME = 'TIME'
-    DATETIME = 'DATETIME'
-    TIMESTAMP = 'TIMESTAMP'
+    DATE = "DATE"
+    TIME = "TIME"
+    DATETIME = "DATETIME"
+    TIMESTAMP = "TIMESTAMP"
-    BOOL = 'BOOL'
+    BOOL = "BOOL"
-    ARRAY_INT64 = 'ARRAY<INT64>'
-    ARRAY_INTEGER = 'ARRAY<INTEGER>'
-    ARRAY_FLOAT64 = 'ARRAY<FLOAT64>'
-    ARRAY_STRING = 'ARRAY<STRING>'
-    ARRAY_JSON = 'ARRAY<JSON>'
-    ARRAY_DATE = 'ARRAY<DATE>'
-    ARRAY_DATETIME = 'ARRAY<DATETIME>'
-    ARRAY_TIMESTAMP = 'ARRAY<TIMESTAMP>'
-    ARRAY_BOOL = 'ARRAY<BOOL>'
+    ARRAY_INT64 = "ARRAY<INT64>"
+    ARRAY_INTEGER = "ARRAY<INTEGER>"
+    ARRAY_FLOAT64 = "ARRAY<FLOAT64>"
+    ARRAY_STRING = "ARRAY<STRING>"
+    ARRAY_JSON = "ARRAY<JSON>"
+    ARRAY_DATE = "ARRAY<DATE>"
+    ARRAY_DATETIME = "ARRAY<DATETIME>"
+    ARRAY_TIMESTAMP = "ARRAY<TIMESTAMP>"
+    ARRAY_BOOL = "ARRAY<BOOL>"
-class BQ():
+class BQ:
     def __init__(self, location: str | None = None, project_id: str = None):
         if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
-            logger.warning('Using ADC for BigQuery authentication')
+            logger.warning("Using ADC for BigQuery authentication")
         # if location is None and my_env.envs.GCP_REGION is None:
         #     raise ValueError('GCP region must be set in environment variables.')
-        self.client = bigquery.Client(project=project_id or my_env.envs.GCP_PROJECT_ID, location=location or my_env.envs.GCP_REGION)
-        logger.debug(f'BQ client open, project: {self.client.project}')
+        self.client = bigquery.Client(
+            project=project_id or my_env.envs.GCP_PROJECT_ID,
+            location=location or my_env.envs.GCP_REGION,
+        )
+        logger.debug(f"BQ client open, project: {self.client.project}")
     # MARK: Query execution
@@ -95,8 +98,10 @@ class BQ():
         is_multi = isinstance(query, list)
         queries = query if is_multi else [query]
         queries = [textwrap.dedent(q).strip() for q in queries]
-        queries = [q if q.endswith(';') else q + ';' for q in queries]  # Append ';' character for each query
-        query = '\n'.join(queries)
+        queries = [
+            q if q.endswith(";") else q + ";" for q in queries
+        ]  # Append ';' character for each query
+        query = "\n".join(queries)
         # Evaluate parameter
         query_parameters = []
@@ -104,36 +109,63 @@ class BQ():
             is_array = isinstance(value, list)
             value_type_py = type(value[0]) if is_array else type(value)
             if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
-                raise ValueError(f'Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}')
+                raise ValueError(
+                    f"Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}"
+                )
             value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
             # Handle data type conversions
             if value_type_py == datetime.date:
-                value = [v.strftime('%Y-%m-%d') for v in value] if is_array else value.strftime('%Y-%m-%d')
+                value = (
+                    [v.strftime("%Y-%m-%d") for v in value]
+                    if is_array
+                    else value.strftime("%Y-%m-%d")
+                )
             if is_array:
-                query_parameters.append(bigquery.ArrayQueryParameter(parameter, value_type_bq, value))
+                query_parameters.append(
+                    bigquery.ArrayQueryParameter(parameter, value_type_bq, value)
+                )
             else:
-                query_parameters.append(bigquery.ScalarQueryParameter(parameter, value_type_bq, value))
+                query_parameters.append(
+                    bigquery.ScalarQueryParameter(parameter, value_type_bq, value)
+                )
-        logger.debug(f'🔎 Query:\n{query}')
-        query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
+        logger.debug(f"🔎 Query:\n{query}")
+        query_job_config = bigquery.QueryJobConfig(
+            dry_run=dry_run, query_parameters=query_parameters
+        )
         if temporary_table:
             query_job_config.destination = None
         t = time.time()
         query_job = self.client.query(query, job_config=query_job_config)
-        logger.info(f'Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults') if not dry_run else None
+        (
+            logger.info(
+                f"Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults"
+            )
+            if not dry_run
+            else None
+        )
         query_job.result()  # Wait for the job to complete
         elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
         if not is_multi:
-            logger.info(f'[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}',)
+            logger.info(
+                f"[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}",
+            )
         else:
-            logger.info(f'[Job ID] {query_job.job_id} [Elapsed] {elapsed}')
-            jobs: list[bigquery.QueryJob] = list(self.client.list_jobs(parent_job=query_job.job_id))
-            [logger.info(f'[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
+            logger.info(f"[Job ID] {query_job.job_id} [Elapsed] {elapsed}")
+            jobs: list[bigquery.QueryJob] = list(
+                self.client.list_jobs(parent_job=query_job.job_id)
+            )
+            [
+                logger.info(
+                    f"[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)",
+                )
+                for job in jobs
+            ]
         return query_job
@@ -156,56 +188,68 @@ class BQ():
         self.raise_for_invalid_table_fqn(dst_table_fqn)
         # Construct table options
-        logger.debug('Constructing table options ...')
+        logger.debug("Constructing table options ...")
         table_options = []
         if expiration_timestamp_utc:
-            table_options.append(f'  expiration_timestamp=\'{expiration_timestamp_utc.isoformat()}\'')
+            table_options.append(
+                f"  expiration_timestamp='{expiration_timestamp_utc.isoformat()}'"
+            )
         if partition_by and require_partition_filter:
-            table_options.append(f'  require_partition_filter=TRUE')
+            table_options.append(f"  require_partition_filter=TRUE")
         if description:
-            table_options.append(f'  description=\'{description}\'')
+            table_options.append(f"  description='{description}'")
         # Check if table exists
-        logger.debug('Checking if destination table exists ...')
-        dst_table_project_id, dst_table_dataset_id, dst_table_id = self.get_table_fqn_parts(dst_table_fqn)
-        table_exist = self.is_table_exists(project_id=dst_table_project_id, dataset_id=dst_table_dataset_id, table_id=dst_table_id)
+        logger.debug("Checking if destination table exists ...")
+        dst_table_project_id, dst_table_dataset_id, dst_table_id = (
+            self.get_table_fqn_parts(dst_table_fqn)
+        )
+        table_exist = self.is_table_exists(
+            project_id=dst_table_project_id,
+            dataset_id=dst_table_dataset_id,
+            table_id=dst_table_id,
+        )
         # Construct beautiful query string
         if table_exist and not replace:
-            logger.debug('Table exists, constructing INSERT query ...')
-            query_parts = [f'INSERT INTO `{dst_table_fqn}`']
+            logger.debug("Table exists, constructing INSERT query ...")
+            query_parts = [f"INSERT INTO `{dst_table_fqn}`"]
             if schema:
-                schema_str = ',\n'.join([column['name'] for column in schema])
-                query_parts.append(f'(\n{schema_str}\n)')
+                schema_str = ",\n".join([column["name"] for column in schema])
+                query_parts.append(f"(\n{schema_str}\n)")
             if table_options:
-                table_options_str = ',\n'.join(table_options)
-                query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
+                table_options_str = ",\n".join(table_options)
+                query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
         else:
-            logger.debug('Table not exist, constructing CREATE TABLE query ...')
+            logger.debug("Table not exist, constructing CREATE TABLE query ...")
             query_parts = [
-                f'CREATE OR REPLACE TABLE `{dst_table_fqn}`',
+                f"CREATE OR REPLACE TABLE `{dst_table_fqn}`",
             ]
             if schema:
-                schema_str = ',\n'.join([f'  {column["name"]} {column["data_type"]}' for column in schema])
-                query_parts.append(f'(\n{schema_str}\n)')
+                schema_str = ",\n".join(
+                    [f'  {column["name"]} {column["data_type"]}' for column in schema]
+                )
+                query_parts.append(f"(\n{schema_str}\n)")
             if partition_by:
-                query_parts.append(f'PARTITION BY {partition_by}')
+                query_parts.append(f"PARTITION BY {partition_by}")
             if clustering_fields:
-                clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
-                query_parts.append(f'CLUSTER BY {clustering_fields_str}')
+                clustering_fields_str = ", ".join(
+                    [f"`{field}`" for field in clustering_fields]
+                )
+                query_parts.append(f"CLUSTER BY {clustering_fields_str}")
             if table_options:
-                table_options_str = ',\n'.join(table_options)
-                query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
-            query_parts.append('AS')
+                table_options_str = ",\n".join(table_options)
+                query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
+            query_parts.append("AS")
         query_parts.append(textwrap.dedent(query).strip())
         # Execute
-        logger.debug('Executing query ...')
-        query = '\n'.join(query_parts)
+        logger.debug("Executing query ...")
+        query = "\n".join(query_parts)
         self.execute_query(query, parameters=query_parameters)
     def drop_table(self, bq_table_fqn: str):
-        logger.info(f'Dropping table: {bq_table_fqn} ...')
+        logger.info(f"Dropping table: {bq_table_fqn} ...")
         self.raise_for_invalid_table_fqn(bq_table_fqn)
         self.client.delete_table(bq_table_fqn, not_found_ok=True)
@@ -219,7 +263,7 @@ class BQ():
         schema: list[dict] | None = None,
         partition_by: str | None = None,
         clustering_fields: list[str] | None = None,
-        field_delimiter: str = ',',
+        field_delimiter: str = ",",
         load_strategy: LoadStrategy = LoadStrategy.APPEND,
         format: DataFileFormat = DataFileFormat.CSV,
         compression=None,
@@ -227,36 +271,42 @@ class BQ():
         self.raise_for_invalid_table_fqn(dst_table_fqn)
-        logger.debug(f'Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...')
+        logger.debug(f"Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...")
         # Construct LOAD options
-        logger.debug('Constructing LOAD options ...')
+        logger.debug("Constructing LOAD options ...")
         load_options = [  # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
-            f'  format=\'{format}\'',
-            f'  uris=[\'{src_gcs_uri}\']',
+            f"  format='{format}'",
+            f"  uris=['{src_gcs_uri}']",
         ]
         if format == DataFileFormat.CSV:
-            load_options.append(f'  skip_leading_rows=1')
-            load_options.append(f'  field_delimiter=\'{field_delimiter}\'')
-            load_options.append(f'  allow_quoted_newlines=true')
+            load_options.append(f"  skip_leading_rows=1")
+            load_options.append(f"  field_delimiter='{field_delimiter}'")
+            load_options.append(f"  allow_quoted_newlines=true")
         if compression:
-            load_options.append(f'  compression=\'{compression}\'')
-        load_options_str = ',\n'.join(load_options)
+            load_options.append(f"  compression='{compression}'")
+        load_options_str = ",\n".join(load_options)
         # Construct beautiful query string
-        logger.debug('Constructing LOAD query ...')
-        schema_str = ',\n'.join([f'  {column["name"]} {column["data_type"]}' for column in schema])
-        query_parts = [f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)']
+        logger.debug("Constructing LOAD query ...")
+        schema_str = ",\n".join(
+            [f'  {column["name"]} {column["data_type"]}' for column in schema]
+        )
+        query_parts = [
+            f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)'
+        ]
         if partition_by:
-            query_parts.append(f'PARTITION BY {partition_by}')
+            query_parts.append(f"PARTITION BY {partition_by}")
         if clustering_fields:
-            clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
-            query_parts.append(f'CLUSTER BY {clustering_fields_str}')
-        query_parts.append(f'FROM FILES (\n{load_options_str}\n)')
-        query = '\n'.join(query_parts)
+            clustering_fields_str = ", ".join(
+                [f"`{field}`" for field in clustering_fields]
+            )
+            query_parts.append(f"CLUSTER BY {clustering_fields_str}")
+        query_parts.append(f"FROM FILES (\n{load_options_str}\n)")
+        query = "\n".join(query_parts)
         # Execute
-        logger.debug('Executing query ...')
+        logger.debug("Executing query ...")
         self.execute_query(query)
     def export_data(
@@ -268,65 +318,88 @@ class BQ():
         format: DataFileFormat = DataFileFormat.CSV,
         compression: DataFileCompression | None = None,
         header: bool = True,
-        delimiter: str = ',',
+        delimiter: str = ",",
     ):
-        logger.debug(f'Exporting query into {dst_gcs_uri} ...')
+        logger.debug(f"Exporting query into {dst_gcs_uri} ...")
         # GCS uri validation
-        if format == DataFileFormat.CSV and compression == DataFileCompression.GZIP and not dst_gcs_uri.endswith('.gz'):
-            raise ValueError('GCS path need to ends with .gz if using compression = GCSCompression.GZIP')
-        elif format == DataFileFormat.CSV and compression != DataFileCompression.GZIP and not dst_gcs_uri.endswith('.csv'):
-            raise ValueError('GCS path need to ends with .csv if using format = GCSExportFormat.CSV')
-        elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith('.parquet'):
-            raise ValueError('GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET')
+        if (
+            format == DataFileFormat.CSV
+            and compression == DataFileCompression.GZIP
+            and not dst_gcs_uri.endswith(".gz")
+        ):
+            raise ValueError(
+                "GCS path need to ends with .gz if using compression = GCSCompression.GZIP"
+            )
+        elif (
+            format == DataFileFormat.CSV
+            and compression != DataFileCompression.GZIP
+            and not dst_gcs_uri.endswith(".csv")
+        ):
+            raise ValueError(
+                "GCS path need to ends with .csv if using format = GCSExportFormat.CSV"
+            )
+        elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith(".parquet"):
+            raise ValueError(
+                "GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET"
+            )
         # Construct options
-        logger.debug('Constructing EXPORT options ...')
+        logger.debug("Constructing EXPORT options ...")
         options = [
-            f'  uri=\'{dst_gcs_uri}\'',
-            f'  format=\'{format}\'',
-            f'  overwrite=TRUE',
+            f"  uri='{dst_gcs_uri}'",
+            f"  format='{format}'",
+            f"  overwrite=TRUE",
         ]
         if format == DataFileFormat.CSV:
-            options.append(f'  field_delimiter=\'{delimiter}\'',)
+            options.append(
+                f"  field_delimiter='{delimiter}'",
+            )
             if header:
-                options.append(f'  header={"true" if header else "false"}',)
+                options.append(
+                    f'  header={"true" if header else "false"}',
+                )
         if compression:
-            options.append(f'  compression=\'{compression}\'')
-        options_str = ',\n'.join(options)
+            options.append(f"  compression='{compression}'")
+        options_str = ",\n".join(options)
         # Construct beautiful query string
-        logger.debug('Constructing EXPORT query ...')
+        logger.debug("Constructing EXPORT query ...")
         query = (
-            f'EXPORT DATA OPTIONS (\n'
-            f'{options_str}\n'
-            f')\n'
-            f'AS (\n'
-            f'{textwrap.dedent(query).strip()}\n'
-            f');'
+            f"EXPORT DATA OPTIONS (\n"
+            f"{options_str}\n"
+            f")\n"
+            f"AS (\n"
+            f"{textwrap.dedent(query).strip()}\n"
+            f");"
         )
         # Execute
-        logger.debug('Executing query ...')
+        logger.debug("Executing query ...")
         self.execute_query(query=query, parameters=parameters)
     def upload_csv(
         self,
-        src_filename: str,
+        src_filepath: str,
         dst_table_fqn: str,
         schema: list[dict] | None = None,
         gcs_bucket: str | None = None,
         partition_by: str = None,
-        cluster_cols: list[str] = None,
+        clustering_fields: list[str] = None,
         compression: DataFileCompression | None = None,
         load_strategy: LoadStrategy = LoadStrategy.APPEND,
     ):
         self.raise_for_invalid_table_fqn(dst_table_fqn)
-        if compression == DataFileCompression.GZIP and not src_filename.endswith('.gz'):
-            raise ValueError('Please provide file path with .gz extension if using compression = GZIP')
-        elif not src_filename.endswith('.csv'):
-            raise ValueError('Please provide file path with .csv extension')
+        if compression == DataFileCompression.GZIP and not src_filepath.endswith(".gz"):
+            raise ValueError(
+                "Please provide file path with .gz extension if using compression = GZIP"
+            )
+        elif not src_filepath.endswith(".csv"):
+            raise ValueError("Please provide file path with .csv extension")
+        src_filename, src_fileextension = os.path.splitext(src_filepath)
+        src_filename = os.path.basename(src_filename)  # Only get filename
         # # <<----- START: Upload to GCS
@@ -354,12 +427,21 @@ class BQ():
         # Upload to GCS
         # TODO: Re-implement the producer-consumer model to upload multiple files
         gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
-        dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(os.path.basename(src_filename), "_").lower()}'
-        gcs.upload(src_filename, dst_blobpath)
+        dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(src_filename, "_").lower()}{src_fileextension}'
+        gcs.upload(src_filepath, dst_blobpath)
         # Load to BQ
         try:
-            self.load_data(dst_blobpath, dst_table_fqn, schema=schema, partition_by=partition_by, cluster_cols=cluster_cols, format=DataFileFormat.CSV, compression=compression, load_strategy=load_strategy)
+            self.load_data(
+                f"gs://{gcs.bucket.name}/{dst_blobpath}",
+                dst_table_fqn,
+                schema=schema,
+                partition_by=partition_by,
+                clustering_fields=clustering_fields,
+                format=DataFileFormat.CSV,
+                compression=compression,
+                load_strategy=load_strategy,
+            )
         except:
             raise
         finally:
@@ -374,43 +456,61 @@ class BQ():
         query_parameters: dict = {},
         csv_row_limit: int | None = None,
     ) -> str | list[str]:
-        if not dst_filepath.endswith('.csv'):
-            raise ValueError('Destination filename must ends with .csv')
+        if not dst_filepath.endswith(".csv"):
+            raise ValueError("Destination filename must ends with .csv")
         # Init
         gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
         # Generic function to export-download-combine csv file from BQ->GCS->local
-        def _export_download_combine(query: str, dst_gcs_prefix: str, dst_filepath: str, query_parameters: dict = {}):
+        def _export_download_combine(
+            query: str,
+            dst_gcs_prefix: str,
+            dst_filepath: str,
+            query_parameters: dict = {},
+        ):
             # Init tmp directory
-            tmp_dirname = f'/tmp/my_bq_{my_datetime.get_current_datetime_str()}'
+            tmp_dirname = f"/tmp/my_bq_{my_datetime.get_current_datetime_str()}"
             if os.path.exists(tmp_dirname):
                 shutil.rmtree(tmp_dirname, ignore_errors=True)
             os.makedirs(tmp_dirname, exist_ok=True)
-            logger.debug(f'Temporary directory created: {tmp_dirname}')
+            logger.debug(f"Temporary directory created: {tmp_dirname}")
             try:
                 # Export to GCS
-                dst_gcs_uri = f'gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz'
-                self.export_data(query, dst_gcs_uri, parameters=query_parameters, format=DataFileFormat.CSV, compression=DataFileCompression.GZIP)
+                dst_gcs_uri = f"gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz"
+                self.export_data(
+                    query,
+                    dst_gcs_uri,
+                    parameters=query_parameters,
+                    format=DataFileFormat.CSV,
+                    compression=DataFileCompression.GZIP,
+                )
                 # Download from GCS
                 local_tmp_filepaths = []
                 for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
-                    local_tmp_filepath = os.path.join(tmp_dirname, tmp_blobs.name.split('/')[-1])
+                    local_tmp_filepath = os.path.join(
+                        tmp_dirname, tmp_blobs.name.split("/")[-1]
+                    )
                     gcs.download(tmp_blobs, local_tmp_filepath, move=True)
                     # logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
                     local_tmp_filepaths.append(local_tmp_filepath)
                 # Combine downloaded files
-                my_csv.combine(local_tmp_filepaths, dst_filepath, gzip=True, delete=True)
+                my_csv.combine(
+                    local_tmp_filepaths, dst_filepath, gzip=True, delete=True
+                )
             except:
                 raise
             finally:
                 shutil.rmtree(tmp_dirname, ignore_errors=True)  # Remove local folder
-                [gcs.delete_blob(blob_filepath) for blob_filepath in gcs.list_blobs(dst_gcs_prefix)]  # Remove temporary GCS files
+                [
+                    gcs.delete_blob(blob_filepath)
+                    for blob_filepath in gcs.list_blobs(dst_gcs_prefix)
+                ]  # Remove temporary GCS files
-            logger.info(f'Export-download-combine done: {dst_filepath}')
+            logger.info(f"Export-download-combine done: {dst_filepath}")
         # Limited csv rows
         if csv_row_limit:
@@ -420,22 +520,31 @@ class BQ():
                 # Create temporary table
                 query_job = self.execute_query(query, temporary_table=True)
                 tmp_table_fqn = str(query_job.destination)
-                logger.debug(f'Create temp table: {tmp_table_fqn}')
+                logger.debug(f"Create temp table: {tmp_table_fqn}")
                 # Create another temporary table for row numbering
-                query_job = self.execute_query(f'SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`', temporary_table=True)
+                query_job = self.execute_query(
+                    f"SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`",
+                    temporary_table=True,
+                )
                 tmp_table_fqn_rn = str(query_job.destination)
-                logger.debug(f'Create temp table (rn): {tmp_table_fqn_rn}')
+                logger.debug(f"Create temp table (rn): {tmp_table_fqn_rn}")
                 # Process parts
-                count = list(self.execute_query(f'SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`').result())[0][0]
+                count = list(
+                    self.execute_query(
+                        f"SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`"
+                    ).result()
+                )[0][0]
                 parts = math.ceil(count / csv_row_limit)
-                logger.info(f'Total part: {count} / {csv_row_limit} = {parts}')
+                logger.info(f"Total part: {count} / {csv_row_limit} = {parts}")
                 dst_filepaths = []
                 for part in range(parts):
-                    dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
+                    dst_filepath_part = (
+                        f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
+                    )
                     _export_download_combine(
-                        f'SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn',
+                        f"SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn",
                         dst_gcs_prefix=gcs.build_tmp_dirpath(),
                         dst_filepath=dst_filepath_part,
                     )
@@ -452,7 +561,12 @@ class BQ():
         # Unlimited csv rows
         else:
-            _export_download_combine(query, gcs.build_tmp_dirpath(), dst_filepath, query_parameters=query_parameters)
+            _export_download_combine(
+                query,
+                gcs.build_tmp_dirpath(),
+                dst_filepath,
+                query_parameters=query_parameters,
+            )
             return dst_filepath
         # query_job_result = query_job.result()
@@ -484,32 +598,43 @@ class BQ():
         # if f:
         #     f.close()
-    def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
-        if not dst_filename.endswith('.xlsx'):
-            raise ValueError('Destination filename must ends with .xlsx!')
+    def download_xlsx(
+        self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000
+    ):
+        if not dst_filename.endswith(".xlsx"):
+            raise ValueError("Destination filename must ends with .xlsx!")
         # Create a temporary table acting as excel file splitting
-        table_name_tmp = f'{src_table_fqn}_'
-        self.execute_query(f'CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`')
+        table_name_tmp = f"{src_table_fqn}_"
+        self.execute_query(
+            f"CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`"
+        )
         try:
             # Calculate the number of excel file parts based on row limit
-            cnt = list(self.execute_query(f'SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`').result())[0][0]
+            cnt = list(
+                self.execute_query(
+                    f"SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`"
+                ).result()
+            )[0][0]
             parts = math.ceil(cnt / xlsx_row_limit)
-            logger.debug(f'Total part: {cnt} / {xlsx_row_limit} = {parts}')
+            logger.debug(f"Total part: {cnt} / {xlsx_row_limit} = {parts}")
             # Download per parts
             for part in range(parts):
-                logger.debug(f'Downloading part {part + 1}...')
-                file_path_tmp = f'{dst_filename}_part{part + 1}'
-                file_path_tmp_csv = f'{file_path_tmp}.csv'
-                self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
-                my_xlsx.csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
+                logger.debug(f"Downloading part {part + 1}...")
+                file_path_tmp = f"{dst_filename}_part{part + 1}"
+                file_path_tmp_csv = f"{file_path_tmp}.csv"
+                self.download_csv(
+                    f"SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}",
+                    f"{file_path_tmp}{os.sep}",
+                )
+                my_xlsx.csv_to_xlsx(file_path_tmp_csv, f"{file_path_tmp}.xlsx")
                 os.remove(file_path_tmp_csv)
         except Exception as e:
             raise e
         finally:
-            self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
+            self.execute_query(f"DROP TABLE IF EXISTS `{table_name_tmp}`")
     # def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
     #     src_project_id, src_dataset_id, _ = src_view_id.split('.')
@@ -564,11 +689,11 @@ class BQ():
         if isinstance(name, list):
             return [BQ.get_table_fqn_parts(x) for x in name]
-        split = name.split('.')
+        split = name.split(".")
         if len(split) == 3:
             return split
         else:
-            raise ValueError(f'{name} is not a valid table FQN')
+            raise ValueError(f"{name} is not a valid table FQN")
     @staticmethod
     def raise_for_invalid_table_fqn(name: str | list[str]):
@@ -582,7 +707,7 @@ class BQ():
         """
         if not BQ.get_table_fqn_parts(name):
-            raise ValueError(f'{name} is not a valid table FQN')
+            raise ValueError(f"{name} is not a valid table FQN")
     def is_table_exists(self, table_fqn: str) -> bool:
         self.raise_for_invalid_table_fqn(table_fqn)
@@ -594,4 +719,4 @@ class BQ():
     def close(self):
         self.client.close()
-        logger.debug('BQ client close')
+        logger.debug("BQ client close")

rdxz2-utill 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

Potentially problematic release.

rdxz2-utill 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl