rdxz2-utill 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rdxz2-utill might be problematic. Click here for more details.

utill/my_bq.py CHANGED
@@ -19,23 +19,23 @@ import textwrap
19
19
  import time
20
20
 
21
21
  PY_DATA_TYPE__BQ_DATA_TYPE = {
22
- int: 'INTEGER',
23
- str: 'STRING',
24
- float: 'STRING',
22
+ int: "INTEGER",
23
+ str: "STRING",
24
+ float: "STRING",
25
25
  }
26
26
 
27
27
 
28
28
  class DataFileFormat(StrEnum):
29
- CSV = 'CSV'
30
- JSON = 'JSON'
31
- AVRO = 'AVRO'
32
- PARQUET = 'PARQUET'
33
- ORC = 'ORC'
29
+ CSV = "CSV"
30
+ JSON = "JSON"
31
+ AVRO = "AVRO"
32
+ PARQUET = "PARQUET"
33
+ ORC = "ORC"
34
34
 
35
35
 
36
36
  class DataFileCompression(StrEnum):
37
- GZIP = 'GZIP'
38
- SNAPPY = 'SNAPPY'
37
+ GZIP = "GZIP"
38
+ SNAPPY = "SNAPPY"
39
39
 
40
40
 
41
41
  class LoadStrategy(Enum):
@@ -44,43 +44,46 @@ class LoadStrategy(Enum):
44
44
 
45
45
 
46
46
  class Dtype:
47
- INT64 = 'INT64'
48
- INTEGER = 'INTEGER'
49
- FLOAT64 = 'FLOAT64'
47
+ INT64 = "INT64"
48
+ INTEGER = "INTEGER"
49
+ FLOAT64 = "FLOAT64"
50
50
 
51
- DECIMAL = 'DECIMAL'
51
+ DECIMAL = "DECIMAL"
52
52
 
53
- STRING = 'STRING'
54
- JSON = 'JSON'
53
+ STRING = "STRING"
54
+ JSON = "JSON"
55
55
 
56
- DATE = 'DATE'
57
- TIME = 'TIME'
58
- DATETIME = 'DATETIME'
59
- TIMESTAMP = 'TIMESTAMP'
56
+ DATE = "DATE"
57
+ TIME = "TIME"
58
+ DATETIME = "DATETIME"
59
+ TIMESTAMP = "TIMESTAMP"
60
60
 
61
- BOOL = 'BOOL'
61
+ BOOL = "BOOL"
62
62
 
63
- ARRAY_INT64 = 'ARRAY<INT64>'
64
- ARRAY_INTEGER = 'ARRAY<INTEGER>'
65
- ARRAY_FLOAT64 = 'ARRAY<FLOAT64>'
66
- ARRAY_STRING = 'ARRAY<STRING>'
67
- ARRAY_JSON = 'ARRAY<JSON>'
68
- ARRAY_DATE = 'ARRAY<DATE>'
69
- ARRAY_DATETIME = 'ARRAY<DATETIME>'
70
- ARRAY_TIMESTAMP = 'ARRAY<TIMESTAMP>'
71
- ARRAY_BOOL = 'ARRAY<BOOL>'
63
+ ARRAY_INT64 = "ARRAY<INT64>"
64
+ ARRAY_INTEGER = "ARRAY<INTEGER>"
65
+ ARRAY_FLOAT64 = "ARRAY<FLOAT64>"
66
+ ARRAY_STRING = "ARRAY<STRING>"
67
+ ARRAY_JSON = "ARRAY<JSON>"
68
+ ARRAY_DATE = "ARRAY<DATE>"
69
+ ARRAY_DATETIME = "ARRAY<DATETIME>"
70
+ ARRAY_TIMESTAMP = "ARRAY<TIMESTAMP>"
71
+ ARRAY_BOOL = "ARRAY<BOOL>"
72
72
 
73
73
 
74
- class BQ():
74
+ class BQ:
75
75
  def __init__(self, location: str | None = None, project_id: str = None):
76
76
  if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
77
- logger.warning('Using ADC for BigQuery authentication')
77
+ logger.warning("Using ADC for BigQuery authentication")
78
78
 
79
79
  # if location is None and my_env.envs.GCP_REGION is None:
80
80
  # raise ValueError('GCP region must be set in environment variables.')
81
81
 
82
- self.client = bigquery.Client(project=project_id or my_env.envs.GCP_PROJECT_ID, location=location or my_env.envs.GCP_REGION)
83
- logger.debug(f'BQ client open, project: {self.client.project}')
82
+ self.client = bigquery.Client(
83
+ project=project_id or my_env.envs.GCP_PROJECT_ID,
84
+ location=location or my_env.envs.GCP_REGION,
85
+ )
86
+ logger.debug(f"BQ client open, project: {self.client.project}")
84
87
 
85
88
  # MARK: Query execution
86
89
 
@@ -95,8 +98,10 @@ class BQ():
95
98
  is_multi = isinstance(query, list)
96
99
  queries = query if is_multi else [query]
97
100
  queries = [textwrap.dedent(q).strip() for q in queries]
98
- queries = [q if q.endswith(';') else q + ';' for q in queries] # Append ';' character for each query
99
- query = '\n'.join(queries)
101
+ queries = [
102
+ q if q.endswith(";") else q + ";" for q in queries
103
+ ] # Append ';' character for each query
104
+ query = "\n".join(queries)
100
105
 
101
106
  # Evaluate parameter
102
107
  query_parameters = []
@@ -104,36 +109,63 @@ class BQ():
104
109
  is_array = isinstance(value, list)
105
110
  value_type_py = type(value[0]) if is_array else type(value)
106
111
  if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
107
- raise ValueError(f'Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}')
112
+ raise ValueError(
113
+ f"Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}"
114
+ )
108
115
 
109
116
  value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
110
117
 
111
118
  # Handle data type conversions
112
119
  if value_type_py == datetime.date:
113
- value = [v.strftime('%Y-%m-%d') for v in value] if is_array else value.strftime('%Y-%m-%d')
120
+ value = (
121
+ [v.strftime("%Y-%m-%d") for v in value]
122
+ if is_array
123
+ else value.strftime("%Y-%m-%d")
124
+ )
114
125
 
115
126
  if is_array:
116
- query_parameters.append(bigquery.ArrayQueryParameter(parameter, value_type_bq, value))
127
+ query_parameters.append(
128
+ bigquery.ArrayQueryParameter(parameter, value_type_bq, value)
129
+ )
117
130
  else:
118
- query_parameters.append(bigquery.ScalarQueryParameter(parameter, value_type_bq, value))
131
+ query_parameters.append(
132
+ bigquery.ScalarQueryParameter(parameter, value_type_bq, value)
133
+ )
119
134
 
120
- logger.debug(f'🔎 Query:\n{query}')
121
- query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
135
+ logger.debug(f"🔎 Query:\n{query}")
136
+ query_job_config = bigquery.QueryJobConfig(
137
+ dry_run=dry_run, query_parameters=query_parameters
138
+ )
122
139
  if temporary_table:
123
140
  query_job_config.destination = None
124
141
  t = time.time()
125
142
  query_job = self.client.query(query, job_config=query_job_config)
126
- logger.info(f'Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults') if not dry_run else None
143
+ (
144
+ logger.info(
145
+ f"Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults"
146
+ )
147
+ if not dry_run
148
+ else None
149
+ )
127
150
  query_job.result() # Wait for the job to complete
128
151
  elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
129
152
 
130
153
  if not is_multi:
131
- logger.info(f'[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}',)
154
+ logger.info(
155
+ f"[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}",
156
+ )
132
157
  else:
133
- logger.info(f'[Job ID] {query_job.job_id} [Elapsed] {elapsed}')
134
-
135
- jobs: list[bigquery.QueryJob] = list(self.client.list_jobs(parent_job=query_job.job_id))
136
- [logger.info(f'[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
158
+ logger.info(f"[Job ID] {query_job.job_id} [Elapsed] {elapsed}")
159
+
160
+ jobs: list[bigquery.QueryJob] = list(
161
+ self.client.list_jobs(parent_job=query_job.job_id)
162
+ )
163
+ [
164
+ logger.info(
165
+ f"[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)",
166
+ )
167
+ for job in jobs
168
+ ]
137
169
 
138
170
  return query_job
139
171
 
@@ -156,56 +188,68 @@ class BQ():
156
188
  self.raise_for_invalid_table_fqn(dst_table_fqn)
157
189
 
158
190
  # Construct table options
159
- logger.debug('Constructing table options ...')
191
+ logger.debug("Constructing table options ...")
160
192
  table_options = []
161
193
  if expiration_timestamp_utc:
162
- table_options.append(f' expiration_timestamp=\'{expiration_timestamp_utc.isoformat()}\'')
194
+ table_options.append(
195
+ f" expiration_timestamp='{expiration_timestamp_utc.isoformat()}'"
196
+ )
163
197
  if partition_by and require_partition_filter:
164
- table_options.append(f' require_partition_filter=TRUE')
198
+ table_options.append(f" require_partition_filter=TRUE")
165
199
  if description:
166
- table_options.append(f' description=\'{description}\'')
200
+ table_options.append(f" description='{description}'")
167
201
 
168
202
  # Check if table exists
169
- logger.debug('Checking if destination table exists ...')
170
- dst_table_project_id, dst_table_dataset_id, dst_table_id = self.get_table_fqn_parts(dst_table_fqn)
171
- table_exist = self.is_table_exists(project_id=dst_table_project_id, dataset_id=dst_table_dataset_id, table_id=dst_table_id)
203
+ logger.debug("Checking if destination table exists ...")
204
+ dst_table_project_id, dst_table_dataset_id, dst_table_id = (
205
+ self.get_table_fqn_parts(dst_table_fqn)
206
+ )
207
+ table_exist = self.is_table_exists(
208
+ project_id=dst_table_project_id,
209
+ dataset_id=dst_table_dataset_id,
210
+ table_id=dst_table_id,
211
+ )
172
212
 
173
213
  # Construct beautiful query string
174
214
  if table_exist and not replace:
175
- logger.debug('Table exists, constructing INSERT query ...')
176
- query_parts = [f'INSERT INTO `{dst_table_fqn}`']
215
+ logger.debug("Table exists, constructing INSERT query ...")
216
+ query_parts = [f"INSERT INTO `{dst_table_fqn}`"]
177
217
  if schema:
178
- schema_str = ',\n'.join([column['name'] for column in schema])
179
- query_parts.append(f'(\n{schema_str}\n)')
218
+ schema_str = ",\n".join([column["name"] for column in schema])
219
+ query_parts.append(f"(\n{schema_str}\n)")
180
220
  if table_options:
181
- table_options_str = ',\n'.join(table_options)
182
- query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
221
+ table_options_str = ",\n".join(table_options)
222
+ query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
183
223
  else:
184
- logger.debug('Table not exist, constructing CREATE TABLE query ...')
224
+ logger.debug("Table not exist, constructing CREATE TABLE query ...")
185
225
  query_parts = [
186
- f'CREATE OR REPLACE TABLE `{dst_table_fqn}`',
226
+ f"CREATE OR REPLACE TABLE `{dst_table_fqn}`",
187
227
  ]
188
228
  if schema:
189
- schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
190
- query_parts.append(f'(\n{schema_str}\n)')
229
+ schema_str = ",\n".join(
230
+ [f' {column["name"]} {column["data_type"]}' for column in schema]
231
+ )
232
+ query_parts.append(f"(\n{schema_str}\n)")
191
233
  if partition_by:
192
- query_parts.append(f'PARTITION BY {partition_by}')
234
+ query_parts.append(f"PARTITION BY {partition_by}")
193
235
  if clustering_fields:
194
- clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
195
- query_parts.append(f'CLUSTER BY {clustering_fields_str}')
236
+ clustering_fields_str = ", ".join(
237
+ [f"`{field}`" for field in clustering_fields]
238
+ )
239
+ query_parts.append(f"CLUSTER BY {clustering_fields_str}")
196
240
  if table_options:
197
- table_options_str = ',\n'.join(table_options)
198
- query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
199
- query_parts.append('AS')
241
+ table_options_str = ",\n".join(table_options)
242
+ query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
243
+ query_parts.append("AS")
200
244
  query_parts.append(textwrap.dedent(query).strip())
201
245
 
202
246
  # Execute
203
- logger.debug('Executing query ...')
204
- query = '\n'.join(query_parts)
247
+ logger.debug("Executing query ...")
248
+ query = "\n".join(query_parts)
205
249
  self.execute_query(query, parameters=query_parameters)
206
250
 
207
251
  def drop_table(self, bq_table_fqn: str):
208
- logger.info(f'Dropping table: {bq_table_fqn} ...')
252
+ logger.info(f"Dropping table: {bq_table_fqn} ...")
209
253
  self.raise_for_invalid_table_fqn(bq_table_fqn)
210
254
  self.client.delete_table(bq_table_fqn, not_found_ok=True)
211
255
 
@@ -219,7 +263,7 @@ class BQ():
219
263
  schema: list[dict] | None = None,
220
264
  partition_by: str | None = None,
221
265
  clustering_fields: list[str] | None = None,
222
- field_delimiter: str = ',',
266
+ field_delimiter: str = ",",
223
267
  load_strategy: LoadStrategy = LoadStrategy.APPEND,
224
268
  format: DataFileFormat = DataFileFormat.CSV,
225
269
  compression=None,
@@ -227,36 +271,42 @@ class BQ():
227
271
 
228
272
  self.raise_for_invalid_table_fqn(dst_table_fqn)
229
273
 
230
- logger.debug(f'Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...')
274
+ logger.debug(f"Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...")
231
275
 
232
276
  # Construct LOAD options
233
- logger.debug('Constructing LOAD options ...')
277
+ logger.debug("Constructing LOAD options ...")
234
278
  load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
235
- f' format=\'{format}\'',
236
- f' uris=[\'{src_gcs_uri}\']',
279
+ f" format='{format}'",
280
+ f" uris=['{src_gcs_uri}']",
237
281
  ]
238
282
  if format == DataFileFormat.CSV:
239
- load_options.append(f' skip_leading_rows=1')
240
- load_options.append(f' field_delimiter=\'{field_delimiter}\'')
241
- load_options.append(f' allow_quoted_newlines=true')
283
+ load_options.append(f" skip_leading_rows=1")
284
+ load_options.append(f" field_delimiter='{field_delimiter}'")
285
+ load_options.append(f" allow_quoted_newlines=true")
242
286
  if compression:
243
- load_options.append(f' compression=\'{compression}\'')
244
- load_options_str = ',\n'.join(load_options)
287
+ load_options.append(f" compression='{compression}'")
288
+ load_options_str = ",\n".join(load_options)
245
289
 
246
290
  # Construct beautiful query string
247
- logger.debug('Constructing LOAD query ...')
248
- schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
249
- query_parts = [f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)']
291
+ logger.debug("Constructing LOAD query ...")
292
+ schema_str = ",\n".join(
293
+ [f' {column["name"]} {column["data_type"]}' for column in schema]
294
+ )
295
+ query_parts = [
296
+ f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)'
297
+ ]
250
298
  if partition_by:
251
- query_parts.append(f'PARTITION BY {partition_by}')
299
+ query_parts.append(f"PARTITION BY {partition_by}")
252
300
  if clustering_fields:
253
- clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
254
- query_parts.append(f'CLUSTER BY {clustering_fields_str}')
255
- query_parts.append(f'FROM FILES (\n{load_options_str}\n)')
256
- query = '\n'.join(query_parts)
301
+ clustering_fields_str = ", ".join(
302
+ [f"`{field}`" for field in clustering_fields]
303
+ )
304
+ query_parts.append(f"CLUSTER BY {clustering_fields_str}")
305
+ query_parts.append(f"FROM FILES (\n{load_options_str}\n)")
306
+ query = "\n".join(query_parts)
257
307
 
258
308
  # Execute
259
- logger.debug('Executing query ...')
309
+ logger.debug("Executing query ...")
260
310
  self.execute_query(query)
261
311
 
262
312
  def export_data(
@@ -268,65 +318,88 @@ class BQ():
268
318
  format: DataFileFormat = DataFileFormat.CSV,
269
319
  compression: DataFileCompression | None = None,
270
320
  header: bool = True,
271
- delimiter: str = ',',
321
+ delimiter: str = ",",
272
322
  ):
273
- logger.debug(f'Exporting query into {dst_gcs_uri} ...')
323
+ logger.debug(f"Exporting query into {dst_gcs_uri} ...")
274
324
 
275
325
  # GCS uri validation
276
- if format == DataFileFormat.CSV and compression == DataFileCompression.GZIP and not dst_gcs_uri.endswith('.gz'):
277
- raise ValueError('GCS path need to ends with .gz if using compression = GCSCompression.GZIP')
278
- elif format == DataFileFormat.CSV and compression != DataFileCompression.GZIP and not dst_gcs_uri.endswith('.csv'):
279
- raise ValueError('GCS path need to ends with .csv if using format = GCSExportFormat.CSV')
280
- elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith('.parquet'):
281
- raise ValueError('GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET')
326
+ if (
327
+ format == DataFileFormat.CSV
328
+ and compression == DataFileCompression.GZIP
329
+ and not dst_gcs_uri.endswith(".gz")
330
+ ):
331
+ raise ValueError(
332
+ "GCS path need to ends with .gz if using compression = GCSCompression.GZIP"
333
+ )
334
+ elif (
335
+ format == DataFileFormat.CSV
336
+ and compression != DataFileCompression.GZIP
337
+ and not dst_gcs_uri.endswith(".csv")
338
+ ):
339
+ raise ValueError(
340
+ "GCS path need to ends with .csv if using format = GCSExportFormat.CSV"
341
+ )
342
+ elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith(".parquet"):
343
+ raise ValueError(
344
+ "GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET"
345
+ )
282
346
 
283
347
  # Construct options
284
- logger.debug('Constructing EXPORT options ...')
348
+ logger.debug("Constructing EXPORT options ...")
285
349
  options = [
286
- f' uri=\'{dst_gcs_uri}\'',
287
- f' format=\'{format}\'',
288
- f' overwrite=TRUE',
350
+ f" uri='{dst_gcs_uri}'",
351
+ f" format='{format}'",
352
+ f" overwrite=TRUE",
289
353
  ]
290
354
  if format == DataFileFormat.CSV:
291
- options.append(f' field_delimiter=\'{delimiter}\'',)
355
+ options.append(
356
+ f" field_delimiter='{delimiter}'",
357
+ )
292
358
  if header:
293
- options.append(f' header={"true" if header else "false"}',)
359
+ options.append(
360
+ f' header={"true" if header else "false"}',
361
+ )
294
362
  if compression:
295
- options.append(f' compression=\'{compression}\'')
296
- options_str = ',\n'.join(options)
363
+ options.append(f" compression='{compression}'")
364
+ options_str = ",\n".join(options)
297
365
 
298
366
  # Construct beautiful query string
299
- logger.debug('Constructing EXPORT query ...')
367
+ logger.debug("Constructing EXPORT query ...")
300
368
  query = (
301
- f'EXPORT DATA OPTIONS (\n'
302
- f'{options_str}\n'
303
- f')\n'
304
- f'AS (\n'
305
- f'{textwrap.dedent(query).strip()}\n'
306
- f');'
369
+ f"EXPORT DATA OPTIONS (\n"
370
+ f"{options_str}\n"
371
+ f")\n"
372
+ f"AS (\n"
373
+ f"{textwrap.dedent(query).strip()}\n"
374
+ f");"
307
375
  )
308
376
 
309
377
  # Execute
310
- logger.debug('Executing query ...')
378
+ logger.debug("Executing query ...")
311
379
  self.execute_query(query=query, parameters=parameters)
312
380
 
313
381
  def upload_csv(
314
382
  self,
315
- src_filename: str,
383
+ src_filepath: str,
316
384
  dst_table_fqn: str,
317
385
  schema: list[dict] | None = None,
318
386
  gcs_bucket: str | None = None,
319
387
  partition_by: str = None,
320
- cluster_cols: list[str] = None,
388
+ clustering_fields: list[str] = None,
321
389
  compression: DataFileCompression | None = None,
322
390
  load_strategy: LoadStrategy = LoadStrategy.APPEND,
323
391
  ):
324
392
  self.raise_for_invalid_table_fqn(dst_table_fqn)
325
393
 
326
- if compression == DataFileCompression.GZIP and not src_filename.endswith('.gz'):
327
- raise ValueError('Please provide file path with .gz extension if using compression = GZIP')
328
- elif not src_filename.endswith('.csv'):
329
- raise ValueError('Please provide file path with .csv extension')
394
+ if compression == DataFileCompression.GZIP and not src_filepath.endswith(".gz"):
395
+ raise ValueError(
396
+ "Please provide file path with .gz extension if using compression = GZIP"
397
+ )
398
+ elif not src_filepath.endswith(".csv"):
399
+ raise ValueError("Please provide file path with .csv extension")
400
+
401
+ src_filename, src_fileextension = os.path.splitext(src_filepath)
402
+ src_filename = os.path.basename(src_filename) # Only get filename
330
403
 
331
404
  # # <<----- START: Upload to GCS
332
405
 
@@ -354,12 +427,21 @@ class BQ():
354
427
  # Upload to GCS
355
428
  # TODO: Re-implement the producer-consumer model to upload multiple files
356
429
  gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
357
- dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(os.path.basename(src_filename), "_").lower()}'
358
- gcs.upload(src_filename, dst_blobpath)
430
+ dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(src_filename, "_").lower()}{src_fileextension}'
431
+ gcs.upload(src_filepath, dst_blobpath)
359
432
 
360
433
  # Load to BQ
361
434
  try:
362
- self.load_data(dst_blobpath, dst_table_fqn, schema=schema, partition_by=partition_by, cluster_cols=cluster_cols, format=DataFileFormat.CSV, compression=compression, load_strategy=load_strategy)
435
+ self.load_data(
436
+ f"gs://{gcs.bucket.name}/{dst_blobpath}",
437
+ dst_table_fqn,
438
+ schema=schema,
439
+ partition_by=partition_by,
440
+ clustering_fields=clustering_fields,
441
+ format=DataFileFormat.CSV,
442
+ compression=compression,
443
+ load_strategy=load_strategy,
444
+ )
363
445
  except:
364
446
  raise
365
447
  finally:
@@ -374,43 +456,61 @@ class BQ():
374
456
  query_parameters: dict = {},
375
457
  csv_row_limit: int | None = None,
376
458
  ) -> str | list[str]:
377
- if not dst_filepath.endswith('.csv'):
378
- raise ValueError('Destination filename must ends with .csv')
459
+ if not dst_filepath.endswith(".csv"):
460
+ raise ValueError("Destination filename must ends with .csv")
379
461
 
380
462
  # Init
381
463
  gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
382
464
 
383
465
  # Generic function to export-download-combine csv file from BQ->GCS->local
384
- def _export_download_combine(query: str, dst_gcs_prefix: str, dst_filepath: str, query_parameters: dict = {}):
466
+ def _export_download_combine(
467
+ query: str,
468
+ dst_gcs_prefix: str,
469
+ dst_filepath: str,
470
+ query_parameters: dict = {},
471
+ ):
385
472
  # Init tmp directory
386
- tmp_dirname = f'/tmp/my_bq_{my_datetime.get_current_datetime_str()}'
473
+ tmp_dirname = f"/tmp/my_bq_{my_datetime.get_current_datetime_str()}"
387
474
  if os.path.exists(tmp_dirname):
388
475
  shutil.rmtree(tmp_dirname, ignore_errors=True)
389
476
  os.makedirs(tmp_dirname, exist_ok=True)
390
- logger.debug(f'Temporary directory created: {tmp_dirname}')
477
+ logger.debug(f"Temporary directory created: {tmp_dirname}")
391
478
 
392
479
  try:
393
480
  # Export to GCS
394
- dst_gcs_uri = f'gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz'
395
- self.export_data(query, dst_gcs_uri, parameters=query_parameters, format=DataFileFormat.CSV, compression=DataFileCompression.GZIP)
481
+ dst_gcs_uri = f"gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz"
482
+ self.export_data(
483
+ query,
484
+ dst_gcs_uri,
485
+ parameters=query_parameters,
486
+ format=DataFileFormat.CSV,
487
+ compression=DataFileCompression.GZIP,
488
+ )
396
489
 
397
490
  # Download from GCS
398
491
  local_tmp_filepaths = []
399
492
  for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
400
- local_tmp_filepath = os.path.join(tmp_dirname, tmp_blobs.name.split('/')[-1])
493
+ local_tmp_filepath = os.path.join(
494
+ tmp_dirname, tmp_blobs.name.split("/")[-1]
495
+ )
401
496
  gcs.download(tmp_blobs, local_tmp_filepath, move=True)
402
497
  # logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
403
498
  local_tmp_filepaths.append(local_tmp_filepath)
404
499
 
405
500
  # Combine downloaded files
406
- my_csv.combine(local_tmp_filepaths, dst_filepath, gzip=True, delete=True)
501
+ my_csv.combine(
502
+ local_tmp_filepaths, dst_filepath, gzip=True, delete=True
503
+ )
407
504
  except:
408
505
  raise
409
506
  finally:
410
507
  shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
411
- [gcs.delete_blob(blob_filepath) for blob_filepath in gcs.list_blobs(dst_gcs_prefix)] # Remove temporary GCS files
508
+ [
509
+ gcs.delete_blob(blob_filepath)
510
+ for blob_filepath in gcs.list_blobs(dst_gcs_prefix)
511
+ ] # Remove temporary GCS files
412
512
 
413
- logger.info(f'Export-download-combine done: {dst_filepath}')
513
+ logger.info(f"Export-download-combine done: {dst_filepath}")
414
514
 
415
515
  # Limited csv rows
416
516
  if csv_row_limit:
@@ -420,22 +520,31 @@ class BQ():
420
520
  # Create temporary table
421
521
  query_job = self.execute_query(query, temporary_table=True)
422
522
  tmp_table_fqn = str(query_job.destination)
423
- logger.debug(f'Create temp table: {tmp_table_fqn}')
523
+ logger.debug(f"Create temp table: {tmp_table_fqn}")
424
524
 
425
525
  # Create another temporary table for row numbering
426
- query_job = self.execute_query(f'SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`', temporary_table=True)
526
+ query_job = self.execute_query(
527
+ f"SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`",
528
+ temporary_table=True,
529
+ )
427
530
  tmp_table_fqn_rn = str(query_job.destination)
428
- logger.debug(f'Create temp table (rn): {tmp_table_fqn_rn}')
531
+ logger.debug(f"Create temp table (rn): {tmp_table_fqn_rn}")
429
532
 
430
533
  # Process parts
431
- count = list(self.execute_query(f'SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`').result())[0][0]
534
+ count = list(
535
+ self.execute_query(
536
+ f"SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`"
537
+ ).result()
538
+ )[0][0]
432
539
  parts = math.ceil(count / csv_row_limit)
433
- logger.info(f'Total part: {count} / {csv_row_limit} = {parts}')
540
+ logger.info(f"Total part: {count} / {csv_row_limit} = {parts}")
434
541
  dst_filepaths = []
435
542
  for part in range(parts):
436
- dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
543
+ dst_filepath_part = (
544
+ f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
545
+ )
437
546
  _export_download_combine(
438
- f'SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn',
547
+ f"SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn",
439
548
  dst_gcs_prefix=gcs.build_tmp_dirpath(),
440
549
  dst_filepath=dst_filepath_part,
441
550
  )
@@ -452,7 +561,12 @@ class BQ():
452
561
 
453
562
  # Unlimited csv rows
454
563
  else:
455
- _export_download_combine(query, gcs.build_tmp_dirpath(), dst_filepath, query_parameters=query_parameters)
564
+ _export_download_combine(
565
+ query,
566
+ gcs.build_tmp_dirpath(),
567
+ dst_filepath,
568
+ query_parameters=query_parameters,
569
+ )
456
570
  return dst_filepath
457
571
 
458
572
  # query_job_result = query_job.result()
@@ -484,32 +598,43 @@ class BQ():
484
598
  # if f:
485
599
  # f.close()
486
600
 
487
- def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
488
- if not dst_filename.endswith('.xlsx'):
489
- raise ValueError('Destination filename must ends with .xlsx!')
601
+ def download_xlsx(
602
+ self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000
603
+ ):
604
+ if not dst_filename.endswith(".xlsx"):
605
+ raise ValueError("Destination filename must ends with .xlsx!")
490
606
 
491
607
  # Create a temporary table acting as excel file splitting
492
- table_name_tmp = f'{src_table_fqn}_'
493
- self.execute_query(f'CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`')
608
+ table_name_tmp = f"{src_table_fqn}_"
609
+ self.execute_query(
610
+ f"CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`"
611
+ )
494
612
 
495
613
  try:
496
614
  # Calculate the number of excel file parts based on row limit
497
- cnt = list(self.execute_query(f'SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`').result())[0][0]
615
+ cnt = list(
616
+ self.execute_query(
617
+ f"SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`"
618
+ ).result()
619
+ )[0][0]
498
620
  parts = math.ceil(cnt / xlsx_row_limit)
499
- logger.debug(f'Total part: {cnt} / {xlsx_row_limit} = {parts}')
621
+ logger.debug(f"Total part: {cnt} / {xlsx_row_limit} = {parts}")
500
622
 
501
623
  # Download per parts
502
624
  for part in range(parts):
503
- logger.debug(f'Downloading part {part + 1}...')
504
- file_path_tmp = f'{dst_filename}_part{part + 1}'
505
- file_path_tmp_csv = f'{file_path_tmp}.csv'
506
- self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
507
- my_xlsx.csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
625
+ logger.debug(f"Downloading part {part + 1}...")
626
+ file_path_tmp = f"{dst_filename}_part{part + 1}"
627
+ file_path_tmp_csv = f"{file_path_tmp}.csv"
628
+ self.download_csv(
629
+ f"SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}",
630
+ f"{file_path_tmp}{os.sep}",
631
+ )
632
+ my_xlsx.csv_to_xlsx(file_path_tmp_csv, f"{file_path_tmp}.xlsx")
508
633
  os.remove(file_path_tmp_csv)
509
634
  except Exception as e:
510
635
  raise e
511
636
  finally:
512
- self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
637
+ self.execute_query(f"DROP TABLE IF EXISTS `{table_name_tmp}`")
513
638
 
514
639
  # def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
515
640
  # src_project_id, src_dataset_id, _ = src_view_id.split('.')
@@ -564,11 +689,11 @@ class BQ():
564
689
  if isinstance(name, list):
565
690
  return [BQ.get_table_fqn_parts(x) for x in name]
566
691
 
567
- split = name.split('.')
692
+ split = name.split(".")
568
693
  if len(split) == 3:
569
694
  return split
570
695
  else:
571
- raise ValueError(f'{name} is not a valid table FQN')
696
+ raise ValueError(f"{name} is not a valid table FQN")
572
697
 
573
698
  @staticmethod
574
699
  def raise_for_invalid_table_fqn(name: str | list[str]):
@@ -582,7 +707,7 @@ class BQ():
582
707
  """
583
708
 
584
709
  if not BQ.get_table_fqn_parts(name):
585
- raise ValueError(f'{name} is not a valid table FQN')
710
+ raise ValueError(f"{name} is not a valid table FQN")
586
711
 
587
712
  def is_table_exists(self, table_fqn: str) -> bool:
588
713
  self.raise_for_invalid_table_fqn(table_fqn)
@@ -594,4 +719,4 @@ class BQ():
594
719
 
595
720
  def close(self):
596
721
  self.client.close()
597
- logger.debug('BQ client close')
722
+ logger.debug("BQ client close")