rdxz2-utill 0.0.12__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rdxz2-utill might be problematic. Click here for more details.

Files changed (43) hide show
  1. {rdxz2_utill-0.0.12/src/rdxz2_utill.egg-info → rdxz2_utill-0.1.1}/PKG-INFO +1 -1
  2. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/pyproject.toml +1 -1
  3. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1/src/rdxz2_utill.egg-info}/PKG-INFO +1 -1
  4. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/_conf.py +1 -0
  5. rdxz2_utill-0.1.1/src/utill/my_bq.py +597 -0
  6. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_csv.py +13 -9
  7. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_datetime.py +1 -1
  8. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_env.py +1 -0
  9. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_file.py +1 -1
  10. rdxz2_utill-0.1.1/src/utill/my_gcs.py +79 -0
  11. rdxz2_utill-0.1.1/src/utill/my_queue.py +141 -0
  12. rdxz2_utill-0.0.12/src/utill/my_bq.py +0 -362
  13. rdxz2_utill-0.0.12/src/utill/my_gcs.py +0 -123
  14. rdxz2_utill-0.0.12/src/utill/my_queue.py +0 -66
  15. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/LICENSE +0 -0
  16. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/README.md +0 -0
  17. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/setup.cfg +0 -0
  18. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/rdxz2_utill.egg-info/SOURCES.txt +0 -0
  19. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/rdxz2_utill.egg-info/dependency_links.txt +0 -0
  20. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/rdxz2_utill.egg-info/entry_points.txt +0 -0
  21. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/rdxz2_utill.egg-info/requires.txt +0 -0
  22. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/rdxz2_utill.egg-info/top_level.txt +0 -0
  23. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/__init__.py +0 -0
  24. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/__init__.py +0 -0
  25. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/_bq.py +0 -0
  26. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/_enc.py +0 -0
  27. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/_main.py +0 -0
  28. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/_pg.py +0 -0
  29. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/cmd/utill.py +0 -0
  30. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_compare.py +0 -0
  31. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_const.py +0 -0
  32. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_dict.py +0 -0
  33. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_encryption.py +0 -0
  34. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_input.py +0 -0
  35. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_json.py +0 -0
  36. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_mb.py +0 -0
  37. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_pg.py +0 -0
  38. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_string.py +0 -0
  39. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_style.py +0 -0
  40. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_tunnel.py +0 -0
  41. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/my_xlsx.py +0 -0
  42. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/templates/mb.json +0 -0
  43. {rdxz2_utill-0.0.12 → rdxz2_utill-0.1.1}/src/utill/templates/pg.json +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdxz2-utill
3
- Version: 0.0.12
3
+ Version: 0.1.1
4
4
  Summary: Your daily Python utility
5
5
  Author-email: Richard Dharmawan <richard.dharmawan@gmail.com>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rdxz2-utill"
7
- version = "0.0.12"
7
+ version = "0.1.1"
8
8
  authors = [
9
9
  { name="Richard Dharmawan", email="richard.dharmawan@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdxz2-utill
3
- Version: 0.0.12
3
+ Version: 0.1.1
4
4
  Summary: Your daily Python utility
5
5
  Author-email: Richard Dharmawan <richard.dharmawan@gmail.com>
6
6
  License: MIT License
@@ -6,6 +6,7 @@ def _init(mode: str):
6
6
  match mode:
7
7
  case 'google-cloud':
8
8
  setattr(envs, 'GCP_PROJECT_ID', input('GCP_PROJECT_ID: '))
9
+ setattr(envs, 'GCP_REGION', input('GCP_REGION: '))
9
10
  setattr(envs, 'GCS_BUCKET', input('GCS_BUCKET: '))
10
11
  envs.write()
11
12
  logger.info('Google cloud configuration initialized')
@@ -0,0 +1,597 @@
1
+ from . import my_csv
2
+ from . import my_datetime
3
+ from . import my_env
4
+ from . import my_gcs
5
+ from . import my_queue
6
+ from . import my_string
7
+ from . import my_xlsx
8
+ from enum import StrEnum, Enum, auto
9
+ from google.cloud import bigquery
10
+ from google.cloud.exceptions import NotFound
11
+ from humanize import precisedelta, naturalsize
12
+ from loguru import logger
13
+ import csv
14
+ import datetime
15
+ import math
16
+ import os
17
+ import shutil
18
+ import textwrap
19
+ import time
20
+
21
+ PY_DATA_TYPE__BQ_DATA_TYPE = {
22
+ int: 'INTEGER',
23
+ str: 'STRING',
24
+ float: 'STRING',
25
+ }
26
+
27
+
28
+ class DataFileFormat(StrEnum):
29
+ CSV = 'CSV'
30
+ JSON = 'JSON'
31
+ AVRO = 'AVRO'
32
+ PARQUET = 'PARQUET'
33
+ ORC = 'ORC'
34
+
35
+
36
+ class DataFileCompression(StrEnum):
37
+ GZIP = 'GZIP'
38
+ SNAPPY = 'SNAPPY'
39
+
40
+
41
+ class LoadStrategy(Enum):
42
+ OVERWRITE = auto()
43
+ APPEND = auto()
44
+
45
+
46
+ class Dtype:
47
+ INT64 = 'INT64'
48
+ INTEGER = 'INTEGER'
49
+ FLOAT64 = 'FLOAT64'
50
+
51
+ DECIMAL = 'DECIMAL'
52
+
53
+ STRING = 'STRING'
54
+ JSON = 'JSON'
55
+
56
+ DATE = 'DATE'
57
+ TIME = 'TIME'
58
+ DATETIME = 'DATETIME'
59
+ TIMESTAMP = 'TIMESTAMP'
60
+
61
+ BOOL = 'BOOL'
62
+
63
+ ARRAY_INT64 = 'ARRAY<INT64>'
64
+ ARRAY_INTEGER = 'ARRAY<INTEGER>'
65
+ ARRAY_FLOAT64 = 'ARRAY<FLOAT64>'
66
+ ARRAY_STRING = 'ARRAY<STRING>'
67
+ ARRAY_JSON = 'ARRAY<JSON>'
68
+ ARRAY_DATE = 'ARRAY<DATE>'
69
+ ARRAY_DATETIME = 'ARRAY<DATETIME>'
70
+ ARRAY_TIMESTAMP = 'ARRAY<TIMESTAMP>'
71
+ ARRAY_BOOL = 'ARRAY<BOOL>'
72
+
73
+
74
+ class BQ():
75
+ def __init__(self, location: str | None = None, project_id: str = None):
76
+ if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
77
+ logger.warning('Using ADC for BigQuery authentication')
78
+
79
+ # if location is None and my_env.envs.GCP_REGION is None:
80
+ # raise ValueError('GCP region must be set in environment variables.')
81
+
82
+ self.client = bigquery.Client(project=project_id or my_env.envs.GCP_PROJECT_ID, location=location or my_env.envs.GCP_REGION)
83
+ logger.debug(f'BQ client open, project: {self.client.project}')
84
+
85
+ # MARK: Query execution
86
+
87
+ def execute_query(
88
+ self,
89
+ query: str | list[str],
90
+ parameters: dict = {},
91
+ dry_run: bool = False,
92
+ temporary_table: bool = False,
93
+ ) -> bigquery.QueryJob:
94
+ # Reconstruct query, handle multiple queries in a single job
95
+ is_multi = isinstance(query, list)
96
+ queries = query if is_multi else [query]
97
+ queries = [textwrap.dedent(q).strip() for q in queries]
98
+ queries = [q if q.endswith(';') else q + ';' for q in queries] # Append ';' character for each query
99
+ query = '\n'.join(queries)
100
+
101
+ # Evaluate parameter
102
+ query_parameters = []
103
+ for parameter, value in parameters.items():
104
+ is_array = isinstance(value, list)
105
+ value_type_py = type(value[0]) if is_array else type(value)
106
+ if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
107
+ raise ValueError(f'Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}')
108
+
109
+ value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
110
+
111
+ # Handle data type conversions
112
+ if value_type_py == datetime.date:
113
+ value = [v.strftime('%Y-%m-%d') for v in value] if is_array else value.strftime('%Y-%m-%d')
114
+
115
+ if is_array:
116
+ query_parameters.append(bigquery.ArrayQueryParameter(parameter, value_type_bq, value))
117
+ else:
118
+ query_parameters.append(bigquery.ScalarQueryParameter(parameter, value_type_bq, value))
119
+
120
+ logger.debug(f'🔎 Query:\n{query}')
121
+ query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
122
+ if temporary_table:
123
+ query_job_config.destination = None
124
+ t = time.time()
125
+ query_job = self.client.query(query, job_config=query_job_config)
126
+ logger.info(f'Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults') if not dry_run else None
127
+ query_job.result() # Wait for the job to complete
128
+ elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
129
+
130
+ if not is_multi:
131
+ logger.info(f'[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}',)
132
+ else:
133
+ logger.info(f'[Job ID] {query_job.job_id} [Elapsed] {elapsed}')
134
+
135
+ jobs: list[bigquery.QueryJob] = list(self.client.list_jobs(parent_job=query_job.job_id))
136
+ [logger.info(f'[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
137
+
138
+ return query_job
139
+
140
+ # MARK: Table operations
141
+
142
+ def create_table(
143
+ self,
144
+ dst_table_fqn: str,
145
+ query: str,
146
+ query_parameters: dict = {},
147
+ *,
148
+ description: str | None = None,
149
+ schema: list[dict] | None = None,
150
+ partition_by: str | None = None,
151
+ clustering_fields: list[str] | None = None,
152
+ expiration_timestamp_utc: datetime.datetime | None = None,
153
+ require_partition_filter: bool = False,
154
+ replace: bool = False,
155
+ ):
156
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
157
+
158
+ # Construct table options
159
+ logger.debug('Constructing table options ...')
160
+ table_options = []
161
+ if expiration_timestamp_utc:
162
+ table_options.append(f' expiration_timestamp=\'{expiration_timestamp_utc.isoformat()}\'')
163
+ if partition_by and require_partition_filter:
164
+ table_options.append(f' require_partition_filter=TRUE')
165
+ if description:
166
+ table_options.append(f' description=\'{description}\'')
167
+
168
+ # Check if table exists
169
+ logger.debug('Checking if destination table exists ...')
170
+ dst_table_project_id, dst_table_dataset_id, dst_table_id = self.get_table_fqn_parts(dst_table_fqn)
171
+ table_exist = self.is_table_exists(project_id=dst_table_project_id, dataset_id=dst_table_dataset_id, table_id=dst_table_id)
172
+
173
+ # Construct beautiful query string
174
+ if table_exist and not replace:
175
+ logger.debug('Table exists, constructing INSERT query ...')
176
+ query_parts = [f'INSERT INTO `{dst_table_fqn}`']
177
+ if schema:
178
+ schema_str = ',\n'.join([column['name'] for column in schema])
179
+ query_parts.append(f'(\n{schema_str}\n)')
180
+ if table_options:
181
+ table_options_str = ',\n'.join(table_options)
182
+ query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
183
+ else:
184
+ logger.debug('Table not exist, constructing CREATE TABLE query ...')
185
+ query_parts = [
186
+ f'CREATE OR REPLACE TABLE `{dst_table_fqn}`',
187
+ ]
188
+ if schema:
189
+ schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
190
+ query_parts.append(f'(\n{schema_str}\n)')
191
+ if partition_by:
192
+ query_parts.append(f'PARTITION BY {partition_by}')
193
+ if clustering_fields:
194
+ clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
195
+ query_parts.append(f'CLUSTER BY {clustering_fields_str}')
196
+ if table_options:
197
+ table_options_str = ',\n'.join(table_options)
198
+ query_parts.append(f'OPTIONS (\n{table_options_str}\n)')
199
+ query_parts.append('AS')
200
+ query_parts.append(textwrap.dedent(query).strip())
201
+
202
+ # Execute
203
+ logger.debug('Executing query ...')
204
+ query = '\n'.join(query_parts)
205
+ self.execute_query(query, parameters=query_parameters)
206
+
207
+ def drop_table(self, bq_table_fqn: str):
208
+ logger.info(f'Dropping table: {bq_table_fqn} ...')
209
+ self.raise_for_invalid_table_fqn(bq_table_fqn)
210
+ self.client.delete_table(bq_table_fqn, not_found_ok=True)
211
+
212
+ # MARK: Table data
213
+
214
+ def load_data(
215
+ self,
216
+ src_gcs_uri: str,
217
+ dst_table_fqn: str,
218
+ *,
219
+ schema: list[dict] | None = None,
220
+ partition_by: str | None = None,
221
+ clustering_fields: list[str] | None = None,
222
+ field_delimiter: str = ',',
223
+ load_strategy: LoadStrategy = LoadStrategy.APPEND,
224
+ format: DataFileFormat = DataFileFormat.CSV,
225
+ compression=None,
226
+ ):
227
+
228
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
229
+
230
+ logger.debug(f'Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...')
231
+
232
+ # Construct LOAD options
233
+ logger.debug('Constructing LOAD options ...')
234
+ load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
235
+ f' format=\'{format}\'',
236
+ f' uris=[\'{src_gcs_uri}\']',
237
+ ]
238
+ if format == DataFileFormat.CSV:
239
+ load_options.append(f' skip_leading_rows=1')
240
+ load_options.append(f' field_delimiter=\'{field_delimiter}\'')
241
+ load_options.append(f' allow_quoted_newlines=true')
242
+ if compression:
243
+ load_options.append(f' compression=\'{compression}\'')
244
+ load_options_str = ',\n'.join(load_options)
245
+
246
+ # Construct beautiful query string
247
+ logger.debug('Constructing LOAD query ...')
248
+ schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
249
+ query_parts = [f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)']
250
+ if partition_by:
251
+ query_parts.append(f'PARTITION BY {partition_by}')
252
+ if clustering_fields:
253
+ clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
254
+ query_parts.append(f'CLUSTER BY {clustering_fields_str}')
255
+ query_parts.append(f'FROM FILES (\n{load_options_str}\n)')
256
+ query = '\n'.join(query_parts)
257
+
258
+ # Execute
259
+ logger.debug('Executing query ...')
260
+ self.execute_query(query)
261
+
262
+ def export_data(
263
+ self,
264
+ query: str,
265
+ dst_gcs_uri: str,
266
+ *,
267
+ parameters: dict = {},
268
+ format: DataFileFormat = DataFileFormat.CSV,
269
+ compression: DataFileCompression | None = None,
270
+ header: bool = True,
271
+ delimiter: str = ',',
272
+ ):
273
+ logger.debug(f'Exporting query into {dst_gcs_uri} ...')
274
+
275
+ # GCS uri validation
276
+ if format == DataFileFormat.CSV and compression == DataFileCompression.GZIP and not dst_gcs_uri.endswith('.gz'):
277
+ raise ValueError('GCS path need to ends with .gz if using compression = GCSCompression.GZIP')
278
+ elif format == DataFileFormat.CSV and compression != DataFileCompression.GZIP and not dst_gcs_uri.endswith('.csv'):
279
+ raise ValueError('GCS path need to ends with .csv if using format = GCSExportFormat.CSV')
280
+ elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith('.parquet'):
281
+ raise ValueError('GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET')
282
+
283
+ # Construct options
284
+ logger.debug('Constructing EXPORT options ...')
285
+ options = [
286
+ f' uri=\'{dst_gcs_uri}\'',
287
+ f' format=\'{format}\'',
288
+ f' overwrite=TRUE',
289
+ ]
290
+ if format == DataFileFormat.CSV:
291
+ options.append(f' field_delimiter=\'{delimiter}\'',)
292
+ if header:
293
+ options.append(f' header={"true" if header else "false"}',)
294
+ if compression:
295
+ options.append(f' compression=\'{compression}\'')
296
+ options_str = ',\n'.join(options)
297
+
298
+ # Construct beautiful query string
299
+ logger.debug('Constructing EXPORT query ...')
300
+ query = (
301
+ f'EXPORT DATA OPTIONS (\n'
302
+ f'{options_str}\n'
303
+ f')\n'
304
+ f'AS (\n'
305
+ f'{textwrap.dedent(query).strip()}\n'
306
+ f');'
307
+ )
308
+
309
+ # Execute
310
+ logger.debug('Executing query ...')
311
+ self.execute_query(query=query, parameters=parameters)
312
+
313
+ def upload_csv(
314
+ self,
315
+ src_filename: str,
316
+ dst_table_fqn: str,
317
+ schema: list[dict] | None = None,
318
+ gcs_bucket: str | None = None,
319
+ partition_by: str = None,
320
+ cluster_cols: list[str] = None,
321
+ compression: DataFileCompression | None = None,
322
+ load_strategy: LoadStrategy = LoadStrategy.APPEND,
323
+ ):
324
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
325
+
326
+ if compression == DataFileCompression.GZIP and not src_filename.endswith('.gz'):
327
+ raise ValueError('Please provide file path with .gz extension if using compression = GZIP')
328
+ elif not src_filename.endswith('.csv'):
329
+ raise ValueError('Please provide file path with .csv extension')
330
+
331
+ # # <<----- START: Upload to GCS
332
+
333
+ # gcs = GCS(self.project_id)
334
+ # tmp_dir = f'tmp/upload__{current_datetime_str()}'
335
+
336
+ # # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
337
+ # # A single file can produce more than one compressed file in GCS
338
+ # def producer(src_file: str):
339
+ # for dst_file in compress(src_file,
340
+ # keep=True, max_size_bytes=ByteSize.GB * 3):
341
+ # yield (dst_file, )
342
+
343
+ # def consumer(dst_file: str):
344
+ # remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
345
+ # logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
346
+ # blob = gcs.upload(dst_file, remote_file_name, move=True)
347
+ # return blob
348
+
349
+ # blobs: list[storage.Blob]
350
+ # _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
351
+
352
+ # # END: Upload to GCS ----->>
353
+
354
+ # Upload to GCS
355
+ # TODO: Re-implement the producer-consumer model to upload multiple files
356
+ gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
357
+ dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(os.path.basename(src_filename), "_").lower()}'
358
+ gcs.upload(src_filename, dst_blobpath)
359
+
360
+ # Load to BQ
361
+ try:
362
+ self.load_data(dst_blobpath, dst_table_fqn, schema=schema, partition_by=partition_by, cluster_cols=cluster_cols, format=DataFileFormat.CSV, compression=compression, load_strategy=load_strategy)
363
+ except:
364
+ raise
365
+ finally:
366
+ gcs.delete_blob(dst_blobpath)
367
+
368
+ def download_csv(
369
+ self,
370
+ query: str,
371
+ dst_filepath: str,
372
+ *,
373
+ gcs_bucket: str | None = None,
374
+ query_parameters: dict = {},
375
+ csv_row_limit: int | None = None,
376
+ ) -> str | list[str]:
377
+ if not dst_filepath.endswith('.csv'):
378
+ raise ValueError('Destination filename must ends with .csv')
379
+
380
+ # Init
381
+ gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
382
+
383
+ # Generic function to export-download-combine csv file from BQ->GCS->local
384
+ def _export_download_combine(query: str, dst_gcs_prefix: str, dst_filepath: str, query_parameters: dict = {}):
385
+ # Init tmp directory
386
+ tmp_dirname = f'/tmp/my_bq_{my_datetime.get_current_datetime_str()}'
387
+ if os.path.exists(tmp_dirname):
388
+ shutil.rmtree(tmp_dirname, ignore_errors=True)
389
+ os.makedirs(tmp_dirname, exist_ok=True)
390
+ logger.debug(f'Temporary directory created: {tmp_dirname}')
391
+
392
+ try:
393
+ # Export to GCS
394
+ dst_gcs_uri = f'gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz'
395
+ self.export_data(query, dst_gcs_uri, parameters=query_parameters, format=DataFileFormat.CSV, compression=DataFileCompression.GZIP)
396
+
397
+ # Download from GCS
398
+ local_tmp_filepaths = []
399
+ for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
400
+ local_tmp_filepath = os.path.join(tmp_dirname, tmp_blobs.name.split('/')[-1])
401
+ gcs.download(tmp_blobs, local_tmp_filepath, move=True)
402
+ logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
403
+ local_tmp_filepaths.append(local_tmp_filepath)
404
+
405
+ # Combine downloaded files
406
+ my_csv.combine(local_tmp_filepaths, dst_filepath, gzip=True, delete=True)
407
+ except:
408
+ raise
409
+ finally:
410
+ shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
411
+ [gcs.delete_blob(blob_filepath) for blob_filepath in gcs.list_blobs(dst_gcs_prefix)] # Remove temporary GCS files
412
+
413
+ logger.info(f'Export-download-combine done: {dst_filepath}')
414
+
415
+ # Limited csv rows
416
+ if csv_row_limit:
417
+ tmp_table_fqn: str | None = None
418
+ tmp_table_fqn_rn: str | None = None
419
+ try:
420
+ # Create temporary table
421
+ query_job = self.execute_query(query, temporary_table=True)
422
+ tmp_table_fqn = str(query_job.destination)
423
+ logger.debug(f'Create temp table: {tmp_table_fqn}')
424
+
425
+ # Create another temporary table for row numbering
426
+ query_job = self.execute_query(f'SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`', temporary_table=True)
427
+ tmp_table_fqn_rn = str(query_job.destination)
428
+ logger.debug(f'Create temp table (rn): {tmp_table_fqn_rn}')
429
+
430
+ # Process parts
431
+ count = list(self.execute_query(f'SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`').result())[0][0]
432
+ parts = math.ceil(count / csv_row_limit)
433
+ logger.info(f'Total part: {count} / {csv_row_limit} = {parts}')
434
+ dst_filepaths = []
435
+ for part in range(parts):
436
+ dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
437
+ _export_download_combine(
438
+ f'SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit}',
439
+ dst_gcs_prefix=gcs.build_tmp_dirpath(),
440
+ dst_filepath=dst_filepath_part,
441
+ )
442
+ dst_filepaths.append(dst_filepath_part)
443
+ return dst_filepaths
444
+ except:
445
+ raise
446
+ finally:
447
+ # Drop temporary tables
448
+ if tmp_table_fqn_rn:
449
+ self.drop_table(tmp_table_fqn_rn)
450
+ if tmp_table_fqn:
451
+ self.drop_table(tmp_table_fqn)
452
+
453
+ # Unlimited csv rows
454
+ else:
455
+ _export_download_combine(query, gcs.build_tmp_dirpath(), dst_filepath, query_parameters=query_parameters)
456
+ return dst_filepath
457
+
458
+ # query_job_result = query_job.result()
459
+ # row_count = 0
460
+ # file_index = 1
461
+
462
+ # # Stream-download-split result
463
+ # def open_file(f):
464
+ # if f:
465
+ # f.close()
466
+ # dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{file_index:06}.csv' if row_limit else dst_filepath
467
+ # logger.info(f'Writing into file: {dst_filepath_part} ...')
468
+ # f = open(dst_filepath_part, 'w', newline='', encoding='utf-8')
469
+ # writer = csv.writer(f)
470
+ # writer.writerow([field.name for field in query_job_result.schema]) # Write header
471
+
472
+ # return f, writer
473
+
474
+ # f, writer = open_file(None)
475
+ # for row in query_job_result:
476
+ # writer.writerow(row)
477
+
478
+ # if row_limit:
479
+ # row_count += 1
480
+ # if row_count >= row_limit:
481
+ # row_count = 0
482
+ # file_index += 1
483
+ # f, writer = open_file(f)
484
+ # if f:
485
+ # f.close()
486
+
487
+ def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
488
+ if not dst_filename.endswith('.xlsx'):
489
+ raise ValueError('Destination filename must ends with .xlsx!')
490
+
491
+ # Create a temporary table acting as excel file splitting
492
+ table_name_tmp = f'{src_table_fqn}_'
493
+ self.execute_query(f'CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`')
494
+
495
+ try:
496
+ # Calculate the number of excel file parts based on row limit
497
+ cnt = list(self.execute_query(f'SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`').result())[0][0]
498
+ parts = math.ceil(cnt / xlsx_row_limit)
499
+ logger.debug(f'Total part: {cnt} / {xlsx_row_limit} = {parts}')
500
+
501
+ # Download per parts
502
+ for part in range(parts):
503
+ logger.debug(f'Downloading part {part + 1}...')
504
+ file_path_tmp = f'{dst_filename}_part{part + 1}'
505
+ file_path_tmp_csv = f'{file_path_tmp}.csv'
506
+ self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
507
+ my_xlsx.csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
508
+ os.remove(file_path_tmp_csv)
509
+ except Exception as e:
510
+ raise e
511
+ finally:
512
+ self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
513
+
514
+ # def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
515
+ # src_project_id, src_dataset_id, _ = src_view_id.split('.')
516
+ # dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
517
+
518
+ # # Create or replace
519
+ # src_view = self.client.get_table(src_view_id)
520
+ # dst_view = bigquery.Table(dst_view_id)
521
+ # dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
522
+ # self.client.delete_table(dst_view, not_found_ok=True)
523
+ # self.client.create_table(dst_view)
524
+ # logger.debug(f'View {src_view_id} copied to {dst_view}')
525
+
526
+ # if drop:
527
+ # self.client.delete_table(src_view_id)
528
+ # logger.debug(f'View {src_view_id} dropped')
529
+
530
+ # def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
531
+ # src_project_id, src_dataset_id, _ = src_routine_id.split('.')
532
+ # dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
533
+
534
+ # # Create or replace
535
+ # src_routine = self.client.get_routine(src_routine_id)
536
+ # dst_routine = bigquery.Routine(dst_routine_id)
537
+ # dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
538
+ # dst_routine.type_ = src_routine.type_
539
+ # dst_routine.description = src_routine.description
540
+ # dst_routine.language = src_routine.language
541
+ # dst_routine.arguments = src_routine.arguments
542
+ # dst_routine.return_type = src_routine.return_type
543
+ # self.client.delete_routine(dst_routine, not_found_ok=True)
544
+ # self.client.create_routine(dst_routine)
545
+ # logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
546
+
547
+ # if drop:
548
+ # self.client.delete_routine(src_routine_id)
549
+ # logger.debug(f'Routine {src_routine_id} dropped')
550
+
551
+ # MARK: Utilities
552
+
553
+ @staticmethod
554
+ def get_table_fqn_parts(name: str | list[str]) -> list[str] | list[list[str]]:
555
+ """Get fully qualified table name, following this format `<projectid>.<datasetid>.<tableid>`
556
+
557
+ Args:
558
+ name (str | list[str]): Input name (can be multiple)
559
+
560
+ Returns:
561
+ list[str] | list[list[str]]: The FQN parts. If the input is list then returns list of FQN parts instead.
562
+ """
563
+
564
+ if isinstance(name, list):
565
+ return [BQ.get_table_fqn_parts(x) for x in name]
566
+
567
+ split = name.split('.')
568
+ if len(split) == 3:
569
+ return split
570
+ else:
571
+ raise ValueError(f'{name} is not a valid table FQN')
572
+
573
+ @staticmethod
574
+ def raise_for_invalid_table_fqn(name: str | list[str]):
575
+ """Raise an error if the provied name is a fully qualified table name
576
+
577
+ Args:
578
+ name (str | list[str]): Input name (can be multiple)
579
+
580
+ Raises:
581
+ ValueError: If name is not a fully qualified table name
582
+ """
583
+
584
+ if not BQ.get_table_fqn_parts(name):
585
+ raise ValueError(f'{name} is not a valid table FQN')
586
+
587
+ def is_table_exists(self, table_fqn: str) -> bool:
588
+ self.raise_for_invalid_table_fqn(table_fqn)
589
+ try:
590
+ self.client.get_table(table_fqn)
591
+ return True
592
+ except NotFound:
593
+ return False
594
+
595
+ def close(self):
596
+ self.client.close()
597
+ logger.debug('BQ client close')
@@ -57,13 +57,13 @@ def compress(src_filename: str, keep: bool = False, max_size_bytes=ByteSize.GB,
57
57
  yield dst_filename
58
58
 
59
59
 
60
- def combine(src_filenames: list[str], dst_filename: str) -> None:
60
+ def combine(src_filenames: list[str], dst_filename: str, gzip: bool = False, delete: bool = False) -> None:
61
61
  csv.field_size_limit(min(sys.maxsize, 2147483646)) # FIX: _csv.Error: field larger than field limit (131072)
62
62
 
63
63
  if not dst_filename.endswith('.csv'):
64
64
  raise ValueError('Output filename must ends with \'.csv\'!')
65
65
 
66
- first_file = True
66
+ first_src_file = True
67
67
  with open(dst_filename, 'w') as fout:
68
68
  csvwriter = csv.writer(fout)
69
69
 
@@ -71,21 +71,25 @@ def combine(src_filenames: list[str], dst_filename: str) -> None:
71
71
  src_filename = os.path.expanduser(src_filename)
72
72
 
73
73
  # Decompress gzipped csv
74
- if src_filename.endswith('.csv.gz'):
74
+ if gzip:
75
75
  src_filename = decompress(src_filename)
76
76
 
77
- # Copy
77
+ # Write content into file
78
78
  with open(src_filename, 'r') as fin:
79
79
  csvreader = csv.reader(fin)
80
80
 
81
- # Copy the header if this is the first file
82
- if first_file:
81
+ # Write header only at first file
82
+ if first_src_file:
83
83
  csvwriter.writerow(next(csvreader))
84
- first_file = False
85
- # Else, skip the header
84
+ first_src_file = False
86
85
  else:
87
86
  next(csvreader)
88
87
 
88
+ # Write body
89
89
  [csvwriter.writerow(row) for row in csvreader]
90
90
 
91
- logger.info(f'Combine {src_filename}')
91
+ logger.debug(f'Combine {src_filename}')
92
+
93
+ if delete:
94
+ os.remove(src_filename)
95
+ logger.debug(f'Delete {src_filename}')