rdxz2-utill 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rdxz2-utill might be problematic. Click here for more details.
- {rdxz2_utill-0.1.3.dist-info → rdxz2_utill-0.1.4.dist-info}/METADATA +2 -1
- rdxz2_utill-0.1.4.dist-info/RECORD +37 -0
- utill/cmd/_bq.py +16 -3
- utill/cmd/_conf.py +15 -15
- utill/cmd/_enc.py +8 -4
- utill/cmd/_mb.py +116 -36
- utill/cmd/_pg.py +4 -2
- utill/cmd/utill.py +193 -72
- utill/my_bq.py +271 -158
- utill/my_compare.py +1 -1
- utill/my_const.py +11 -8
- utill/my_csv.py +31 -15
- utill/my_datetime.py +21 -10
- utill/my_encryption.py +31 -13
- utill/my_env.py +22 -13
- utill/my_file.py +15 -13
- utill/my_gcs.py +40 -16
- utill/my_gdrive.py +195 -0
- utill/my_input.py +8 -4
- utill/my_json.py +6 -6
- utill/my_mb.py +351 -357
- utill/my_pg.py +76 -46
- utill/my_queue.py +37 -24
- utill/my_string.py +23 -5
- utill/my_style.py +18 -16
- utill/my_tunnel.py +29 -9
- utill/my_xlsx.py +11 -8
- rdxz2_utill-0.1.3.dist-info/RECORD +0 -36
- {rdxz2_utill-0.1.3.dist-info → rdxz2_utill-0.1.4.dist-info}/WHEEL +0 -0
- {rdxz2_utill-0.1.3.dist-info → rdxz2_utill-0.1.4.dist-info}/entry_points.txt +0 -0
- {rdxz2_utill-0.1.3.dist-info → rdxz2_utill-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {rdxz2_utill-0.1.3.dist-info → rdxz2_utill-0.1.4.dist-info}/top_level.txt +0 -0
utill/my_bq.py
CHANGED
|
@@ -19,23 +19,23 @@ import textwrap
|
|
|
19
19
|
import time
|
|
20
20
|
|
|
21
21
|
PY_DATA_TYPE__BQ_DATA_TYPE = {
|
|
22
|
-
int:
|
|
23
|
-
str:
|
|
24
|
-
float:
|
|
22
|
+
int: "INTEGER",
|
|
23
|
+
str: "STRING",
|
|
24
|
+
float: "STRING",
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class DataFileFormat(StrEnum):
|
|
29
|
-
CSV =
|
|
30
|
-
JSON =
|
|
31
|
-
AVRO =
|
|
32
|
-
PARQUET =
|
|
33
|
-
ORC =
|
|
29
|
+
CSV = "CSV"
|
|
30
|
+
JSON = "JSON"
|
|
31
|
+
AVRO = "AVRO"
|
|
32
|
+
PARQUET = "PARQUET"
|
|
33
|
+
ORC = "ORC"
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class DataFileCompression(StrEnum):
|
|
37
|
-
GZIP =
|
|
38
|
-
SNAPPY =
|
|
37
|
+
GZIP = "GZIP"
|
|
38
|
+
SNAPPY = "SNAPPY"
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class LoadStrategy(Enum):
|
|
@@ -44,43 +44,46 @@ class LoadStrategy(Enum):
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
class Dtype:
|
|
47
|
-
INT64 =
|
|
48
|
-
INTEGER =
|
|
49
|
-
FLOAT64 =
|
|
47
|
+
INT64 = "INT64"
|
|
48
|
+
INTEGER = "INTEGER"
|
|
49
|
+
FLOAT64 = "FLOAT64"
|
|
50
50
|
|
|
51
|
-
DECIMAL =
|
|
51
|
+
DECIMAL = "DECIMAL"
|
|
52
52
|
|
|
53
|
-
STRING =
|
|
54
|
-
JSON =
|
|
53
|
+
STRING = "STRING"
|
|
54
|
+
JSON = "JSON"
|
|
55
55
|
|
|
56
|
-
DATE =
|
|
57
|
-
TIME =
|
|
58
|
-
DATETIME =
|
|
59
|
-
TIMESTAMP =
|
|
56
|
+
DATE = "DATE"
|
|
57
|
+
TIME = "TIME"
|
|
58
|
+
DATETIME = "DATETIME"
|
|
59
|
+
TIMESTAMP = "TIMESTAMP"
|
|
60
60
|
|
|
61
|
-
BOOL =
|
|
61
|
+
BOOL = "BOOL"
|
|
62
62
|
|
|
63
|
-
ARRAY_INT64 =
|
|
64
|
-
ARRAY_INTEGER =
|
|
65
|
-
ARRAY_FLOAT64 =
|
|
66
|
-
ARRAY_STRING =
|
|
67
|
-
ARRAY_JSON =
|
|
68
|
-
ARRAY_DATE =
|
|
69
|
-
ARRAY_DATETIME =
|
|
70
|
-
ARRAY_TIMESTAMP =
|
|
71
|
-
ARRAY_BOOL =
|
|
63
|
+
ARRAY_INT64 = "ARRAY<INT64>"
|
|
64
|
+
ARRAY_INTEGER = "ARRAY<INTEGER>"
|
|
65
|
+
ARRAY_FLOAT64 = "ARRAY<FLOAT64>"
|
|
66
|
+
ARRAY_STRING = "ARRAY<STRING>"
|
|
67
|
+
ARRAY_JSON = "ARRAY<JSON>"
|
|
68
|
+
ARRAY_DATE = "ARRAY<DATE>"
|
|
69
|
+
ARRAY_DATETIME = "ARRAY<DATETIME>"
|
|
70
|
+
ARRAY_TIMESTAMP = "ARRAY<TIMESTAMP>"
|
|
71
|
+
ARRAY_BOOL = "ARRAY<BOOL>"
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
class BQ
|
|
74
|
+
class BQ:
|
|
75
75
|
def __init__(self, location: str | None = None, project_id: str = None):
|
|
76
76
|
if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
|
|
77
|
-
logger.warning(
|
|
77
|
+
logger.warning("Using ADC for BigQuery authentication")
|
|
78
78
|
|
|
79
79
|
# if location is None and my_env.envs.GCP_REGION is None:
|
|
80
80
|
# raise ValueError('GCP region must be set in environment variables.')
|
|
81
81
|
|
|
82
|
-
self.client = bigquery.Client(
|
|
83
|
-
|
|
82
|
+
self.client = bigquery.Client(
|
|
83
|
+
project=project_id or my_env.envs.GCP_PROJECT_ID,
|
|
84
|
+
location=location or my_env.envs.GCP_REGION,
|
|
85
|
+
)
|
|
86
|
+
logger.debug(f"BQ client open, project: {self.client.project}")
|
|
84
87
|
|
|
85
88
|
# MARK: Query execution
|
|
86
89
|
|
|
@@ -95,8 +98,10 @@ class BQ():
|
|
|
95
98
|
is_multi = isinstance(query, list)
|
|
96
99
|
queries = query if is_multi else [query]
|
|
97
100
|
queries = [textwrap.dedent(q).strip() for q in queries]
|
|
98
|
-
queries = [
|
|
99
|
-
|
|
101
|
+
queries = [
|
|
102
|
+
q if q.endswith(";") else q + ";" for q in queries
|
|
103
|
+
] # Append ';' character for each query
|
|
104
|
+
query = "\n".join(queries)
|
|
100
105
|
|
|
101
106
|
# Evaluate parameter
|
|
102
107
|
query_parameters = []
|
|
@@ -104,36 +109,63 @@ class BQ():
|
|
|
104
109
|
is_array = isinstance(value, list)
|
|
105
110
|
value_type_py = type(value[0]) if is_array else type(value)
|
|
106
111
|
if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
|
|
107
|
-
raise ValueError(
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}"
|
|
114
|
+
)
|
|
108
115
|
|
|
109
116
|
value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
|
|
110
117
|
|
|
111
118
|
# Handle data type conversions
|
|
112
119
|
if value_type_py == datetime.date:
|
|
113
|
-
value =
|
|
120
|
+
value = (
|
|
121
|
+
[v.strftime("%Y-%m-%d") for v in value]
|
|
122
|
+
if is_array
|
|
123
|
+
else value.strftime("%Y-%m-%d")
|
|
124
|
+
)
|
|
114
125
|
|
|
115
126
|
if is_array:
|
|
116
|
-
query_parameters.append(
|
|
127
|
+
query_parameters.append(
|
|
128
|
+
bigquery.ArrayQueryParameter(parameter, value_type_bq, value)
|
|
129
|
+
)
|
|
117
130
|
else:
|
|
118
|
-
query_parameters.append(
|
|
131
|
+
query_parameters.append(
|
|
132
|
+
bigquery.ScalarQueryParameter(parameter, value_type_bq, value)
|
|
133
|
+
)
|
|
119
134
|
|
|
120
|
-
logger.debug(f
|
|
121
|
-
query_job_config = bigquery.QueryJobConfig(
|
|
135
|
+
logger.debug(f"🔎 Query:\n{query}")
|
|
136
|
+
query_job_config = bigquery.QueryJobConfig(
|
|
137
|
+
dry_run=dry_run, query_parameters=query_parameters
|
|
138
|
+
)
|
|
122
139
|
if temporary_table:
|
|
123
140
|
query_job_config.destination = None
|
|
124
141
|
t = time.time()
|
|
125
142
|
query_job = self.client.query(query, job_config=query_job_config)
|
|
126
|
-
|
|
143
|
+
(
|
|
144
|
+
logger.info(
|
|
145
|
+
f"Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults"
|
|
146
|
+
)
|
|
147
|
+
if not dry_run
|
|
148
|
+
else None
|
|
149
|
+
)
|
|
127
150
|
query_job.result() # Wait for the job to complete
|
|
128
151
|
elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
|
|
129
152
|
|
|
130
153
|
if not is_multi:
|
|
131
|
-
logger.info(
|
|
154
|
+
logger.info(
|
|
155
|
+
f"[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}",
|
|
156
|
+
)
|
|
132
157
|
else:
|
|
133
|
-
logger.info(f
|
|
158
|
+
logger.info(f"[Job ID] {query_job.job_id} [Elapsed] {elapsed}")
|
|
134
159
|
|
|
135
|
-
jobs: list[bigquery.QueryJob] = list(
|
|
136
|
-
|
|
160
|
+
jobs: list[bigquery.QueryJob] = list(
|
|
161
|
+
self.client.list_jobs(parent_job=query_job.job_id)
|
|
162
|
+
)
|
|
163
|
+
[
|
|
164
|
+
logger.info(
|
|
165
|
+
f"[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)",
|
|
166
|
+
)
|
|
167
|
+
for job in jobs
|
|
168
|
+
]
|
|
137
169
|
|
|
138
170
|
return query_job
|
|
139
171
|
|
|
@@ -156,56 +188,68 @@ class BQ():
|
|
|
156
188
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
157
189
|
|
|
158
190
|
# Construct table options
|
|
159
|
-
logger.debug(
|
|
191
|
+
logger.debug("Constructing table options ...")
|
|
160
192
|
table_options = []
|
|
161
193
|
if expiration_timestamp_utc:
|
|
162
|
-
table_options.append(
|
|
194
|
+
table_options.append(
|
|
195
|
+
f" expiration_timestamp='{expiration_timestamp_utc.isoformat()}'"
|
|
196
|
+
)
|
|
163
197
|
if partition_by and require_partition_filter:
|
|
164
|
-
table_options.append(f
|
|
198
|
+
table_options.append(f" require_partition_filter=TRUE")
|
|
165
199
|
if description:
|
|
166
|
-
table_options.append(f
|
|
200
|
+
table_options.append(f" description='{description}'")
|
|
167
201
|
|
|
168
202
|
# Check if table exists
|
|
169
|
-
logger.debug(
|
|
170
|
-
dst_table_project_id, dst_table_dataset_id, dst_table_id =
|
|
171
|
-
|
|
203
|
+
logger.debug("Checking if destination table exists ...")
|
|
204
|
+
dst_table_project_id, dst_table_dataset_id, dst_table_id = (
|
|
205
|
+
self.get_table_fqn_parts(dst_table_fqn)
|
|
206
|
+
)
|
|
207
|
+
table_exist = self.is_table_exists(
|
|
208
|
+
project_id=dst_table_project_id,
|
|
209
|
+
dataset_id=dst_table_dataset_id,
|
|
210
|
+
table_id=dst_table_id,
|
|
211
|
+
)
|
|
172
212
|
|
|
173
213
|
# Construct beautiful query string
|
|
174
214
|
if table_exist and not replace:
|
|
175
|
-
logger.debug(
|
|
176
|
-
query_parts = [f
|
|
215
|
+
logger.debug("Table exists, constructing INSERT query ...")
|
|
216
|
+
query_parts = [f"INSERT INTO `{dst_table_fqn}`"]
|
|
177
217
|
if schema:
|
|
178
|
-
schema_str =
|
|
179
|
-
query_parts.append(f
|
|
218
|
+
schema_str = ",\n".join([column["name"] for column in schema])
|
|
219
|
+
query_parts.append(f"(\n{schema_str}\n)")
|
|
180
220
|
if table_options:
|
|
181
|
-
table_options_str =
|
|
182
|
-
query_parts.append(f
|
|
221
|
+
table_options_str = ",\n".join(table_options)
|
|
222
|
+
query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
|
|
183
223
|
else:
|
|
184
|
-
logger.debug(
|
|
224
|
+
logger.debug("Table not exist, constructing CREATE TABLE query ...")
|
|
185
225
|
query_parts = [
|
|
186
|
-
f
|
|
226
|
+
f"CREATE OR REPLACE TABLE `{dst_table_fqn}`",
|
|
187
227
|
]
|
|
188
228
|
if schema:
|
|
189
|
-
schema_str =
|
|
190
|
-
|
|
229
|
+
schema_str = ",\n".join(
|
|
230
|
+
[f' {column["name"]} {column["data_type"]}' for column in schema]
|
|
231
|
+
)
|
|
232
|
+
query_parts.append(f"(\n{schema_str}\n)")
|
|
191
233
|
if partition_by:
|
|
192
|
-
query_parts.append(f
|
|
234
|
+
query_parts.append(f"PARTITION BY {partition_by}")
|
|
193
235
|
if clustering_fields:
|
|
194
|
-
clustering_fields_str =
|
|
195
|
-
|
|
236
|
+
clustering_fields_str = ", ".join(
|
|
237
|
+
[f"`{field}`" for field in clustering_fields]
|
|
238
|
+
)
|
|
239
|
+
query_parts.append(f"CLUSTER BY {clustering_fields_str}")
|
|
196
240
|
if table_options:
|
|
197
|
-
table_options_str =
|
|
198
|
-
query_parts.append(f
|
|
199
|
-
query_parts.append(
|
|
241
|
+
table_options_str = ",\n".join(table_options)
|
|
242
|
+
query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
|
|
243
|
+
query_parts.append("AS")
|
|
200
244
|
query_parts.append(textwrap.dedent(query).strip())
|
|
201
245
|
|
|
202
246
|
# Execute
|
|
203
|
-
logger.debug(
|
|
204
|
-
query =
|
|
247
|
+
logger.debug("Executing query ...")
|
|
248
|
+
query = "\n".join(query_parts)
|
|
205
249
|
self.execute_query(query, parameters=query_parameters)
|
|
206
250
|
|
|
207
251
|
def drop_table(self, bq_table_fqn: str):
|
|
208
|
-
logger.info(f
|
|
252
|
+
logger.info(f"Dropping table: {bq_table_fqn} ...")
|
|
209
253
|
self.raise_for_invalid_table_fqn(bq_table_fqn)
|
|
210
254
|
self.client.delete_table(bq_table_fqn, not_found_ok=True)
|
|
211
255
|
|
|
@@ -219,7 +263,7 @@ class BQ():
|
|
|
219
263
|
schema: list[dict] | None = None,
|
|
220
264
|
partition_by: str | None = None,
|
|
221
265
|
clustering_fields: list[str] | None = None,
|
|
222
|
-
field_delimiter: str =
|
|
266
|
+
field_delimiter: str = ",",
|
|
223
267
|
load_strategy: LoadStrategy = LoadStrategy.APPEND,
|
|
224
268
|
format: DataFileFormat = DataFileFormat.CSV,
|
|
225
269
|
compression=None,
|
|
@@ -227,36 +271,42 @@ class BQ():
|
|
|
227
271
|
|
|
228
272
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
229
273
|
|
|
230
|
-
logger.debug(f
|
|
274
|
+
logger.debug(f"Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...")
|
|
231
275
|
|
|
232
276
|
# Construct LOAD options
|
|
233
|
-
logger.debug(
|
|
277
|
+
logger.debug("Constructing LOAD options ...")
|
|
234
278
|
load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
|
|
235
|
-
f
|
|
236
|
-
f
|
|
279
|
+
f" format='{format}'",
|
|
280
|
+
f" uris=['{src_gcs_uri}']",
|
|
237
281
|
]
|
|
238
282
|
if format == DataFileFormat.CSV:
|
|
239
|
-
load_options.append(f
|
|
240
|
-
load_options.append(f
|
|
241
|
-
load_options.append(f
|
|
283
|
+
load_options.append(f" skip_leading_rows=1")
|
|
284
|
+
load_options.append(f" field_delimiter='{field_delimiter}'")
|
|
285
|
+
load_options.append(f" allow_quoted_newlines=true")
|
|
242
286
|
if compression:
|
|
243
|
-
load_options.append(f
|
|
244
|
-
load_options_str =
|
|
287
|
+
load_options.append(f" compression='{compression}'")
|
|
288
|
+
load_options_str = ",\n".join(load_options)
|
|
245
289
|
|
|
246
290
|
# Construct beautiful query string
|
|
247
|
-
logger.debug(
|
|
248
|
-
schema_str =
|
|
249
|
-
|
|
291
|
+
logger.debug("Constructing LOAD query ...")
|
|
292
|
+
schema_str = ",\n".join(
|
|
293
|
+
[f' {column["name"]} {column["data_type"]}' for column in schema]
|
|
294
|
+
)
|
|
295
|
+
query_parts = [
|
|
296
|
+
f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)'
|
|
297
|
+
]
|
|
250
298
|
if partition_by:
|
|
251
|
-
query_parts.append(f
|
|
299
|
+
query_parts.append(f"PARTITION BY {partition_by}")
|
|
252
300
|
if clustering_fields:
|
|
253
|
-
clustering_fields_str =
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
301
|
+
clustering_fields_str = ", ".join(
|
|
302
|
+
[f"`{field}`" for field in clustering_fields]
|
|
303
|
+
)
|
|
304
|
+
query_parts.append(f"CLUSTER BY {clustering_fields_str}")
|
|
305
|
+
query_parts.append(f"FROM FILES (\n{load_options_str}\n)")
|
|
306
|
+
query = "\n".join(query_parts)
|
|
257
307
|
|
|
258
308
|
# Execute
|
|
259
|
-
logger.debug(
|
|
309
|
+
logger.debug("Executing query ...")
|
|
260
310
|
self.execute_query(query)
|
|
261
311
|
|
|
262
312
|
def export_data(
|
|
@@ -268,46 +318,64 @@ class BQ():
|
|
|
268
318
|
format: DataFileFormat = DataFileFormat.CSV,
|
|
269
319
|
compression: DataFileCompression | None = None,
|
|
270
320
|
header: bool = True,
|
|
271
|
-
delimiter: str =
|
|
321
|
+
delimiter: str = ",",
|
|
272
322
|
):
|
|
273
|
-
logger.debug(f
|
|
323
|
+
logger.debug(f"Exporting query into {dst_gcs_uri} ...")
|
|
274
324
|
|
|
275
325
|
# GCS uri validation
|
|
276
|
-
if
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
raise ValueError(
|
|
326
|
+
if (
|
|
327
|
+
format == DataFileFormat.CSV
|
|
328
|
+
and compression == DataFileCompression.GZIP
|
|
329
|
+
and not dst_gcs_uri.endswith(".gz")
|
|
330
|
+
):
|
|
331
|
+
raise ValueError(
|
|
332
|
+
"GCS path need to ends with .gz if using compression = GCSCompression.GZIP"
|
|
333
|
+
)
|
|
334
|
+
elif (
|
|
335
|
+
format == DataFileFormat.CSV
|
|
336
|
+
and compression != DataFileCompression.GZIP
|
|
337
|
+
and not dst_gcs_uri.endswith(".csv")
|
|
338
|
+
):
|
|
339
|
+
raise ValueError(
|
|
340
|
+
"GCS path need to ends with .csv if using format = GCSExportFormat.CSV"
|
|
341
|
+
)
|
|
342
|
+
elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith(".parquet"):
|
|
343
|
+
raise ValueError(
|
|
344
|
+
"GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET"
|
|
345
|
+
)
|
|
282
346
|
|
|
283
347
|
# Construct options
|
|
284
|
-
logger.debug(
|
|
348
|
+
logger.debug("Constructing EXPORT options ...")
|
|
285
349
|
options = [
|
|
286
|
-
f
|
|
287
|
-
f
|
|
288
|
-
f
|
|
350
|
+
f" uri='{dst_gcs_uri}'",
|
|
351
|
+
f" format='{format}'",
|
|
352
|
+
f" overwrite=TRUE",
|
|
289
353
|
]
|
|
290
354
|
if format == DataFileFormat.CSV:
|
|
291
|
-
options.append(
|
|
355
|
+
options.append(
|
|
356
|
+
f" field_delimiter='{delimiter}'",
|
|
357
|
+
)
|
|
292
358
|
if header:
|
|
293
|
-
options.append(
|
|
359
|
+
options.append(
|
|
360
|
+
f' header={"true" if header else "false"}',
|
|
361
|
+
)
|
|
294
362
|
if compression:
|
|
295
|
-
options.append(f
|
|
296
|
-
options_str =
|
|
363
|
+
options.append(f" compression='{compression}'")
|
|
364
|
+
options_str = ",\n".join(options)
|
|
297
365
|
|
|
298
366
|
# Construct beautiful query string
|
|
299
|
-
logger.debug(
|
|
367
|
+
logger.debug("Constructing EXPORT query ...")
|
|
300
368
|
query = (
|
|
301
|
-
f
|
|
302
|
-
f
|
|
303
|
-
f
|
|
304
|
-
f
|
|
305
|
-
f
|
|
306
|
-
f
|
|
369
|
+
f"EXPORT DATA OPTIONS (\n"
|
|
370
|
+
f"{options_str}\n"
|
|
371
|
+
f")\n"
|
|
372
|
+
f"AS (\n"
|
|
373
|
+
f"{textwrap.dedent(query).strip()}\n"
|
|
374
|
+
f");"
|
|
307
375
|
)
|
|
308
376
|
|
|
309
377
|
# Execute
|
|
310
|
-
logger.debug(
|
|
378
|
+
logger.debug("Executing query ...")
|
|
311
379
|
self.execute_query(query=query, parameters=parameters)
|
|
312
380
|
|
|
313
381
|
def upload_csv(
|
|
@@ -323,11 +391,13 @@ class BQ():
|
|
|
323
391
|
):
|
|
324
392
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
325
393
|
|
|
326
|
-
if compression == DataFileCompression.GZIP and not src_filepath.endswith(
|
|
327
|
-
raise ValueError(
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
394
|
+
if compression == DataFileCompression.GZIP and not src_filepath.endswith(".gz"):
|
|
395
|
+
raise ValueError(
|
|
396
|
+
"Please provide file path with .gz extension if using compression = GZIP"
|
|
397
|
+
)
|
|
398
|
+
elif not src_filepath.endswith(".csv"):
|
|
399
|
+
raise ValueError("Please provide file path with .csv extension")
|
|
400
|
+
|
|
331
401
|
src_filename, src_fileextension = os.path.splitext(src_filepath)
|
|
332
402
|
src_filename = os.path.basename(src_filename) # Only get filename
|
|
333
403
|
|
|
@@ -363,7 +433,7 @@ class BQ():
|
|
|
363
433
|
# Load to BQ
|
|
364
434
|
try:
|
|
365
435
|
self.load_data(
|
|
366
|
-
f
|
|
436
|
+
f"gs://{gcs.bucket.name}/{dst_blobpath}",
|
|
367
437
|
dst_table_fqn,
|
|
368
438
|
schema=schema,
|
|
369
439
|
partition_by=partition_by,
|
|
@@ -386,43 +456,61 @@ class BQ():
|
|
|
386
456
|
query_parameters: dict = {},
|
|
387
457
|
csv_row_limit: int | None = None,
|
|
388
458
|
) -> str | list[str]:
|
|
389
|
-
if not dst_filepath.endswith(
|
|
390
|
-
raise ValueError(
|
|
459
|
+
if not dst_filepath.endswith(".csv"):
|
|
460
|
+
raise ValueError("Destination filename must ends with .csv")
|
|
391
461
|
|
|
392
462
|
# Init
|
|
393
463
|
gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
|
|
394
464
|
|
|
395
465
|
# Generic function to export-download-combine csv file from BQ->GCS->local
|
|
396
|
-
def _export_download_combine(
|
|
466
|
+
def _export_download_combine(
|
|
467
|
+
query: str,
|
|
468
|
+
dst_gcs_prefix: str,
|
|
469
|
+
dst_filepath: str,
|
|
470
|
+
query_parameters: dict = {},
|
|
471
|
+
):
|
|
397
472
|
# Init tmp directory
|
|
398
|
-
tmp_dirname = f
|
|
473
|
+
tmp_dirname = f"/tmp/my_bq_{my_datetime.get_current_datetime_str()}"
|
|
399
474
|
if os.path.exists(tmp_dirname):
|
|
400
475
|
shutil.rmtree(tmp_dirname, ignore_errors=True)
|
|
401
476
|
os.makedirs(tmp_dirname, exist_ok=True)
|
|
402
|
-
logger.debug(f
|
|
477
|
+
logger.debug(f"Temporary directory created: {tmp_dirname}")
|
|
403
478
|
|
|
404
479
|
try:
|
|
405
480
|
# Export to GCS
|
|
406
|
-
dst_gcs_uri = f
|
|
407
|
-
self.export_data(
|
|
481
|
+
dst_gcs_uri = f"gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz"
|
|
482
|
+
self.export_data(
|
|
483
|
+
query,
|
|
484
|
+
dst_gcs_uri,
|
|
485
|
+
parameters=query_parameters,
|
|
486
|
+
format=DataFileFormat.CSV,
|
|
487
|
+
compression=DataFileCompression.GZIP,
|
|
488
|
+
)
|
|
408
489
|
|
|
409
490
|
# Download from GCS
|
|
410
491
|
local_tmp_filepaths = []
|
|
411
492
|
for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
|
|
412
|
-
local_tmp_filepath = os.path.join(
|
|
493
|
+
local_tmp_filepath = os.path.join(
|
|
494
|
+
tmp_dirname, tmp_blobs.name.split("/")[-1]
|
|
495
|
+
)
|
|
413
496
|
gcs.download(tmp_blobs, local_tmp_filepath, move=True)
|
|
414
497
|
# logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
|
|
415
498
|
local_tmp_filepaths.append(local_tmp_filepath)
|
|
416
499
|
|
|
417
500
|
# Combine downloaded files
|
|
418
|
-
my_csv.combine(
|
|
501
|
+
my_csv.combine(
|
|
502
|
+
local_tmp_filepaths, dst_filepath, gzip=True, delete=True
|
|
503
|
+
)
|
|
419
504
|
except:
|
|
420
505
|
raise
|
|
421
506
|
finally:
|
|
422
507
|
shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
|
|
423
|
-
[
|
|
508
|
+
[
|
|
509
|
+
gcs.delete_blob(blob_filepath)
|
|
510
|
+
for blob_filepath in gcs.list_blobs(dst_gcs_prefix)
|
|
511
|
+
] # Remove temporary GCS files
|
|
424
512
|
|
|
425
|
-
logger.info(f
|
|
513
|
+
logger.info(f"Export-download-combine done: {dst_filepath}")
|
|
426
514
|
|
|
427
515
|
# Limited csv rows
|
|
428
516
|
if csv_row_limit:
|
|
@@ -432,22 +520,31 @@ class BQ():
|
|
|
432
520
|
# Create temporary table
|
|
433
521
|
query_job = self.execute_query(query, temporary_table=True)
|
|
434
522
|
tmp_table_fqn = str(query_job.destination)
|
|
435
|
-
logger.debug(f
|
|
523
|
+
logger.debug(f"Create temp table: {tmp_table_fqn}")
|
|
436
524
|
|
|
437
525
|
# Create another temporary table for row numbering
|
|
438
|
-
query_job = self.execute_query(
|
|
526
|
+
query_job = self.execute_query(
|
|
527
|
+
f"SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`",
|
|
528
|
+
temporary_table=True,
|
|
529
|
+
)
|
|
439
530
|
tmp_table_fqn_rn = str(query_job.destination)
|
|
440
|
-
logger.debug(f
|
|
531
|
+
logger.debug(f"Create temp table (rn): {tmp_table_fqn_rn}")
|
|
441
532
|
|
|
442
533
|
# Process parts
|
|
443
|
-
count = list(
|
|
534
|
+
count = list(
|
|
535
|
+
self.execute_query(
|
|
536
|
+
f"SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`"
|
|
537
|
+
).result()
|
|
538
|
+
)[0][0]
|
|
444
539
|
parts = math.ceil(count / csv_row_limit)
|
|
445
|
-
logger.info(f
|
|
540
|
+
logger.info(f"Total part: {count} / {csv_row_limit} = {parts}")
|
|
446
541
|
dst_filepaths = []
|
|
447
542
|
for part in range(parts):
|
|
448
|
-
dst_filepath_part =
|
|
543
|
+
dst_filepath_part = (
|
|
544
|
+
f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
|
|
545
|
+
)
|
|
449
546
|
_export_download_combine(
|
|
450
|
-
f
|
|
547
|
+
f"SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn",
|
|
451
548
|
dst_gcs_prefix=gcs.build_tmp_dirpath(),
|
|
452
549
|
dst_filepath=dst_filepath_part,
|
|
453
550
|
)
|
|
@@ -464,7 +561,12 @@ class BQ():
|
|
|
464
561
|
|
|
465
562
|
# Unlimited csv rows
|
|
466
563
|
else:
|
|
467
|
-
_export_download_combine(
|
|
564
|
+
_export_download_combine(
|
|
565
|
+
query,
|
|
566
|
+
gcs.build_tmp_dirpath(),
|
|
567
|
+
dst_filepath,
|
|
568
|
+
query_parameters=query_parameters,
|
|
569
|
+
)
|
|
468
570
|
return dst_filepath
|
|
469
571
|
|
|
470
572
|
# query_job_result = query_job.result()
|
|
@@ -496,32 +598,43 @@ class BQ():
|
|
|
496
598
|
# if f:
|
|
497
599
|
# f.close()
|
|
498
600
|
|
|
499
|
-
def download_xlsx(
|
|
500
|
-
|
|
501
|
-
|
|
601
|
+
def download_xlsx(
|
|
602
|
+
self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000
|
|
603
|
+
):
|
|
604
|
+
if not dst_filename.endswith(".xlsx"):
|
|
605
|
+
raise ValueError("Destination filename must ends with .xlsx!")
|
|
502
606
|
|
|
503
607
|
# Create a temporary table acting as excel file splitting
|
|
504
|
-
table_name_tmp = f
|
|
505
|
-
self.execute_query(
|
|
608
|
+
table_name_tmp = f"{src_table_fqn}_"
|
|
609
|
+
self.execute_query(
|
|
610
|
+
f"CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`"
|
|
611
|
+
)
|
|
506
612
|
|
|
507
613
|
try:
|
|
508
614
|
# Calculate the number of excel file parts based on row limit
|
|
509
|
-
cnt = list(
|
|
615
|
+
cnt = list(
|
|
616
|
+
self.execute_query(
|
|
617
|
+
f"SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`"
|
|
618
|
+
).result()
|
|
619
|
+
)[0][0]
|
|
510
620
|
parts = math.ceil(cnt / xlsx_row_limit)
|
|
511
|
-
logger.debug(f
|
|
621
|
+
logger.debug(f"Total part: {cnt} / {xlsx_row_limit} = {parts}")
|
|
512
622
|
|
|
513
623
|
# Download per parts
|
|
514
624
|
for part in range(parts):
|
|
515
|
-
logger.debug(f
|
|
516
|
-
file_path_tmp = f
|
|
517
|
-
file_path_tmp_csv = f
|
|
518
|
-
self.download_csv(
|
|
519
|
-
|
|
625
|
+
logger.debug(f"Downloading part {part + 1}...")
|
|
626
|
+
file_path_tmp = f"{dst_filename}_part{part + 1}"
|
|
627
|
+
file_path_tmp_csv = f"{file_path_tmp}.csv"
|
|
628
|
+
self.download_csv(
|
|
629
|
+
f"SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}",
|
|
630
|
+
f"{file_path_tmp}{os.sep}",
|
|
631
|
+
)
|
|
632
|
+
my_xlsx.csv_to_xlsx(file_path_tmp_csv, f"{file_path_tmp}.xlsx")
|
|
520
633
|
os.remove(file_path_tmp_csv)
|
|
521
634
|
except Exception as e:
|
|
522
635
|
raise e
|
|
523
636
|
finally:
|
|
524
|
-
self.execute_query(f
|
|
637
|
+
self.execute_query(f"DROP TABLE IF EXISTS `{table_name_tmp}`")
|
|
525
638
|
|
|
526
639
|
# def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
|
|
527
640
|
# src_project_id, src_dataset_id, _ = src_view_id.split('.')
|
|
@@ -576,11 +689,11 @@ class BQ():
|
|
|
576
689
|
if isinstance(name, list):
|
|
577
690
|
return [BQ.get_table_fqn_parts(x) for x in name]
|
|
578
691
|
|
|
579
|
-
split = name.split(
|
|
692
|
+
split = name.split(".")
|
|
580
693
|
if len(split) == 3:
|
|
581
694
|
return split
|
|
582
695
|
else:
|
|
583
|
-
raise ValueError(f
|
|
696
|
+
raise ValueError(f"{name} is not a valid table FQN")
|
|
584
697
|
|
|
585
698
|
@staticmethod
|
|
586
699
|
def raise_for_invalid_table_fqn(name: str | list[str]):
|
|
@@ -594,7 +707,7 @@ class BQ():
|
|
|
594
707
|
"""
|
|
595
708
|
|
|
596
709
|
if not BQ.get_table_fqn_parts(name):
|
|
597
|
-
raise ValueError(f
|
|
710
|
+
raise ValueError(f"{name} is not a valid table FQN")
|
|
598
711
|
|
|
599
712
|
def is_table_exists(self, table_fqn: str) -> bool:
|
|
600
713
|
self.raise_for_invalid_table_fqn(table_fqn)
|
|
@@ -606,4 +719,4 @@ class BQ():
|
|
|
606
719
|
|
|
607
720
|
def close(self):
|
|
608
721
|
self.client.close()
|
|
609
|
-
logger.debug(
|
|
722
|
+
logger.debug("BQ client close")
|