rdxz2-utill 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rdxz2-utill might be problematic. Click here for more details.
- {rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/METADATA +2 -1
- rdxz2_utill-0.1.4.dist-info/RECORD +37 -0
- utill/cmd/_bq.py +16 -3
- utill/cmd/_conf.py +15 -15
- utill/cmd/_enc.py +8 -4
- utill/cmd/_mb.py +140 -0
- utill/cmd/_pg.py +4 -2
- utill/cmd/utill.py +203 -61
- utill/my_bq.py +287 -162
- utill/my_compare.py +1 -1
- utill/my_const.py +11 -8
- utill/my_csv.py +31 -15
- utill/my_datetime.py +21 -10
- utill/my_encryption.py +31 -13
- utill/my_env.py +22 -13
- utill/my_file.py +15 -13
- utill/my_gcs.py +40 -16
- utill/my_gdrive.py +195 -0
- utill/my_input.py +8 -4
- utill/my_json.py +6 -6
- utill/my_mb.py +351 -357
- utill/my_pg.py +76 -46
- utill/my_queue.py +37 -24
- utill/my_string.py +23 -5
- utill/my_style.py +18 -16
- utill/my_tunnel.py +29 -9
- utill/my_xlsx.py +11 -8
- rdxz2_utill-0.1.2.dist-info/RECORD +0 -35
- {rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/WHEEL +0 -0
- {rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/entry_points.txt +0 -0
- {rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {rdxz2_utill-0.1.2.dist-info → rdxz2_utill-0.1.4.dist-info}/top_level.txt +0 -0
utill/my_bq.py
CHANGED
|
@@ -19,23 +19,23 @@ import textwrap
|
|
|
19
19
|
import time
|
|
20
20
|
|
|
21
21
|
PY_DATA_TYPE__BQ_DATA_TYPE = {
|
|
22
|
-
int:
|
|
23
|
-
str:
|
|
24
|
-
float:
|
|
22
|
+
int: "INTEGER",
|
|
23
|
+
str: "STRING",
|
|
24
|
+
float: "STRING",
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class DataFileFormat(StrEnum):
|
|
29
|
-
CSV =
|
|
30
|
-
JSON =
|
|
31
|
-
AVRO =
|
|
32
|
-
PARQUET =
|
|
33
|
-
ORC =
|
|
29
|
+
CSV = "CSV"
|
|
30
|
+
JSON = "JSON"
|
|
31
|
+
AVRO = "AVRO"
|
|
32
|
+
PARQUET = "PARQUET"
|
|
33
|
+
ORC = "ORC"
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class DataFileCompression(StrEnum):
|
|
37
|
-
GZIP =
|
|
38
|
-
SNAPPY =
|
|
37
|
+
GZIP = "GZIP"
|
|
38
|
+
SNAPPY = "SNAPPY"
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class LoadStrategy(Enum):
|
|
@@ -44,43 +44,46 @@ class LoadStrategy(Enum):
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
class Dtype:
|
|
47
|
-
INT64 =
|
|
48
|
-
INTEGER =
|
|
49
|
-
FLOAT64 =
|
|
47
|
+
INT64 = "INT64"
|
|
48
|
+
INTEGER = "INTEGER"
|
|
49
|
+
FLOAT64 = "FLOAT64"
|
|
50
50
|
|
|
51
|
-
DECIMAL =
|
|
51
|
+
DECIMAL = "DECIMAL"
|
|
52
52
|
|
|
53
|
-
STRING =
|
|
54
|
-
JSON =
|
|
53
|
+
STRING = "STRING"
|
|
54
|
+
JSON = "JSON"
|
|
55
55
|
|
|
56
|
-
DATE =
|
|
57
|
-
TIME =
|
|
58
|
-
DATETIME =
|
|
59
|
-
TIMESTAMP =
|
|
56
|
+
DATE = "DATE"
|
|
57
|
+
TIME = "TIME"
|
|
58
|
+
DATETIME = "DATETIME"
|
|
59
|
+
TIMESTAMP = "TIMESTAMP"
|
|
60
60
|
|
|
61
|
-
BOOL =
|
|
61
|
+
BOOL = "BOOL"
|
|
62
62
|
|
|
63
|
-
ARRAY_INT64 =
|
|
64
|
-
ARRAY_INTEGER =
|
|
65
|
-
ARRAY_FLOAT64 =
|
|
66
|
-
ARRAY_STRING =
|
|
67
|
-
ARRAY_JSON =
|
|
68
|
-
ARRAY_DATE =
|
|
69
|
-
ARRAY_DATETIME =
|
|
70
|
-
ARRAY_TIMESTAMP =
|
|
71
|
-
ARRAY_BOOL =
|
|
63
|
+
ARRAY_INT64 = "ARRAY<INT64>"
|
|
64
|
+
ARRAY_INTEGER = "ARRAY<INTEGER>"
|
|
65
|
+
ARRAY_FLOAT64 = "ARRAY<FLOAT64>"
|
|
66
|
+
ARRAY_STRING = "ARRAY<STRING>"
|
|
67
|
+
ARRAY_JSON = "ARRAY<JSON>"
|
|
68
|
+
ARRAY_DATE = "ARRAY<DATE>"
|
|
69
|
+
ARRAY_DATETIME = "ARRAY<DATETIME>"
|
|
70
|
+
ARRAY_TIMESTAMP = "ARRAY<TIMESTAMP>"
|
|
71
|
+
ARRAY_BOOL = "ARRAY<BOOL>"
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
class BQ
|
|
74
|
+
class BQ:
|
|
75
75
|
def __init__(self, location: str | None = None, project_id: str = None):
|
|
76
76
|
if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
|
|
77
|
-
logger.warning(
|
|
77
|
+
logger.warning("Using ADC for BigQuery authentication")
|
|
78
78
|
|
|
79
79
|
# if location is None and my_env.envs.GCP_REGION is None:
|
|
80
80
|
# raise ValueError('GCP region must be set in environment variables.')
|
|
81
81
|
|
|
82
|
-
self.client = bigquery.Client(
|
|
83
|
-
|
|
82
|
+
self.client = bigquery.Client(
|
|
83
|
+
project=project_id or my_env.envs.GCP_PROJECT_ID,
|
|
84
|
+
location=location or my_env.envs.GCP_REGION,
|
|
85
|
+
)
|
|
86
|
+
logger.debug(f"BQ client open, project: {self.client.project}")
|
|
84
87
|
|
|
85
88
|
# MARK: Query execution
|
|
86
89
|
|
|
@@ -95,8 +98,10 @@ class BQ():
|
|
|
95
98
|
is_multi = isinstance(query, list)
|
|
96
99
|
queries = query if is_multi else [query]
|
|
97
100
|
queries = [textwrap.dedent(q).strip() for q in queries]
|
|
98
|
-
queries = [
|
|
99
|
-
|
|
101
|
+
queries = [
|
|
102
|
+
q if q.endswith(";") else q + ";" for q in queries
|
|
103
|
+
] # Append ';' character for each query
|
|
104
|
+
query = "\n".join(queries)
|
|
100
105
|
|
|
101
106
|
# Evaluate parameter
|
|
102
107
|
query_parameters = []
|
|
@@ -104,36 +109,63 @@ class BQ():
|
|
|
104
109
|
is_array = isinstance(value, list)
|
|
105
110
|
value_type_py = type(value[0]) if is_array else type(value)
|
|
106
111
|
if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
|
|
107
|
-
raise ValueError(
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}"
|
|
114
|
+
)
|
|
108
115
|
|
|
109
116
|
value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
|
|
110
117
|
|
|
111
118
|
# Handle data type conversions
|
|
112
119
|
if value_type_py == datetime.date:
|
|
113
|
-
value =
|
|
120
|
+
value = (
|
|
121
|
+
[v.strftime("%Y-%m-%d") for v in value]
|
|
122
|
+
if is_array
|
|
123
|
+
else value.strftime("%Y-%m-%d")
|
|
124
|
+
)
|
|
114
125
|
|
|
115
126
|
if is_array:
|
|
116
|
-
query_parameters.append(
|
|
127
|
+
query_parameters.append(
|
|
128
|
+
bigquery.ArrayQueryParameter(parameter, value_type_bq, value)
|
|
129
|
+
)
|
|
117
130
|
else:
|
|
118
|
-
query_parameters.append(
|
|
131
|
+
query_parameters.append(
|
|
132
|
+
bigquery.ScalarQueryParameter(parameter, value_type_bq, value)
|
|
133
|
+
)
|
|
119
134
|
|
|
120
|
-
logger.debug(f
|
|
121
|
-
query_job_config = bigquery.QueryJobConfig(
|
|
135
|
+
logger.debug(f"🔎 Query:\n{query}")
|
|
136
|
+
query_job_config = bigquery.QueryJobConfig(
|
|
137
|
+
dry_run=dry_run, query_parameters=query_parameters
|
|
138
|
+
)
|
|
122
139
|
if temporary_table:
|
|
123
140
|
query_job_config.destination = None
|
|
124
141
|
t = time.time()
|
|
125
142
|
query_job = self.client.query(query, job_config=query_job_config)
|
|
126
|
-
|
|
143
|
+
(
|
|
144
|
+
logger.info(
|
|
145
|
+
f"Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults"
|
|
146
|
+
)
|
|
147
|
+
if not dry_run
|
|
148
|
+
else None
|
|
149
|
+
)
|
|
127
150
|
query_job.result() # Wait for the job to complete
|
|
128
151
|
elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
|
|
129
152
|
|
|
130
153
|
if not is_multi:
|
|
131
|
-
logger.info(
|
|
154
|
+
logger.info(
|
|
155
|
+
f"[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}",
|
|
156
|
+
)
|
|
132
157
|
else:
|
|
133
|
-
logger.info(f
|
|
134
|
-
|
|
135
|
-
jobs: list[bigquery.QueryJob] = list(
|
|
136
|
-
|
|
158
|
+
logger.info(f"[Job ID] {query_job.job_id} [Elapsed] {elapsed}")
|
|
159
|
+
|
|
160
|
+
jobs: list[bigquery.QueryJob] = list(
|
|
161
|
+
self.client.list_jobs(parent_job=query_job.job_id)
|
|
162
|
+
)
|
|
163
|
+
[
|
|
164
|
+
logger.info(
|
|
165
|
+
f"[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)",
|
|
166
|
+
)
|
|
167
|
+
for job in jobs
|
|
168
|
+
]
|
|
137
169
|
|
|
138
170
|
return query_job
|
|
139
171
|
|
|
@@ -156,56 +188,68 @@ class BQ():
|
|
|
156
188
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
157
189
|
|
|
158
190
|
# Construct table options
|
|
159
|
-
logger.debug(
|
|
191
|
+
logger.debug("Constructing table options ...")
|
|
160
192
|
table_options = []
|
|
161
193
|
if expiration_timestamp_utc:
|
|
162
|
-
table_options.append(
|
|
194
|
+
table_options.append(
|
|
195
|
+
f" expiration_timestamp='{expiration_timestamp_utc.isoformat()}'"
|
|
196
|
+
)
|
|
163
197
|
if partition_by and require_partition_filter:
|
|
164
|
-
table_options.append(f
|
|
198
|
+
table_options.append(f" require_partition_filter=TRUE")
|
|
165
199
|
if description:
|
|
166
|
-
table_options.append(f
|
|
200
|
+
table_options.append(f" description='{description}'")
|
|
167
201
|
|
|
168
202
|
# Check if table exists
|
|
169
|
-
logger.debug(
|
|
170
|
-
dst_table_project_id, dst_table_dataset_id, dst_table_id =
|
|
171
|
-
|
|
203
|
+
logger.debug("Checking if destination table exists ...")
|
|
204
|
+
dst_table_project_id, dst_table_dataset_id, dst_table_id = (
|
|
205
|
+
self.get_table_fqn_parts(dst_table_fqn)
|
|
206
|
+
)
|
|
207
|
+
table_exist = self.is_table_exists(
|
|
208
|
+
project_id=dst_table_project_id,
|
|
209
|
+
dataset_id=dst_table_dataset_id,
|
|
210
|
+
table_id=dst_table_id,
|
|
211
|
+
)
|
|
172
212
|
|
|
173
213
|
# Construct beautiful query string
|
|
174
214
|
if table_exist and not replace:
|
|
175
|
-
logger.debug(
|
|
176
|
-
query_parts = [f
|
|
215
|
+
logger.debug("Table exists, constructing INSERT query ...")
|
|
216
|
+
query_parts = [f"INSERT INTO `{dst_table_fqn}`"]
|
|
177
217
|
if schema:
|
|
178
|
-
schema_str =
|
|
179
|
-
query_parts.append(f
|
|
218
|
+
schema_str = ",\n".join([column["name"] for column in schema])
|
|
219
|
+
query_parts.append(f"(\n{schema_str}\n)")
|
|
180
220
|
if table_options:
|
|
181
|
-
table_options_str =
|
|
182
|
-
query_parts.append(f
|
|
221
|
+
table_options_str = ",\n".join(table_options)
|
|
222
|
+
query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
|
|
183
223
|
else:
|
|
184
|
-
logger.debug(
|
|
224
|
+
logger.debug("Table not exist, constructing CREATE TABLE query ...")
|
|
185
225
|
query_parts = [
|
|
186
|
-
f
|
|
226
|
+
f"CREATE OR REPLACE TABLE `{dst_table_fqn}`",
|
|
187
227
|
]
|
|
188
228
|
if schema:
|
|
189
|
-
schema_str =
|
|
190
|
-
|
|
229
|
+
schema_str = ",\n".join(
|
|
230
|
+
[f' {column["name"]} {column["data_type"]}' for column in schema]
|
|
231
|
+
)
|
|
232
|
+
query_parts.append(f"(\n{schema_str}\n)")
|
|
191
233
|
if partition_by:
|
|
192
|
-
query_parts.append(f
|
|
234
|
+
query_parts.append(f"PARTITION BY {partition_by}")
|
|
193
235
|
if clustering_fields:
|
|
194
|
-
clustering_fields_str =
|
|
195
|
-
|
|
236
|
+
clustering_fields_str = ", ".join(
|
|
237
|
+
[f"`{field}`" for field in clustering_fields]
|
|
238
|
+
)
|
|
239
|
+
query_parts.append(f"CLUSTER BY {clustering_fields_str}")
|
|
196
240
|
if table_options:
|
|
197
|
-
table_options_str =
|
|
198
|
-
query_parts.append(f
|
|
199
|
-
query_parts.append(
|
|
241
|
+
table_options_str = ",\n".join(table_options)
|
|
242
|
+
query_parts.append(f"OPTIONS (\n{table_options_str}\n)")
|
|
243
|
+
query_parts.append("AS")
|
|
200
244
|
query_parts.append(textwrap.dedent(query).strip())
|
|
201
245
|
|
|
202
246
|
# Execute
|
|
203
|
-
logger.debug(
|
|
204
|
-
query =
|
|
247
|
+
logger.debug("Executing query ...")
|
|
248
|
+
query = "\n".join(query_parts)
|
|
205
249
|
self.execute_query(query, parameters=query_parameters)
|
|
206
250
|
|
|
207
251
|
def drop_table(self, bq_table_fqn: str):
|
|
208
|
-
logger.info(f
|
|
252
|
+
logger.info(f"Dropping table: {bq_table_fqn} ...")
|
|
209
253
|
self.raise_for_invalid_table_fqn(bq_table_fqn)
|
|
210
254
|
self.client.delete_table(bq_table_fqn, not_found_ok=True)
|
|
211
255
|
|
|
@@ -219,7 +263,7 @@ class BQ():
|
|
|
219
263
|
schema: list[dict] | None = None,
|
|
220
264
|
partition_by: str | None = None,
|
|
221
265
|
clustering_fields: list[str] | None = None,
|
|
222
|
-
field_delimiter: str =
|
|
266
|
+
field_delimiter: str = ",",
|
|
223
267
|
load_strategy: LoadStrategy = LoadStrategy.APPEND,
|
|
224
268
|
format: DataFileFormat = DataFileFormat.CSV,
|
|
225
269
|
compression=None,
|
|
@@ -227,36 +271,42 @@ class BQ():
|
|
|
227
271
|
|
|
228
272
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
229
273
|
|
|
230
|
-
logger.debug(f
|
|
274
|
+
logger.debug(f"Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...")
|
|
231
275
|
|
|
232
276
|
# Construct LOAD options
|
|
233
|
-
logger.debug(
|
|
277
|
+
logger.debug("Constructing LOAD options ...")
|
|
234
278
|
load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
|
|
235
|
-
f
|
|
236
|
-
f
|
|
279
|
+
f" format='{format}'",
|
|
280
|
+
f" uris=['{src_gcs_uri}']",
|
|
237
281
|
]
|
|
238
282
|
if format == DataFileFormat.CSV:
|
|
239
|
-
load_options.append(f
|
|
240
|
-
load_options.append(f
|
|
241
|
-
load_options.append(f
|
|
283
|
+
load_options.append(f" skip_leading_rows=1")
|
|
284
|
+
load_options.append(f" field_delimiter='{field_delimiter}'")
|
|
285
|
+
load_options.append(f" allow_quoted_newlines=true")
|
|
242
286
|
if compression:
|
|
243
|
-
load_options.append(f
|
|
244
|
-
load_options_str =
|
|
287
|
+
load_options.append(f" compression='{compression}'")
|
|
288
|
+
load_options_str = ",\n".join(load_options)
|
|
245
289
|
|
|
246
290
|
# Construct beautiful query string
|
|
247
|
-
logger.debug(
|
|
248
|
-
schema_str =
|
|
249
|
-
|
|
291
|
+
logger.debug("Constructing LOAD query ...")
|
|
292
|
+
schema_str = ",\n".join(
|
|
293
|
+
[f' {column["name"]} {column["data_type"]}' for column in schema]
|
|
294
|
+
)
|
|
295
|
+
query_parts = [
|
|
296
|
+
f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)'
|
|
297
|
+
]
|
|
250
298
|
if partition_by:
|
|
251
|
-
query_parts.append(f
|
|
299
|
+
query_parts.append(f"PARTITION BY {partition_by}")
|
|
252
300
|
if clustering_fields:
|
|
253
|
-
clustering_fields_str =
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
301
|
+
clustering_fields_str = ", ".join(
|
|
302
|
+
[f"`{field}`" for field in clustering_fields]
|
|
303
|
+
)
|
|
304
|
+
query_parts.append(f"CLUSTER BY {clustering_fields_str}")
|
|
305
|
+
query_parts.append(f"FROM FILES (\n{load_options_str}\n)")
|
|
306
|
+
query = "\n".join(query_parts)
|
|
257
307
|
|
|
258
308
|
# Execute
|
|
259
|
-
logger.debug(
|
|
309
|
+
logger.debug("Executing query ...")
|
|
260
310
|
self.execute_query(query)
|
|
261
311
|
|
|
262
312
|
def export_data(
|
|
@@ -268,65 +318,88 @@ class BQ():
|
|
|
268
318
|
format: DataFileFormat = DataFileFormat.CSV,
|
|
269
319
|
compression: DataFileCompression | None = None,
|
|
270
320
|
header: bool = True,
|
|
271
|
-
delimiter: str =
|
|
321
|
+
delimiter: str = ",",
|
|
272
322
|
):
|
|
273
|
-
logger.debug(f
|
|
323
|
+
logger.debug(f"Exporting query into {dst_gcs_uri} ...")
|
|
274
324
|
|
|
275
325
|
# GCS uri validation
|
|
276
|
-
if
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
raise ValueError(
|
|
326
|
+
if (
|
|
327
|
+
format == DataFileFormat.CSV
|
|
328
|
+
and compression == DataFileCompression.GZIP
|
|
329
|
+
and not dst_gcs_uri.endswith(".gz")
|
|
330
|
+
):
|
|
331
|
+
raise ValueError(
|
|
332
|
+
"GCS path need to ends with .gz if using compression = GCSCompression.GZIP"
|
|
333
|
+
)
|
|
334
|
+
elif (
|
|
335
|
+
format == DataFileFormat.CSV
|
|
336
|
+
and compression != DataFileCompression.GZIP
|
|
337
|
+
and not dst_gcs_uri.endswith(".csv")
|
|
338
|
+
):
|
|
339
|
+
raise ValueError(
|
|
340
|
+
"GCS path need to ends with .csv if using format = GCSExportFormat.CSV"
|
|
341
|
+
)
|
|
342
|
+
elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith(".parquet"):
|
|
343
|
+
raise ValueError(
|
|
344
|
+
"GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET"
|
|
345
|
+
)
|
|
282
346
|
|
|
283
347
|
# Construct options
|
|
284
|
-
logger.debug(
|
|
348
|
+
logger.debug("Constructing EXPORT options ...")
|
|
285
349
|
options = [
|
|
286
|
-
f
|
|
287
|
-
f
|
|
288
|
-
f
|
|
350
|
+
f" uri='{dst_gcs_uri}'",
|
|
351
|
+
f" format='{format}'",
|
|
352
|
+
f" overwrite=TRUE",
|
|
289
353
|
]
|
|
290
354
|
if format == DataFileFormat.CSV:
|
|
291
|
-
options.append(
|
|
355
|
+
options.append(
|
|
356
|
+
f" field_delimiter='{delimiter}'",
|
|
357
|
+
)
|
|
292
358
|
if header:
|
|
293
|
-
options.append(
|
|
359
|
+
options.append(
|
|
360
|
+
f' header={"true" if header else "false"}',
|
|
361
|
+
)
|
|
294
362
|
if compression:
|
|
295
|
-
options.append(f
|
|
296
|
-
options_str =
|
|
363
|
+
options.append(f" compression='{compression}'")
|
|
364
|
+
options_str = ",\n".join(options)
|
|
297
365
|
|
|
298
366
|
# Construct beautiful query string
|
|
299
|
-
logger.debug(
|
|
367
|
+
logger.debug("Constructing EXPORT query ...")
|
|
300
368
|
query = (
|
|
301
|
-
f
|
|
302
|
-
f
|
|
303
|
-
f
|
|
304
|
-
f
|
|
305
|
-
f
|
|
306
|
-
f
|
|
369
|
+
f"EXPORT DATA OPTIONS (\n"
|
|
370
|
+
f"{options_str}\n"
|
|
371
|
+
f")\n"
|
|
372
|
+
f"AS (\n"
|
|
373
|
+
f"{textwrap.dedent(query).strip()}\n"
|
|
374
|
+
f");"
|
|
307
375
|
)
|
|
308
376
|
|
|
309
377
|
# Execute
|
|
310
|
-
logger.debug(
|
|
378
|
+
logger.debug("Executing query ...")
|
|
311
379
|
self.execute_query(query=query, parameters=parameters)
|
|
312
380
|
|
|
313
381
|
def upload_csv(
|
|
314
382
|
self,
|
|
315
|
-
|
|
383
|
+
src_filepath: str,
|
|
316
384
|
dst_table_fqn: str,
|
|
317
385
|
schema: list[dict] | None = None,
|
|
318
386
|
gcs_bucket: str | None = None,
|
|
319
387
|
partition_by: str = None,
|
|
320
|
-
|
|
388
|
+
clustering_fields: list[str] = None,
|
|
321
389
|
compression: DataFileCompression | None = None,
|
|
322
390
|
load_strategy: LoadStrategy = LoadStrategy.APPEND,
|
|
323
391
|
):
|
|
324
392
|
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
325
393
|
|
|
326
|
-
if compression == DataFileCompression.GZIP and not
|
|
327
|
-
raise ValueError(
|
|
328
|
-
|
|
329
|
-
|
|
394
|
+
if compression == DataFileCompression.GZIP and not src_filepath.endswith(".gz"):
|
|
395
|
+
raise ValueError(
|
|
396
|
+
"Please provide file path with .gz extension if using compression = GZIP"
|
|
397
|
+
)
|
|
398
|
+
elif not src_filepath.endswith(".csv"):
|
|
399
|
+
raise ValueError("Please provide file path with .csv extension")
|
|
400
|
+
|
|
401
|
+
src_filename, src_fileextension = os.path.splitext(src_filepath)
|
|
402
|
+
src_filename = os.path.basename(src_filename) # Only get filename
|
|
330
403
|
|
|
331
404
|
# # <<----- START: Upload to GCS
|
|
332
405
|
|
|
@@ -354,12 +427,21 @@ class BQ():
|
|
|
354
427
|
# Upload to GCS
|
|
355
428
|
# TODO: Re-implement the producer-consumer model to upload multiple files
|
|
356
429
|
gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
|
|
357
|
-
dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(
|
|
358
|
-
gcs.upload(
|
|
430
|
+
dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(src_filename, "_").lower()}{src_fileextension}'
|
|
431
|
+
gcs.upload(src_filepath, dst_blobpath)
|
|
359
432
|
|
|
360
433
|
# Load to BQ
|
|
361
434
|
try:
|
|
362
|
-
self.load_data(
|
|
435
|
+
self.load_data(
|
|
436
|
+
f"gs://{gcs.bucket.name}/{dst_blobpath}",
|
|
437
|
+
dst_table_fqn,
|
|
438
|
+
schema=schema,
|
|
439
|
+
partition_by=partition_by,
|
|
440
|
+
clustering_fields=clustering_fields,
|
|
441
|
+
format=DataFileFormat.CSV,
|
|
442
|
+
compression=compression,
|
|
443
|
+
load_strategy=load_strategy,
|
|
444
|
+
)
|
|
363
445
|
except:
|
|
364
446
|
raise
|
|
365
447
|
finally:
|
|
@@ -374,43 +456,61 @@ class BQ():
|
|
|
374
456
|
query_parameters: dict = {},
|
|
375
457
|
csv_row_limit: int | None = None,
|
|
376
458
|
) -> str | list[str]:
|
|
377
|
-
if not dst_filepath.endswith(
|
|
378
|
-
raise ValueError(
|
|
459
|
+
if not dst_filepath.endswith(".csv"):
|
|
460
|
+
raise ValueError("Destination filename must ends with .csv")
|
|
379
461
|
|
|
380
462
|
# Init
|
|
381
463
|
gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
|
|
382
464
|
|
|
383
465
|
# Generic function to export-download-combine csv file from BQ->GCS->local
|
|
384
|
-
def _export_download_combine(
|
|
466
|
+
def _export_download_combine(
|
|
467
|
+
query: str,
|
|
468
|
+
dst_gcs_prefix: str,
|
|
469
|
+
dst_filepath: str,
|
|
470
|
+
query_parameters: dict = {},
|
|
471
|
+
):
|
|
385
472
|
# Init tmp directory
|
|
386
|
-
tmp_dirname = f
|
|
473
|
+
tmp_dirname = f"/tmp/my_bq_{my_datetime.get_current_datetime_str()}"
|
|
387
474
|
if os.path.exists(tmp_dirname):
|
|
388
475
|
shutil.rmtree(tmp_dirname, ignore_errors=True)
|
|
389
476
|
os.makedirs(tmp_dirname, exist_ok=True)
|
|
390
|
-
logger.debug(f
|
|
477
|
+
logger.debug(f"Temporary directory created: {tmp_dirname}")
|
|
391
478
|
|
|
392
479
|
try:
|
|
393
480
|
# Export to GCS
|
|
394
|
-
dst_gcs_uri = f
|
|
395
|
-
self.export_data(
|
|
481
|
+
dst_gcs_uri = f"gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz"
|
|
482
|
+
self.export_data(
|
|
483
|
+
query,
|
|
484
|
+
dst_gcs_uri,
|
|
485
|
+
parameters=query_parameters,
|
|
486
|
+
format=DataFileFormat.CSV,
|
|
487
|
+
compression=DataFileCompression.GZIP,
|
|
488
|
+
)
|
|
396
489
|
|
|
397
490
|
# Download from GCS
|
|
398
491
|
local_tmp_filepaths = []
|
|
399
492
|
for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
|
|
400
|
-
local_tmp_filepath = os.path.join(
|
|
493
|
+
local_tmp_filepath = os.path.join(
|
|
494
|
+
tmp_dirname, tmp_blobs.name.split("/")[-1]
|
|
495
|
+
)
|
|
401
496
|
gcs.download(tmp_blobs, local_tmp_filepath, move=True)
|
|
402
497
|
# logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
|
|
403
498
|
local_tmp_filepaths.append(local_tmp_filepath)
|
|
404
499
|
|
|
405
500
|
# Combine downloaded files
|
|
406
|
-
my_csv.combine(
|
|
501
|
+
my_csv.combine(
|
|
502
|
+
local_tmp_filepaths, dst_filepath, gzip=True, delete=True
|
|
503
|
+
)
|
|
407
504
|
except:
|
|
408
505
|
raise
|
|
409
506
|
finally:
|
|
410
507
|
shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
|
|
411
|
-
[
|
|
508
|
+
[
|
|
509
|
+
gcs.delete_blob(blob_filepath)
|
|
510
|
+
for blob_filepath in gcs.list_blobs(dst_gcs_prefix)
|
|
511
|
+
] # Remove temporary GCS files
|
|
412
512
|
|
|
413
|
-
logger.info(f
|
|
513
|
+
logger.info(f"Export-download-combine done: {dst_filepath}")
|
|
414
514
|
|
|
415
515
|
# Limited csv rows
|
|
416
516
|
if csv_row_limit:
|
|
@@ -420,22 +520,31 @@ class BQ():
|
|
|
420
520
|
# Create temporary table
|
|
421
521
|
query_job = self.execute_query(query, temporary_table=True)
|
|
422
522
|
tmp_table_fqn = str(query_job.destination)
|
|
423
|
-
logger.debug(f
|
|
523
|
+
logger.debug(f"Create temp table: {tmp_table_fqn}")
|
|
424
524
|
|
|
425
525
|
# Create another temporary table for row numbering
|
|
426
|
-
query_job = self.execute_query(
|
|
526
|
+
query_job = self.execute_query(
|
|
527
|
+
f"SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`",
|
|
528
|
+
temporary_table=True,
|
|
529
|
+
)
|
|
427
530
|
tmp_table_fqn_rn = str(query_job.destination)
|
|
428
|
-
logger.debug(f
|
|
531
|
+
logger.debug(f"Create temp table (rn): {tmp_table_fqn_rn}")
|
|
429
532
|
|
|
430
533
|
# Process parts
|
|
431
|
-
count = list(
|
|
534
|
+
count = list(
|
|
535
|
+
self.execute_query(
|
|
536
|
+
f"SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`"
|
|
537
|
+
).result()
|
|
538
|
+
)[0][0]
|
|
432
539
|
parts = math.ceil(count / csv_row_limit)
|
|
433
|
-
logger.info(f
|
|
540
|
+
logger.info(f"Total part: {count} / {csv_row_limit} = {parts}")
|
|
434
541
|
dst_filepaths = []
|
|
435
542
|
for part in range(parts):
|
|
436
|
-
dst_filepath_part =
|
|
543
|
+
dst_filepath_part = (
|
|
544
|
+
f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
|
|
545
|
+
)
|
|
437
546
|
_export_download_combine(
|
|
438
|
-
f
|
|
547
|
+
f"SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit} ORDER BY _rn",
|
|
439
548
|
dst_gcs_prefix=gcs.build_tmp_dirpath(),
|
|
440
549
|
dst_filepath=dst_filepath_part,
|
|
441
550
|
)
|
|
@@ -452,7 +561,12 @@ class BQ():
|
|
|
452
561
|
|
|
453
562
|
# Unlimited csv rows
|
|
454
563
|
else:
|
|
455
|
-
_export_download_combine(
|
|
564
|
+
_export_download_combine(
|
|
565
|
+
query,
|
|
566
|
+
gcs.build_tmp_dirpath(),
|
|
567
|
+
dst_filepath,
|
|
568
|
+
query_parameters=query_parameters,
|
|
569
|
+
)
|
|
456
570
|
return dst_filepath
|
|
457
571
|
|
|
458
572
|
# query_job_result = query_job.result()
|
|
@@ -484,32 +598,43 @@ class BQ():
|
|
|
484
598
|
# if f:
|
|
485
599
|
# f.close()
|
|
486
600
|
|
|
487
|
-
def download_xlsx(
|
|
488
|
-
|
|
489
|
-
|
|
601
|
+
def download_xlsx(
|
|
602
|
+
self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000
|
|
603
|
+
):
|
|
604
|
+
if not dst_filename.endswith(".xlsx"):
|
|
605
|
+
raise ValueError("Destination filename must ends with .xlsx!")
|
|
490
606
|
|
|
491
607
|
# Create a temporary table acting as excel file splitting
|
|
492
|
-
table_name_tmp = f
|
|
493
|
-
self.execute_query(
|
|
608
|
+
table_name_tmp = f"{src_table_fqn}_"
|
|
609
|
+
self.execute_query(
|
|
610
|
+
f"CREATE TABLE `{table_name_tmp}` AS SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{src_table_fqn}`"
|
|
611
|
+
)
|
|
494
612
|
|
|
495
613
|
try:
|
|
496
614
|
# Calculate the number of excel file parts based on row limit
|
|
497
|
-
cnt = list(
|
|
615
|
+
cnt = list(
|
|
616
|
+
self.execute_query(
|
|
617
|
+
f"SELECT COUNT(1) AS cnt FROM `{src_table_fqn}`"
|
|
618
|
+
).result()
|
|
619
|
+
)[0][0]
|
|
498
620
|
parts = math.ceil(cnt / xlsx_row_limit)
|
|
499
|
-
logger.debug(f
|
|
621
|
+
logger.debug(f"Total part: {cnt} / {xlsx_row_limit} = {parts}")
|
|
500
622
|
|
|
501
623
|
# Download per parts
|
|
502
624
|
for part in range(parts):
|
|
503
|
-
logger.debug(f
|
|
504
|
-
file_path_tmp = f
|
|
505
|
-
file_path_tmp_csv = f
|
|
506
|
-
self.download_csv(
|
|
507
|
-
|
|
625
|
+
logger.debug(f"Downloading part {part + 1}...")
|
|
626
|
+
file_path_tmp = f"{dst_filename}_part{part + 1}"
|
|
627
|
+
file_path_tmp_csv = f"{file_path_tmp}.csv"
|
|
628
|
+
self.download_csv(
|
|
629
|
+
f"SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}",
|
|
630
|
+
f"{file_path_tmp}{os.sep}",
|
|
631
|
+
)
|
|
632
|
+
my_xlsx.csv_to_xlsx(file_path_tmp_csv, f"{file_path_tmp}.xlsx")
|
|
508
633
|
os.remove(file_path_tmp_csv)
|
|
509
634
|
except Exception as e:
|
|
510
635
|
raise e
|
|
511
636
|
finally:
|
|
512
|
-
self.execute_query(f
|
|
637
|
+
self.execute_query(f"DROP TABLE IF EXISTS `{table_name_tmp}`")
|
|
513
638
|
|
|
514
639
|
# def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
|
|
515
640
|
# src_project_id, src_dataset_id, _ = src_view_id.split('.')
|
|
@@ -564,11 +689,11 @@ class BQ():
|
|
|
564
689
|
if isinstance(name, list):
|
|
565
690
|
return [BQ.get_table_fqn_parts(x) for x in name]
|
|
566
691
|
|
|
567
|
-
split = name.split(
|
|
692
|
+
split = name.split(".")
|
|
568
693
|
if len(split) == 3:
|
|
569
694
|
return split
|
|
570
695
|
else:
|
|
571
|
-
raise ValueError(f
|
|
696
|
+
raise ValueError(f"{name} is not a valid table FQN")
|
|
572
697
|
|
|
573
698
|
@staticmethod
|
|
574
699
|
def raise_for_invalid_table_fqn(name: str | list[str]):
|
|
@@ -582,7 +707,7 @@ class BQ():
|
|
|
582
707
|
"""
|
|
583
708
|
|
|
584
709
|
if not BQ.get_table_fqn_parts(name):
|
|
585
|
-
raise ValueError(f
|
|
710
|
+
raise ValueError(f"{name} is not a valid table FQN")
|
|
586
711
|
|
|
587
712
|
def is_table_exists(self, table_fqn: str) -> bool:
|
|
588
713
|
self.raise_for_invalid_table_fqn(table_fqn)
|
|
@@ -594,4 +719,4 @@ class BQ():
|
|
|
594
719
|
|
|
595
720
|
def close(self):
|
|
596
721
|
self.client.close()
|
|
597
|
-
logger.debug(
|
|
722
|
+
logger.debug("BQ client close")
|