rdxz2-utill 0.0.11__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rdxz2-utill might be problematic. Click here for more details.
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/METADATA +1 -1
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/RECORD +14 -14
- utill/cmd/_conf.py +1 -0
- utill/my_bq.py +496 -262
- utill/my_csv.py +13 -9
- utill/my_datetime.py +1 -1
- utill/my_env.py +1 -0
- utill/my_file.py +1 -1
- utill/my_gcs.py +56 -100
- utill/my_queue.py +77 -2
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/WHEEL +0 -0
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/entry_points.txt +0 -0
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/licenses/LICENSE +0 -0
- {rdxz2_utill-0.0.11.dist-info → rdxz2_utill-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,35 +1,35 @@
|
|
|
1
|
-
rdxz2_utill-0.0.
|
|
1
|
+
rdxz2_utill-0.1.0.dist-info/licenses/LICENSE,sha256=PF9CUvzP8XFYopEAzrMzSCovF7RdBdscPqJCDC6KjPc,1073
|
|
2
2
|
utill/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
utill/my_bq.py,sha256=
|
|
3
|
+
utill/my_bq.py,sha256=1BKjYa05yfOBInm245373lfi4sbOZKybYXdYmON2npM,24985
|
|
4
4
|
utill/my_compare.py,sha256=619QbVk3GihWxen95yVnivKHkah8GgPTLGiSkgHxykw,886
|
|
5
5
|
utill/my_const.py,sha256=88dOqn6NPQ5-hfRqdkew5POoAIyO91XXOGvN76oNsdo,251
|
|
6
|
-
utill/my_csv.py,sha256=
|
|
7
|
-
utill/my_datetime.py,sha256=
|
|
6
|
+
utill/my_csv.py,sha256=AT5sAbAlYqnAmNgQMTSqEueRXM4D42yNPb5C3Hedy6c,2921
|
|
7
|
+
utill/my_datetime.py,sha256=8AUO9l_MSzdthRsgASuyGZpvjgpoQb9Lowt4goHjyqw,2129
|
|
8
8
|
utill/my_dict.py,sha256=jPaPfdn4WYpm0uIBPiYFinpHhx1jXpFVDJ9npmvxGZQ,391
|
|
9
9
|
utill/my_encryption.py,sha256=SCF7PPur39cW4RHidsRhw-9BZP-ymUH-6LZ9nAHJDsY,2105
|
|
10
|
-
utill/my_env.py,sha256=
|
|
11
|
-
utill/my_file.py,sha256
|
|
12
|
-
utill/my_gcs.py,sha256=
|
|
10
|
+
utill/my_env.py,sha256=E7XW3fuhxbDlFqmLPHrziJJZVRogzGh6rfQdyNV49f8,2130
|
|
11
|
+
utill/my_file.py,sha256=-b6_dGDDBdS228kgwTYpmIa3vxW1c1TtWrLdzdlHjKY,1873
|
|
12
|
+
utill/my_gcs.py,sha256=VBJ8lsJ-fHr_BzMoSuT5JUrvxidGyMc2VNtE6Um1T_M,3060
|
|
13
13
|
utill/my_input.py,sha256=OyKLoutXpwISReltuL_Gw2oojv16tYWJqQpqabBOQx4,350
|
|
14
14
|
utill/my_json.py,sha256=WgW6mavGhfs4h1N5XbhsDnRk2dbh_ttJWdJUj4iWDN4,1473
|
|
15
15
|
utill/my_mb.py,sha256=IyrySs92TqtjBUvPMeUN3P2kRK8EttTFRPZsv5Cr-xw,15090
|
|
16
16
|
utill/my_pg.py,sha256=J9USygc-oug4w7AkBacA9x043jHZrDfQPGFEqXavZAY,6799
|
|
17
|
-
utill/my_queue.py,sha256=
|
|
17
|
+
utill/my_queue.py,sha256=Qf3Nm_ZRoVD34oAoym8A9hoH9Y27kUHeWLhylAUj5Q4,4749
|
|
18
18
|
utill/my_string.py,sha256=pINYFR1ligTyVZYzV8P_FolCsZQwYE1jaFNTuQ3XS_8,833
|
|
19
19
|
utill/my_style.py,sha256=Wy6j4WL9RgGeX6cS9hhlOrufc9UC4UPTQ5UJa0ZJ3Yo,900
|
|
20
20
|
utill/my_tunnel.py,sha256=uCpGtiG8AcRYiaN7rLnTulsZI4iFTRM8EHxwyAAfDrE,1292
|
|
21
21
|
utill/my_xlsx.py,sha256=YcQRp6DC9girSS1fkUPVKsHspyQpr8JC8GymSSnRV-w,729
|
|
22
22
|
utill/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
utill/cmd/_bq.py,sha256=MQGLIv_WBUBl2tf18bfYrAszx0Koa5kdTW1c8A5HDDg,520
|
|
24
|
-
utill/cmd/_conf.py,sha256=
|
|
24
|
+
utill/cmd/_conf.py,sha256=DKl3IVVLp6-5P43tvh6bYmHR5rOL9XnKVuQ7kQJtzrc,1863
|
|
25
25
|
utill/cmd/_enc.py,sha256=DBy3Iwa5DTtww7lgHPRLEilrYPrWDG1vRv5PO-YzNO8,997
|
|
26
26
|
utill/cmd/_main.py,sha256=UJ_XTIGDO9XPIypgHhS81SJQ_8qy8JOyw98Or0Nb2x8,273
|
|
27
27
|
utill/cmd/_pg.py,sha256=RVxEiSifyIwMDYDM69vt6WSLdVDr1cMzY6r4T2PzNRA,492
|
|
28
28
|
utill/cmd/utill.py,sha256=TlHfiwOUcK1m58PrRCjX9sARiPYZUsoTk-KOTCOz1vM,3558
|
|
29
29
|
utill/templates/mb.json,sha256=M46ZHSaSh4rbD_KGUViGr2B2ZV8_PC-O5Evqi35JK5g,59
|
|
30
30
|
utill/templates/pg.json,sha256=LkJt0VV3zcyt7Tpn6gulsoVQgUc-9uImXOStvzu8cdU,271
|
|
31
|
-
rdxz2_utill-0.0.
|
|
32
|
-
rdxz2_utill-0.0.
|
|
33
|
-
rdxz2_utill-0.0.
|
|
34
|
-
rdxz2_utill-0.0.
|
|
35
|
-
rdxz2_utill-0.0.
|
|
31
|
+
rdxz2_utill-0.1.0.dist-info/METADATA,sha256=iB4cHw4zIQnP_2DHvIkBpwKxR5s32RVXr9xyJiY-GX4,4401
|
|
32
|
+
rdxz2_utill-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
rdxz2_utill-0.1.0.dist-info/entry_points.txt,sha256=9n5NWz5Wi9jDvYhB_81_4icgT5xABZ-QivHD8ibcafg,47
|
|
34
|
+
rdxz2_utill-0.1.0.dist-info/top_level.txt,sha256=tuAYZoCsr02JYbpZj7I6fl1IIo53v3GG0uoj-_fINVk,6
|
|
35
|
+
rdxz2_utill-0.1.0.dist-info/RECORD,,
|
utill/cmd/_conf.py
CHANGED
|
@@ -6,6 +6,7 @@ def _init(mode: str):
|
|
|
6
6
|
match mode:
|
|
7
7
|
case 'google-cloud':
|
|
8
8
|
setattr(envs, 'GCP_PROJECT_ID', input('GCP_PROJECT_ID: '))
|
|
9
|
+
setattr(envs, 'GCP_REGION', input('GCP_REGION: '))
|
|
9
10
|
setattr(envs, 'GCS_BUCKET', input('GCS_BUCKET: '))
|
|
10
11
|
envs.write()
|
|
11
12
|
logger.info('Google cloud configuration initialized')
|
utill/my_bq.py
CHANGED
|
@@ -1,30 +1,46 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from .
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from google.cloud import
|
|
1
|
+
from . import my_csv
|
|
2
|
+
from . import my_datetime
|
|
3
|
+
from . import my_env
|
|
4
|
+
from . import my_gcs
|
|
5
|
+
from . import my_queue
|
|
6
|
+
from . import my_string
|
|
7
|
+
from . import my_xlsx
|
|
8
|
+
from enum import StrEnum, Enum, auto
|
|
9
|
+
from google.cloud import bigquery
|
|
10
|
+
from google.cloud.exceptions import NotFound
|
|
11
|
+
from humanize import precisedelta, naturalsize
|
|
11
12
|
from loguru import logger
|
|
12
|
-
from textwrap import dedent
|
|
13
13
|
import csv
|
|
14
|
-
import
|
|
14
|
+
import datetime
|
|
15
15
|
import math
|
|
16
16
|
import os
|
|
17
|
+
import shutil
|
|
18
|
+
import textwrap
|
|
19
|
+
import time
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
PY_DATA_TYPE__BQ_DATA_TYPE = {
|
|
19
22
|
int: 'INTEGER',
|
|
20
23
|
str: 'STRING',
|
|
21
24
|
float: 'STRING',
|
|
22
25
|
}
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
class DataFileFormat(StrEnum):
|
|
29
|
+
CSV = 'CSV'
|
|
30
|
+
JSON = 'JSON'
|
|
31
|
+
AVRO = 'AVRO'
|
|
32
|
+
PARQUET = 'PARQUET'
|
|
33
|
+
ORC = 'ORC'
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DataFileCompression(StrEnum):
|
|
37
|
+
GZIP = 'GZIP'
|
|
38
|
+
SNAPPY = 'SNAPPY'
|
|
39
|
+
|
|
40
|
+
|
|
25
41
|
class LoadStrategy(Enum):
|
|
26
|
-
OVERWRITE =
|
|
27
|
-
APPEND =
|
|
42
|
+
OVERWRITE = auto()
|
|
43
|
+
APPEND = auto()
|
|
28
44
|
|
|
29
45
|
|
|
30
46
|
class Dtype:
|
|
@@ -56,231 +72,415 @@ class Dtype:
|
|
|
56
72
|
|
|
57
73
|
|
|
58
74
|
class BQ():
|
|
59
|
-
def __init__(self,
|
|
60
|
-
|
|
75
|
+
def __init__(self, location: str | None = None, project_id: str = None):
|
|
76
|
+
if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
|
|
77
|
+
logger.warning('Using ADC for BigQuery authentication')
|
|
61
78
|
|
|
62
|
-
if
|
|
63
|
-
|
|
64
|
-
else:
|
|
65
|
-
self.client = bigquery.Client(project=self.project)
|
|
79
|
+
# if location is None and my_env.envs.GCP_REGION is None:
|
|
80
|
+
# raise ValueError('GCP region must be set in environment variables.')
|
|
66
81
|
|
|
82
|
+
self.client = bigquery.Client(project=project_id or my_env.envs.GCP_PROJECT_ID, location=location or my_env.envs.GCP_REGION)
|
|
67
83
|
logger.debug(f'BQ client open, project: {self.client.project}')
|
|
68
84
|
|
|
69
|
-
|
|
70
|
-
|
|
85
|
+
# MARK: Query execution
|
|
86
|
+
|
|
87
|
+
def execute_query(
|
|
88
|
+
self,
|
|
89
|
+
query: str | list[str],
|
|
90
|
+
parameters: dict = {},
|
|
91
|
+
dry_run: bool = False,
|
|
92
|
+
temporary_table: bool = False,
|
|
93
|
+
) -> bigquery.QueryJob:
|
|
94
|
+
# Reconstruct query, handle multiple queries in a single job
|
|
95
|
+
is_multi = isinstance(query, list)
|
|
96
|
+
queries = query if is_multi else [query]
|
|
97
|
+
queries = [textwrap.dedent(q).strip() for q in queries]
|
|
98
|
+
queries = [q if q.endswith(';') else q + ';' for q in queries] # Append ';' character for each query
|
|
99
|
+
query = '\n'.join(queries)
|
|
100
|
+
|
|
101
|
+
# Evaluate parameter
|
|
102
|
+
query_parameters = []
|
|
103
|
+
for parameter, value in parameters.items():
|
|
104
|
+
is_array = isinstance(value, list)
|
|
105
|
+
value_type_py = type(value[0]) if is_array else type(value)
|
|
106
|
+
if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
|
|
107
|
+
raise ValueError(f'Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}')
|
|
71
108
|
|
|
72
|
-
|
|
73
|
-
self.close_client()
|
|
109
|
+
value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
|
|
74
110
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
query = '\n'.join([x if str(x).strip().endswith(';') else x + ';' for x in query if x])
|
|
79
|
-
else:
|
|
80
|
-
query = query.strip()
|
|
111
|
+
# Handle data type conversions
|
|
112
|
+
if value_type_py == datetime.date:
|
|
113
|
+
value = [v.strftime('%Y-%m-%d') for v in value] if is_array else value.strftime('%Y-%m-%d')
|
|
81
114
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
for parameter, value in parameters.items():
|
|
85
|
-
if type(value) == list:
|
|
86
|
-
query_parameters.append(bigquery.ArrayQueryParameter(parameter, MAP__PYTHON_DTYPE__BQ_DTYPE[type(value[0])], value))
|
|
115
|
+
if is_array:
|
|
116
|
+
query_parameters.append(bigquery.ArrayQueryParameter(parameter, value_type_bq, value))
|
|
87
117
|
else:
|
|
88
|
-
query_parameters.append(bigquery.ScalarQueryParameter(parameter,
|
|
118
|
+
query_parameters.append(bigquery.ScalarQueryParameter(parameter, value_type_bq, value))
|
|
89
119
|
|
|
90
120
|
logger.debug(f'🔎 Query:\n{query}')
|
|
91
121
|
query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
|
|
122
|
+
if temporary_table:
|
|
123
|
+
query_job_config.destination = None
|
|
124
|
+
t = time.time()
|
|
92
125
|
query_job = self.client.query(query, job_config=query_job_config)
|
|
126
|
+
logger.info(f'Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults') if not dry_run else None
|
|
127
|
+
query_job.result() # Wait for the job to complete
|
|
128
|
+
elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
|
|
93
129
|
|
|
94
|
-
if not
|
|
95
|
-
logger.
|
|
130
|
+
if not is_multi:
|
|
131
|
+
logger.info(f'[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}',)
|
|
96
132
|
else:
|
|
97
|
-
logger.
|
|
133
|
+
logger.info(f'[Job ID] {query_job.job_id} [Elapsed] {elapsed}')
|
|
98
134
|
|
|
99
|
-
jobs: list[bigquery.QueryJob] = self.client.list_jobs(parent_job=query_job.job_id)
|
|
100
|
-
[logger.
|
|
135
|
+
jobs: list[bigquery.QueryJob] = list(self.client.list_jobs(parent_job=query_job.job_id))
|
|
136
|
+
[logger.info(f'[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
|
|
101
137
|
|
|
102
138
|
return query_job
|
|
103
139
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
140
|
+
# MARK: Table operations
|
|
141
|
+
|
|
142
|
+
def create_table(
|
|
143
|
+
self,
|
|
144
|
+
dst_table_fqn: str,
|
|
145
|
+
query: str,
|
|
146
|
+
query_parameters: dict = {},
|
|
147
|
+
*,
|
|
148
|
+
description: str | None = None,
|
|
149
|
+
schema: list[dict] | None = None,
|
|
150
|
+
partition_by: str | None = None,
|
|
151
|
+
clustering_fields: list[str] | None = None,
|
|
152
|
+
expiration_timestamp_utc: datetime.datetime | None = None,
|
|
153
|
+
require_partition_filter: bool = False,
|
|
154
|
+
replace: bool = False,
|
|
155
|
+
):
|
|
156
|
+
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
157
|
+
|
|
158
|
+
# Construct table options
|
|
159
|
+
logger.debug('Constructing table options ...')
|
|
160
|
+
table_options = []
|
|
161
|
+
if expiration_timestamp_utc:
|
|
162
|
+
table_options.append(f' expiration_timestamp=\'{expiration_timestamp_utc.isoformat()}\'')
|
|
163
|
+
if partition_by and require_partition_filter:
|
|
164
|
+
table_options.append(f' require_partition_filter=TRUE')
|
|
165
|
+
if description:
|
|
166
|
+
table_options.append(f' description=\'{description}\'')
|
|
167
|
+
|
|
168
|
+
# Check if table exists
|
|
169
|
+
logger.debug('Checking if destination table exists ...')
|
|
170
|
+
dst_table_project_id, dst_table_dataset_id, dst_table_id = self.get_table_fqn_parts(dst_table_fqn)
|
|
171
|
+
table_exist = self.is_table_exists(project_id=dst_table_project_id, dataset_id=dst_table_dataset_id, table_id=dst_table_id)
|
|
172
|
+
|
|
173
|
+
# Construct beautiful query string
|
|
174
|
+
if table_exist and not replace:
|
|
175
|
+
logger.debug('Table exists, constructing INSERT query ...')
|
|
176
|
+
query_parts = [f'INSERT INTO `{dst_table_fqn}`']
|
|
177
|
+
if schema:
|
|
178
|
+
schema_str = ',\n'.join([column['name'] for column in schema])
|
|
179
|
+
query_parts.append(f'(\n{schema_str}\n)')
|
|
180
|
+
if table_options:
|
|
181
|
+
query_parts.append(f'OPTIONS (\n{",\n".join(table_options)}\n)')
|
|
182
|
+
else:
|
|
183
|
+
logger.debug('Table not exist, constructing CREATE TABLE query ...')
|
|
184
|
+
query_parts = [
|
|
185
|
+
f'CREATE OR REPLACE TABLE `{dst_table_fqn}`',
|
|
186
|
+
]
|
|
187
|
+
if schema:
|
|
188
|
+
schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
|
|
189
|
+
query_parts.append(f'(\n{schema_str}\n)')
|
|
190
|
+
if partition_by:
|
|
191
|
+
query_parts.append(f'PARTITION BY {partition_by}')
|
|
192
|
+
if clustering_fields:
|
|
193
|
+
clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
|
|
194
|
+
query_parts.append(f'CLUSTER BY {clustering_fields_str}')
|
|
195
|
+
if table_options:
|
|
196
|
+
query_parts.append(f'OPTIONS (\n{",\n".join(table_options)}\n)')
|
|
197
|
+
query_parts.append('AS')
|
|
198
|
+
query_parts.append(textwrap.dedent(query).strip())
|
|
199
|
+
|
|
200
|
+
# Execute
|
|
201
|
+
logger.debug('Executing query ...')
|
|
202
|
+
query = '\n'.join(query_parts)
|
|
203
|
+
self.execute_query(query, parameters=query_parameters)
|
|
117
204
|
|
|
118
205
|
def drop_table(self, bq_table_fqn: str):
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
'''
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
query
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
206
|
+
logger.info(f'Dropping table: {bq_table_fqn} ...')
|
|
207
|
+
self.raise_for_invalid_table_fqn(bq_table_fqn)
|
|
208
|
+
self.client.delete_table(bq_table_fqn, not_found_ok=True)
|
|
209
|
+
|
|
210
|
+
# MARK: Table data
|
|
211
|
+
|
|
212
|
+
def load_data(
|
|
213
|
+
self,
|
|
214
|
+
src_gcs_uri: str,
|
|
215
|
+
dst_table_fqn: str,
|
|
216
|
+
*,
|
|
217
|
+
schema: list[dict] | None = None,
|
|
218
|
+
partition_by: str | None = None,
|
|
219
|
+
clustering_fields: list[str] | None = None,
|
|
220
|
+
field_delimiter: str = ',',
|
|
221
|
+
load_strategy: LoadStrategy = LoadStrategy.APPEND,
|
|
222
|
+
format: DataFileFormat = DataFileFormat.CSV,
|
|
223
|
+
compression=None,
|
|
224
|
+
):
|
|
225
|
+
|
|
226
|
+
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
227
|
+
|
|
228
|
+
logger.debug(f'Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...')
|
|
229
|
+
|
|
230
|
+
# Construct LOAD options
|
|
231
|
+
logger.debug('Constructing LOAD options ...')
|
|
232
|
+
load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
|
|
233
|
+
f' format=\'{format}\'',
|
|
234
|
+
f' uris=[\'{src_gcs_uri}\']',
|
|
235
|
+
]
|
|
236
|
+
if format == DataFileFormat.CSV:
|
|
237
|
+
load_options.append(f' skip_leading_rows=1')
|
|
238
|
+
load_options.append(f' field_delimiter=\'{field_delimiter}\'')
|
|
239
|
+
load_options.append(f' allow_quoted_newlines=true')
|
|
240
|
+
if compression:
|
|
241
|
+
load_options.append(f' compression=\'{compression}\'')
|
|
242
|
+
load_options_str = ',\n'.join(load_options)
|
|
243
|
+
|
|
244
|
+
# Construct beautiful query string
|
|
245
|
+
logger.debug('Constructing LOAD query ...')
|
|
246
|
+
schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
|
|
247
|
+
query_parts = [f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)']
|
|
248
|
+
if partition_by:
|
|
249
|
+
query_parts.append(f'PARTITION BY {partition_by}')
|
|
250
|
+
if clustering_fields:
|
|
251
|
+
clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
|
|
252
|
+
query_parts.append(f'CLUSTER BY {clustering_fields_str}')
|
|
253
|
+
query_parts.append(f'FROM FILES (\n{load_options_str}\n)')
|
|
254
|
+
query = '\n'.join(query_parts)
|
|
255
|
+
|
|
256
|
+
# Execute
|
|
257
|
+
logger.debug('Executing query ...')
|
|
258
|
+
self.execute_query(query)
|
|
259
|
+
|
|
260
|
+
def export_data(
|
|
261
|
+
self,
|
|
262
|
+
query: str,
|
|
263
|
+
dst_gcs_uri: str,
|
|
264
|
+
*,
|
|
265
|
+
parameters: dict = {},
|
|
266
|
+
format: DataFileFormat = DataFileFormat.CSV,
|
|
267
|
+
compression: DataFileCompression | None = None,
|
|
268
|
+
header: bool = True,
|
|
269
|
+
delimiter: str = ',',
|
|
270
|
+
):
|
|
271
|
+
logger.debug(f'Exporting query into {dst_gcs_uri} ...')
|
|
272
|
+
|
|
273
|
+
# GCS uri validation
|
|
274
|
+
if format == DataFileFormat.CSV and compression == DataFileCompression.GZIP and not dst_gcs_uri.endswith('.gz'):
|
|
275
|
+
raise ValueError('GCS path need to ends with .gz if using compression = GCSCompression.GZIP')
|
|
276
|
+
elif format == DataFileFormat.CSV and compression != DataFileCompression.GZIP and not dst_gcs_uri.endswith('.csv'):
|
|
277
|
+
raise ValueError('GCS path need to ends with .csv if using format = GCSExportFormat.CSV')
|
|
278
|
+
elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith('.parquet'):
|
|
279
|
+
raise ValueError('GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET')
|
|
280
|
+
|
|
281
|
+
# Construct options
|
|
282
|
+
logger.debug('Constructing EXPORT options ...')
|
|
283
|
+
options = [
|
|
284
|
+
f' uri=\'{dst_gcs_uri}\'',
|
|
285
|
+
f' format=\'{format}\'',
|
|
286
|
+
f' overwrite=TRUE',
|
|
287
|
+
]
|
|
288
|
+
if format == DataFileFormat.CSV:
|
|
289
|
+
options.append(f' field_delimiter=\'{delimiter}\'',)
|
|
290
|
+
if header:
|
|
291
|
+
options.append(f' header={"true" if header else "false"}',)
|
|
292
|
+
if compression:
|
|
293
|
+
options.append(f' compression=\'{compression}\'')
|
|
294
|
+
options_str = ',\n'.join(options)
|
|
295
|
+
|
|
296
|
+
# Construct beautiful query string
|
|
297
|
+
logger.debug('Constructing EXPORT query ...')
|
|
298
|
+
query = (
|
|
299
|
+
f'EXPORT DATA OPTIONS (\n'
|
|
300
|
+
f'{options_str}\n'
|
|
301
|
+
f')\n'
|
|
302
|
+
f'AS (\n'
|
|
303
|
+
f'{textwrap.dedent(query).strip()}\n'
|
|
304
|
+
f');'
|
|
172
305
|
)
|
|
173
306
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if
|
|
192
|
-
if
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
#
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
#
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
#
|
|
215
|
-
#
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
blobs: list[storage.Blob]
|
|
227
|
-
_, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
|
|
228
|
-
|
|
229
|
-
# END: Upload to GCS ----->>
|
|
230
|
-
|
|
231
|
-
# <<----- START: Load to BQ
|
|
232
|
-
|
|
307
|
+
# Execute
|
|
308
|
+
logger.debug('Executing query ...')
|
|
309
|
+
self.execute_query(query=query, parameters=parameters)
|
|
310
|
+
|
|
311
|
+
def upload_csv(
|
|
312
|
+
self,
|
|
313
|
+
src_filename: str,
|
|
314
|
+
dst_table_fqn: str,
|
|
315
|
+
schema: list[dict] | None = None,
|
|
316
|
+
gcs_bucket: str | None = None,
|
|
317
|
+
partition_by: str = None,
|
|
318
|
+
cluster_cols: list[str] = None,
|
|
319
|
+
compression: DataFileCompression | None = None,
|
|
320
|
+
load_strategy: LoadStrategy = LoadStrategy.APPEND,
|
|
321
|
+
):
|
|
322
|
+
self.raise_for_invalid_table_fqn(dst_table_fqn)
|
|
323
|
+
|
|
324
|
+
if compression == DataFileCompression.GZIP and not src_filename.endswith('.gz'):
|
|
325
|
+
raise ValueError('Please provide file path with .gz extension if using compression = GZIP')
|
|
326
|
+
elif not src_filename.endswith('.csv'):
|
|
327
|
+
raise ValueError('Please provide file path with .csv extension')
|
|
328
|
+
|
|
329
|
+
# # <<----- START: Upload to GCS
|
|
330
|
+
|
|
331
|
+
# gcs = GCS(self.project_id)
|
|
332
|
+
# tmp_dir = f'tmp/upload__{current_datetime_str()}'
|
|
333
|
+
|
|
334
|
+
# # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
|
|
335
|
+
# # A single file can produce more than one compressed file in GCS
|
|
336
|
+
# def producer(src_file: str):
|
|
337
|
+
# for dst_file in compress(src_file,
|
|
338
|
+
# keep=True, max_size_bytes=ByteSize.GB * 3):
|
|
339
|
+
# yield (dst_file, )
|
|
340
|
+
|
|
341
|
+
# def consumer(dst_file: str):
|
|
342
|
+
# remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
|
|
343
|
+
# logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
|
|
344
|
+
# blob = gcs.upload(dst_file, remote_file_name, move=True)
|
|
345
|
+
# return blob
|
|
346
|
+
|
|
347
|
+
# blobs: list[storage.Blob]
|
|
348
|
+
# _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
|
|
349
|
+
|
|
350
|
+
# # END: Upload to GCS ----->>
|
|
351
|
+
|
|
352
|
+
# Upload to GCS
|
|
353
|
+
# TODO: Re-implement the producer-consumer model to upload multiple files
|
|
354
|
+
gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
|
|
355
|
+
dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(os.path.basename(src_filename), "_").lower()}'
|
|
356
|
+
gcs.upload(src_filename, dst_blobpath)
|
|
357
|
+
|
|
358
|
+
# Load to BQ
|
|
233
359
|
try:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols, overwrite=True)
|
|
238
|
-
case LoadStrategy.APPEND:
|
|
239
|
-
self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols)
|
|
240
|
-
case _:
|
|
241
|
-
return ValueError(f'Load strategy not recognized: {load_strategy}')
|
|
242
|
-
except Exception as e:
|
|
243
|
-
raise e
|
|
360
|
+
self.load_data(dst_blobpath, dst_table_fqn, schema=schema, partition_by=partition_by, cluster_cols=cluster_cols, format=DataFileFormat.CSV, compression=compression, load_strategy=load_strategy)
|
|
361
|
+
except:
|
|
362
|
+
raise
|
|
244
363
|
finally:
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
364
|
+
gcs.delete_blob(dst_blobpath)
|
|
365
|
+
|
|
366
|
+
def download_csv(
|
|
367
|
+
self,
|
|
368
|
+
query: str,
|
|
369
|
+
dst_filepath: str,
|
|
370
|
+
*,
|
|
371
|
+
gcs_bucket: str | None = None,
|
|
372
|
+
query_parameters: dict = {},
|
|
373
|
+
csv_row_limit: int | None = None,
|
|
374
|
+
) -> str | list[str]:
|
|
250
375
|
if not dst_filepath.endswith('.csv'):
|
|
251
376
|
raise ValueError('Destination filename must ends with .csv')
|
|
252
377
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
378
|
+
# Init
|
|
379
|
+
gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
|
|
380
|
+
|
|
381
|
+
# Generic function to export-download-combine csv file from BQ->GCS->local
|
|
382
|
+
def _export_download_combine(query: str, dst_gcs_prefix: str, dst_filepath: str, query_parameters: dict = {}):
|
|
383
|
+
# Init tmp directory
|
|
384
|
+
tmp_dirname = f'/tmp/my_bq_{my_datetime.get_current_datetime_str()}'
|
|
385
|
+
if os.path.exists(tmp_dirname):
|
|
386
|
+
shutil.rmtree(tmp_dirname, ignore_errors=True)
|
|
387
|
+
os.makedirs(tmp_dirname, exist_ok=True)
|
|
388
|
+
logger.debug(f'Temporary directory created: {tmp_dirname}')
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
# Export to GCS
|
|
392
|
+
dst_gcs_uri = f'gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz'
|
|
393
|
+
self.export_data(query, dst_gcs_uri, parameters=query_parameters, format=DataFileFormat.CSV, compression=DataFileCompression.GZIP)
|
|
394
|
+
|
|
395
|
+
# Download from GCS
|
|
396
|
+
local_tmp_filepaths = []
|
|
397
|
+
for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
|
|
398
|
+
local_tmp_filepath = os.path.join(tmp_dirname, tmp_blobs.name.split('/')[-1])
|
|
399
|
+
gcs.download(tmp_blobs, local_tmp_filepath, move=True)
|
|
400
|
+
logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
|
|
401
|
+
local_tmp_filepaths.append(local_tmp_filepath)
|
|
402
|
+
|
|
403
|
+
# Combine downloaded files
|
|
404
|
+
my_csv.combine(local_tmp_filepaths, dst_filepath, gzip=True, delete=True)
|
|
405
|
+
except:
|
|
406
|
+
raise
|
|
407
|
+
finally:
|
|
408
|
+
shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
|
|
409
|
+
[gcs.delete_blob(blob_filepath) for blob_filepath in gcs.list_blobs(dst_gcs_prefix)] # Remove temporary GCS files
|
|
410
|
+
|
|
411
|
+
logger.info(f'Export-download-combine done: {dst_filepath}')
|
|
412
|
+
|
|
413
|
+
# Limited csv rows
|
|
414
|
+
if csv_row_limit:
|
|
415
|
+
tmp_table_fqn: str | None = None
|
|
416
|
+
tmp_table_fqn_rn: str | None = None
|
|
417
|
+
try:
|
|
418
|
+
# Create temporary table
|
|
419
|
+
query_job = self.execute_query(query, temporary_table=True)
|
|
420
|
+
tmp_table_fqn = str(query_job.destination)
|
|
421
|
+
logger.debug(f'Create temp table: {tmp_table_fqn}')
|
|
422
|
+
|
|
423
|
+
# Create another temporary table for row numbering
|
|
424
|
+
query_job = self.execute_query(f'SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`', temporary_table=True)
|
|
425
|
+
tmp_table_fqn_rn = str(query_job.destination)
|
|
426
|
+
logger.debug(f'Create temp table (rn): {tmp_table_fqn_rn}')
|
|
427
|
+
|
|
428
|
+
# Process parts
|
|
429
|
+
count = list(self.execute_query(f'SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`').result())[0][0]
|
|
430
|
+
parts = math.ceil(count / csv_row_limit)
|
|
431
|
+
logger.info(f'Total part: {count} / {csv_row_limit} = {parts}')
|
|
432
|
+
dst_filepaths = []
|
|
433
|
+
for part in range(parts):
|
|
434
|
+
dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
|
|
435
|
+
_export_download_combine(
|
|
436
|
+
f'SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit}',
|
|
437
|
+
dst_gcs_prefix=gcs.build_tmp_dirpath(),
|
|
438
|
+
dst_filepath=dst_filepath_part,
|
|
439
|
+
)
|
|
440
|
+
dst_filepaths.append(dst_filepath_part)
|
|
441
|
+
return dst_filepaths
|
|
442
|
+
except:
|
|
443
|
+
raise
|
|
444
|
+
finally:
|
|
445
|
+
# Drop temporary tables
|
|
446
|
+
if tmp_table_fqn_rn:
|
|
447
|
+
self.drop_table(tmp_table_fqn_rn)
|
|
448
|
+
if tmp_table_fqn:
|
|
449
|
+
self.drop_table(tmp_table_fqn)
|
|
450
|
+
|
|
451
|
+
# Unlimited csv rows
|
|
452
|
+
else:
|
|
453
|
+
_export_download_combine(query, gcs.build_tmp_dirpath(), dst_filepath, query_parameters=query_parameters)
|
|
454
|
+
return dst_filepath
|
|
455
|
+
|
|
456
|
+
# query_job_result = query_job.result()
|
|
457
|
+
# row_count = 0
|
|
458
|
+
# file_index = 1
|
|
459
|
+
|
|
460
|
+
# # Stream-download-split result
|
|
461
|
+
# def open_file(f):
|
|
462
|
+
# if f:
|
|
463
|
+
# f.close()
|
|
464
|
+
# dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{file_index:06}.csv' if row_limit else dst_filepath
|
|
465
|
+
# logger.info(f'Writing into file: {dst_filepath_part} ...')
|
|
466
|
+
# f = open(dst_filepath_part, 'w', newline='', encoding='utf-8')
|
|
467
|
+
# writer = csv.writer(f)
|
|
468
|
+
# writer.writerow([field.name for field in query_job_result.schema]) # Write header
|
|
469
|
+
|
|
470
|
+
# return f, writer
|
|
471
|
+
|
|
472
|
+
# f, writer = open_file(None)
|
|
473
|
+
# for row in query_job_result:
|
|
474
|
+
# writer.writerow(row)
|
|
475
|
+
|
|
476
|
+
# if row_limit:
|
|
477
|
+
# row_count += 1
|
|
478
|
+
# if row_count >= row_limit:
|
|
479
|
+
# row_count = 0
|
|
480
|
+
# file_index += 1
|
|
481
|
+
# f, writer = open_file(f)
|
|
482
|
+
# if f:
|
|
483
|
+
# f.close()
|
|
284
484
|
|
|
285
485
|
def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
|
|
286
486
|
if not dst_filename.endswith('.xlsx'):
|
|
@@ -302,60 +502,94 @@ class BQ():
|
|
|
302
502
|
file_path_tmp = f'{dst_filename}_part{part + 1}'
|
|
303
503
|
file_path_tmp_csv = f'{file_path_tmp}.csv'
|
|
304
504
|
self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
|
|
305
|
-
csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
|
|
505
|
+
my_xlsx.csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
|
|
306
506
|
os.remove(file_path_tmp_csv)
|
|
307
507
|
except Exception as e:
|
|
308
508
|
raise e
|
|
309
509
|
finally:
|
|
310
510
|
self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
|
|
311
511
|
|
|
312
|
-
def
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
512
|
+
# def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
|
|
513
|
+
# src_project_id, src_dataset_id, _ = src_view_id.split('.')
|
|
514
|
+
# dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
|
|
515
|
+
|
|
516
|
+
# # Create or replace
|
|
517
|
+
# src_view = self.client.get_table(src_view_id)
|
|
518
|
+
# dst_view = bigquery.Table(dst_view_id)
|
|
519
|
+
# dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
|
|
520
|
+
# self.client.delete_table(dst_view, not_found_ok=True)
|
|
521
|
+
# self.client.create_table(dst_view)
|
|
522
|
+
# logger.debug(f'View {src_view_id} copied to {dst_view}')
|
|
523
|
+
|
|
524
|
+
# if drop:
|
|
525
|
+
# self.client.delete_table(src_view_id)
|
|
526
|
+
# logger.debug(f'View {src_view_id} dropped')
|
|
527
|
+
|
|
528
|
+
# def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
|
|
529
|
+
# src_project_id, src_dataset_id, _ = src_routine_id.split('.')
|
|
530
|
+
# dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
|
|
531
|
+
|
|
532
|
+
# # Create or replace
|
|
533
|
+
# src_routine = self.client.get_routine(src_routine_id)
|
|
534
|
+
# dst_routine = bigquery.Routine(dst_routine_id)
|
|
535
|
+
# dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
|
|
536
|
+
# dst_routine.type_ = src_routine.type_
|
|
537
|
+
# dst_routine.description = src_routine.description
|
|
538
|
+
# dst_routine.language = src_routine.language
|
|
539
|
+
# dst_routine.arguments = src_routine.arguments
|
|
540
|
+
# dst_routine.return_type = src_routine.return_type
|
|
541
|
+
# self.client.delete_routine(dst_routine, not_found_ok=True)
|
|
542
|
+
# self.client.create_routine(dst_routine)
|
|
543
|
+
# logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
|
|
544
|
+
|
|
545
|
+
# if drop:
|
|
546
|
+
# self.client.delete_routine(src_routine_id)
|
|
547
|
+
# logger.debug(f'Routine {src_routine_id} dropped')
|
|
548
|
+
|
|
549
|
+
# MARK: Utilities
|
|
550
|
+
|
|
551
|
+
@staticmethod
|
|
552
|
+
def get_table_fqn_parts(name: str | list[str]) -> list[str] | list[list[str]]:
|
|
553
|
+
"""Get fully qualified table name, following this format `<projectid>.<datasetid>.<tableid>`
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
name (str | list[str]): Input name (can be multiple)
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
list[str] | list[list[str]]: The FQN parts. If the input is list then returns list of FQN parts instead.
|
|
560
|
+
"""
|
|
561
|
+
|
|
562
|
+
if isinstance(name, list):
|
|
563
|
+
return [BQ.get_table_fqn_parts(x) for x in name]
|
|
564
|
+
|
|
565
|
+
split = name.split('.')
|
|
566
|
+
if len(split) == 3:
|
|
567
|
+
return split
|
|
568
|
+
else:
|
|
569
|
+
raise ValueError(f'{name} is not a valid table FQN')
|
|
570
|
+
|
|
571
|
+
@staticmethod
|
|
572
|
+
def raise_for_invalid_table_fqn(name: str | list[str]):
|
|
573
|
+
"""Raise an error if the provied name is a fully qualified table name
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
name (str | list[str]): Input name (can be multiple)
|
|
577
|
+
|
|
578
|
+
Raises:
|
|
579
|
+
ValueError: If name is not a fully qualified table name
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
if not BQ.get_table_fqn_parts(name):
|
|
583
|
+
raise ValueError(f'{name} is not a valid table FQN')
|
|
584
|
+
|
|
585
|
+
def is_table_exists(self, table_fqn: str) -> bool:
|
|
586
|
+
self.raise_for_invalid_table_fqn(table_fqn)
|
|
587
|
+
try:
|
|
588
|
+
self.client.get_table(table_fqn)
|
|
589
|
+
return True
|
|
590
|
+
except NotFound:
|
|
591
|
+
return False
|
|
592
|
+
|
|
593
|
+
def close(self):
|
|
360
594
|
self.client.close()
|
|
361
595
|
logger.debug('BQ client close')
|
utill/my_csv.py
CHANGED
|
@@ -57,13 +57,13 @@ def compress(src_filename: str, keep: bool = False, max_size_bytes=ByteSize.GB,
|
|
|
57
57
|
yield dst_filename
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def combine(src_filenames: list[str], dst_filename: str) -> None:
|
|
60
|
+
def combine(src_filenames: list[str], dst_filename: str, gzip: bool = False, delete: bool = False) -> None:
|
|
61
61
|
csv.field_size_limit(min(sys.maxsize, 2147483646)) # FIX: _csv.Error: field larger than field limit (131072)
|
|
62
62
|
|
|
63
63
|
if not dst_filename.endswith('.csv'):
|
|
64
64
|
raise ValueError('Output filename must ends with \'.csv\'!')
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
first_src_file = True
|
|
67
67
|
with open(dst_filename, 'w') as fout:
|
|
68
68
|
csvwriter = csv.writer(fout)
|
|
69
69
|
|
|
@@ -71,21 +71,25 @@ def combine(src_filenames: list[str], dst_filename: str) -> None:
|
|
|
71
71
|
src_filename = os.path.expanduser(src_filename)
|
|
72
72
|
|
|
73
73
|
# Decompress gzipped csv
|
|
74
|
-
if
|
|
74
|
+
if gzip:
|
|
75
75
|
src_filename = decompress(src_filename)
|
|
76
76
|
|
|
77
|
-
#
|
|
77
|
+
# Write content into file
|
|
78
78
|
with open(src_filename, 'r') as fin:
|
|
79
79
|
csvreader = csv.reader(fin)
|
|
80
80
|
|
|
81
|
-
#
|
|
82
|
-
if
|
|
81
|
+
# Write header only at first file
|
|
82
|
+
if first_src_file:
|
|
83
83
|
csvwriter.writerow(next(csvreader))
|
|
84
|
-
|
|
85
|
-
# Else, skip the header
|
|
84
|
+
first_src_file = False
|
|
86
85
|
else:
|
|
87
86
|
next(csvreader)
|
|
88
87
|
|
|
88
|
+
# Write body
|
|
89
89
|
[csvwriter.writerow(row) for row in csvreader]
|
|
90
90
|
|
|
91
|
-
logger.
|
|
91
|
+
logger.debug(f'Combine {src_filename}')
|
|
92
|
+
|
|
93
|
+
if delete:
|
|
94
|
+
os.remove(src_filename)
|
|
95
|
+
logger.debug(f'Delete {src_filename}')
|
utill/my_datetime.py
CHANGED
|
@@ -11,7 +11,7 @@ def get_current_date_str(use_separator: bool = False) -> str:
|
|
|
11
11
|
return datetime.now().strftime('%Y-%m-%d' if use_separator else '%Y%m%d')
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def
|
|
14
|
+
def get_current_datetime_str(use_separator: bool = False) -> str:
|
|
15
15
|
return datetime.now().strftime('%Y-%m-%d %H:%M:%S' if use_separator else '%Y%m%d%H%M%S')
|
|
16
16
|
|
|
17
17
|
|
utill/my_env.py
CHANGED
utill/my_file.py
CHANGED
|
@@ -28,7 +28,7 @@ def decompress(src_file: str, keep: bool = False):
|
|
|
28
28
|
dst_file = src_file.removesuffix('.gz')
|
|
29
29
|
|
|
30
30
|
os.remove(dst_file) if os.path.exists(dst_file) else None
|
|
31
|
-
logger.debug(f'
|
|
31
|
+
logger.debug(f'Decompress {src_file} to {dst_file}')
|
|
32
32
|
with gzip.open(src_file, 'rb') as f_in:
|
|
33
33
|
with open(dst_file, 'wb') as f_out:
|
|
34
34
|
shutil.copyfileobj(f_in, f_out)
|
utill/my_gcs.py
CHANGED
|
@@ -1,123 +1,79 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
|
|
1
|
+
from .my_datetime import get_current_datetime_str
|
|
2
|
+
from .my_env import envs
|
|
4
3
|
from google.cloud import storage
|
|
5
4
|
from loguru import logger
|
|
6
|
-
|
|
7
|
-
from .my_env import envs
|
|
5
|
+
import os
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class GCS:
|
|
11
9
|
|
|
12
|
-
def __init__(self,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
if service_account_filename is not None:
|
|
16
|
-
self.client = storage.Client.from_service_account_json(service_account_filename)
|
|
17
|
-
else:
|
|
18
|
-
self.client = storage.Client(project=self.project)
|
|
10
|
+
def __init__(self, bucket: str | None = None, project_id: str | None = None):
|
|
11
|
+
if project_id is None and envs.GCP_PROJECT_ID is None:
|
|
12
|
+
logger.warning('Using ADC for GCS authentication')
|
|
19
13
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
self.base_path = '/'.join(bucket_name_parts[1:]) if len(bucket_name_parts) > 1 else None
|
|
23
|
-
not self.base_path or logger.debug(f'Base path: {self.base_path}')
|
|
14
|
+
if bucket is None and envs.GCS_BUCKET is None:
|
|
15
|
+
raise ValueError('Bucket name must be provided either as an argument or set in environment variables.')
|
|
24
16
|
|
|
17
|
+
self.client = storage.Client(project=project_id or envs.GCP_PROJECT_ID)
|
|
18
|
+
self.bucket = self.client.bucket(bucket or envs.GCS_BUCKET)
|
|
25
19
|
logger.debug(f'GCS client open, project: {self.client.project}')
|
|
26
20
|
|
|
27
|
-
def
|
|
28
|
-
return self
|
|
29
|
-
|
|
30
|
-
def __exit__(self, exc_type, exc_value, exc_tb):
|
|
31
|
-
self.close_client()
|
|
32
|
-
|
|
33
|
-
def _construct_path(self, path: str) -> str:
|
|
34
|
-
return f'{self.base_path}/{path}' if self.base_path else path
|
|
35
|
-
|
|
36
|
-
def change_bucket(self, bucket_name: str):
|
|
37
|
-
if not bucket_name:
|
|
38
|
-
raise ValueError('Bucket name needed')
|
|
39
|
-
self.bucket = self.client.bucket(bucket_name)
|
|
40
|
-
logger.debug(f'Change bucket to {self.bucket.name}')
|
|
41
|
-
|
|
42
|
-
def get(self, path: str) -> storage.Blob:
|
|
43
|
-
path = self._construct_path(path)
|
|
44
|
-
return self.bucket.blob(path)
|
|
45
|
-
|
|
46
|
-
def list(self, path: str) -> list[storage.Blob]:
|
|
47
|
-
path = self._construct_path(path)
|
|
48
|
-
if '*' in path:
|
|
49
|
-
path_prefix = path.split('*')[0]
|
|
50
|
-
regex_pattern = '^' + re.escape(path).replace('\\*', '.*') + '$'
|
|
51
|
-
regex = re.compile(regex_pattern)
|
|
52
|
-
return [x for x in self.bucket.list_blobs(prefix=path_prefix) if regex.match(x.name)]
|
|
53
|
-
|
|
54
|
-
return list(self.bucket.list_blobs(prefix=path))
|
|
55
|
-
|
|
56
|
-
def copy(self, src_path: str, dst_path: str, mv: bool = False):
|
|
57
|
-
src_blob = self.get(src_path)
|
|
58
|
-
dst_blob = self.get(dst_path)
|
|
59
|
-
|
|
60
|
-
dst_blob.rewrite(src_blob)
|
|
21
|
+
def get_blob(self, blobpath: str) -> storage.Blob:
|
|
22
|
+
return self.bucket.blob(blobpath)
|
|
61
23
|
|
|
62
|
-
|
|
24
|
+
def list_blobs(self, prefix: str) -> list[storage.Blob]:
|
|
25
|
+
return self.bucket.list_blobs(prefix=prefix)
|
|
63
26
|
|
|
64
|
-
|
|
27
|
+
def delete_blob(self, blobpath: str | storage.Blob) -> storage.Blob:
|
|
28
|
+
blob = self.get_blob(blobpath) if isinstance(blobpath, str) else blobpath
|
|
29
|
+
return blob.delete()
|
|
65
30
|
|
|
66
|
-
|
|
31
|
+
def copy(self, src_blobpath: str, dst_blobpath: str, dst_bucket: str = None, move: bool = False):
|
|
32
|
+
src_bucket = self.bucket
|
|
33
|
+
src_blob = self.get_blob(src_blobpath)
|
|
34
|
+
dst_bucket = dst_bucket or src_bucket.name
|
|
67
35
|
|
|
68
|
-
|
|
69
|
-
self.bucket.copy_blob(src_blob, dst_gcs.bucket, dst_path)
|
|
70
|
-
dst_blob = dst_gcs.get(dst_path)
|
|
36
|
+
self.bucket.copy_blob(src_blob, dst_bucket, dst_blobpath)
|
|
71
37
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if not os.path.exists(local_path):
|
|
80
|
-
raise FileNotFoundError(f'File not found: {local_path}')
|
|
81
|
-
|
|
82
|
-
blob = self.get(remote_path)
|
|
83
|
-
blob.upload_from_filename(local_path)
|
|
84
|
-
|
|
85
|
-
logger.debug(f'✅ Upload {local_path} to gs://{self.bucket.name}/{blob.name}')
|
|
86
|
-
|
|
87
|
-
not mv or os.remove(local_path)
|
|
88
|
-
|
|
89
|
-
return blob
|
|
90
|
-
|
|
91
|
-
def download(self, obj: str | storage.Blob, local_path: str, mv: bool = False):
|
|
92
|
-
local_path = os.path.expanduser(local_path)
|
|
93
|
-
is_blob = type(obj) == storage.Blob
|
|
94
|
-
|
|
95
|
-
if os.path.isdir(local_path):
|
|
96
|
-
local_path = os.path.join(local_path, obj.name.split('/')[-1] if is_blob else os.path.basename(obj))
|
|
97
|
-
if not os.path.dirname(local_path):
|
|
98
|
-
raise FileNotFoundError(f'Destination directory not found: {os.path.dirname(local_path)}')
|
|
99
|
-
|
|
100
|
-
blob = obj if is_blob else self.get(obj)
|
|
101
|
-
blob.download_to_filename(local_path)
|
|
38
|
+
# Move mode
|
|
39
|
+
if move:
|
|
40
|
+
self.delete_blob(src_blobpath)
|
|
41
|
+
logger.debug(f'Moved gs://{src_bucket}/{src_blobpath} to gs://{dst_bucket}/{dst_blobpath}')
|
|
42
|
+
# Copy mode
|
|
43
|
+
else:
|
|
44
|
+
logger.debug(f'Copied gs://{src_bucket}/{src_blobpath} to gs://{dst_bucket}/{dst_blobpath}')
|
|
102
45
|
|
|
103
|
-
|
|
46
|
+
def upload(self, src_filepath: str, dst_blobpath: str, move: bool = False):
|
|
47
|
+
blob = self.get_blob(dst_blobpath)
|
|
48
|
+
blob.upload_from_filename(src_filepath)
|
|
104
49
|
|
|
105
|
-
|
|
50
|
+
# Move mode
|
|
51
|
+
if move:
|
|
52
|
+
os.remove(src_filepath)
|
|
53
|
+
logger.debug(f'Moved {src_filepath} to gs://{self.bucket.name}/{blob.name}')
|
|
54
|
+
# Copy mode
|
|
55
|
+
else:
|
|
56
|
+
logger.debug(f'Uploaded {src_filepath} to gs://{self.bucket.name}/{blob.name}')
|
|
106
57
|
|
|
107
|
-
|
|
58
|
+
def download(self, src_blobpath: str | storage.Blob, dst_filepath: str, move: bool = False):
|
|
59
|
+
blob = self.get_blob(src_blobpath) if isinstance(src_blobpath, str) else src_blobpath
|
|
60
|
+
blob.download_to_filename(dst_filepath)
|
|
108
61
|
|
|
109
|
-
|
|
110
|
-
|
|
62
|
+
if move:
|
|
63
|
+
self.delete_blob(blob)
|
|
64
|
+
logger.debug(f'Moved gs://{self.bucket.name}/{blob.name} to {dst_filepath}')
|
|
65
|
+
else:
|
|
66
|
+
logger.debug(f'Copied gs://{self.bucket.name}/{blob.name} to {dst_filepath}')
|
|
111
67
|
|
|
112
|
-
|
|
68
|
+
# MARK: Utilities
|
|
113
69
|
|
|
114
|
-
|
|
70
|
+
@staticmethod
|
|
71
|
+
def build_tmp_dirpath(prefix: str = 'tmp') -> str:
|
|
72
|
+
"""
|
|
73
|
+
Builds a temporary directory path in the GCS bucket.
|
|
74
|
+
"""
|
|
75
|
+
return f'{prefix}/{get_current_datetime_str()}'
|
|
115
76
|
|
|
116
|
-
def
|
|
77
|
+
def close(self):
|
|
117
78
|
self.client.close()
|
|
118
|
-
logger.debug('GCS client
|
|
119
|
-
|
|
120
|
-
@staticmethod
|
|
121
|
-
def remove_blob(blob: storage.Blob):
|
|
122
|
-
blob.delete()
|
|
123
|
-
logger.debug(f'🗑️ Remove gs://{blob.bucket.name}/{blob.name}')
|
|
79
|
+
logger.debug('GCS client closed')
|
utill/my_queue.py
CHANGED
|
@@ -1,7 +1,82 @@
|
|
|
1
|
-
import
|
|
1
|
+
from loguru import logger
|
|
2
|
+
from typing import Callable
|
|
2
3
|
import concurrent.futures
|
|
4
|
+
import queue
|
|
5
|
+
|
|
6
|
+
class StreamingQ:
|
|
7
|
+
def __init__(self, producer_func: Callable, producer_args: tuple, consumer_func: Callable, max_queue_size: int = 0):
|
|
8
|
+
self.producer_func = producer_func
|
|
9
|
+
self.producer_args = producer_args
|
|
10
|
+
self.consumer_func = consumer_func
|
|
11
|
+
|
|
12
|
+
# Use maxsize for backpressure control (0 = unlimited)
|
|
13
|
+
self.q = queue.Queue(maxsize=max_queue_size)
|
|
14
|
+
|
|
15
|
+
def execute(self):
|
|
16
|
+
"""
|
|
17
|
+
Execute producer and consumer with true streaming using generators.
|
|
18
|
+
Yields consumer results as they become available.
|
|
19
|
+
"""
|
|
20
|
+
def producer():
|
|
21
|
+
try:
|
|
22
|
+
for item in self.producer_func(*self.producer_args):
|
|
23
|
+
self.q.put(item)
|
|
24
|
+
logger.debug(f'🌾 Produced {item}')
|
|
25
|
+
except Exception as e:
|
|
26
|
+
logger.error(f'Producer error: {e}')
|
|
27
|
+
self.q.put(('ERROR', e))
|
|
28
|
+
finally:
|
|
29
|
+
# Signal end of production
|
|
30
|
+
self.q.put(None)
|
|
31
|
+
logger.debug('🌾 Producer finished')
|
|
32
|
+
|
|
33
|
+
def consumer():
|
|
34
|
+
while True:
|
|
35
|
+
item = self.q.get()
|
|
36
|
+
|
|
37
|
+
if item is None:
|
|
38
|
+
# End of stream signal
|
|
39
|
+
self.q.task_done()
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
if isinstance(item, tuple) and item[0] == 'ERROR':
|
|
43
|
+
# Propagate producer error
|
|
44
|
+
self.q.task_done()
|
|
45
|
+
raise item[1]
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Unpack item if it's a tuple, otherwise pass as single arg
|
|
49
|
+
if isinstance(item, tuple):
|
|
50
|
+
result = self.consumer_func(*item)
|
|
51
|
+
else:
|
|
52
|
+
result = self.consumer_func(item)
|
|
53
|
+
|
|
54
|
+
self.q.task_done()
|
|
55
|
+
logger.debug(f'🔥 Consumed {item} -> {result}')
|
|
56
|
+
yield result
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
self.q.task_done()
|
|
60
|
+
logger.error(f'Consumer error processing {item}: {e}')
|
|
61
|
+
raise
|
|
62
|
+
|
|
63
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
64
|
+
# Start producer in background
|
|
65
|
+
future_producer = executor.submit(producer)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# Yield results as they become available
|
|
69
|
+
for result in consumer():
|
|
70
|
+
yield result
|
|
71
|
+
|
|
72
|
+
# Wait for producer to complete
|
|
73
|
+
future_producer.result()
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
# Cancel producer if consumer fails
|
|
77
|
+
future_producer.cancel()
|
|
78
|
+
raise
|
|
3
79
|
|
|
4
|
-
from loguru import logger
|
|
5
80
|
|
|
6
81
|
|
|
7
82
|
class ThreadingQ:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|