rdxz2-utill 0.0.11__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rdxz2-utill might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdxz2-utill
3
- Version: 0.0.11
3
+ Version: 0.1.0
4
4
  Summary: Your daily Python utility
5
5
  Author-email: Richard Dharmawan <richard.dharmawan@gmail.com>
6
6
  License: MIT License
@@ -1,35 +1,35 @@
1
- rdxz2_utill-0.0.11.dist-info/licenses/LICENSE,sha256=PF9CUvzP8XFYopEAzrMzSCovF7RdBdscPqJCDC6KjPc,1073
1
+ rdxz2_utill-0.1.0.dist-info/licenses/LICENSE,sha256=PF9CUvzP8XFYopEAzrMzSCovF7RdBdscPqJCDC6KjPc,1073
2
2
  utill/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- utill/my_bq.py,sha256=eWHnCz-tPHLtK4Ac9uNRLw_iyoEvOEy-mk9OMxyWZGc,14771
3
+ utill/my_bq.py,sha256=1BKjYa05yfOBInm245373lfi4sbOZKybYXdYmON2npM,24985
4
4
  utill/my_compare.py,sha256=619QbVk3GihWxen95yVnivKHkah8GgPTLGiSkgHxykw,886
5
5
  utill/my_const.py,sha256=88dOqn6NPQ5-hfRqdkew5POoAIyO91XXOGvN76oNsdo,251
6
- utill/my_csv.py,sha256=svgu93R0pP7UW0B58eJMi0vuJnYhqMtafzCsTIk4yUU,2781
7
- utill/my_datetime.py,sha256=KEZTplLk3tgVqqC3wClXFcsF_zo40fma_rtPg4kSJHc,2125
6
+ utill/my_csv.py,sha256=AT5sAbAlYqnAmNgQMTSqEueRXM4D42yNPb5C3Hedy6c,2921
7
+ utill/my_datetime.py,sha256=8AUO9l_MSzdthRsgASuyGZpvjgpoQb9Lowt4goHjyqw,2129
8
8
  utill/my_dict.py,sha256=jPaPfdn4WYpm0uIBPiYFinpHhx1jXpFVDJ9npmvxGZQ,391
9
9
  utill/my_encryption.py,sha256=SCF7PPur39cW4RHidsRhw-9BZP-ymUH-6LZ9nAHJDsY,2105
10
- utill/my_env.py,sha256=mREys72Ybg2p9p2s7ApOt0s_6F5-qxR8FyYEcSJ8pmU,2093
11
- utill/my_file.py,sha256=H2V8qGSCwnztBKiLYA38-4KUaGFQhznJz86cdilLtAE,1879
12
- utill/my_gcs.py,sha256=KUx89rZx2-dq-GV1LbbvbZ79Qr9NznjG1Zipop4hMZE,4216
10
+ utill/my_env.py,sha256=E7XW3fuhxbDlFqmLPHrziJJZVRogzGh6rfQdyNV49f8,2130
11
+ utill/my_file.py,sha256=-b6_dGDDBdS228kgwTYpmIa3vxW1c1TtWrLdzdlHjKY,1873
12
+ utill/my_gcs.py,sha256=VBJ8lsJ-fHr_BzMoSuT5JUrvxidGyMc2VNtE6Um1T_M,3060
13
13
  utill/my_input.py,sha256=OyKLoutXpwISReltuL_Gw2oojv16tYWJqQpqabBOQx4,350
14
14
  utill/my_json.py,sha256=WgW6mavGhfs4h1N5XbhsDnRk2dbh_ttJWdJUj4iWDN4,1473
15
15
  utill/my_mb.py,sha256=IyrySs92TqtjBUvPMeUN3P2kRK8EttTFRPZsv5Cr-xw,15090
16
16
  utill/my_pg.py,sha256=J9USygc-oug4w7AkBacA9x043jHZrDfQPGFEqXavZAY,6799
17
- utill/my_queue.py,sha256=hINP4_yjmboSjHgo1J3CtPm2X9SE3HfczyED3ip7nfk,1930
17
+ utill/my_queue.py,sha256=Qf3Nm_ZRoVD34oAoym8A9hoH9Y27kUHeWLhylAUj5Q4,4749
18
18
  utill/my_string.py,sha256=pINYFR1ligTyVZYzV8P_FolCsZQwYE1jaFNTuQ3XS_8,833
19
19
  utill/my_style.py,sha256=Wy6j4WL9RgGeX6cS9hhlOrufc9UC4UPTQ5UJa0ZJ3Yo,900
20
20
  utill/my_tunnel.py,sha256=uCpGtiG8AcRYiaN7rLnTulsZI4iFTRM8EHxwyAAfDrE,1292
21
21
  utill/my_xlsx.py,sha256=YcQRp6DC9girSS1fkUPVKsHspyQpr8JC8GymSSnRV-w,729
22
22
  utill/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  utill/cmd/_bq.py,sha256=MQGLIv_WBUBl2tf18bfYrAszx0Koa5kdTW1c8A5HDDg,520
24
- utill/cmd/_conf.py,sha256=hNdkApzRhPloRSe4RyxbWuLoyeqkK7Yx9g44kcvKOEM,1800
24
+ utill/cmd/_conf.py,sha256=DKl3IVVLp6-5P43tvh6bYmHR5rOL9XnKVuQ7kQJtzrc,1863
25
25
  utill/cmd/_enc.py,sha256=DBy3Iwa5DTtww7lgHPRLEilrYPrWDG1vRv5PO-YzNO8,997
26
26
  utill/cmd/_main.py,sha256=UJ_XTIGDO9XPIypgHhS81SJQ_8qy8JOyw98Or0Nb2x8,273
27
27
  utill/cmd/_pg.py,sha256=RVxEiSifyIwMDYDM69vt6WSLdVDr1cMzY6r4T2PzNRA,492
28
28
  utill/cmd/utill.py,sha256=TlHfiwOUcK1m58PrRCjX9sARiPYZUsoTk-KOTCOz1vM,3558
29
29
  utill/templates/mb.json,sha256=M46ZHSaSh4rbD_KGUViGr2B2ZV8_PC-O5Evqi35JK5g,59
30
30
  utill/templates/pg.json,sha256=LkJt0VV3zcyt7Tpn6gulsoVQgUc-9uImXOStvzu8cdU,271
31
- rdxz2_utill-0.0.11.dist-info/METADATA,sha256=ZGCUekaj1Zr5C8zjKwLBL6Q7tf1oPFEzuwS29Dsr7h0,4402
32
- rdxz2_utill-0.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- rdxz2_utill-0.0.11.dist-info/entry_points.txt,sha256=9n5NWz5Wi9jDvYhB_81_4icgT5xABZ-QivHD8ibcafg,47
34
- rdxz2_utill-0.0.11.dist-info/top_level.txt,sha256=tuAYZoCsr02JYbpZj7I6fl1IIo53v3GG0uoj-_fINVk,6
35
- rdxz2_utill-0.0.11.dist-info/RECORD,,
31
+ rdxz2_utill-0.1.0.dist-info/METADATA,sha256=iB4cHw4zIQnP_2DHvIkBpwKxR5s32RVXr9xyJiY-GX4,4401
32
+ rdxz2_utill-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ rdxz2_utill-0.1.0.dist-info/entry_points.txt,sha256=9n5NWz5Wi9jDvYhB_81_4icgT5xABZ-QivHD8ibcafg,47
34
+ rdxz2_utill-0.1.0.dist-info/top_level.txt,sha256=tuAYZoCsr02JYbpZj7I6fl1IIo53v3GG0uoj-_fINVk,6
35
+ rdxz2_utill-0.1.0.dist-info/RECORD,,
utill/cmd/_conf.py CHANGED
@@ -6,6 +6,7 @@ def _init(mode: str):
6
6
  match mode:
7
7
  case 'google-cloud':
8
8
  setattr(envs, 'GCP_PROJECT_ID', input('GCP_PROJECT_ID: '))
9
+ setattr(envs, 'GCP_REGION', input('GCP_REGION: '))
9
10
  setattr(envs, 'GCS_BUCKET', input('GCS_BUCKET: '))
10
11
  envs.write()
11
12
  logger.info('Google cloud configuration initialized')
utill/my_bq.py CHANGED
@@ -1,30 +1,46 @@
1
- from .my_const import ByteSize
2
- from .my_csv import read_header, combine as compress
3
- from .my_datetime import current_datetime_str
4
- from .my_env import envs
5
- from .my_gcs import GCS
6
- from .my_queue import ThreadingQ
7
- from .my_string import replace_nonnumeric
8
- from .my_xlsx import csv_to_xlsx
9
- from enum import Enum
10
- from google.cloud import bigquery, storage
1
+ from . import my_csv
2
+ from . import my_datetime
3
+ from . import my_env
4
+ from . import my_gcs
5
+ from . import my_queue
6
+ from . import my_string
7
+ from . import my_xlsx
8
+ from enum import StrEnum, Enum, auto
9
+ from google.cloud import bigquery
10
+ from google.cloud.exceptions import NotFound
11
+ from humanize import precisedelta, naturalsize
11
12
  from loguru import logger
12
- from textwrap import dedent
13
13
  import csv
14
- import humanize
14
+ import datetime
15
15
  import math
16
16
  import os
17
+ import shutil
18
+ import textwrap
19
+ import time
17
20
 
18
- MAP__PYTHON_DTYPE__BQ_DTYPE = {
21
+ PY_DATA_TYPE__BQ_DATA_TYPE = {
19
22
  int: 'INTEGER',
20
23
  str: 'STRING',
21
24
  float: 'STRING',
22
25
  }
23
26
 
24
27
 
28
+ class DataFileFormat(StrEnum):
29
+ CSV = 'CSV'
30
+ JSON = 'JSON'
31
+ AVRO = 'AVRO'
32
+ PARQUET = 'PARQUET'
33
+ ORC = 'ORC'
34
+
35
+
36
+ class DataFileCompression(StrEnum):
37
+ GZIP = 'GZIP'
38
+ SNAPPY = 'SNAPPY'
39
+
40
+
25
41
  class LoadStrategy(Enum):
26
- OVERWRITE = 1
27
- APPEND = 2
42
+ OVERWRITE = auto()
43
+ APPEND = auto()
28
44
 
29
45
 
30
46
  class Dtype:
@@ -56,231 +72,415 @@ class Dtype:
56
72
 
57
73
 
58
74
  class BQ():
59
- def __init__(self, project: str = None, service_account_filename: str = None):
60
- self.project = project or envs.GCP_PROJECT_ID
75
+ def __init__(self, location: str | None = None, project_id: str = None):
76
+ if project_id is None and my_env.envs.GCP_PROJECT_ID is None:
77
+ logger.warning('Using ADC for BigQuery authentication')
61
78
 
62
- if service_account_filename is not None:
63
- self.client = bigquery.Client.from_service_account_json(service_account_filename)
64
- else:
65
- self.client = bigquery.Client(project=self.project)
79
+ # if location is None and my_env.envs.GCP_REGION is None:
80
+ # raise ValueError('GCP region must be set in environment variables.')
66
81
 
82
+ self.client = bigquery.Client(project=project_id or my_env.envs.GCP_PROJECT_ID, location=location or my_env.envs.GCP_REGION)
67
83
  logger.debug(f'BQ client open, project: {self.client.project}')
68
84
 
69
- def __enter__(self):
70
- return self
85
+ # MARK: Query execution
86
+
87
+ def execute_query(
88
+ self,
89
+ query: str | list[str],
90
+ parameters: dict = {},
91
+ dry_run: bool = False,
92
+ temporary_table: bool = False,
93
+ ) -> bigquery.QueryJob:
94
+ # Reconstruct query, handle multiple queries in a single job
95
+ is_multi = isinstance(query, list)
96
+ queries = query if is_multi else [query]
97
+ queries = [textwrap.dedent(q).strip() for q in queries]
98
+ queries = [q if q.endswith(';') else q + ';' for q in queries] # Append ';' character for each query
99
+ query = '\n'.join(queries)
100
+
101
+ # Evaluate parameter
102
+ query_parameters = []
103
+ for parameter, value in parameters.items():
104
+ is_array = isinstance(value, list)
105
+ value_type_py = type(value[0]) if is_array else type(value)
106
+ if value_type_py not in PY_DATA_TYPE__BQ_DATA_TYPE:
107
+ raise ValueError(f'Unsupported type for parameter {parameter}: {value_type_py}. Supported types are: {list(PY_DATA_TYPE__BQ_DATA_TYPE.keys())}')
71
108
 
72
- def __exit__(self, exc_type, exc_value, exc_tb):
73
- self.close_client()
109
+ value_type_bq = PY_DATA_TYPE__BQ_DATA_TYPE[value_type_py]
74
110
 
75
- def execute_query(self, query: str | list[str], dry_run: bool = False, parameters: dict = {}) -> bigquery.QueryJob:
76
- multi = type(query) == list
77
- if multi:
78
- query = '\n'.join([x if str(x).strip().endswith(';') else x + ';' for x in query if x])
79
- else:
80
- query = query.strip()
111
+ # Handle data type conversions
112
+ if value_type_py == datetime.date:
113
+ value = [v.strftime('%Y-%m-%d') for v in value] if is_array else value.strftime('%Y-%m-%d')
81
114
 
82
- # Build paramters
83
- query_parameters = []
84
- for parameter, value in parameters.items():
85
- if type(value) == list:
86
- query_parameters.append(bigquery.ArrayQueryParameter(parameter, MAP__PYTHON_DTYPE__BQ_DTYPE[type(value[0])], value))
115
+ if is_array:
116
+ query_parameters.append(bigquery.ArrayQueryParameter(parameter, value_type_bq, value))
87
117
  else:
88
- query_parameters.append(bigquery.ScalarQueryParameter(parameter, MAP__PYTHON_DTYPE__BQ_DTYPE[type(value)], value))
118
+ query_parameters.append(bigquery.ScalarQueryParameter(parameter, value_type_bq, value))
89
119
 
90
120
  logger.debug(f'🔎 Query:\n{query}')
91
121
  query_job_config = bigquery.QueryJobConfig(dry_run=dry_run, query_parameters=query_parameters)
122
+ if temporary_table:
123
+ query_job_config.destination = None
124
+ t = time.time()
92
125
  query_job = self.client.query(query, job_config=query_job_config)
126
+ logger.info(f'Job tracking: https://console.cloud.google.com/bigquery?project={self.client.project}&j=bq:{self.client.location}:{query_job.job_id}&page=queryresults') if not dry_run else None
127
+ query_job.result() # Wait for the job to complete
128
+ elapsed = precisedelta(datetime.timedelta(seconds=time.time() - t))
93
129
 
94
- if not multi:
95
- logger.debug(f'[Job ID] {query_job.job_id}, [Processed] {humanize.naturalsize(query_job.total_bytes_processed)}, [Billed] {humanize.naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s)',)
130
+ if not is_multi:
131
+ logger.info(f'[Job ID] {query_job.job_id}, [Processed] {naturalsize(query_job.total_bytes_processed)}, [Billed] {naturalsize(query_job.total_bytes_billed)}, [Affected] {query_job.num_dml_affected_rows or 0} row(s), [Elapsed] {elapsed}',)
96
132
  else:
97
- logger.debug(f'[Job ID] {query_job.job_id}')
133
+ logger.info(f'[Job ID] {query_job.job_id} [Elapsed] {elapsed}')
98
134
 
99
- jobs: list[bigquery.QueryJob] = self.client.list_jobs(parent_job=query_job.job_id)
100
- [logger.debug(f'[Script ID] {job.job_id}, [Processed] {humanize.naturalsize(job.total_bytes_processed)}, [Billed] {humanize.naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
135
+ jobs: list[bigquery.QueryJob] = list(self.client.list_jobs(parent_job=query_job.job_id))
136
+ [logger.info(f'[Script ID] {job.job_id}, [Processed] {naturalsize(job.total_bytes_processed)}, [Billed] {naturalsize(job.total_bytes_billed)}, [Affected] {job.num_dml_affected_rows or 0} row(s)',) for job in jobs]
101
137
 
102
138
  return query_job
103
139
 
104
- def create_table(self, bq_table_fqn: str, schema: list[bigquery.SchemaField], partition_col: str, cluster_cols: list[str]):
105
- table = bigquery.Table(bq_table_fqn, schema=schema)
106
-
107
- if partition_col:
108
- table.time_partitioning = bigquery.TimePartitioning(field=partition_col)
109
- table.partitioning_type = 'DAY'
110
-
111
- if cluster_cols:
112
- table.clustering_fields = cluster_cols
113
-
114
- bq_table = self.client.create_table(table)
115
- logger.info(f'✅ Table created: {bq_table_fqn}')
116
- return bq_table
140
+ # MARK: Table operations
141
+
142
+ def create_table(
143
+ self,
144
+ dst_table_fqn: str,
145
+ query: str,
146
+ query_parameters: dict = {},
147
+ *,
148
+ description: str | None = None,
149
+ schema: list[dict] | None = None,
150
+ partition_by: str | None = None,
151
+ clustering_fields: list[str] | None = None,
152
+ expiration_timestamp_utc: datetime.datetime | None = None,
153
+ require_partition_filter: bool = False,
154
+ replace: bool = False,
155
+ ):
156
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
157
+
158
+ # Construct table options
159
+ logger.debug('Constructing table options ...')
160
+ table_options = []
161
+ if expiration_timestamp_utc:
162
+ table_options.append(f' expiration_timestamp=\'{expiration_timestamp_utc.isoformat()}\'')
163
+ if partition_by and require_partition_filter:
164
+ table_options.append(f' require_partition_filter=TRUE')
165
+ if description:
166
+ table_options.append(f' description=\'{description}\'')
167
+
168
+ # Check if table exists
169
+ logger.debug('Checking if destination table exists ...')
170
+ dst_table_project_id, dst_table_dataset_id, dst_table_id = self.get_table_fqn_parts(dst_table_fqn)
171
+ table_exist = self.is_table_exists(project_id=dst_table_project_id, dataset_id=dst_table_dataset_id, table_id=dst_table_id)
172
+
173
+ # Construct beautiful query string
174
+ if table_exist and not replace:
175
+ logger.debug('Table exists, constructing INSERT query ...')
176
+ query_parts = [f'INSERT INTO `{dst_table_fqn}`']
177
+ if schema:
178
+ schema_str = ',\n'.join([column['name'] for column in schema])
179
+ query_parts.append(f'(\n{schema_str}\n)')
180
+ if table_options:
181
+ query_parts.append(f'OPTIONS (\n{",\n".join(table_options)}\n)')
182
+ else:
183
+ logger.debug('Table not exist, constructing CREATE TABLE query ...')
184
+ query_parts = [
185
+ f'CREATE OR REPLACE TABLE `{dst_table_fqn}`',
186
+ ]
187
+ if schema:
188
+ schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
189
+ query_parts.append(f'(\n{schema_str}\n)')
190
+ if partition_by:
191
+ query_parts.append(f'PARTITION BY {partition_by}')
192
+ if clustering_fields:
193
+ clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
194
+ query_parts.append(f'CLUSTER BY {clustering_fields_str}')
195
+ if table_options:
196
+ query_parts.append(f'OPTIONS (\n{",\n".join(table_options)}\n)')
197
+ query_parts.append('AS')
198
+ query_parts.append(textwrap.dedent(query).strip())
199
+
200
+ # Execute
201
+ logger.debug('Executing query ...')
202
+ query = '\n'.join(query_parts)
203
+ self.execute_query(query, parameters=query_parameters)
117
204
 
118
205
  def drop_table(self, bq_table_fqn: str):
119
- self.client.delete_table(bq_table_fqn)
120
- logger.info(f'✅ Table dropped: {bq_table_fqn}')
121
-
122
- def load_data_into(self, bq_table_fqn: str, gcs_path: list[str] | str, cols: dict[str, Dtype], partition_col: str = None, cluster_cols: list[str] = None, overwrite: bool = False):
123
- if type(gcs_path) == str:
124
- gcs_path = [gcs_path]
125
- gcs_path_str = ',\n'.join([f' \'{x}\'' for x in gcs_path])
126
-
127
- load_data_keyword = 'OVERWRITE' if overwrite else 'INTO'
128
- cols_str = ',\n'.join([f' `{x}` {y}' for x, y in cols.items()])
129
- cluster_cols_str = ','.join([f'`{x}`' for x in cluster_cols]) if cluster_cols else None
130
- query = dedent(
131
- f'''
132
- LOAD DATA {load_data_keyword} `{bq_table_fqn}` (
133
- {cols_str}
134
- )
135
- {f"PARTITION BY `{partition_col}`" if partition_col is not None else "-- No partition column provided"}
136
- {f"CLUSTER BY {cluster_cols_str}" if cluster_cols_str is not None else "-- No cluster column provided"}
137
- FROM FILES(
138
- skip_leading_rows=1,
139
- allow_quoted_newlines=true,
140
- format='csv',
141
- compression='gzip',
142
- uris = [
143
- {gcs_path_str}
144
- ]
145
- );
146
- '''
147
- )
148
-
149
- logger.debug(f'⌛ Load data into: {bq_table_fqn}')
150
- query_job = self.execute_query(query)
151
- logger.info(f'✅ Load data into: {bq_table_fqn}')
152
- return query_job
153
-
154
- def export_data(self, query: str, gcs_path: str, pre_query: str = None):
155
- if '*' not in gcs_path:
156
- raise ValueError('GCS path need to have a single \'*\' wildcard character')
157
-
158
- query = dedent(
159
- f'''
160
- EXPORT DATA OPTIONS (
161
- uri='{gcs_path}',
162
- format='csv',
163
- compression='gzip',
164
- overwrite=true,
165
- header=true,
166
- field_delimiter=',')
167
- AS (
168
- {query}
169
- ORDER BY 1
170
- );
171
- '''
206
+ logger.info(f'Dropping table: {bq_table_fqn} ...')
207
+ self.raise_for_invalid_table_fqn(bq_table_fqn)
208
+ self.client.delete_table(bq_table_fqn, not_found_ok=True)
209
+
210
+ # MARK: Table data
211
+
212
+ def load_data(
213
+ self,
214
+ src_gcs_uri: str,
215
+ dst_table_fqn: str,
216
+ *,
217
+ schema: list[dict] | None = None,
218
+ partition_by: str | None = None,
219
+ clustering_fields: list[str] | None = None,
220
+ field_delimiter: str = ',',
221
+ load_strategy: LoadStrategy = LoadStrategy.APPEND,
222
+ format: DataFileFormat = DataFileFormat.CSV,
223
+ compression=None,
224
+ ):
225
+
226
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
227
+
228
+ logger.debug(f'Loading CSV from {src_gcs_uri} into {dst_table_fqn} ...')
229
+
230
+ # Construct LOAD options
231
+ logger.debug('Constructing LOAD options ...')
232
+ load_options = [ # https://cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#load_option_list
233
+ f' format=\'{format}\'',
234
+ f' uris=[\'{src_gcs_uri}\']',
235
+ ]
236
+ if format == DataFileFormat.CSV:
237
+ load_options.append(f' skip_leading_rows=1')
238
+ load_options.append(f' field_delimiter=\'{field_delimiter}\'')
239
+ load_options.append(f' allow_quoted_newlines=true')
240
+ if compression:
241
+ load_options.append(f' compression=\'{compression}\'')
242
+ load_options_str = ',\n'.join(load_options)
243
+
244
+ # Construct beautiful query string
245
+ logger.debug('Constructing LOAD query ...')
246
+ schema_str = ',\n'.join([f' {column["name"]} {column["data_type"]}' for column in schema])
247
+ query_parts = [f'LOAD DATA {"OVERWRITE" if load_strategy == LoadStrategy.OVERWRITE else "INTO"} `{dst_table_fqn}` (\n{schema_str}\n)']
248
+ if partition_by:
249
+ query_parts.append(f'PARTITION BY {partition_by}')
250
+ if clustering_fields:
251
+ clustering_fields_str = ', '.join([f'`{field}`' for field in clustering_fields])
252
+ query_parts.append(f'CLUSTER BY {clustering_fields_str}')
253
+ query_parts.append(f'FROM FILES (\n{load_options_str}\n)')
254
+ query = '\n'.join(query_parts)
255
+
256
+ # Execute
257
+ logger.debug('Executing query ...')
258
+ self.execute_query(query)
259
+
260
+ def export_data(
261
+ self,
262
+ query: str,
263
+ dst_gcs_uri: str,
264
+ *,
265
+ parameters: dict = {},
266
+ format: DataFileFormat = DataFileFormat.CSV,
267
+ compression: DataFileCompression | None = None,
268
+ header: bool = True,
269
+ delimiter: str = ',',
270
+ ):
271
+ logger.debug(f'Exporting query into {dst_gcs_uri} ...')
272
+
273
+ # GCS uri validation
274
+ if format == DataFileFormat.CSV and compression == DataFileCompression.GZIP and not dst_gcs_uri.endswith('.gz'):
275
+ raise ValueError('GCS path need to ends with .gz if using compression = GCSCompression.GZIP')
276
+ elif format == DataFileFormat.CSV and compression != DataFileCompression.GZIP and not dst_gcs_uri.endswith('.csv'):
277
+ raise ValueError('GCS path need to ends with .csv if using format = GCSExportFormat.CSV')
278
+ elif format == DataFileFormat.PARQUET and not dst_gcs_uri.endswith('.parquet'):
279
+ raise ValueError('GCS path need to ends with .parquet if using format = GCSExportFormat.PARQUET')
280
+
281
+ # Construct options
282
+ logger.debug('Constructing EXPORT options ...')
283
+ options = [
284
+ f' uri=\'{dst_gcs_uri}\'',
285
+ f' format=\'{format}\'',
286
+ f' overwrite=TRUE',
287
+ ]
288
+ if format == DataFileFormat.CSV:
289
+ options.append(f' field_delimiter=\'{delimiter}\'',)
290
+ if header:
291
+ options.append(f' header={"true" if header else "false"}',)
292
+ if compression:
293
+ options.append(f' compression=\'{compression}\'')
294
+ options_str = ',\n'.join(options)
295
+
296
+ # Construct beautiful query string
297
+ logger.debug('Constructing EXPORT query ...')
298
+ query = (
299
+ f'EXPORT DATA OPTIONS (\n'
300
+ f'{options_str}\n'
301
+ f')\n'
302
+ f'AS (\n'
303
+ f'{textwrap.dedent(query).strip()}\n'
304
+ f');'
172
305
  )
173
306
 
174
- if pre_query:
175
- query = [pre_query, query]
176
-
177
- logger.debug(f'⌛ Export data into: {gcs_path}')
178
- query_job = self.execute_query(query)
179
- logger.info(f'✅ Exported data into: {gcs_path}')
180
- return query_job
181
-
182
- def upload_csv(self, src_filename: str, bq_table_fqn: str, cols: dict[str, Dtype], partition_col: str = None, cluster_cols: list[str] = None, load_strategy: LoadStrategy = LoadStrategy.APPEND):
183
- # <<----- START: Validation
184
-
185
- if load_strategy not in LoadStrategy:
186
- raise ValueError('Invalid load strategy')
187
-
188
- if not src_filename.endswith('.csv'):
189
- raise ValueError('Please provide file path with .csv extension!')
190
-
191
- if partition_col is not None:
192
- if partition_col not in cols.keys():
193
- raise ValueError(f'Partition \'{partition_col}\' not exists in columns!')
194
- if cluster_cols is not None:
195
- if cluster_cols not in cols.keys():
196
- raise ValueError(f'Cluster \'{cluster_cols}\' not exists in columns!')
197
-
198
- # Build list of columns with its datatypes
199
- csv_cols = set(read_header(src_filename))
200
- excessive_cols = set(cols.keys()) - set(csv_cols)
201
- if excessive_cols:
202
- raise ValueError(f'{len(excessive_cols)} columns not exists in CSV file: {", ".join(excessive_cols)}')
203
- nonexistent_cols = set(csv_cols) - set(cols.keys())
204
- if nonexistent_cols:
205
- raise ValueError(f'{len(nonexistent_cols)} columns from CSV are missing: {", ".join(nonexistent_cols)}')
206
-
207
- # END: Validation ----->>
208
-
209
- # <<----- START: Upload to GCS
210
-
211
- gcs = GCS(self.project)
212
- tmp_dir = f'tmp/upload__{current_datetime_str()}'
213
-
214
- # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
215
- # A single file can produce more than one compressed file in GCS
216
- def producer(src_file: str):
217
- for dst_file in compress(src_file, keep=True, max_size_bytes=ByteSize.GB * 3):
218
- yield (dst_file, )
219
-
220
- def consumer(dst_file: str):
221
- remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
222
- logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
223
- blob = gcs.upload(dst_file, remote_file_name, mv=True)
224
- return blob
225
-
226
- blobs: list[storage.Blob]
227
- _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
228
-
229
- # END: Upload to GCS ----->>
230
-
231
- # <<----- START: Load to BQ
232
-
307
+ # Execute
308
+ logger.debug('Executing query ...')
309
+ self.execute_query(query=query, parameters=parameters)
310
+
311
+ def upload_csv(
312
+ self,
313
+ src_filename: str,
314
+ dst_table_fqn: str,
315
+ schema: list[dict] | None = None,
316
+ gcs_bucket: str | None = None,
317
+ partition_by: str = None,
318
+ cluster_cols: list[str] = None,
319
+ compression: DataFileCompression | None = None,
320
+ load_strategy: LoadStrategy = LoadStrategy.APPEND,
321
+ ):
322
+ self.raise_for_invalid_table_fqn(dst_table_fqn)
323
+
324
+ if compression == DataFileCompression.GZIP and not src_filename.endswith('.gz'):
325
+ raise ValueError('Please provide file path with .gz extension if using compression = GZIP')
326
+ elif not src_filename.endswith('.csv'):
327
+ raise ValueError('Please provide file path with .csv extension')
328
+
329
+ # # <<----- START: Upload to GCS
330
+
331
+ # gcs = GCS(self.project_id)
332
+ # tmp_dir = f'tmp/upload__{current_datetime_str()}'
333
+
334
+ # # This will compress while splitting the compressed file to a certain bytes size because of GCS 4GB file limitation
335
+ # # A single file can produce more than one compressed file in GCS
336
+ # def producer(src_file: str):
337
+ # for dst_file in compress(src_file,
338
+ # keep=True, max_size_bytes=ByteSize.GB * 3):
339
+ # yield (dst_file, )
340
+
341
+ # def consumer(dst_file: str):
342
+ # remote_file_name = f'{tmp_dir}/{replace_nonnumeric(os.path.basename(dst_file), "_").lower()}.csv.gz'
343
+ # logger.debug(f'Uploading {dst_file} to {remote_file_name}...')
344
+ # blob = gcs.upload(dst_file, remote_file_name, move=True)
345
+ # return blob
346
+
347
+ # blobs: list[storage.Blob]
348
+ # _, blobs = ThreadingQ().add_producer(producer, src_filename).add_consumer(consumer).execute()
349
+
350
+ # # END: Upload to GCS ----->>
351
+
352
+ # Upload to GCS
353
+ # TODO: Re-implement the producer-consumer model to upload multiple files
354
+ gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
355
+ dst_blobpath = f'tmp/my_bq/{my_datetime.get_current_datetime_str()}/{my_string.replace_nonnumeric(os.path.basename(src_filename), "_").lower()}'
356
+ gcs.upload(src_filename, dst_blobpath)
357
+
358
+ # Load to BQ
233
359
  try:
234
- gcs_filename_fqns = [f'gs://{blob.bucket.name}/{blob.name}' for blob in blobs]
235
- match load_strategy:
236
- case LoadStrategy.OVERWRITE:
237
- self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols, overwrite=True)
238
- case LoadStrategy.APPEND:
239
- self.load_data_into(bq_table_fqn, gcs_filename_fqns, cols, partition_col=partition_col, cluster_cols=cluster_cols)
240
- case _:
241
- return ValueError(f'Load strategy not recognized: {load_strategy}')
242
- except Exception as e:
243
- raise e
360
+ self.load_data(dst_blobpath, dst_table_fqn, schema=schema, partition_by=partition_by, cluster_cols=cluster_cols, format=DataFileFormat.CSV, compression=compression, load_strategy=load_strategy)
361
+ except:
362
+ raise
244
363
  finally:
245
- [GCS.remove_blob(blob) for blob in blobs]
246
-
247
- # END: Load to BQ ----->>
248
-
249
- def download_csv(self, query: str, dst_filepath: str, row_limit: int | None = None):
364
+ gcs.delete_blob(dst_blobpath)
365
+
366
+ def download_csv(
367
+ self,
368
+ query: str,
369
+ dst_filepath: str,
370
+ *,
371
+ gcs_bucket: str | None = None,
372
+ query_parameters: dict = {},
373
+ csv_row_limit: int | None = None,
374
+ ) -> str | list[str]:
250
375
  if not dst_filepath.endswith('.csv'):
251
376
  raise ValueError('Destination filename must ends with .csv')
252
377
 
253
- dst_filepath = os.path.expanduser(dst_filepath) # /path/to/file.csv
254
-
255
- query_job = self.execute_query(query)
256
- query_job_result = query_job.result()
257
- row_count = 0
258
- file_index = 1
259
-
260
- # Stream-download-split result
261
- def open_file(f):
262
- if f:
263
- f.close()
264
- dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{file_index:06}.csv' if row_limit else dst_filepath
265
- logger.info(f'Writing into file: {dst_filepath_part} ...')
266
- f = open(dst_filepath_part, 'w', newline='', encoding='utf-8')
267
- writer = csv.writer(f)
268
- writer.writerow([field.name for field in query_job_result.schema]) # Write header
269
-
270
- return f, writer
271
-
272
- f, writer = open_file(None)
273
- for row in query_job_result:
274
- writer.writerow(row)
275
-
276
- if row_limit:
277
- row_count += 1
278
- if row_count >= row_limit:
279
- row_count = 0
280
- file_index += 1
281
- f, writer = open_file(f)
282
- if f:
283
- f.close()
378
+ # Init
379
+ gcs = my_gcs.GCS(bucket=gcs_bucket, project_id=self.client.project)
380
+
381
+ # Generic function to export-download-combine csv file from BQ->GCS->local
382
+ def _export_download_combine(query: str, dst_gcs_prefix: str, dst_filepath: str, query_parameters: dict = {}):
383
+ # Init tmp directory
384
+ tmp_dirname = f'/tmp/my_bq_{my_datetime.get_current_datetime_str()}'
385
+ if os.path.exists(tmp_dirname):
386
+ shutil.rmtree(tmp_dirname, ignore_errors=True)
387
+ os.makedirs(tmp_dirname, exist_ok=True)
388
+ logger.debug(f'Temporary directory created: {tmp_dirname}')
389
+
390
+ try:
391
+ # Export to GCS
392
+ dst_gcs_uri = f'gs://{gcs.bucket.name}/{dst_gcs_prefix}/*.csv.gz'
393
+ self.export_data(query, dst_gcs_uri, parameters=query_parameters, format=DataFileFormat.CSV, compression=DataFileCompression.GZIP)
394
+
395
+ # Download from GCS
396
+ local_tmp_filepaths = []
397
+ for tmp_blobs in gcs.list_blobs(dst_gcs_prefix):
398
+ local_tmp_filepath = os.path.join(tmp_dirname, tmp_blobs.name.split('/')[-1])
399
+ gcs.download(tmp_blobs, local_tmp_filepath, move=True)
400
+ logger.debug(f'Downloaded {tmp_blobs.name} to {local_tmp_filepath}')
401
+ local_tmp_filepaths.append(local_tmp_filepath)
402
+
403
+ # Combine downloaded files
404
+ my_csv.combine(local_tmp_filepaths, dst_filepath, gzip=True, delete=True)
405
+ except:
406
+ raise
407
+ finally:
408
+ shutil.rmtree(tmp_dirname, ignore_errors=True) # Remove local folder
409
+ [gcs.delete_blob(blob_filepath) for blob_filepath in gcs.list_blobs(dst_gcs_prefix)] # Remove temporary GCS files
410
+
411
+ logger.info(f'Export-download-combine done: {dst_filepath}')
412
+
413
+ # Limited csv rows
414
+ if csv_row_limit:
415
+ tmp_table_fqn: str | None = None
416
+ tmp_table_fqn_rn: str | None = None
417
+ try:
418
+ # Create temporary table
419
+ query_job = self.execute_query(query, temporary_table=True)
420
+ tmp_table_fqn = str(query_job.destination)
421
+ logger.debug(f'Create temp table: {tmp_table_fqn}')
422
+
423
+ # Create another temporary table for row numbering
424
+ query_job = self.execute_query(f'SELECT *, ROW_NUMBER() OVER() AS _rn FROM `{tmp_table_fqn}`', temporary_table=True)
425
+ tmp_table_fqn_rn = str(query_job.destination)
426
+ logger.debug(f'Create temp table (rn): {tmp_table_fqn_rn}')
427
+
428
+ # Process parts
429
+ count = list(self.execute_query(f'SELECT COUNT(1) FROM `{tmp_table_fqn_rn}`').result())[0][0]
430
+ parts = math.ceil(count / csv_row_limit)
431
+ logger.info(f'Total part: {count} / {csv_row_limit} = {parts}')
432
+ dst_filepaths = []
433
+ for part in range(parts):
434
+ dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{part + 1:06}.csv'
435
+ _export_download_combine(
436
+ f'SELECT * EXCEPT(_rn) FROM `{tmp_table_fqn_rn}` WHERE _rn BETWEEN {(part * csv_row_limit) + 1} AND {(part + 1) * csv_row_limit}',
437
+ dst_gcs_prefix=gcs.build_tmp_dirpath(),
438
+ dst_filepath=dst_filepath_part,
439
+ )
440
+ dst_filepaths.append(dst_filepath_part)
441
+ return dst_filepaths
442
+ except:
443
+ raise
444
+ finally:
445
+ # Drop temporary tables
446
+ if tmp_table_fqn_rn:
447
+ self.drop_table(tmp_table_fqn_rn)
448
+ if tmp_table_fqn:
449
+ self.drop_table(tmp_table_fqn)
450
+
451
+ # Unlimited csv rows
452
+ else:
453
+ _export_download_combine(query, gcs.build_tmp_dirpath(), dst_filepath, query_parameters=query_parameters)
454
+ return dst_filepath
455
+
456
+ # query_job_result = query_job.result()
457
+ # row_count = 0
458
+ # file_index = 1
459
+
460
+ # # Stream-download-split result
461
+ # def open_file(f):
462
+ # if f:
463
+ # f.close()
464
+ # dst_filepath_part = f'{dst_filepath.removesuffix(".csv")}_{file_index:06}.csv' if row_limit else dst_filepath
465
+ # logger.info(f'Writing into file: {dst_filepath_part} ...')
466
+ # f = open(dst_filepath_part, 'w', newline='', encoding='utf-8')
467
+ # writer = csv.writer(f)
468
+ # writer.writerow([field.name for field in query_job_result.schema]) # Write header
469
+
470
+ # return f, writer
471
+
472
+ # f, writer = open_file(None)
473
+ # for row in query_job_result:
474
+ # writer.writerow(row)
475
+
476
+ # if row_limit:
477
+ # row_count += 1
478
+ # if row_count >= row_limit:
479
+ # row_count = 0
480
+ # file_index += 1
481
+ # f, writer = open_file(f)
482
+ # if f:
483
+ # f.close()
284
484
 
285
485
  def download_xlsx(self, src_table_fqn: str, dst_filename: str, xlsx_row_limit: int = 950000):
286
486
  if not dst_filename.endswith('.xlsx'):
@@ -302,60 +502,94 @@ class BQ():
302
502
  file_path_tmp = f'{dst_filename}_part{part + 1}'
303
503
  file_path_tmp_csv = f'{file_path_tmp}.csv'
304
504
  self.download_csv(f'SELECT * EXCEPT(_rn) FROM `{table_name_tmp}` WHERE _rn BETWEEN {(part * xlsx_row_limit) + 1} AND {(part + 1) * xlsx_row_limit}', f'{file_path_tmp}{os.sep}')
305
- csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
505
+ my_xlsx.csv_to_xlsx(file_path_tmp_csv, f'{file_path_tmp}.xlsx')
306
506
  os.remove(file_path_tmp_csv)
307
507
  except Exception as e:
308
508
  raise e
309
509
  finally:
310
510
  self.execute_query(f'DROP TABLE IF EXISTS `{table_name_tmp}`')
311
511
 
312
- def copy_table(self, src_table_id: str, dst_table_id: str, drop: bool = False):
313
- # Create or replace
314
- self.client.delete_table(dst_table_id, not_found_ok=True)
315
- self.client.copy_table(src_table_id, dst_table_id).result()
316
- logger.debug(f'Table {src_table_id} copied to {dst_table_id}')
317
-
318
- if drop:
319
- self.client.delete_table(src_table_id)
320
- logger.debug(f'Table {src_table_id} dropped')
321
-
322
- def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
323
- src_project_id, src_dataset_id, _ = src_view_id.split('.')
324
- dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
325
-
326
- # Create or replace
327
- src_view = self.client.get_table(src_view_id)
328
- dst_view = bigquery.Table(dst_view_id)
329
- dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
330
- self.client.delete_table(dst_view, not_found_ok=True)
331
- self.client.create_table(dst_view)
332
- logger.debug(f'View {src_view_id} copied to {dst_view}')
333
-
334
- if drop:
335
- self.client.delete_table(src_view_id)
336
- logger.debug(f'View {src_view_id} dropped')
337
-
338
- def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
339
- src_project_id, src_dataset_id, _ = src_routine_id.split('.')
340
- dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
341
-
342
- # Create or replace
343
- src_routine = self.client.get_routine(src_routine_id)
344
- dst_routine = bigquery.Routine(dst_routine_id)
345
- dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
346
- dst_routine.type_ = src_routine.type_
347
- dst_routine.description = src_routine.description
348
- dst_routine.language = src_routine.language
349
- dst_routine.arguments = src_routine.arguments
350
- dst_routine.return_type = src_routine.return_type
351
- self.client.delete_routine(dst_routine, not_found_ok=True)
352
- self.client.create_routine(dst_routine)
353
- logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
354
-
355
- if drop:
356
- self.client.delete_routine(src_routine_id)
357
- logger.debug(f'Routine {src_routine_id} dropped')
358
-
359
- def close_client(self):
512
+ # def copy_view(self, src_view_id: str, dst_view_id: str, drop: bool = False):
513
+ # src_project_id, src_dataset_id, _ = src_view_id.split('.')
514
+ # dst_project_id, dst_dataset_id, _ = dst_view_id.split('.')
515
+
516
+ # # Create or replace
517
+ # src_view = self.client.get_table(src_view_id)
518
+ # dst_view = bigquery.Table(dst_view_id)
519
+ # dst_view.view_query = src_view.view_query.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
520
+ # self.client.delete_table(dst_view, not_found_ok=True)
521
+ # self.client.create_table(dst_view)
522
+ # logger.debug(f'View {src_view_id} copied to {dst_view}')
523
+
524
+ # if drop:
525
+ # self.client.delete_table(src_view_id)
526
+ # logger.debug(f'View {src_view_id} dropped')
527
+
528
+ # def copy_routine(self, src_routine_id: str, dst_routine_id: str, drop: bool = False):
529
+ # src_project_id, src_dataset_id, _ = src_routine_id.split('.')
530
+ # dst_project_id, dst_dataset_id, _ = dst_routine_id.split('.')
531
+
532
+ # # Create or replace
533
+ # src_routine = self.client.get_routine(src_routine_id)
534
+ # dst_routine = bigquery.Routine(dst_routine_id)
535
+ # dst_routine.body = src_routine.body.replace(f'{src_project_id}.{src_dataset_id}', f'{dst_project_id}.{dst_dataset_id}')
536
+ # dst_routine.type_ = src_routine.type_
537
+ # dst_routine.description = src_routine.description
538
+ # dst_routine.language = src_routine.language
539
+ # dst_routine.arguments = src_routine.arguments
540
+ # dst_routine.return_type = src_routine.return_type
541
+ # self.client.delete_routine(dst_routine, not_found_ok=True)
542
+ # self.client.create_routine(dst_routine)
543
+ # logger.debug(f'Routine {src_routine_id} copied to {dst_routine_id}')
544
+
545
+ # if drop:
546
+ # self.client.delete_routine(src_routine_id)
547
+ # logger.debug(f'Routine {src_routine_id} dropped')
548
+
549
+ # MARK: Utilities
550
+
551
+ @staticmethod
552
+ def get_table_fqn_parts(name: str | list[str]) -> list[str] | list[list[str]]:
553
+ """Get fully qualified table name, following this format `<projectid>.<datasetid>.<tableid>`
554
+
555
+ Args:
556
+ name (str | list[str]): Input name (can be multiple)
557
+
558
+ Returns:
559
+ list[str] | list[list[str]]: The FQN parts. If the input is list then returns list of FQN parts instead.
560
+ """
561
+
562
+ if isinstance(name, list):
563
+ return [BQ.get_table_fqn_parts(x) for x in name]
564
+
565
+ split = name.split('.')
566
+ if len(split) == 3:
567
+ return split
568
+ else:
569
+ raise ValueError(f'{name} is not a valid table FQN')
570
+
571
+ @staticmethod
572
+ def raise_for_invalid_table_fqn(name: str | list[str]):
573
+ """Raise an error if the provied name is a fully qualified table name
574
+
575
+ Args:
576
+ name (str | list[str]): Input name (can be multiple)
577
+
578
+ Raises:
579
+ ValueError: If name is not a fully qualified table name
580
+ """
581
+
582
+ if not BQ.get_table_fqn_parts(name):
583
+ raise ValueError(f'{name} is not a valid table FQN')
584
+
585
+ def is_table_exists(self, table_fqn: str) -> bool:
586
+ self.raise_for_invalid_table_fqn(table_fqn)
587
+ try:
588
+ self.client.get_table(table_fqn)
589
+ return True
590
+ except NotFound:
591
+ return False
592
+
593
+ def close(self):
360
594
  self.client.close()
361
595
  logger.debug('BQ client close')
utill/my_csv.py CHANGED
@@ -57,13 +57,13 @@ def compress(src_filename: str, keep: bool = False, max_size_bytes=ByteSize.GB,
57
57
  yield dst_filename
58
58
 
59
59
 
60
- def combine(src_filenames: list[str], dst_filename: str) -> None:
60
+ def combine(src_filenames: list[str], dst_filename: str, gzip: bool = False, delete: bool = False) -> None:
61
61
  csv.field_size_limit(min(sys.maxsize, 2147483646)) # FIX: _csv.Error: field larger than field limit (131072)
62
62
 
63
63
  if not dst_filename.endswith('.csv'):
64
64
  raise ValueError('Output filename must ends with \'.csv\'!')
65
65
 
66
- first_file = True
66
+ first_src_file = True
67
67
  with open(dst_filename, 'w') as fout:
68
68
  csvwriter = csv.writer(fout)
69
69
 
@@ -71,21 +71,25 @@ def combine(src_filenames: list[str], dst_filename: str) -> None:
71
71
  src_filename = os.path.expanduser(src_filename)
72
72
 
73
73
  # Decompress gzipped csv
74
- if src_filename.endswith('.csv.gz'):
74
+ if gzip:
75
75
  src_filename = decompress(src_filename)
76
76
 
77
- # Copy
77
+ # Write content into file
78
78
  with open(src_filename, 'r') as fin:
79
79
  csvreader = csv.reader(fin)
80
80
 
81
- # Copy the header if this is the first file
82
- if first_file:
81
+ # Write header only at first file
82
+ if first_src_file:
83
83
  csvwriter.writerow(next(csvreader))
84
- first_file = False
85
- # Else, skip the header
84
+ first_src_file = False
86
85
  else:
87
86
  next(csvreader)
88
87
 
88
+ # Write body
89
89
  [csvwriter.writerow(row) for row in csvreader]
90
90
 
91
- logger.info(f'Combine {src_filename}')
91
+ logger.debug(f'Combine {src_filename}')
92
+
93
+ if delete:
94
+ os.remove(src_filename)
95
+ logger.debug(f'Delete {src_filename}')
utill/my_datetime.py CHANGED
@@ -11,7 +11,7 @@ def get_current_date_str(use_separator: bool = False) -> str:
11
11
  return datetime.now().strftime('%Y-%m-%d' if use_separator else '%Y%m%d')
12
12
 
13
13
 
14
- def current_datetime_str(use_separator: bool = False) -> str:
14
+ def get_current_datetime_str(use_separator: bool = False) -> str:
15
15
  return datetime.now().strftime('%Y-%m-%d %H:%M:%S' if use_separator else '%Y%m%d%H%M%S')
16
16
 
17
17
 
utill/my_env.py CHANGED
@@ -49,6 +49,7 @@ def init_mb_file():
49
49
  class Envs(BaseSettings):
50
50
 
51
51
  GCP_PROJECT_ID: Optional[str] = None
52
+ GCP_REGION: Optional[str] = None
52
53
  GCS_BUCKET: Optional[str] = None
53
54
 
54
55
  def set_var(self, k: str, v: str):
utill/my_file.py CHANGED
@@ -28,7 +28,7 @@ def decompress(src_file: str, keep: bool = False):
28
28
  dst_file = src_file.removesuffix('.gz')
29
29
 
30
30
  os.remove(dst_file) if os.path.exists(dst_file) else None
31
- logger.debug(f'📄 Decompress {src_file} --> {dst_file}')
31
+ logger.debug(f'Decompress {src_file} to {dst_file}')
32
32
  with gzip.open(src_file, 'rb') as f_in:
33
33
  with open(dst_file, 'wb') as f_out:
34
34
  shutil.copyfileobj(f_in, f_out)
utill/my_gcs.py CHANGED
@@ -1,123 +1,79 @@
1
- import os
2
- import re
3
-
1
+ from .my_datetime import get_current_datetime_str
2
+ from .my_env import envs
4
3
  from google.cloud import storage
5
4
  from loguru import logger
6
-
7
- from .my_env import envs
5
+ import os
8
6
 
9
7
 
10
8
  class GCS:
11
9
 
12
- def __init__(self, project: str = None, service_account_filename: str = None, bucket_name: str = None):
13
- self.project = project if project is not None else envs.GCP_PROJECT_ID
14
-
15
- if service_account_filename is not None:
16
- self.client = storage.Client.from_service_account_json(service_account_filename)
17
- else:
18
- self.client = storage.Client(project=self.project)
10
+ def __init__(self, bucket: str | None = None, project_id: str | None = None):
11
+ if project_id is None and envs.GCP_PROJECT_ID is None:
12
+ logger.warning('Using ADC for GCS authentication')
19
13
 
20
- bucket_name_parts = (bucket_name or envs.GCS_BUCKET).split('/')
21
- self.change_bucket(bucket_name_parts[0])
22
- self.base_path = '/'.join(bucket_name_parts[1:]) if len(bucket_name_parts) > 1 else None
23
- not self.base_path or logger.debug(f'Base path: {self.base_path}')
14
+ if bucket is None and envs.GCS_BUCKET is None:
15
+ raise ValueError('Bucket name must be provided either as an argument or set in environment variables.')
24
16
 
17
+ self.client = storage.Client(project=project_id or envs.GCP_PROJECT_ID)
18
+ self.bucket = self.client.bucket(bucket or envs.GCS_BUCKET)
25
19
  logger.debug(f'GCS client open, project: {self.client.project}')
26
20
 
27
- def __enter__(self):
28
- return self
29
-
30
- def __exit__(self, exc_type, exc_value, exc_tb):
31
- self.close_client()
32
-
33
- def _construct_path(self, path: str) -> str:
34
- return f'{self.base_path}/{path}' if self.base_path else path
35
-
36
- def change_bucket(self, bucket_name: str):
37
- if not bucket_name:
38
- raise ValueError('Bucket name needed')
39
- self.bucket = self.client.bucket(bucket_name)
40
- logger.debug(f'Change bucket to {self.bucket.name}')
41
-
42
- def get(self, path: str) -> storage.Blob:
43
- path = self._construct_path(path)
44
- return self.bucket.blob(path)
45
-
46
- def list(self, path: str) -> list[storage.Blob]:
47
- path = self._construct_path(path)
48
- if '*' in path:
49
- path_prefix = path.split('*')[0]
50
- regex_pattern = '^' + re.escape(path).replace('\\*', '.*') + '$'
51
- regex = re.compile(regex_pattern)
52
- return [x for x in self.bucket.list_blobs(prefix=path_prefix) if regex.match(x.name)]
53
-
54
- return list(self.bucket.list_blobs(prefix=path))
55
-
56
- def copy(self, src_path: str, dst_path: str, mv: bool = False):
57
- src_blob = self.get(src_path)
58
- dst_blob = self.get(dst_path)
59
-
60
- dst_blob.rewrite(src_blob)
21
+ def get_blob(self, blobpath: str) -> storage.Blob:
22
+ return self.bucket.blob(blobpath)
61
23
 
62
- logger.debug(f'✅ Copy gs://{src_blob.bucket.name}/{src_blob.name} to gs://{dst_blob.bucket.name}/{dst_blob.name}')
24
+ def list_blobs(self, prefix: str) -> list[storage.Blob]:
25
+ return self.bucket.list_blobs(prefix=prefix)
63
26
 
64
- not mv or GCS.remove_blob(src_blob)
27
+ def delete_blob(self, blobpath: str | storage.Blob) -> storage.Blob:
28
+ blob = self.get_blob(blobpath) if isinstance(blobpath, str) else blobpath
29
+ return blob.delete()
65
30
 
66
- return dst_blob
31
+ def copy(self, src_blobpath: str, dst_blobpath: str, dst_bucket: str = None, move: bool = False):
32
+ src_bucket = self.bucket
33
+ src_blob = self.get_blob(src_blobpath)
34
+ dst_bucket = dst_bucket or src_bucket.name
67
35
 
68
- def copy_to_other_gcs(self, src_blob: storage.Blob, dst_gcs: "GCS", dst_path: str, mv: bool = False):
69
- self.bucket.copy_blob(src_blob, dst_gcs.bucket, dst_path)
70
- dst_blob = dst_gcs.get(dst_path)
36
+ self.bucket.copy_blob(src_blob, dst_bucket, dst_blobpath)
71
37
 
72
- not mv or GCS.remove_blob(src_blob)
73
-
74
- return dst_blob
75
-
76
- def upload(self, local_path: str, remote_path: str, mv: bool = False):
77
- local_path = os.path.expanduser(local_path)
78
-
79
- if not os.path.exists(local_path):
80
- raise FileNotFoundError(f'File not found: {local_path}')
81
-
82
- blob = self.get(remote_path)
83
- blob.upload_from_filename(local_path)
84
-
85
- logger.debug(f'✅ Upload {local_path} to gs://{self.bucket.name}/{blob.name}')
86
-
87
- not mv or os.remove(local_path)
88
-
89
- return blob
90
-
91
- def download(self, obj: str | storage.Blob, local_path: str, mv: bool = False):
92
- local_path = os.path.expanduser(local_path)
93
- is_blob = type(obj) == storage.Blob
94
-
95
- if os.path.isdir(local_path):
96
- local_path = os.path.join(local_path, obj.name.split('/')[-1] if is_blob else os.path.basename(obj))
97
- if not os.path.dirname(local_path):
98
- raise FileNotFoundError(f'Destination directory not found: {os.path.dirname(local_path)}')
99
-
100
- blob = obj if is_blob else self.get(obj)
101
- blob.download_to_filename(local_path)
38
+ # Move mode
39
+ if move:
40
+ self.delete_blob(src_blobpath)
41
+ logger.debug(f'Moved gs://{src_bucket}/{src_blobpath} to gs://{dst_bucket}/{dst_blobpath}')
42
+ # Copy mode
43
+ else:
44
+ logger.debug(f'Copied gs://{src_bucket}/{src_blobpath} to gs://{dst_bucket}/{dst_blobpath}')
102
45
 
103
- logger.debug(f'✅ Download gs://{self.bucket.name}/{blob.name} to {local_path}')
46
+ def upload(self, src_filepath: str, dst_blobpath: str, move: bool = False):
47
+ blob = self.get_blob(dst_blobpath)
48
+ blob.upload_from_filename(src_filepath)
104
49
 
105
- not mv or GCS.remove_blob(blob)
50
+ # Move mode
51
+ if move:
52
+ os.remove(src_filepath)
53
+ logger.debug(f'Moved {src_filepath} to gs://{self.bucket.name}/{blob.name}')
54
+ # Copy mode
55
+ else:
56
+ logger.debug(f'Uploaded {src_filepath} to gs://{self.bucket.name}/{blob.name}')
106
57
 
107
- return blob
58
+ def download(self, src_blobpath: str | storage.Blob, dst_filepath: str, move: bool = False):
59
+ blob = self.get_blob(src_blobpath) if isinstance(src_blobpath, str) else src_blobpath
60
+ blob.download_to_filename(dst_filepath)
108
61
 
109
- def remove(self, remote_path: str):
110
- blob = self.get(remote_path)
62
+ if move:
63
+ self.delete_blob(blob)
64
+ logger.debug(f'Moved gs://{self.bucket.name}/{blob.name} to {dst_filepath}')
65
+ else:
66
+ logger.debug(f'Copied gs://{self.bucket.name}/{blob.name} to {dst_filepath}')
111
67
 
112
- GCS.remove_blob(blob)
68
+ # MARK: Utilities
113
69
 
114
- return blob
70
+ @staticmethod
71
+ def build_tmp_dirpath(prefix: str = 'tmp') -> str:
72
+ """
73
+ Builds a temporary directory path in the GCS bucket.
74
+ """
75
+ return f'{prefix}/{get_current_datetime_str()}'
115
76
 
116
- def close_client(self):
77
+ def close(self):
117
78
  self.client.close()
118
- logger.debug('GCS client close')
119
-
120
- @staticmethod
121
- def remove_blob(blob: storage.Blob):
122
- blob.delete()
123
- logger.debug(f'🗑️ Remove gs://{blob.bucket.name}/{blob.name}')
79
+ logger.debug('GCS client closed')
utill/my_queue.py CHANGED
@@ -1,7 +1,82 @@
1
- import queue
1
+ from loguru import logger
2
+ from typing import Callable
2
3
  import concurrent.futures
4
+ import queue
5
+
6
+ class StreamingQ:
7
+ def __init__(self, producer_func: Callable, producer_args: tuple, consumer_func: Callable, max_queue_size: int = 0):
8
+ self.producer_func = producer_func
9
+ self.producer_args = producer_args
10
+ self.consumer_func = consumer_func
11
+
12
+ # Use maxsize for backpressure control (0 = unlimited)
13
+ self.q = queue.Queue(maxsize=max_queue_size)
14
+
15
+ def execute(self):
16
+ """
17
+ Execute producer and consumer with true streaming using generators.
18
+ Yields consumer results as they become available.
19
+ """
20
+ def producer():
21
+ try:
22
+ for item in self.producer_func(*self.producer_args):
23
+ self.q.put(item)
24
+ logger.debug(f'🌾 Produced {item}')
25
+ except Exception as e:
26
+ logger.error(f'Producer error: {e}')
27
+ self.q.put(('ERROR', e))
28
+ finally:
29
+ # Signal end of production
30
+ self.q.put(None)
31
+ logger.debug('🌾 Producer finished')
32
+
33
+ def consumer():
34
+ while True:
35
+ item = self.q.get()
36
+
37
+ if item is None:
38
+ # End of stream signal
39
+ self.q.task_done()
40
+ break
41
+
42
+ if isinstance(item, tuple) and item[0] == 'ERROR':
43
+ # Propagate producer error
44
+ self.q.task_done()
45
+ raise item[1]
46
+
47
+ try:
48
+ # Unpack item if it's a tuple, otherwise pass as single arg
49
+ if isinstance(item, tuple):
50
+ result = self.consumer_func(*item)
51
+ else:
52
+ result = self.consumer_func(item)
53
+
54
+ self.q.task_done()
55
+ logger.debug(f'🔥 Consumed {item} -> {result}')
56
+ yield result
57
+
58
+ except Exception as e:
59
+ self.q.task_done()
60
+ logger.error(f'Consumer error processing {item}: {e}')
61
+ raise
62
+
63
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
64
+ # Start producer in background
65
+ future_producer = executor.submit(producer)
66
+
67
+ try:
68
+ # Yield results as they become available
69
+ for result in consumer():
70
+ yield result
71
+
72
+ # Wait for producer to complete
73
+ future_producer.result()
74
+
75
+ except Exception as e:
76
+ # Cancel producer if consumer fails
77
+ future_producer.cancel()
78
+ raise
3
79
 
4
- from loguru import logger
5
80
 
6
81
 
7
82
  class ThreadingQ: