toolsbq 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
toolsbq-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MH
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
toolsbq-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,387 @@
1
+ Metadata-Version: 2.4
2
+ Name: toolsbq
3
+ Version: 0.1.0
4
+ Summary: Helpers for Google BigQuery: client creation, schema helpers, and a convenience BqTools wrapper.
5
+ Author: MH
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: google-cloud-bigquery>=3.0.0
11
+ Requires-Dist: google-auth>=2.0.0
12
+ Dynamic: license-file
13
+
14
+ \
15
+
16
+ # toolsbq
17
+
18
+ Utilities for working with **Google BigQuery** in Python.
19
+
20
+ This package is built from a standalone script and packaged for easy reuse.
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install toolsbq
26
+ ```
27
+
28
+ ## Quick start
29
+
30
+ ```python
31
+ from toolsbq import bq_get_client, BqTools
32
+
33
+ client = bq_get_client() # uses ADC by default (recommended on Cloud Run / Functions)
34
+ bq = BqTools(client)
35
+ ```
36
+
37
+ ## Authentication options
38
+
39
+ `bq_get_client()` supports:
40
+
41
+ - **ADC (Application Default Credentials)**: default when no credentials are provided
42
+ - **Service account key file**: pass `path_keyfile="~/.config/.../key.json"` (supports `~` and `$HOME` expansion)
43
+ - **Service account info dict**: pass `keyfile_json={...}` (already-parsed JSON)
44
+
45
+ Examples:
46
+
47
+ ```python
48
+ from toolsbq import bq_get_client
49
+
50
+ # 1) ADC
51
+ client = bq_get_client(project_id="YOUR_PROJECT_ID")
52
+
53
+ # 2) Service account file (path normalization supported)
54
+ client = bq_get_client(path_keyfile="~/.config/gcloud/sa-keys/keyname.json", project_id="YOUR_PROJECT_ID")
55
+
56
+ # 3) Service account info dict
57
+ client = bq_get_client(keyfile_json={"type": "service_account", "project_id": "YOUR_PROJECT_ID", "...": "..."})
58
+ ```
59
+
60
+ ## Examples
61
+
62
+ The original script contained the following guidance and example notes:
63
+
64
+ ```text
65
+ # ===============================================================================
66
+ # 0) Define overall variables for uploads
67
+ # ===============================================================================
68
+
69
+ datetime_system = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
70
+ # datetime_utc = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f')
71
+ datetime_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
72
+ print("Current datetime system:", datetime_system)
73
+ print("Current datetime UTC :", datetime_utc)
74
+
75
+ # ===============================================================================
76
+ # 1) Provide BQ auth via file path / via json string
77
+ # ===============================================================================
78
+
79
+ # path_keyfile = "~/.config/gcloud/sa-keys/keyfile.json"
80
+ #
81
+ # # client = bq_get_client(sql_keyfile_json=sql_keyfile_json)
82
+ # client = bq_get_client(path_keyfile=path_keyfile)
83
+ # # client = bq_get_client(keyfile_json=keyfile_json)
84
+ #
85
+ # # pass none for test (not creating an actual client)
86
+ # # client = None
87
+
88
+ # NEW default: ADC
89
+ client = bq_get_client()
90
+
91
+ # ===============================================================================
92
+ # 2) Example fields_schema fields to copy over
93
+ # ===============================================================================
94
+
95
+ # bq_upload = BqTools(
96
+ # bq_client=client,
97
+ # table_id="",
98
+ # fields_schema=[
99
+ # # fields list: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
100
+ # {"name": "", "type": "INT64", "isKey": 0, "mode": "nullable", "default": None},
101
+ # {"name": "", "type": "INT64", "isKey": 0, "mode": "required", "default": None},
102
+
103
+ # {"name": "", "type": "STRING", "isKey": 0, "mode": "nullable", "default": None},
104
+ # {"name": "", "type": "STRING", "isKey": 0, "mode": "required", "default": None},
105
+
106
+ # {"name": "", "type": "DATE", "isKey": 0, "mode": "nullable", "default": None},
107
+ # {"name": "", "type": "DATE", "isKey": 0, "mode": "required", "default": None},
108
+
109
+ # {"name": "", "type": "DATETIME", "isKey": 0, "mode": "nullable", "default": None},
110
+ # {"name": "", "type": "DATETIME", "isKey": 0, "mode": "required", "default": None},
111
+
112
+ # {"name": "", "type": "TIMESTAMP", "isKey": 0, "mode": "nullable", "default": None},
113
+ # {"name": "", "type": "TIMESTAMP", "isKey": 0, "mode": "required", "default": None},
114
+
115
+ # {"name": "", "type": "NUMERIC", "isKey": 0, "mode": "nullable", "default": None},
116
+ # {"name": "", "type": "NUMERIC", "isKey": 0, "mode": "required", "default": None},
117
+
118
+ # {"name": "", "type": "BOOL", "isKey": 0, "mode": "nullable", "default": None},
119
+ # {"name": "", "type": "BOOL", "isKey": 0, "mode": "required", "default": None},
120
+
121
+ # {"name": "", "type": "JSON", "isKey": 0, "mode": "nullable", "default": None},
122
+ # {"name": "", "type": "JSON", "isKey": 0, "mode": "required", "default": None},
123
+
124
+ # {"name": "last_updated", "type": "TIMESTAMP", "isKey": 0, "mode": "required", "default": "current_timestamp"},
125
+ # ],
126
+ # # https://cloud.google.com/bigquery/docs/creating-partitioned-tables#python
127
+ # # https://cloud.google.com/bigquery/docs/creating-clustered-tables
128
+ # # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TimePartitioning
129
+ # table_options={
130
+ # "partition_field": None,
131
+ # "cluster_fields": [], # max 4 fields - by order provided
132
+ # "partition_expiration_days": None, # number of days for expiration (0.08 = 2 hours) -> creates options
133
+ # # fields to define expiring partition by ingestion -> need partition_expiration_days too
134
+ # "is_expiring_partition_ingestion_hour": None, # defines expiring partitiong by ingestion time - by hour
135
+ # "is_expiring_partition_ingestion_date": None, # defines expiring partitiong by ingestion time - by date
136
+ # },
137
+ # table_suffix="xxxxxx"
138
+ # )
139
+
140
+ # ===============================================================================
141
+ # 3) Simple most basic Tools connection to run query / to pull data / get total rows
142
+ # ===============================================================================
143
+
144
+ # # to simply run a query without doing anything else
145
+ # bq_pull = BqTools(
146
+ # bq_client=client,
147
+ # )
148
+ #
149
+ # query = """
150
+ # SELECT * FROM testdb.testproject.testtable LIMIT 5;
151
+ # """
152
+ #
153
+ # print("Total rows in table:", bq_pull.get_row_count("testdb.testproject.testtable"))
154
+ # # quit()
155
+ #
156
+ # bq_pull.runsql(query)
157
+ # print(bq_pull.sql_result)
158
+ # for row in bq_pull.sql_result:
159
+ # print(row)
160
+
161
+ # ===============================================================================
162
+ # 4) Create a table by defining a schema and then running create table query
163
+ # ===============================================================================
164
+ # client = None
165
+
166
+ # bq_new_table = BqTools(
167
+ # bq_client=client,
168
+ # table_id="testdb.testproject.testtable",
169
+ # fields_schema=[
170
+ # {
171
+ # "name": "employee_id",
172
+ # "type": "int64",
173
+ # "isKey": 1,
174
+ # "mode": "nullable",
175
+ # "default": None,
176
+ # },
177
+ # {"name": "stats_date", "type": "date", "isKey": 1, "mode": "nullable", "default": None},
178
+ # {
179
+ # "name": "annual_ctc",
180
+ # "type": "int64",
181
+ # "isKey": 0,
182
+ # "mode": "nullable",
183
+ # "default": None,
184
+ # },
185
+ # {
186
+ # "name": "last_updated",
187
+ # "type": "timestamp",
188
+ # "isKey": 0,
189
+ # "mode": "required",
190
+ # "default": "current_timestamp",
191
+ # },
192
+ # ],
193
+ # # table_options={
194
+ # # "time_partition_field": None, # youe _PARTITIONTIMEME, if field is not set
195
+ # # "time_partitioning_type": "HOUR", # day, hour, month, year -> nothing: day
196
+ # # "expiration_ms": 3600000, # 1 hour
197
+ # # "cluster_fields": [], # max 4 fields - by order provided
198
+ # # },
199
+ # table_options={
200
+ # "partition_field": "stats_date",
201
+ # "cluster_fields": ["employee_id"], # max 4 fields - by order provided
202
+ # "partition_expiration_days": None, # number of days for expiration (0.08 = 2 hours) -> creates options
203
+ # # fields to define expiring partition by ingestion -> need partition_expiration_days too
204
+ # "is_expiring_partition_ingestion_hour": None, # defines expiring partitiong by ingestion time - by hour
205
+ # "is_expiring_partition_ingestion_date": None, # defines expiring partitiong by ingestion time - by date
206
+ # },
207
+ # table_suffix="xxxxxx",
208
+ # )
209
+ #
210
+ # print(bq_new_table.create_table_query)
211
+ # print(bq_new_table.merge_query)
212
+ # print(bq_new_table.table_id_temp)
213
+ # # quit()
214
+ #
215
+ # bq_new_table.run_create_table_main()
216
+ # quit()
217
+
218
+ # # drop table via manual query
219
+ # # bq_new_table.runsql("drop table if exists {}".format(bq_new_table.table_id))
220
+ # # print("table dropped")
221
+
222
+ # ===============================================================================
223
+ # 5) Simple client to insert all into an existing table (creating duplicates, no upsert), no need for schema
224
+ # ===============================================================================
225
+
226
+ # rows_to_insert = [
227
+ # {"employee_id": 157, "annual_ctc": 182},
228
+ # {"employee_id": 158, "annual_ctc": 183},
229
+ # {"employee_id": 159, "annual_ctc": 184},
230
+ # {"employee_id": 160, "annual_ctc": 1840},
231
+ # {"employee_id": 161, "annual_ctc": 1840},
232
+ # {"employee_id": 1000, "annual_ctc": 5000},
233
+ # ]
234
+ # print("numnber of rows:", len(rows_to_insert))
235
+
236
+ # # 5a) generic -> define table name in function call
237
+ # bq_insert = BqTools(
238
+ # bq_client=client,
239
+ # )
240
+ # bq_insert.insert_stream_generic("testdb.testproject.testtable", rows_to_insert, max_rows_per_request=1000)
241
+
242
+ # 5b) table_id in class definition
243
+ # bq_insert = BqTools(
244
+ # bq_client=client,
245
+ # table_id="testdb.testproject.testtable",
246
+ # )
247
+ # bq_insert.insert_stream_table_main(rows_to_insert, max_rows_per_request=1000)
248
+
249
+ # ===============================================================================
250
+ # 6) Upsert example: Define schema, insert all values into temp table, use specific suffic and uuid
251
+ # ===============================================================================
252
+
253
+ # rows_to_insert = [
254
+ # {"employee_id": 1579, "annual_ctc": 182},
255
+ # {"employee_id": 1589, "annual_ctc": 183},
256
+ # {"employee_id": 1599, "annual_ctc": 1840},
257
+ # {"employee_id": 160, "annual_ctc": 18400},
258
+ # {"employee_id": 161, "annual_ctc": 18400},
259
+ # {"employee_id": 1000, "annual_ctc": 50000},
260
+ # ]
261
+ # print("number of rows:", len(rows_to_insert))
262
+
263
+ # bq_upsert = BqTools(
264
+ # bq_client=client,
265
+ # table_id="testdb.testproject.testtable",
266
+ # fields_schema=[
267
+ # {"name": "employee_id", "type": "int64", "isKey": 1, "mode": "nullable", "default": None},
268
+ # {"name": "stats_date", "type": "date", "isKey": 0, "mode": "nullable", "default": None},
269
+ # {"name": "annual_ctc", "type": "int64", "isKey": 0, "mode": "nullable", "default": None},
270
+ # {"name": "last_updated", "type": "timestamp", "isKey": 0, "mode": "required", "default": "current_timestamp"},
271
+ # ],
272
+ # table_options={
273
+ # # "partition_field": 'stats_date',
274
+ # "cluster_fields": ['employee_id'], # max 4 fields - by order provided
275
+ # },
276
+ # # run_uuid="xxx-xxx-xxx-xxx", # can pass over a uuid if needed to re-use connection and upsert is still working
277
+ # # table_suffix=None,
278
+ # table_suffix="skoeis", # use a different table_suffix on each upsert definition (e.g. when different amount of columns are updated)
279
+ # )
280
+
281
+ # # Generate a UUID in normal code, if we want to pass it over in tools definition
282
+ # # uuid_test = uuid4()
283
+ # # print(uuid_test)
284
+
285
+ # print("the uuid is:", bq_upsert.run_uuid)
286
+ # print(bq_upsert.table_id)
287
+ # # print(json.dumps(bq_upsert.fields_schema, indent=2))
288
+ # print(bq_upsert.table_id_temp)
289
+ # print("schema is safe:", bq_upsert.schema_is_safe)
290
+ # # print(json.dumps(bq_upsert.fields_schema_temp, indent=2))
291
+ # # print("create main table:", bq_upsert.create_table_query)
292
+ # # bq_upsert.run_create_table_main()
293
+ # # print("create temp table:", bq_upsert.create_table_query_temp)
294
+ # print("merge query:", bq_upsert.merge_query)
295
+
296
+ # # run the upsert
297
+ # bq_upsert.run_upsert(rows_to_insert)
298
+
299
+ # # check runUuid and merge query after upsert (should have changed now)
300
+ # print("the uuid is:", bq_upsert.run_uuid)
301
+ # print("merge query:", bq_upsert.merge_query)
302
+
303
+ # # force run only the merge query --> need to fix the run_uuid to the proper run_uuid!
304
+ # # bq_upsert.run_merge()
305
+
306
+ # ===============================================================================
307
+ # 7) Load job with defined schema into new/existing table (from mysql results dict)
308
+ # ===============================================================================
309
+
310
+ # bq_load = BqTools(
311
+ # bq_client=client,
312
+ # table_id="testdb.testproject.testtable",
313
+ # fields_schema=[
314
+ # {"name": "employee_id", "type": "int64", "isKey": 1, "mode": "nullable", "default": None},
315
+ # {"name": "stats_date", "type": "date", "isKey": 0, "mode": "nullable", "default": None},
316
+ # {"name": "annual_ctc", "type": "int64", "isKey": 0, "mode": "nullable", "default": None},
317
+ # {"name": "last_updated", "type": "timestamp", "isKey": 0, "mode": "required", "default": "current_timestamp"},
318
+ # ],
319
+ # table_options={
320
+ # "partition_field": None,
321
+ # "cluster_fields": ["stats_date"],
322
+ # },
323
+ # )
324
+
325
+ # # use mysql to run test sql -> into sql_results / rows_to_insert (has exactly the same layout)
326
+ # # need to pass all fields, including required last_updated for example! -> add to dict
327
+ # rows_to_insert = [
328
+ # {"employee_id": 1579, "annual_ctc": 182},
329
+ # {"employee_id": 1589, "annual_ctc": 183},
330
+ # {"employee_id": 1599, "annual_ctc": 1840},
331
+ # {"employee_id": 160, "annual_ctc": 18400},
332
+ # {"employee_id": 161, "annual_ctc": 18400},
333
+ # {"employee_id": 1000, "annual_ctc": 50000},
334
+ # ]
335
+
336
+ # # Attention: required field has to be passed via load job!
337
+ # # add additional field for all items in results dict, e.g., last_updated date
338
+ # for i in range(0, len(rows_to_insert)):
339
+ # rows_to_insert[i].update({"last_updated": datetime_utc})
340
+
341
+ # # drop existing table first -> like that we make sure it is empty
342
+ # bq_load.runsql("drop table if exists {}".format(bq_load.table_id))
343
+ # print("table dropped")
344
+
345
+ # # run upload from mysql dict -> load job (table to be created, if it doesn't exist via schema)
346
+ # bq_load.load_job_from_json(rows_to_insert, convert_dict_json=True)
347
+
348
+ # ===============================================================================
349
+ # 8) Load job with autodetect schema into new table (from mysql results dict)
350
+ # ===============================================================================
351
+
352
+ # bq_load = BqTools(
353
+ # bq_client=client,
354
+ # table_id="testdb.testproject.testtable",
355
+ # )
356
+
357
+ # # use mysql to run test sql -> into sql_results / rows_to_insert (has exactly the same layout)
358
+ # # need to pass all fields, including required last_updated for example! -> add to dict
359
+ # rows_to_insert = [
360
+ # {"employee_id": 1579, "annual_ctc": 182},
361
+ # {"employee_id": 1589, "annual_ctc": 183},
362
+ # ]
363
+
364
+ # # drop existing table first -> like that we make sure it is empty
365
+ # bq_load.runsql("drop table if exists {}".format(bq_load.table_id))
366
+ # print("table dropped")
367
+
368
+ # # run upload from mysql dict -> load job (table to be created, if it doesn't exist via schema)
369
+ # bq_load.load_job_from_json(rows_to_insert, convert_dict_json=True, autodetect_schema=True)
370
+
371
+ ```
372
+
373
+ ## Development
374
+
375
+ Build locally:
376
+
377
+ ```bash
378
+ python -m pip install --upgrade build twine
379
+ python -m build
380
+ twine check dist/*
381
+ ```
382
+
383
+ Publish (manual):
384
+
385
+ ```bash
386
+ twine upload dist/*
387
+ ```