pyarrow-bigquery 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ from .read import reader, read_table, reader_query, read_query # noqa
2
+ from .write import writer, write_table # noqa
@@ -0,0 +1,232 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ import tempfile
7
+ import multiprocessing
8
+ import threading
9
+ import shutil
10
+
11
+ from google.cloud import bigquery_storage
12
+ from google.cloud import bigquery
13
+ from google.cloud.exceptions import NotFound
14
+
15
+ import pyarrow as pa
16
+ import pyarrow.feather as fa
17
+
18
+ from . import some_itertools
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _bq_table_exists(project: str, location: str):
25
+ client = bigquery.Client(project=project)
26
+
27
+ try:
28
+ client.get_table(location)
29
+ logger.debug(f"Table {location} already exists")
30
+ except NotFound as e:
31
+ logger.debug("Table {location} is not found")
32
+ raise e
33
+
34
+
35
+ def _bq_read_create_strems(
36
+ read_client: bigquery_storage.BigQueryReadClient,
37
+ parent: str,
38
+ location: str,
39
+ selected_fields: list | None,
40
+ row_restrictions: str | None,
41
+ max_stream_count: int,
42
+ ) -> tuple[list[str], pa.Schema]:
43
+ project, dataset, table = location.split(".")
44
+
45
+ read_session = bigquery_storage.ReadSession(
46
+ table=f"projects/{project}/datasets/{dataset}/tables/{table}",
47
+ data_format=bigquery_storage.DataFormat.ARROW,
48
+ read_options={
49
+ "selected_fields": selected_fields,
50
+ "row_restriction": row_restrictions,
51
+ },
52
+ )
53
+
54
+ read_session = read_client.create_read_session(
55
+ parent=f"projects/{parent}",
56
+ read_session=read_session,
57
+ max_stream_count=max_stream_count,
58
+ )
59
+
60
+ schema_buffer = pa.py_buffer(read_session.arrow_schema.serialized_schema)
61
+ schema = pa.ipc.read_schema(schema_buffer)
62
+
63
+ return read_session.streams, schema
64
+
65
+
66
+ def _stream_worker(read_client, read_streams, table_schema, batch_size, queue_results, temp_dir):
67
+ batches = []
68
+
69
+ for stream in read_streams:
70
+ t = time.time()
71
+
72
+ for message in read_client.read_rows(stream.name):
73
+ record_batch = pa.ipc.read_record_batch(message.arrow_record_batch.serialized_record_batch, table_schema)
74
+
75
+ batches.append(record_batch)
76
+
77
+ if sum(b.num_rows for b in batches) >= batch_size:
78
+ table = pa.Table.from_batches(batches)
79
+
80
+ element = tempfile.mktemp(dir=temp_dir)
81
+ fa.write_feather(table[:batch_size], element)
82
+ queue_results.put(element)
83
+
84
+ batches = table[batch_size:].to_batches()
85
+
86
+ logger.debug(f"Stream {stream.name} done in {time.time()-t:.2f} seconds")
87
+
88
+ if batches:
89
+ table = pa.Table.from_batches(batches)
90
+ element = tempfile.mktemp(dir=temp_dir)
91
+ fa.write_feather(table, element)
92
+ queue_results.put(element)
93
+
94
+ queue_results.put(None)
95
+
96
+
97
+ def reader(
98
+ source: str,
99
+ *,
100
+ project: str | None = None,
101
+ columns: list | None = None,
102
+ row_restrictions: str | None = None,
103
+ worker_count: int = multiprocessing.cpu_count(),
104
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
105
+ batch_size: int = 100,
106
+ ):
107
+ t0 = time.time()
108
+ project_id, *_ = source.split(".")
109
+
110
+ if not project:
111
+ project = project_id
112
+
113
+ queue_results = multiprocessing.Queue()
114
+ read_client = bigquery_storage.BigQueryReadClient()
115
+
116
+ _bq_table_exists(project, source)
117
+
118
+ streams, streams_schema = _bq_read_create_strems(
119
+ read_client=read_client,
120
+ parent=project,
121
+ location=source,
122
+ selected_fields=columns,
123
+ row_restrictions=row_restrictions,
124
+ max_stream_count=worker_count * 3,
125
+ )
126
+ workers_done = 0
127
+
128
+ assert streams, "No streams to read, Table might be empty"
129
+
130
+ logger.debug(f"Number of workers: {worker_count}, number of streams: {len(streams)}")
131
+
132
+ actual_worker_count = min(worker_count, len(streams))
133
+
134
+ logger.debug(f"Actual worker count: {actual_worker_count}")
135
+
136
+ temp_dir = tempfile.mkdtemp()
137
+
138
+ try:
139
+ for streams in some_itertools.to_split(streams, actual_worker_count):
140
+ e = worker_type(
141
+ target=_stream_worker,
142
+ args=(
143
+ read_client,
144
+ streams,
145
+ streams_schema,
146
+ batch_size,
147
+ queue_results,
148
+ temp_dir,
149
+ ),
150
+ )
151
+ e.start()
152
+
153
+ while True:
154
+ element = queue_results.get()
155
+
156
+ if not element:
157
+ workers_done += 1
158
+
159
+ if workers_done == actual_worker_count:
160
+ break
161
+ else:
162
+ table = fa.read_table(element)
163
+ os.remove(element)
164
+ yield table
165
+ finally:
166
+ t = time.time()
167
+ shutil.rmtree(temp_dir, ignore_errors=True)
168
+ logger.debug(f"Time to cleanup temp directory: {time.time()-t:.2f}")
169
+ logger.debug(f"Time taken to read: {time.time()-t0:.2f}")
170
+
171
+
172
+ def reader_query(
173
+ project: str,
174
+ query: str,
175
+ *,
176
+ worker_count: int = multiprocessing.cpu_count(),
177
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
178
+ batch_size: int = 100,
179
+ ):
180
+ client = bigquery.Client(project=project)
181
+ job = client.query(query)
182
+ job.result()
183
+
184
+ source = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}"
185
+ return reader(
186
+ source=source,
187
+ project=project,
188
+ worker_count=worker_count,
189
+ worker_type=worker_type,
190
+ batch_size=batch_size,
191
+ )
192
+
193
+
194
+ def read_table(
195
+ source: str,
196
+ *,
197
+ project: str | None = None,
198
+ columns: list | None = None,
199
+ row_restrictions: str | None = None,
200
+ worker_count: int = multiprocessing.cpu_count(),
201
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
202
+ batch_size: int = 100,
203
+ ):
204
+ return pa.concat_tables(
205
+ reader(
206
+ source=source,
207
+ project=project,
208
+ columns=columns,
209
+ row_restrictions=row_restrictions,
210
+ worker_count=worker_count,
211
+ worker_type=worker_type,
212
+ batch_size=batch_size,
213
+ )
214
+ )
215
+
216
+ def read_query(
217
+ project: str,
218
+ query: str,
219
+ *,
220
+ worker_count: int = multiprocessing.cpu_count(),
221
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
222
+ batch_size: int = 100,
223
+ ):
224
+ return pa.concat_tables(
225
+ reader_query(
226
+ project=project,
227
+ query=query,
228
+ worker_count=worker_count,
229
+ worker_type=worker_type,
230
+ batch_size=batch_size
231
+ )
232
+ )
@@ -0,0 +1,12 @@
1
+ def to_chunks(table, chunk_size):
2
+ for i in range(0, len(table), chunk_size):
3
+ yield table[i : i + chunk_size]
4
+
5
+
6
+ def to_split(table, split_number):
7
+ k, m = divmod(len(table), split_number)
8
+ return (table[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(split_number))
9
+
10
+
11
+ def first(iterable, condition):
12
+ return next((x for x in iterable if condition(x)), None)
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import datetime
6
+ import multiprocessing
7
+ import threading
8
+ import shutil
9
+ import tempfile
10
+ import logging
11
+ import collections
12
+
13
+
14
+ from google.cloud import bigquery
15
+ from google.cloud import bigquery_storage_v1
16
+ from google.cloud.bigquery_storage_v1.writer import AppendRowsStream
17
+ from google.protobuf import descriptor_pb2
18
+ from google.api_core.exceptions import Unknown, NotFound
19
+ from google.api_core import retry
20
+
21
+ import pyarrow as pa
22
+ import pyarrow.feather as fa
23
+
24
+ from . import pa_to_bq
25
+ from . import pa_to_pb
26
+ from . import upload
27
+ from .. import some_itertools
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ Stream = collections.namedtuple("Stream", ["append_rows_stream", "write_stream"])
33
+
34
+
35
+ def _bq_create_table(*, project, location, schema, expire, overwrite):
36
+ client = bigquery.Client(project=project)
37
+
38
+ if overwrite:
39
+ client.delete_table(location, not_found_ok=True)
40
+
41
+ bq_schema = pa_to_bq.generate(schema)
42
+
43
+ table = bigquery.Table(location, schema=bq_schema)
44
+
45
+ client.create_table(table)
46
+
47
+ if expire:
48
+ table.expires = datetime.datetime.now() + datetime.timedelta(seconds=expire)
49
+ client.update_table(table, ["expires"])
50
+
51
+ logger.debug(f"Created BigQuery table '{location}'")
52
+
53
+
54
+ def _bq_write_create_stream(write_client: bigquery_storage_v1.BigQueryWriteClient, parent, protobuf_definition):
55
+ write_stream = write_client.create_write_stream(
56
+ parent=parent,
57
+ write_stream=bigquery_storage_v1.types.WriteStream(type=bigquery_storage_v1.types.WriteStream.Type.PENDING),
58
+ retry=retry.Retry(predicate=retry.if_exception_type(Unknown, NotFound)),
59
+ )
60
+
61
+ proto_schema = bigquery_storage_v1.types.ProtoSchema()
62
+ proto_descriptor = descriptor_pb2.DescriptorProto()
63
+ protobuf_definition.CopyToProto(proto_descriptor)
64
+ proto_schema.proto_descriptor = proto_descriptor
65
+
66
+ proto_data = bigquery_storage_v1.types.AppendRowsRequest.ProtoData()
67
+ proto_data.writer_schema = proto_schema
68
+
69
+ request_template = bigquery_storage_v1.types.AppendRowsRequest()
70
+ request_template.write_stream = write_stream.name
71
+ request_template.proto_rows = proto_data
72
+
73
+ append_rows_stream = AppendRowsStream(write_client, request_template)
74
+
75
+ return Stream(
76
+ write_stream=write_stream,
77
+ append_rows_stream=append_rows_stream,
78
+ )
79
+
80
+
81
+ def _bq_storage_close_stream(write_client, stream, parent):
82
+ stream.append_rows_stream.close()
83
+ write_client.finalize_write_stream(name=stream.write_stream.name)
84
+
85
+ batch_commit_write_streams_request = bigquery_storage_v1.types.BatchCommitWriteStreamsRequest()
86
+ batch_commit_write_streams_request.parent = parent
87
+ batch_commit_write_streams_request.write_streams = [stream.write_stream.name]
88
+
89
+ write_client.batch_commit_write_streams(batch_commit_write_streams_request)
90
+
91
+ logger.debug(f"Stream '{stream.write_stream.name}' closed")
92
+
93
+
94
+ def _stream_worker(
95
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
96
+ parent: str,
97
+ schema_protobuf,
98
+ queue_results,
99
+ ):
100
+ stream = _bq_write_create_stream(write_client, parent, schema_protobuf)
101
+
102
+ offset = 0
103
+
104
+ while True:
105
+ element = queue_results.get()
106
+ if element is None:
107
+ break
108
+
109
+ table = fa.read_table(element)
110
+
111
+ upload.upload_data(stream, table, schema_protobuf, offset)
112
+
113
+ os.remove(element)
114
+
115
+ offset += table.num_rows
116
+
117
+ _bq_storage_close_stream(write_client, stream, parent)
118
+
119
+
120
+ class writer:
121
+ """Method to handle"""
122
+
123
+ def __init__(
124
+ self,
125
+ schema: pa.Schema,
126
+ where: str,
127
+ *,
128
+ project: str | None = None,
129
+ table_create: bool = True,
130
+ table_expire: int | None = None,
131
+ table_overwrite: bool = False,
132
+ worker_count: int = multiprocessing.cpu_count(),
133
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
134
+ batch_size: int = 100,
135
+ ):
136
+ self.project = project
137
+ self.where = where
138
+ self.schema = schema
139
+
140
+ self.table_create = table_create
141
+ self.table_expire = table_expire
142
+ self.table_overwrite = table_overwrite
143
+
144
+ self.worker_count = worker_count
145
+ self.worker_type = worker_type
146
+
147
+ self.batch_size = batch_size
148
+
149
+ project_id, dataset_id, table_id = where.split(".")
150
+
151
+ self.parent = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"
152
+
153
+ if not self.project:
154
+ self.project = project_id
155
+
156
+ def __enter__(self):
157
+ self.t0 = time.time()
158
+ self.temp_dir = tempfile.mkdtemp()
159
+ self.schema_protobuf = pa_to_pb.generate(self.schema)
160
+
161
+ if self.table_create:
162
+ _bq_create_table(
163
+ project=self.project,
164
+ location=self.where,
165
+ schema=self.schema,
166
+ expire=self.table_expire,
167
+ overwrite=self.table_overwrite,
168
+ )
169
+
170
+ self.queue_results = multiprocessing.Queue()
171
+ self.workers = []
172
+ write_client = bigquery_storage_v1.BigQueryWriteClient()
173
+
174
+ for _ in range(self.worker_count):
175
+ worker = self.worker_type(
176
+ target=_stream_worker,
177
+ args=(
178
+ write_client,
179
+ self.parent,
180
+ self.schema_protobuf,
181
+ self.queue_results,
182
+ ),
183
+ )
184
+ worker.start()
185
+ self.workers.append(worker)
186
+
187
+ return self
188
+
189
+ def write_table(self, table):
190
+ for table_chunk in some_itertools.to_chunks(table, self.batch_size):
191
+ element = tempfile.mktemp(dir=self.temp_dir)
192
+ fa.write_feather(table_chunk, element)
193
+ self.queue_results.put(element)
194
+
195
+ def write_batch(self, batch):
196
+ element = tempfile.mktemp(dir=self.temp_dir)
197
+ fa.write_feather(pa.Table.from_batches([batch]), element)
198
+ self.queue_results.put(element)
199
+
200
+ def __exit__(self, *_, **__):
201
+ for _ in range(self.worker_count):
202
+ self.queue_results.put(None)
203
+
204
+ for w in self.workers:
205
+ w.join()
206
+
207
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
208
+
209
+ logger.debug(f"Time taken: {time.time() - self.t0}")
210
+
211
+
212
+ def write_table(
213
+ table: pa.Table,
214
+ where: str,
215
+ *,
216
+ project: str | None = None,
217
+ table_create: bool = True,
218
+ table_expire: int | None = None,
219
+ table_overwrite: bool = False,
220
+ worker_count: int = multiprocessing.cpu_count(),
221
+ worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
222
+ batch_size: int = 100,
223
+ ):
224
+ assert table.num_rows > 0, "Table is empty"
225
+
226
+ with writer(
227
+ where=where,
228
+ schema=table.schema,
229
+ project=project,
230
+ table_create=table_create,
231
+ table_expire=table_expire,
232
+ table_overwrite=table_overwrite,
233
+ worker_count=worker_count,
234
+ worker_type=worker_type,
235
+ batch_size=batch_size,
236
+ ) as w:
237
+ for table_chunk in some_itertools.to_split(table, w.worker_count):
238
+ w.write_table(table_chunk)
@@ -0,0 +1,48 @@
1
+ import pyarrow as pa
2
+
3
+ from google.cloud import bigquery
4
+
5
+ from .type_mapping import TYPES_MAPPING
6
+ from ..some_itertools import first
7
+
8
+
9
+ def emit(schema):
10
+ fields = []
11
+
12
+ for field in schema:
13
+ field_mode = "NULLABLE" if field.nullable else "REQUIRED"
14
+ field_type = field.type
15
+
16
+ if pa.types.is_list(field_type):
17
+ field_mode = "REPEATED"
18
+ field_type = field.type.value_type
19
+
20
+ if pa.types.is_list(field.type.value_type):
21
+ raise TypeError("Nested lists are not supported")
22
+
23
+ if pa.types.is_struct(field_type):
24
+ fields.append(
25
+ bigquery.SchemaField(
26
+ name=field.name,
27
+ field_type="RECORD",
28
+ mode=field_mode,
29
+ fields=emit(field_type),
30
+ )
31
+ )
32
+
33
+ elif type_check := first(TYPES_MAPPING, lambda type_check: type_check(field_type)):
34
+ fields.append(
35
+ bigquery.SchemaField(
36
+ name=field.name,
37
+ field_type=TYPES_MAPPING[type_check].bq,
38
+ mode=field_mode,
39
+ )
40
+ )
41
+ else:
42
+ raise TypeError(f"Unsupported type {field_type}")
43
+
44
+ return fields
45
+
46
+
47
+ def generate(schema):
48
+ return emit(schema)
@@ -0,0 +1,151 @@
1
+ import logging
2
+ import time
3
+ import random
4
+ import string
5
+
6
+ from google.protobuf.descriptor_pb2 import DescriptorProto, FieldDescriptorProto
7
+ from google.protobuf import descriptor_pb2, descriptor_pool, message_factory
8
+
9
+ import pyarrow as pa
10
+ from .type_mapping import TYPES_MAPPING
11
+ from ..some_itertools import first
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ FORCE_STRING = [
17
+ pa.types.is_date,
18
+ pa.types.is_decimal,
19
+ pa.types.is_time,
20
+ pa.types.is_timestamp,
21
+ ]
22
+
23
+ GRPC_UPLOAD_LIMIT = 10485760
24
+
25
+
26
+ def random_string(length):
27
+ characters = string.ascii_letters + string.digits
28
+ return "".join(random.choices(characters, k=length))
29
+
30
+
31
+ def emit(schema, message_name):
32
+ message_descriptor = DescriptorProto()
33
+ message_descriptor.name = message_name
34
+
35
+ for idx, field in enumerate(schema, 1):
36
+ _optional = field.nullable
37
+ _repeated = False
38
+ _type = field.type
39
+
40
+ if pa.types.is_list(_type):
41
+ _repeated = True
42
+ _type = field.type.value_type
43
+
44
+ if pa.types.is_list(field.type.value_type):
45
+ raise TypeError("Nested lists are not supported")
46
+
47
+ if pa.types.is_struct(_type):
48
+ proto_type = f"{message_name}_{field.name}"
49
+
50
+ message_descriptor.nested_type.extend([emit(schema=_type, message_name=proto_type)])
51
+
52
+ label = (
53
+ FieldDescriptorProto.LABEL_REPEATED
54
+ if _repeated
55
+ else FieldDescriptorProto.LABEL_OPTIONAL
56
+ if _optional
57
+ else FieldDescriptorProto.LABEL_REQUIRED
58
+ )
59
+
60
+ message_descriptor.field.add(
61
+ name=field.name,
62
+ number=idx,
63
+ label=label,
64
+ type=FieldDescriptorProto.TYPE_MESSAGE,
65
+ type_name=proto_type,
66
+ )
67
+
68
+ elif type_check := first(TYPES_MAPPING, lambda type_check: type_check(_type)):
69
+ label = (
70
+ FieldDescriptorProto.LABEL_REPEATED
71
+ if _repeated
72
+ else FieldDescriptorProto.LABEL_OPTIONAL
73
+ if _optional
74
+ else FieldDescriptorProto.LABEL_REQUIRED
75
+ )
76
+ message_descriptor.field.add(
77
+ name=field.name,
78
+ number=idx,
79
+ label=label,
80
+ type=TYPES_MAPPING[type_check].pb,
81
+ )
82
+
83
+ else:
84
+ raise TypeError(f"Unsupported type {_type}")
85
+
86
+ return message_descriptor
87
+
88
+
89
+ def generate(schema):
90
+ # NOTE. (I think)
91
+ #
92
+ # 1) Since we are using the same descriptor pool,
93
+ # we need to make sure that the message name is unique.
94
+ # 2) Above applies to files added to the pool as well.
95
+
96
+ message_name = f"Message_{random_string(10)}"
97
+ file_name = f"{random_string(10)}.proto"
98
+
99
+ message_type = emit(schema, message_name=message_name)
100
+
101
+ pool = descriptor_pool.Default()
102
+ pool.AddSerializedFile(
103
+ descriptor_pb2.FileDescriptorProto(name=file_name, message_type=[message_type]).SerializeToString()
104
+ )
105
+
106
+ return pool.FindMessageTypeByName(message_name)
107
+
108
+
109
+ def ammend_schema(schema):
110
+ def _cast_field(field):
111
+ if pa.types.is_struct(field.type):
112
+ new_fields = [_cast_field(sub_field) for sub_field in field.type]
113
+ return pa.field(field.name, pa.struct(new_fields))
114
+
115
+ elif pa.types.is_list(field.type):
116
+ new_value_field = _cast_field(field.type.value_field)
117
+ return pa.field(field.name, pa.list_(new_value_field.type))
118
+
119
+ elif any(type_check(field.type) for type_check in FORCE_STRING):
120
+ return pa.field(field.name, pa.string())
121
+ else:
122
+ return field
123
+
124
+ new_fields = [_cast_field(field) for field in schema]
125
+ return pa.schema(new_fields)
126
+
127
+
128
+ def serialize(pa_table, protobuf_definition):
129
+ Message = message_factory.GetMessageClass(protobuf_definition)
130
+
131
+ pa_table = pa_table.cast(ammend_schema(pa_table.schema))
132
+
133
+ rows = []
134
+ size = 0
135
+
136
+ for element in pa_table.to_pylist():
137
+ t0 = time.time()
138
+ message = Message(**element)
139
+ size += message.ByteSize()
140
+
141
+ logger.debug(f"Time taken to serialize: {(time.time() - t0):.4f}")
142
+
143
+ if size > GRPC_UPLOAD_LIMIT:
144
+ assert rows, "Row is too large to fit in a single request"
145
+ yield rows
146
+ rows = []
147
+
148
+ rows.append(message.SerializeToString())
149
+
150
+ if rows:
151
+ yield rows
@@ -0,0 +1,21 @@
1
+ import collections
2
+
3
+ import pyarrow as pa
4
+ from google.protobuf.descriptor_pb2 import FieldDescriptorProto
5
+ from google.cloud import bigquery
6
+
7
+
8
+ TypeMapping = collections.namedtuple("TypeMapping", ["bq", "pb"])
9
+
10
+
11
+ TYPES_MAPPING = {
12
+ pa.types.is_binary: TypeMapping(bigquery.SqlTypeNames.BYTES, FieldDescriptorProto.TYPE_BYTES),
13
+ pa.types.is_boolean: TypeMapping(bigquery.SqlTypeNames.BOOLEAN, FieldDescriptorProto.TYPE_BOOL),
14
+ pa.types.is_date: TypeMapping(bigquery.SqlTypeNames.DATE, FieldDescriptorProto.TYPE_STRING),
15
+ pa.types.is_decimal: TypeMapping(bigquery.SqlTypeNames.DECIMAL, FieldDescriptorProto.TYPE_STRING),
16
+ pa.types.is_floating: TypeMapping(bigquery.SqlTypeNames.FLOAT64, FieldDescriptorProto.TYPE_DOUBLE),
17
+ pa.types.is_integer: TypeMapping(bigquery.SqlTypeNames.INT64, FieldDescriptorProto.TYPE_INT64),
18
+ pa.types.is_string: TypeMapping(bigquery.SqlTypeNames.STRING, FieldDescriptorProto.TYPE_STRING),
19
+ pa.types.is_time: TypeMapping(bigquery.SqlTypeNames.TIME, FieldDescriptorProto.TYPE_STRING),
20
+ pa.types.is_timestamp: TypeMapping(bigquery.SqlTypeNames.TIMESTAMP, FieldDescriptorProto.TYPE_STRING),
21
+ }
@@ -0,0 +1,30 @@
1
+ import tenacity
2
+
3
+ from google.cloud.bigquery_storage_v1 import types
4
+ from google.api_core.exceptions import Unknown
5
+
6
+ from . import pa_to_pb
7
+
8
+
9
+ @tenacity.retry(
10
+ stop=tenacity.stop_after_attempt(5),
11
+ retry=tenacity.retry_if_exception_type(Unknown))
12
+ def _send(stream, serialized_rows, offset):
13
+ proto_rows = types.ProtoRows()
14
+ proto_rows.serialized_rows.extend(serialized_rows)
15
+
16
+ proto_data = types.AppendRowsRequest.ProtoData()
17
+ proto_data.rows = proto_rows
18
+
19
+ request = types.AppendRowsRequest()
20
+ request.offset = offset
21
+ request.proto_rows = proto_data
22
+
23
+ stream.append_rows_stream.send(request).result()
24
+
25
+
26
+ def upload_data(stream, pa_table, protobuf_definition, offset):
27
+ local_offset = 0
28
+ for serialized_rows in pa_to_pb.serialize(pa_table, protobuf_definition):
29
+ _send(stream, serialized_rows, offset + local_offset)
30
+ local_offset += len(serialized_rows)
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyarrow-bigquery
3
+ Version: 0.1.0
4
+ Summary: A simple library to **write to** and **download from** BigQuery tables as PyArrow tables.
5
+ Author-email: Sebastian Pawluś <sebastian.pawlus@gmail.com>
6
+ License: MIT
7
+ Keywords: pyarrow,bigquery
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: google-cloud-bigquery <5,>=3
10
+ Requires-Dist: google-cloud-bigquery-storage <3,>=2
11
+ Requires-Dist: pyarrow <17,>=16
12
+ Requires-Dist: tenacity
13
+
14
+ # pyarrow-bigquery
15
+
16
+ A simple library to **write to** and **download from** BigQuery tables as PyArrow tables.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install pyarrow-bigquery
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ This guide will help you quickly get started with `pyarrow-bigquery`, a library that allows you to **read** from and **write** to Google BigQuery using PyArrow.
27
+
28
+ ### Reading from BigQuery
29
+
30
+ `pyarrow-bigquery` exposes two methods to read BigQuery tables as PyArrow tables. Depending on your use case or the size of the table, you might want to use one method over the other.
31
+
32
+ #### Read the Whole Table
33
+
34
+ When the table is small enough to fit in memory, you can read it directly using `bq.read_table`.
35
+
36
+ ```python
37
+ import pyarrow.bigquery as bq
38
+
39
+ table = bq.read_table("gcp_project.dataset.small_table")
40
+
41
+ print(table.num_rows)
42
+ ```
43
+
44
+ #### Read with Batches
45
+
46
+ If the target table is larger than memory or you have other reasons not to fetch the whole table at once, you can use the `bq.reader` iterator method along with the `batch_size` parameter to limit how much data is fetched per iteration.
47
+
48
+ ```python
49
+ import pyarrow.bigquery as bq
50
+
51
+ for table in bq.reader("gcp_project.dataset.big_table", batch_size=100):
52
+ print(table.num_rows)
53
+ ```
54
+
55
+ ### Writing to BigQuery
56
+
57
+ Similarly, the package exposes two methods to write to BigQuery. Depending on your use case or the size of the table, you might want to use one method over the other.
58
+
59
+ #### Write the Whole Table
60
+
61
+ When you want to write a complete table at once, you can use the `bq.write_table` method.
62
+
63
+ ```python
64
+ import pyarrow as pa
65
+ import pyarrow.bigquery as bq
66
+
67
+ table = pa.Table.from_arrays([[1, 2, 3, 4]], names=['integers'])
68
+
69
+ bq.write_table(table, 'gcp_project.dataset.table')
70
+ ```
71
+
72
+ #### Write in Batches (Smaller Chunks)
73
+
74
+ If you need to write data in smaller chunks, you can use the `bq.writer` method with the `schema` parameter to define the table structure.
75
+
76
+ ```python
77
+ import pyarrow as pa
78
+ import pyarrow.bigquery as bq
79
+
80
+ schema = pa.schema([
81
+ ("integers", pa.int64())
82
+ ])
83
+
84
+ with bq.writer("gcp_project.dataset.table", schema=schema) as w:
85
+ w.write_batch(record_batch)
86
+ w.write_table(table)
87
+ ```
88
+
89
+ ## API Reference
90
+
91
+ ### `pyarrow.bigquery.write_table`
92
+
93
+ Write a PyArrow Table to a BigQuery Table. No return value.
94
+
95
+ **Parameters:**
96
+
97
+ - `table`: `pa.Table`
98
+ PyArrow table.
99
+
100
+ - `where`: `str`
101
+ Destination location in BigQuery catalog.
102
+
103
+ - `project`: `str`, *default* `None`
104
+ BigQuery execution project, also the billing project. If not provided, it will be extracted from `where`.
105
+
106
+ - `table_create`: `bool`, *default* `True`
107
+ Specifies if the BigQuery table should be created.
108
+
109
+ - `table_expire`: `None | int`, *default* `None`
110
+ Amount of seconds after which the created table will expire. Used only if `table_create` is `True`. Set to `None` to disable expiration.
111
+
112
+ - `table_overwrite`: `bool`, *default* `False`
113
+ If the table already exists, destroy it and create a new one.
114
+
115
+ - `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
116
+ Worker backend for fetching data.
117
+
118
+ - `worker_count`: `int`, *default* `os.cpu_count()`
119
+ Number of threads or processes to use for fetching data from BigQuery.
120
+
121
+ - `batch_size`: `int`, *default* `100`
122
+ Batch size for fetched rows.
123
+
124
+ ```python
125
+ bq.write_table(table, 'gcp_project.dataset.table')
126
+ ```
127
+
128
+ ### `pyarrow.bigquery.writer`
129
+
130
+ Context manager version of the write method. Useful when the PyArrow table is larger than memory size or the table is available in chunks.
131
+
132
+ **Parameters:**
133
+
134
+ - `schema`: `pa.Schema`
135
+ PyArrow schema.
136
+
137
+ - `where`: `str`
138
+ Destination location in BigQuery catalog.
139
+
140
+ - `project`: `str`, *default* `None`
141
+ BigQuery execution project, also the billing project. If not provided, it will be extracted from `where`.
142
+
143
+ - `table_create`: `bool`, *default* `True`
144
+ Specifies if the BigQuery table should be created.
145
+
146
+ - `table_expire`: `None | int`, *default* `None`
147
+ Amount of seconds after which the created table will expire. Used only if `table_create` is `True`. Set to `None` to disable expiration.
148
+
149
+ - `table_overwrite`: `bool`, *default* `False`
150
+ If the table already exists, destroy it and create a new one.
151
+
152
+ - `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
153
+ Worker backend for writing data.
154
+
155
+ - `worker_count`: `int`, *default* `os.cpu_count()`
156
+ Number of threads or processes to use for writing data to BigQuery.
157
+
158
+ - `batch_size`: `int`, *default* `100`
159
+ Batch size used for writes. Table will be automatically split to this value.
160
+
161
+ Depending on the use case, you might want to use one of the methods below to write your data to a BigQuery table, using either `pa.Table` or `pa.RecordBatch`.
162
+
163
+ #### `pyarrow.bigquery.writer.write_table`
164
+
165
+ Context manager method to write a table.
166
+
167
+ **Parameters:**
168
+
169
+ - `table`: `pa.Table`
170
+ PyArrow table.
171
+
172
+ ```python
173
+ import pyarrow as pa
174
+ import pyarrow.bigquery as bq
175
+
176
+ schema = pa.schema([("value", pa.list_(pa.int64()))])
177
+
178
+ with bq.writer("gcp_project.dataset.table", schema=schema) as w:
179
+ for a in range(1000):
180
+ w.write_table(pa.Table.from_pylist([{'value': [a] * 10}]))
181
+ ```
182
+
183
+ #### `pyarrow.bigquery.writer.write_batch`
184
+
185
+ Context manager method to write a record batch.
186
+
187
+ **Parameters:**
188
+
189
+ - `batch`: `pa.RecordBatch`
190
+ PyArrow record batch.
191
+
192
+ ```python
193
+ import pyarrow as pa
194
+ import pyarrow.bigquery as bq
195
+
196
+ schema = pa.schema([("value", pa.list_(pa.int64()))])
197
+
198
+ with bq.writer("gcp_project.dataset.table", schema=schema) as w:
199
+ for a in range(1000):
200
+ w.write_batch(pa.RecordBatch.from_pylist([{'value': [1] * 10}]))
201
+ ```
202
+
203
+ ### `pyarrow.bigquery.read_table`
204
+
205
+ **Parameters:**
206
+
207
+ - `source`: `str`
208
+ BigQuery table location.
209
+
210
+ - `project`: `str`, *default* `None`
211
+ BigQuery execution project, also the billing project. If not provided, it will be extracted from `source`.
212
+
213
+ - `columns`: `str`, *default* `None`
214
+ Columns to download. When not provided, all available columns will be downloaded.
215
+
216
+ - `row_restrictions`: `str`, *default* `None`
217
+ Row level filtering executed on the BigQuery side. More in [BigQuery documentation](https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1).
218
+
219
+ - `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
220
+ Worker backend for fetching data.
221
+
222
+ - `worker_count`: `int`, *default* `os.cpu_count()`
223
+ Number of threads or processes to use for fetching data from BigQuery.
224
+
225
+ - `batch_size`: `int`, *default* `100`
226
+ Batch size used for fetching. Table will be automatically split to this value.
227
+
228
+ ### `pyarrow.bigquery.reader`
229
+
230
+ **Parameters:**
231
+
232
+ - `source`: `str`
233
+ BigQuery table location.
234
+
235
+ - `project`: `str`, *default* `None`
236
+ BigQuery execution project, also the billing project. If not provided, it will be extracted from `source`.
237
+
238
+ - `columns`: `str`, *default* `None`
239
+ Columns to download. When not provided, all available columns will be downloaded.
240
+
241
+ - `row_restrictions`: `str`, *default* `None`
242
+ Row level filtering executed on the BigQuery side. More in [BigQuery documentation](https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1).
243
+
244
+ - `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
245
+ Worker backend for fetching data.
246
+
247
+ - `worker_count`: `int`, *default* `os.cpu_count()`
248
+ Number of threads or processes to use for fetching data from BigQuery.
249
+
250
+ - `batch_size`: `int`, *default* `100`
251
+ Batch size used for fetching. Table will be automatically split to this value.
252
+
253
+ ```python
254
+ import pyarrow as pa
255
+ import pyarrow.bigquery as bq
256
+
257
+ parts = []
258
+ for part in bq.reader("gcp_project.dataset.table"):
259
+ parts.append(part)
260
+
261
+ table = pa.concat_tables(parts)
262
+ ```
@@ -0,0 +1,12 @@
1
+ pyarrow/bigquery/__init__.py,sha256=BZBKPSDDcVkf3s6d91hx4OHoeJm927BN4T33rY_xMqs,118
2
+ pyarrow/bigquery/read.py,sha256=MgKhvkeKqq0ha6XSN-Fe59EgPdGOHVLZ-PTCWpmqMwA,6524
3
+ pyarrow/bigquery/some_itertools.py,sha256=XCbKojLfSYqV6TDAtCV6uQXxxuuU9Jlk02fjxCv8UVI,394
4
+ pyarrow/bigquery/write/__init__.py,sha256=u3gntAxTAMRCDj6ogTyp3OicnAjsgK7LKMgVIPyQBOs,7098
5
+ pyarrow/bigquery/write/pa_to_bq.py,sha256=NADdA7bywjpMvS7kCLnTCOdDyTzWiQQm6wFFeqYm5dw,1309
6
+ pyarrow/bigquery/write/pa_to_pb.py,sha256=6UUWpGudPs8aVEhdrsYTg6OJ9PdI8UUxFi55WM2nfl0,4427
7
+ pyarrow/bigquery/write/type_mapping.py,sha256=O8ellbgF6_yEET4E9gQvtSKz4Q0750BMwRctTli8aXI,1138
8
+ pyarrow/bigquery/write/upload.py,sha256=uVmKNUAtoFneACAgFWanefNrJNjSE4D6kQdYw4B8pDk,912
9
+ pyarrow_bigquery-0.1.0.dist-info/METADATA,sha256=7kCiT33XF3QLKbNWcD8TzGn_8burhpBy8AcO0ju3jDI,8204
10
+ pyarrow_bigquery-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
11
+ pyarrow_bigquery-0.1.0.dist-info/top_level.txt,sha256=fPLFY23J70iLX3TKZtbNM2WS9DlDdIA5d9WX0dloJVY,8
12
+ pyarrow_bigquery-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ pyarrow