pyarrow-bigquery 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyarrow/bigquery/__init__.py +2 -0
- pyarrow/bigquery/read.py +232 -0
- pyarrow/bigquery/some_itertools.py +12 -0
- pyarrow/bigquery/write/__init__.py +238 -0
- pyarrow/bigquery/write/pa_to_bq.py +48 -0
- pyarrow/bigquery/write/pa_to_pb.py +151 -0
- pyarrow/bigquery/write/type_mapping.py +21 -0
- pyarrow/bigquery/write/upload.py +30 -0
- pyarrow_bigquery-0.1.0.dist-info/METADATA +262 -0
- pyarrow_bigquery-0.1.0.dist-info/RECORD +12 -0
- pyarrow_bigquery-0.1.0.dist-info/WHEEL +5 -0
- pyarrow_bigquery-0.1.0.dist-info/top_level.txt +1 -0
pyarrow/bigquery/read.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import tempfile
|
|
7
|
+
import multiprocessing
|
|
8
|
+
import threading
|
|
9
|
+
import shutil
|
|
10
|
+
|
|
11
|
+
from google.cloud import bigquery_storage
|
|
12
|
+
from google.cloud import bigquery
|
|
13
|
+
from google.cloud.exceptions import NotFound
|
|
14
|
+
|
|
15
|
+
import pyarrow as pa
|
|
16
|
+
import pyarrow.feather as fa
|
|
17
|
+
|
|
18
|
+
from . import some_itertools
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _bq_table_exists(project: str, location: str):
|
|
25
|
+
client = bigquery.Client(project=project)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
client.get_table(location)
|
|
29
|
+
logger.debug(f"Table {location} already exists")
|
|
30
|
+
except NotFound as e:
|
|
31
|
+
logger.debug("Table {location} is not found")
|
|
32
|
+
raise e
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _bq_read_create_strems(
|
|
36
|
+
read_client: bigquery_storage.BigQueryReadClient,
|
|
37
|
+
parent: str,
|
|
38
|
+
location: str,
|
|
39
|
+
selected_fields: list | None,
|
|
40
|
+
row_restrictions: str | None,
|
|
41
|
+
max_stream_count: int,
|
|
42
|
+
) -> tuple[list[str], pa.Schema]:
|
|
43
|
+
project, dataset, table = location.split(".")
|
|
44
|
+
|
|
45
|
+
read_session = bigquery_storage.ReadSession(
|
|
46
|
+
table=f"projects/{project}/datasets/{dataset}/tables/{table}",
|
|
47
|
+
data_format=bigquery_storage.DataFormat.ARROW,
|
|
48
|
+
read_options={
|
|
49
|
+
"selected_fields": selected_fields,
|
|
50
|
+
"row_restriction": row_restrictions,
|
|
51
|
+
},
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
read_session = read_client.create_read_session(
|
|
55
|
+
parent=f"projects/{parent}",
|
|
56
|
+
read_session=read_session,
|
|
57
|
+
max_stream_count=max_stream_count,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
schema_buffer = pa.py_buffer(read_session.arrow_schema.serialized_schema)
|
|
61
|
+
schema = pa.ipc.read_schema(schema_buffer)
|
|
62
|
+
|
|
63
|
+
return read_session.streams, schema
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _stream_worker(read_client, read_streams, table_schema, batch_size, queue_results, temp_dir):
|
|
67
|
+
batches = []
|
|
68
|
+
|
|
69
|
+
for stream in read_streams:
|
|
70
|
+
t = time.time()
|
|
71
|
+
|
|
72
|
+
for message in read_client.read_rows(stream.name):
|
|
73
|
+
record_batch = pa.ipc.read_record_batch(message.arrow_record_batch.serialized_record_batch, table_schema)
|
|
74
|
+
|
|
75
|
+
batches.append(record_batch)
|
|
76
|
+
|
|
77
|
+
if sum(b.num_rows for b in batches) >= batch_size:
|
|
78
|
+
table = pa.Table.from_batches(batches)
|
|
79
|
+
|
|
80
|
+
element = tempfile.mktemp(dir=temp_dir)
|
|
81
|
+
fa.write_feather(table[:batch_size], element)
|
|
82
|
+
queue_results.put(element)
|
|
83
|
+
|
|
84
|
+
batches = table[batch_size:].to_batches()
|
|
85
|
+
|
|
86
|
+
logger.debug(f"Stream {stream.name} done in {time.time()-t:.2f} seconds")
|
|
87
|
+
|
|
88
|
+
if batches:
|
|
89
|
+
table = pa.Table.from_batches(batches)
|
|
90
|
+
element = tempfile.mktemp(dir=temp_dir)
|
|
91
|
+
fa.write_feather(table, element)
|
|
92
|
+
queue_results.put(element)
|
|
93
|
+
|
|
94
|
+
queue_results.put(None)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def reader(
|
|
98
|
+
source: str,
|
|
99
|
+
*,
|
|
100
|
+
project: str | None = None,
|
|
101
|
+
columns: list | None = None,
|
|
102
|
+
row_restrictions: str | None = None,
|
|
103
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
104
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
105
|
+
batch_size: int = 100,
|
|
106
|
+
):
|
|
107
|
+
t0 = time.time()
|
|
108
|
+
project_id, *_ = source.split(".")
|
|
109
|
+
|
|
110
|
+
if not project:
|
|
111
|
+
project = project_id
|
|
112
|
+
|
|
113
|
+
queue_results = multiprocessing.Queue()
|
|
114
|
+
read_client = bigquery_storage.BigQueryReadClient()
|
|
115
|
+
|
|
116
|
+
_bq_table_exists(project, source)
|
|
117
|
+
|
|
118
|
+
streams, streams_schema = _bq_read_create_strems(
|
|
119
|
+
read_client=read_client,
|
|
120
|
+
parent=project,
|
|
121
|
+
location=source,
|
|
122
|
+
selected_fields=columns,
|
|
123
|
+
row_restrictions=row_restrictions,
|
|
124
|
+
max_stream_count=worker_count * 3,
|
|
125
|
+
)
|
|
126
|
+
workers_done = 0
|
|
127
|
+
|
|
128
|
+
assert streams, "No streams to read, Table might be empty"
|
|
129
|
+
|
|
130
|
+
logger.debug(f"Number of workers: {worker_count}, number of streams: {len(streams)}")
|
|
131
|
+
|
|
132
|
+
actual_worker_count = min(worker_count, len(streams))
|
|
133
|
+
|
|
134
|
+
logger.debug(f"Actual worker count: {actual_worker_count}")
|
|
135
|
+
|
|
136
|
+
temp_dir = tempfile.mkdtemp()
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
for streams in some_itertools.to_split(streams, actual_worker_count):
|
|
140
|
+
e = worker_type(
|
|
141
|
+
target=_stream_worker,
|
|
142
|
+
args=(
|
|
143
|
+
read_client,
|
|
144
|
+
streams,
|
|
145
|
+
streams_schema,
|
|
146
|
+
batch_size,
|
|
147
|
+
queue_results,
|
|
148
|
+
temp_dir,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
e.start()
|
|
152
|
+
|
|
153
|
+
while True:
|
|
154
|
+
element = queue_results.get()
|
|
155
|
+
|
|
156
|
+
if not element:
|
|
157
|
+
workers_done += 1
|
|
158
|
+
|
|
159
|
+
if workers_done == actual_worker_count:
|
|
160
|
+
break
|
|
161
|
+
else:
|
|
162
|
+
table = fa.read_table(element)
|
|
163
|
+
os.remove(element)
|
|
164
|
+
yield table
|
|
165
|
+
finally:
|
|
166
|
+
t = time.time()
|
|
167
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
168
|
+
logger.debug(f"Time to cleanup temp directory: {time.time()-t:.2f}")
|
|
169
|
+
logger.debug(f"Time taken to read: {time.time()-t0:.2f}")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def reader_query(
|
|
173
|
+
project: str,
|
|
174
|
+
query: str,
|
|
175
|
+
*,
|
|
176
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
177
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
178
|
+
batch_size: int = 100,
|
|
179
|
+
):
|
|
180
|
+
client = bigquery.Client(project=project)
|
|
181
|
+
job = client.query(query)
|
|
182
|
+
job.result()
|
|
183
|
+
|
|
184
|
+
source = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}"
|
|
185
|
+
return reader(
|
|
186
|
+
source=source,
|
|
187
|
+
project=project,
|
|
188
|
+
worker_count=worker_count,
|
|
189
|
+
worker_type=worker_type,
|
|
190
|
+
batch_size=batch_size,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def read_table(
|
|
195
|
+
source: str,
|
|
196
|
+
*,
|
|
197
|
+
project: str | None = None,
|
|
198
|
+
columns: list | None = None,
|
|
199
|
+
row_restrictions: str | None = None,
|
|
200
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
201
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
202
|
+
batch_size: int = 100,
|
|
203
|
+
):
|
|
204
|
+
return pa.concat_tables(
|
|
205
|
+
reader(
|
|
206
|
+
source=source,
|
|
207
|
+
project=project,
|
|
208
|
+
columns=columns,
|
|
209
|
+
row_restrictions=row_restrictions,
|
|
210
|
+
worker_count=worker_count,
|
|
211
|
+
worker_type=worker_type,
|
|
212
|
+
batch_size=batch_size,
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def read_query(
|
|
217
|
+
project: str,
|
|
218
|
+
query: str,
|
|
219
|
+
*,
|
|
220
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
221
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
222
|
+
batch_size: int = 100,
|
|
223
|
+
):
|
|
224
|
+
return pa.concat_tables(
|
|
225
|
+
reader_query(
|
|
226
|
+
project=project,
|
|
227
|
+
query=query,
|
|
228
|
+
worker_count=worker_count,
|
|
229
|
+
worker_type=worker_type,
|
|
230
|
+
batch_size=batch_size
|
|
231
|
+
)
|
|
232
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
def to_chunks(table, chunk_size):
|
|
2
|
+
for i in range(0, len(table), chunk_size):
|
|
3
|
+
yield table[i : i + chunk_size]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_split(table, split_number):
|
|
7
|
+
k, m = divmod(len(table), split_number)
|
|
8
|
+
return (table[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(split_number))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def first(iterable, condition):
|
|
12
|
+
return next((x for x in iterable if condition(x)), None)
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import datetime
|
|
6
|
+
import multiprocessing
|
|
7
|
+
import threading
|
|
8
|
+
import shutil
|
|
9
|
+
import tempfile
|
|
10
|
+
import logging
|
|
11
|
+
import collections
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from google.cloud import bigquery
|
|
15
|
+
from google.cloud import bigquery_storage_v1
|
|
16
|
+
from google.cloud.bigquery_storage_v1.writer import AppendRowsStream
|
|
17
|
+
from google.protobuf import descriptor_pb2
|
|
18
|
+
from google.api_core.exceptions import Unknown, NotFound
|
|
19
|
+
from google.api_core import retry
|
|
20
|
+
|
|
21
|
+
import pyarrow as pa
|
|
22
|
+
import pyarrow.feather as fa
|
|
23
|
+
|
|
24
|
+
from . import pa_to_bq
|
|
25
|
+
from . import pa_to_pb
|
|
26
|
+
from . import upload
|
|
27
|
+
from .. import some_itertools
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
Stream = collections.namedtuple("Stream", ["append_rows_stream", "write_stream"])
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _bq_create_table(*, project, location, schema, expire, overwrite):
|
|
36
|
+
client = bigquery.Client(project=project)
|
|
37
|
+
|
|
38
|
+
if overwrite:
|
|
39
|
+
client.delete_table(location, not_found_ok=True)
|
|
40
|
+
|
|
41
|
+
bq_schema = pa_to_bq.generate(schema)
|
|
42
|
+
|
|
43
|
+
table = bigquery.Table(location, schema=bq_schema)
|
|
44
|
+
|
|
45
|
+
client.create_table(table)
|
|
46
|
+
|
|
47
|
+
if expire:
|
|
48
|
+
table.expires = datetime.datetime.now() + datetime.timedelta(seconds=expire)
|
|
49
|
+
client.update_table(table, ["expires"])
|
|
50
|
+
|
|
51
|
+
logger.debug(f"Created BigQuery table '{location}'")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _bq_write_create_stream(write_client: bigquery_storage_v1.BigQueryWriteClient, parent, protobuf_definition):
|
|
55
|
+
write_stream = write_client.create_write_stream(
|
|
56
|
+
parent=parent,
|
|
57
|
+
write_stream=bigquery_storage_v1.types.WriteStream(type=bigquery_storage_v1.types.WriteStream.Type.PENDING),
|
|
58
|
+
retry=retry.Retry(predicate=retry.if_exception_type(Unknown, NotFound)),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
proto_schema = bigquery_storage_v1.types.ProtoSchema()
|
|
62
|
+
proto_descriptor = descriptor_pb2.DescriptorProto()
|
|
63
|
+
protobuf_definition.CopyToProto(proto_descriptor)
|
|
64
|
+
proto_schema.proto_descriptor = proto_descriptor
|
|
65
|
+
|
|
66
|
+
proto_data = bigquery_storage_v1.types.AppendRowsRequest.ProtoData()
|
|
67
|
+
proto_data.writer_schema = proto_schema
|
|
68
|
+
|
|
69
|
+
request_template = bigquery_storage_v1.types.AppendRowsRequest()
|
|
70
|
+
request_template.write_stream = write_stream.name
|
|
71
|
+
request_template.proto_rows = proto_data
|
|
72
|
+
|
|
73
|
+
append_rows_stream = AppendRowsStream(write_client, request_template)
|
|
74
|
+
|
|
75
|
+
return Stream(
|
|
76
|
+
write_stream=write_stream,
|
|
77
|
+
append_rows_stream=append_rows_stream,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _bq_storage_close_stream(write_client, stream, parent):
|
|
82
|
+
stream.append_rows_stream.close()
|
|
83
|
+
write_client.finalize_write_stream(name=stream.write_stream.name)
|
|
84
|
+
|
|
85
|
+
batch_commit_write_streams_request = bigquery_storage_v1.types.BatchCommitWriteStreamsRequest()
|
|
86
|
+
batch_commit_write_streams_request.parent = parent
|
|
87
|
+
batch_commit_write_streams_request.write_streams = [stream.write_stream.name]
|
|
88
|
+
|
|
89
|
+
write_client.batch_commit_write_streams(batch_commit_write_streams_request)
|
|
90
|
+
|
|
91
|
+
logger.debug(f"Stream '{stream.write_stream.name}' closed")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _stream_worker(
|
|
95
|
+
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
96
|
+
parent: str,
|
|
97
|
+
schema_protobuf,
|
|
98
|
+
queue_results,
|
|
99
|
+
):
|
|
100
|
+
stream = _bq_write_create_stream(write_client, parent, schema_protobuf)
|
|
101
|
+
|
|
102
|
+
offset = 0
|
|
103
|
+
|
|
104
|
+
while True:
|
|
105
|
+
element = queue_results.get()
|
|
106
|
+
if element is None:
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
table = fa.read_table(element)
|
|
110
|
+
|
|
111
|
+
upload.upload_data(stream, table, schema_protobuf, offset)
|
|
112
|
+
|
|
113
|
+
os.remove(element)
|
|
114
|
+
|
|
115
|
+
offset += table.num_rows
|
|
116
|
+
|
|
117
|
+
_bq_storage_close_stream(write_client, stream, parent)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class writer:
|
|
121
|
+
"""Method to handle"""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
schema: pa.Schema,
|
|
126
|
+
where: str,
|
|
127
|
+
*,
|
|
128
|
+
project: str | None = None,
|
|
129
|
+
table_create: bool = True,
|
|
130
|
+
table_expire: int | None = None,
|
|
131
|
+
table_overwrite: bool = False,
|
|
132
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
133
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
134
|
+
batch_size: int = 100,
|
|
135
|
+
):
|
|
136
|
+
self.project = project
|
|
137
|
+
self.where = where
|
|
138
|
+
self.schema = schema
|
|
139
|
+
|
|
140
|
+
self.table_create = table_create
|
|
141
|
+
self.table_expire = table_expire
|
|
142
|
+
self.table_overwrite = table_overwrite
|
|
143
|
+
|
|
144
|
+
self.worker_count = worker_count
|
|
145
|
+
self.worker_type = worker_type
|
|
146
|
+
|
|
147
|
+
self.batch_size = batch_size
|
|
148
|
+
|
|
149
|
+
project_id, dataset_id, table_id = where.split(".")
|
|
150
|
+
|
|
151
|
+
self.parent = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"
|
|
152
|
+
|
|
153
|
+
if not self.project:
|
|
154
|
+
self.project = project_id
|
|
155
|
+
|
|
156
|
+
def __enter__(self):
|
|
157
|
+
self.t0 = time.time()
|
|
158
|
+
self.temp_dir = tempfile.mkdtemp()
|
|
159
|
+
self.schema_protobuf = pa_to_pb.generate(self.schema)
|
|
160
|
+
|
|
161
|
+
if self.table_create:
|
|
162
|
+
_bq_create_table(
|
|
163
|
+
project=self.project,
|
|
164
|
+
location=self.where,
|
|
165
|
+
schema=self.schema,
|
|
166
|
+
expire=self.table_expire,
|
|
167
|
+
overwrite=self.table_overwrite,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
self.queue_results = multiprocessing.Queue()
|
|
171
|
+
self.workers = []
|
|
172
|
+
write_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
173
|
+
|
|
174
|
+
for _ in range(self.worker_count):
|
|
175
|
+
worker = self.worker_type(
|
|
176
|
+
target=_stream_worker,
|
|
177
|
+
args=(
|
|
178
|
+
write_client,
|
|
179
|
+
self.parent,
|
|
180
|
+
self.schema_protobuf,
|
|
181
|
+
self.queue_results,
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
worker.start()
|
|
185
|
+
self.workers.append(worker)
|
|
186
|
+
|
|
187
|
+
return self
|
|
188
|
+
|
|
189
|
+
def write_table(self, table):
|
|
190
|
+
for table_chunk in some_itertools.to_chunks(table, self.batch_size):
|
|
191
|
+
element = tempfile.mktemp(dir=self.temp_dir)
|
|
192
|
+
fa.write_feather(table_chunk, element)
|
|
193
|
+
self.queue_results.put(element)
|
|
194
|
+
|
|
195
|
+
def write_batch(self, batch):
|
|
196
|
+
element = tempfile.mktemp(dir=self.temp_dir)
|
|
197
|
+
fa.write_feather(pa.Table.from_batches([batch]), element)
|
|
198
|
+
self.queue_results.put(element)
|
|
199
|
+
|
|
200
|
+
def __exit__(self, *_, **__):
|
|
201
|
+
for _ in range(self.worker_count):
|
|
202
|
+
self.queue_results.put(None)
|
|
203
|
+
|
|
204
|
+
for w in self.workers:
|
|
205
|
+
w.join()
|
|
206
|
+
|
|
207
|
+
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
208
|
+
|
|
209
|
+
logger.debug(f"Time taken: {time.time() - self.t0}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def write_table(
|
|
213
|
+
table: pa.Table,
|
|
214
|
+
where: str,
|
|
215
|
+
*,
|
|
216
|
+
project: str | None = None,
|
|
217
|
+
table_create: bool = True,
|
|
218
|
+
table_expire: int | None = None,
|
|
219
|
+
table_overwrite: bool = False,
|
|
220
|
+
worker_count: int = multiprocessing.cpu_count(),
|
|
221
|
+
worker_type: type[threading.Thread] | type[multiprocessing.Process] = threading.Thread,
|
|
222
|
+
batch_size: int = 100,
|
|
223
|
+
):
|
|
224
|
+
assert table.num_rows > 0, "Table is empty"
|
|
225
|
+
|
|
226
|
+
with writer(
|
|
227
|
+
where=where,
|
|
228
|
+
schema=table.schema,
|
|
229
|
+
project=project,
|
|
230
|
+
table_create=table_create,
|
|
231
|
+
table_expire=table_expire,
|
|
232
|
+
table_overwrite=table_overwrite,
|
|
233
|
+
worker_count=worker_count,
|
|
234
|
+
worker_type=worker_type,
|
|
235
|
+
batch_size=batch_size,
|
|
236
|
+
) as w:
|
|
237
|
+
for table_chunk in some_itertools.to_split(table, w.worker_count):
|
|
238
|
+
w.write_table(table_chunk)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
|
|
3
|
+
from google.cloud import bigquery
|
|
4
|
+
|
|
5
|
+
from .type_mapping import TYPES_MAPPING
|
|
6
|
+
from ..some_itertools import first
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def emit(schema):
|
|
10
|
+
fields = []
|
|
11
|
+
|
|
12
|
+
for field in schema:
|
|
13
|
+
field_mode = "NULLABLE" if field.nullable else "REQUIRED"
|
|
14
|
+
field_type = field.type
|
|
15
|
+
|
|
16
|
+
if pa.types.is_list(field_type):
|
|
17
|
+
field_mode = "REPEATED"
|
|
18
|
+
field_type = field.type.value_type
|
|
19
|
+
|
|
20
|
+
if pa.types.is_list(field.type.value_type):
|
|
21
|
+
raise TypeError("Nested lists are not supported")
|
|
22
|
+
|
|
23
|
+
if pa.types.is_struct(field_type):
|
|
24
|
+
fields.append(
|
|
25
|
+
bigquery.SchemaField(
|
|
26
|
+
name=field.name,
|
|
27
|
+
field_type="RECORD",
|
|
28
|
+
mode=field_mode,
|
|
29
|
+
fields=emit(field_type),
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
elif type_check := first(TYPES_MAPPING, lambda type_check: type_check(field_type)):
|
|
34
|
+
fields.append(
|
|
35
|
+
bigquery.SchemaField(
|
|
36
|
+
name=field.name,
|
|
37
|
+
field_type=TYPES_MAPPING[type_check].bq,
|
|
38
|
+
mode=field_mode,
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
raise TypeError(f"Unsupported type {field_type}")
|
|
43
|
+
|
|
44
|
+
return fields
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def generate(schema):
|
|
48
|
+
return emit(schema)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
import random
|
|
4
|
+
import string
|
|
5
|
+
|
|
6
|
+
from google.protobuf.descriptor_pb2 import DescriptorProto, FieldDescriptorProto
|
|
7
|
+
from google.protobuf import descriptor_pb2, descriptor_pool, message_factory
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
from .type_mapping import TYPES_MAPPING
|
|
11
|
+
from ..some_itertools import first
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
FORCE_STRING = [
|
|
17
|
+
pa.types.is_date,
|
|
18
|
+
pa.types.is_decimal,
|
|
19
|
+
pa.types.is_time,
|
|
20
|
+
pa.types.is_timestamp,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
GRPC_UPLOAD_LIMIT = 10485760
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def random_string(length):
|
|
27
|
+
characters = string.ascii_letters + string.digits
|
|
28
|
+
return "".join(random.choices(characters, k=length))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def emit(schema, message_name):
|
|
32
|
+
message_descriptor = DescriptorProto()
|
|
33
|
+
message_descriptor.name = message_name
|
|
34
|
+
|
|
35
|
+
for idx, field in enumerate(schema, 1):
|
|
36
|
+
_optional = field.nullable
|
|
37
|
+
_repeated = False
|
|
38
|
+
_type = field.type
|
|
39
|
+
|
|
40
|
+
if pa.types.is_list(_type):
|
|
41
|
+
_repeated = True
|
|
42
|
+
_type = field.type.value_type
|
|
43
|
+
|
|
44
|
+
if pa.types.is_list(field.type.value_type):
|
|
45
|
+
raise TypeError("Nested lists are not supported")
|
|
46
|
+
|
|
47
|
+
if pa.types.is_struct(_type):
|
|
48
|
+
proto_type = f"{message_name}_{field.name}"
|
|
49
|
+
|
|
50
|
+
message_descriptor.nested_type.extend([emit(schema=_type, message_name=proto_type)])
|
|
51
|
+
|
|
52
|
+
label = (
|
|
53
|
+
FieldDescriptorProto.LABEL_REPEATED
|
|
54
|
+
if _repeated
|
|
55
|
+
else FieldDescriptorProto.LABEL_OPTIONAL
|
|
56
|
+
if _optional
|
|
57
|
+
else FieldDescriptorProto.LABEL_REQUIRED
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
message_descriptor.field.add(
|
|
61
|
+
name=field.name,
|
|
62
|
+
number=idx,
|
|
63
|
+
label=label,
|
|
64
|
+
type=FieldDescriptorProto.TYPE_MESSAGE,
|
|
65
|
+
type_name=proto_type,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
elif type_check := first(TYPES_MAPPING, lambda type_check: type_check(_type)):
|
|
69
|
+
label = (
|
|
70
|
+
FieldDescriptorProto.LABEL_REPEATED
|
|
71
|
+
if _repeated
|
|
72
|
+
else FieldDescriptorProto.LABEL_OPTIONAL
|
|
73
|
+
if _optional
|
|
74
|
+
else FieldDescriptorProto.LABEL_REQUIRED
|
|
75
|
+
)
|
|
76
|
+
message_descriptor.field.add(
|
|
77
|
+
name=field.name,
|
|
78
|
+
number=idx,
|
|
79
|
+
label=label,
|
|
80
|
+
type=TYPES_MAPPING[type_check].pb,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
raise TypeError(f"Unsupported type {_type}")
|
|
85
|
+
|
|
86
|
+
return message_descriptor
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def generate(schema):
|
|
90
|
+
# NOTE. (I think)
|
|
91
|
+
#
|
|
92
|
+
# 1) Since we are using the same descriptor pool,
|
|
93
|
+
# we need to make sure that the message name is unique.
|
|
94
|
+
# 2) Above applies to files added to the pool as well.
|
|
95
|
+
|
|
96
|
+
message_name = f"Message_{random_string(10)}"
|
|
97
|
+
file_name = f"{random_string(10)}.proto"
|
|
98
|
+
|
|
99
|
+
message_type = emit(schema, message_name=message_name)
|
|
100
|
+
|
|
101
|
+
pool = descriptor_pool.Default()
|
|
102
|
+
pool.AddSerializedFile(
|
|
103
|
+
descriptor_pb2.FileDescriptorProto(name=file_name, message_type=[message_type]).SerializeToString()
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return pool.FindMessageTypeByName(message_name)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def ammend_schema(schema):
|
|
110
|
+
def _cast_field(field):
|
|
111
|
+
if pa.types.is_struct(field.type):
|
|
112
|
+
new_fields = [_cast_field(sub_field) for sub_field in field.type]
|
|
113
|
+
return pa.field(field.name, pa.struct(new_fields))
|
|
114
|
+
|
|
115
|
+
elif pa.types.is_list(field.type):
|
|
116
|
+
new_value_field = _cast_field(field.type.value_field)
|
|
117
|
+
return pa.field(field.name, pa.list_(new_value_field.type))
|
|
118
|
+
|
|
119
|
+
elif any(type_check(field.type) for type_check in FORCE_STRING):
|
|
120
|
+
return pa.field(field.name, pa.string())
|
|
121
|
+
else:
|
|
122
|
+
return field
|
|
123
|
+
|
|
124
|
+
new_fields = [_cast_field(field) for field in schema]
|
|
125
|
+
return pa.schema(new_fields)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def serialize(pa_table, protobuf_definition):
|
|
129
|
+
Message = message_factory.GetMessageClass(protobuf_definition)
|
|
130
|
+
|
|
131
|
+
pa_table = pa_table.cast(ammend_schema(pa_table.schema))
|
|
132
|
+
|
|
133
|
+
rows = []
|
|
134
|
+
size = 0
|
|
135
|
+
|
|
136
|
+
for element in pa_table.to_pylist():
|
|
137
|
+
t0 = time.time()
|
|
138
|
+
message = Message(**element)
|
|
139
|
+
size += message.ByteSize()
|
|
140
|
+
|
|
141
|
+
logger.debug(f"Time taken to serialize: {(time.time() - t0):.4f}")
|
|
142
|
+
|
|
143
|
+
if size > GRPC_UPLOAD_LIMIT:
|
|
144
|
+
assert rows, "Row is too large to fit in a single request"
|
|
145
|
+
yield rows
|
|
146
|
+
rows = []
|
|
147
|
+
|
|
148
|
+
rows.append(message.SerializeToString())
|
|
149
|
+
|
|
150
|
+
if rows:
|
|
151
|
+
yield rows
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
from google.protobuf.descriptor_pb2 import FieldDescriptorProto
|
|
5
|
+
from google.cloud import bigquery
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
TypeMapping = collections.namedtuple("TypeMapping", ["bq", "pb"])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
TYPES_MAPPING = {
|
|
12
|
+
pa.types.is_binary: TypeMapping(bigquery.SqlTypeNames.BYTES, FieldDescriptorProto.TYPE_BYTES),
|
|
13
|
+
pa.types.is_boolean: TypeMapping(bigquery.SqlTypeNames.BOOLEAN, FieldDescriptorProto.TYPE_BOOL),
|
|
14
|
+
pa.types.is_date: TypeMapping(bigquery.SqlTypeNames.DATE, FieldDescriptorProto.TYPE_STRING),
|
|
15
|
+
pa.types.is_decimal: TypeMapping(bigquery.SqlTypeNames.DECIMAL, FieldDescriptorProto.TYPE_STRING),
|
|
16
|
+
pa.types.is_floating: TypeMapping(bigquery.SqlTypeNames.FLOAT64, FieldDescriptorProto.TYPE_DOUBLE),
|
|
17
|
+
pa.types.is_integer: TypeMapping(bigquery.SqlTypeNames.INT64, FieldDescriptorProto.TYPE_INT64),
|
|
18
|
+
pa.types.is_string: TypeMapping(bigquery.SqlTypeNames.STRING, FieldDescriptorProto.TYPE_STRING),
|
|
19
|
+
pa.types.is_time: TypeMapping(bigquery.SqlTypeNames.TIME, FieldDescriptorProto.TYPE_STRING),
|
|
20
|
+
pa.types.is_timestamp: TypeMapping(bigquery.SqlTypeNames.TIMESTAMP, FieldDescriptorProto.TYPE_STRING),
|
|
21
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import tenacity
|
|
2
|
+
|
|
3
|
+
from google.cloud.bigquery_storage_v1 import types
|
|
4
|
+
from google.api_core.exceptions import Unknown
|
|
5
|
+
|
|
6
|
+
from . import pa_to_pb
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@tenacity.retry(
|
|
10
|
+
stop=tenacity.stop_after_attempt(5),
|
|
11
|
+
retry=tenacity.retry_if_exception_type(Unknown))
|
|
12
|
+
def _send(stream, serialized_rows, offset):
|
|
13
|
+
proto_rows = types.ProtoRows()
|
|
14
|
+
proto_rows.serialized_rows.extend(serialized_rows)
|
|
15
|
+
|
|
16
|
+
proto_data = types.AppendRowsRequest.ProtoData()
|
|
17
|
+
proto_data.rows = proto_rows
|
|
18
|
+
|
|
19
|
+
request = types.AppendRowsRequest()
|
|
20
|
+
request.offset = offset
|
|
21
|
+
request.proto_rows = proto_data
|
|
22
|
+
|
|
23
|
+
stream.append_rows_stream.send(request).result()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def upload_data(stream, pa_table, protobuf_definition, offset):
|
|
27
|
+
local_offset = 0
|
|
28
|
+
for serialized_rows in pa_to_pb.serialize(pa_table, protobuf_definition):
|
|
29
|
+
_send(stream, serialized_rows, offset + local_offset)
|
|
30
|
+
local_offset += len(serialized_rows)
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pyarrow-bigquery
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A simple library to **write to** and **download from** BigQuery tables as PyArrow tables.
|
|
5
|
+
Author-email: Sebastian Pawluś <sebastian.pawlus@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: pyarrow,bigquery
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: google-cloud-bigquery <5,>=3
|
|
10
|
+
Requires-Dist: google-cloud-bigquery-storage <3,>=2
|
|
11
|
+
Requires-Dist: pyarrow <17,>=16
|
|
12
|
+
Requires-Dist: tenacity
|
|
13
|
+
|
|
14
|
+
# pyarrow-bigquery
|
|
15
|
+
|
|
16
|
+
A simple library to **write to** and **download from** BigQuery tables as PyArrow tables.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install pyarrow-bigquery
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
This guide will help you quickly get started with `pyarrow-bigquery`, a library that allows you to **read** from and **write** to Google BigQuery using PyArrow.
|
|
27
|
+
|
|
28
|
+
### Reading from BigQuery
|
|
29
|
+
|
|
30
|
+
`pyarrow-bigquery` exposes two methods to read BigQuery tables as PyArrow tables. Depending on your use case or the size of the table, you might want to use one method over the other.
|
|
31
|
+
|
|
32
|
+
#### Read the Whole Table
|
|
33
|
+
|
|
34
|
+
When the table is small enough to fit in memory, you can read it directly using `bq.read_table`.
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
import pyarrow.bigquery as bq
|
|
38
|
+
|
|
39
|
+
table = bq.read_table("gcp_project.dataset.small_table")
|
|
40
|
+
|
|
41
|
+
print(table.num_rows)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
#### Read with Batches
|
|
45
|
+
|
|
46
|
+
If the target table is larger than memory or you have other reasons not to fetch the whole table at once, you can use the `bq.reader` iterator method along with the `batch_size` parameter to limit how much data is fetched per iteration.
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import pyarrow.bigquery as bq
|
|
50
|
+
|
|
51
|
+
for table in bq.reader("gcp_project.dataset.big_table", batch_size=100):
|
|
52
|
+
print(table.num_rows)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Writing to BigQuery
|
|
56
|
+
|
|
57
|
+
Similarly, the package exposes two methods to write to BigQuery. Depending on your use case or the size of the table, you might want to use one method over the other.
|
|
58
|
+
|
|
59
|
+
#### Write the Whole Table
|
|
60
|
+
|
|
61
|
+
When you want to write a complete table at once, you can use the `bq.write_table` method.
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import pyarrow as pa
|
|
65
|
+
import pyarrow.bigquery as bq
|
|
66
|
+
|
|
67
|
+
table = pa.Table.from_arrays([[1, 2, 3, 4]], names=['integers'])
|
|
68
|
+
|
|
69
|
+
bq.write_table(table, 'gcp_project.dataset.table')
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### Write in Batches (Smaller Chunks)
|
|
73
|
+
|
|
74
|
+
If you need to write data in smaller chunks, you can use the `bq.writer` method with the `schema` parameter to define the table structure.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import pyarrow as pa
|
|
78
|
+
import pyarrow.bigquery as bq
|
|
79
|
+
|
|
80
|
+
schema = pa.schema([
|
|
81
|
+
("integers", pa.int64())
|
|
82
|
+
])
|
|
83
|
+
|
|
84
|
+
with bq.writer("gcp_project.dataset.table", schema=schema) as w:
|
|
85
|
+
w.write_batch(record_batch)
|
|
86
|
+
w.write_table(table)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## API Reference
|
|
90
|
+
|
|
91
|
+
### `pyarrow.bigquery.write_table`
|
|
92
|
+
|
|
93
|
+
Write a PyArrow Table to a BigQuery Table. No return value.
|
|
94
|
+
|
|
95
|
+
**Parameters:**
|
|
96
|
+
|
|
97
|
+
- `table`: `pa.Table`
|
|
98
|
+
PyArrow table.
|
|
99
|
+
|
|
100
|
+
- `where`: `str`
|
|
101
|
+
Destination location in BigQuery catalog.
|
|
102
|
+
|
|
103
|
+
- `project`: `str`, *default* `None`
|
|
104
|
+
BigQuery execution project, also the billing project. If not provided, it will be extracted from `where`.
|
|
105
|
+
|
|
106
|
+
- `table_create`: `bool`, *default* `True`
|
|
107
|
+
Specifies if the BigQuery table should be created.
|
|
108
|
+
|
|
109
|
+
- `table_expire`: `None | int`, *default* `None`
|
|
110
|
+
Amount of seconds after which the created table will expire. Used only if `table_create` is `True`. Set to `None` to disable expiration.
|
|
111
|
+
|
|
112
|
+
- `table_overwrite`: `bool`, *default* `False`
|
|
113
|
+
If the table already exists, destroy it and create a new one.
|
|
114
|
+
|
|
115
|
+
- `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
|
|
116
|
+
Worker backend for fetching data.
|
|
117
|
+
|
|
118
|
+
- `worker_count`: `int`, *default* `os.cpu_count()`
|
|
119
|
+
Number of threads or processes to use for fetching data from BigQuery.
|
|
120
|
+
|
|
121
|
+
- `batch_size`: `int`, *default* `100`
|
|
122
|
+
Batch size for fetched rows.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
bq.write_table(table, 'gcp_project.dataset.table')
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### `pyarrow.bigquery.writer`
|
|
129
|
+
|
|
130
|
+
Context manager version of the write method. Useful when the PyArrow table is larger than memory size or the table is available in chunks.
|
|
131
|
+
|
|
132
|
+
**Parameters:**
|
|
133
|
+
|
|
134
|
+
- `schema`: `pa.Schema`
|
|
135
|
+
PyArrow schema.
|
|
136
|
+
|
|
137
|
+
- `where`: `str`
|
|
138
|
+
Destination location in BigQuery catalog.
|
|
139
|
+
|
|
140
|
+
- `project`: `str`, *default* `None`
|
|
141
|
+
BigQuery execution project, also the billing project. If not provided, it will be extracted from `where`.
|
|
142
|
+
|
|
143
|
+
- `table_create`: `bool`, *default* `True`
|
|
144
|
+
Specifies if the BigQuery table should be created.
|
|
145
|
+
|
|
146
|
+
- `table_expire`: `None | int`, *default* `None`
|
|
147
|
+
Amount of seconds after which the created table will expire. Used only if `table_create` is `True`. Set to `None` to disable expiration.
|
|
148
|
+
|
|
149
|
+
- `table_overwrite`: `bool`, *default* `False`
|
|
150
|
+
If the table already exists, destroy it and create a new one.
|
|
151
|
+
|
|
152
|
+
- `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
|
|
153
|
+
Worker backend for writing data.
|
|
154
|
+
|
|
155
|
+
- `worker_count`: `int`, *default* `os.cpu_count()`
|
|
156
|
+
Number of threads or processes to use for writing data to BigQuery.
|
|
157
|
+
|
|
158
|
+
- `batch_size`: `int`, *default* `100`
|
|
159
|
+
Batch size used for writes. Table will be automatically split to this value.
|
|
160
|
+
|
|
161
|
+
Depending on the use case, you might want to use one of the methods below to write your data to a BigQuery table, using either `pa.Table` or `pa.RecordBatch`.
|
|
162
|
+
|
|
163
|
+
#### `pyarrow.bigquery.writer.write_table`
|
|
164
|
+
|
|
165
|
+
Context manager method to write a table.
|
|
166
|
+
|
|
167
|
+
**Parameters:**
|
|
168
|
+
|
|
169
|
+
- `table`: `pa.Table`
|
|
170
|
+
PyArrow table.
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
import pyarrow as pa
|
|
174
|
+
import pyarrow.bigquery as bq
|
|
175
|
+
|
|
176
|
+
schema = pa.schema([("value", pa.list_(pa.int64()))])
|
|
177
|
+
|
|
178
|
+
with bq.writer("gcp_project.dataset.table", schema=schema) as w:
|
|
179
|
+
for a in range(1000):
|
|
180
|
+
w.write_table(pa.Table.from_pylist([{'value': [a] * 10}]))
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
#### `pyarrow.bigquery.writer.write_batch`
|
|
184
|
+
|
|
185
|
+
Context manager method to write a record batch.
|
|
186
|
+
|
|
187
|
+
**Parameters:**
|
|
188
|
+
|
|
189
|
+
- `batch`: `pa.RecordBatch`
|
|
190
|
+
PyArrow record batch.
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
import pyarrow as pa
|
|
194
|
+
import pyarrow.bigquery as bq
|
|
195
|
+
|
|
196
|
+
schema = pa.schema([("value", pa.list_(pa.int64()))])
|
|
197
|
+
|
|
198
|
+
with bq.writer("gcp_project.dataset.table", schema=schema) as w:
|
|
199
|
+
for a in range(1000):
|
|
200
|
+
w.write_batch(pa.RecordBatch.from_pylist([{'value': [1] * 10}]))
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### `pyarrow.bigquery.read_table`
|
|
204
|
+
|
|
205
|
+
**Parameters:**
|
|
206
|
+
|
|
207
|
+
- `source`: `str`
|
|
208
|
+
BigQuery table location.
|
|
209
|
+
|
|
210
|
+
- `project`: `str`, *default* `None`
|
|
211
|
+
BigQuery execution project, also the billing project. If not provided, it will be extracted from `source`.
|
|
212
|
+
|
|
213
|
+
- `columns`: `str`, *default* `None`
|
|
214
|
+
Columns to download. When not provided, all available columns will be downloaded.
|
|
215
|
+
|
|
216
|
+
- `row_restrictions`: `str`, *default* `None`
|
|
217
|
+
Row level filtering executed on the BigQuery side. More in [BigQuery documentation](https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1).
|
|
218
|
+
|
|
219
|
+
- `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
|
|
220
|
+
Worker backend for fetching data.
|
|
221
|
+
|
|
222
|
+
- `worker_count`: `int`, *default* `os.cpu_count()`
|
|
223
|
+
Number of threads or processes to use for fetching data from BigQuery.
|
|
224
|
+
|
|
225
|
+
- `batch_size`: `int`, *default* `100`
|
|
226
|
+
Batch size used for fetching. Table will be automatically split to this value.
|
|
227
|
+
|
|
228
|
+
### `pyarrow.bigquery.reader`
|
|
229
|
+
|
|
230
|
+
**Parameters:**
|
|
231
|
+
|
|
232
|
+
- `source`: `str`
|
|
233
|
+
BigQuery table location.
|
|
234
|
+
|
|
235
|
+
- `project`: `str`, *default* `None`
|
|
236
|
+
BigQuery execution project, also the billing project. If not provided, it will be extracted from `source`.
|
|
237
|
+
|
|
238
|
+
- `columns`: `str`, *default* `None`
|
|
239
|
+
Columns to download. When not provided, all available columns will be downloaded.
|
|
240
|
+
|
|
241
|
+
- `row_restrictions`: `str`, *default* `None`
|
|
242
|
+
Row level filtering executed on the BigQuery side. More in [BigQuery documentation](https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1).
|
|
243
|
+
|
|
244
|
+
- `worker_type`: `threading.Thread | multiprocessing.Process`, *default* `threading.Thread`
|
|
245
|
+
Worker backend for fetching data.
|
|
246
|
+
|
|
247
|
+
- `worker_count`: `int`, *default* `os.cpu_count()`
|
|
248
|
+
Number of threads or processes to use for fetching data from BigQuery.
|
|
249
|
+
|
|
250
|
+
- `batch_size`: `int`, *default* `100`
|
|
251
|
+
Batch size used for fetching. Table will be automatically split to this value.
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
import pyarrow as pa
|
|
255
|
+
import pyarrow.bigquery as bq
|
|
256
|
+
|
|
257
|
+
parts = []
|
|
258
|
+
for part in bq.reader("gcp_project.dataset.table"):
|
|
259
|
+
parts.append(part)
|
|
260
|
+
|
|
261
|
+
table = pa.concat_tables(parts)
|
|
262
|
+
```
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
pyarrow/bigquery/__init__.py,sha256=BZBKPSDDcVkf3s6d91hx4OHoeJm927BN4T33rY_xMqs,118
|
|
2
|
+
pyarrow/bigquery/read.py,sha256=MgKhvkeKqq0ha6XSN-Fe59EgPdGOHVLZ-PTCWpmqMwA,6524
|
|
3
|
+
pyarrow/bigquery/some_itertools.py,sha256=XCbKojLfSYqV6TDAtCV6uQXxxuuU9Jlk02fjxCv8UVI,394
|
|
4
|
+
pyarrow/bigquery/write/__init__.py,sha256=u3gntAxTAMRCDj6ogTyp3OicnAjsgK7LKMgVIPyQBOs,7098
|
|
5
|
+
pyarrow/bigquery/write/pa_to_bq.py,sha256=NADdA7bywjpMvS7kCLnTCOdDyTzWiQQm6wFFeqYm5dw,1309
|
|
6
|
+
pyarrow/bigquery/write/pa_to_pb.py,sha256=6UUWpGudPs8aVEhdrsYTg6OJ9PdI8UUxFi55WM2nfl0,4427
|
|
7
|
+
pyarrow/bigquery/write/type_mapping.py,sha256=O8ellbgF6_yEET4E9gQvtSKz4Q0750BMwRctTli8aXI,1138
|
|
8
|
+
pyarrow/bigquery/write/upload.py,sha256=uVmKNUAtoFneACAgFWanefNrJNjSE4D6kQdYw4B8pDk,912
|
|
9
|
+
pyarrow_bigquery-0.1.0.dist-info/METADATA,sha256=7kCiT33XF3QLKbNWcD8TzGn_8burhpBy8AcO0ju3jDI,8204
|
|
10
|
+
pyarrow_bigquery-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
11
|
+
pyarrow_bigquery-0.1.0.dist-info/top_level.txt,sha256=fPLFY23J70iLX3TKZtbNM2WS9DlDdIA5d9WX0dloJVY,8
|
|
12
|
+
pyarrow_bigquery-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pyarrow
|