micromegas 0.12.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {micromegas-0.12.0 → micromegas-0.13.0}/PKG-INFO +1 -1
- micromegas-0.13.0/micromegas/__init__.py +18 -0
- micromegas-0.13.0/micromegas/admin.py +227 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/client.py +140 -8
- {micromegas-0.12.0 → micromegas-0.13.0}/pyproject.toml +1 -1
- micromegas-0.12.0/micromegas/__init__.py +0 -9
- {micromegas-0.12.0 → micromegas-0.13.0}/README.md +0 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/FlightSql_pb2.py +0 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/__init__.py +0 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/time.py +0 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/perfetto.py +0 -0
- {micromegas-0.12.0 → micromegas-0.13.0}/micromegas/time.py +0 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
import grpc
|
2
|
+
from . import time
|
3
|
+
from . import perfetto
|
4
|
+
from . import flightsql
|
5
|
+
from . import admin
|
6
|
+
|
7
|
+
|
8
|
+
def connect(preserve_dictionary=False):
|
9
|
+
"""Connect to the analytics service using default values.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
|
13
|
+
Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
|
14
|
+
Defaults to False for backward compatibility.
|
15
|
+
"""
|
16
|
+
return flightsql.client.FlightSQLClient(
|
17
|
+
"grpc://localhost:50051", preserve_dictionary=preserve_dictionary
|
18
|
+
)
|
@@ -0,0 +1,227 @@
|
|
1
|
+
"""Administrative utilities for Micromegas lakehouse management.
|
2
|
+
|
3
|
+
This module provides functions for managing schema evolution and partition lifecycle
|
4
|
+
in Micromegas lakehouse. These functions are intended for administrative use and
|
5
|
+
should be used with caution as they perform potentially destructive operations.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
|
12
|
+
def list_incompatible_partitions(
|
13
|
+
client, view_set_name: Optional[str] = None
|
14
|
+
) -> pd.DataFrame:
|
15
|
+
"""List partitions with schemas incompatible with current view set schemas.
|
16
|
+
|
17
|
+
This function identifies partitions that have schema versions different from
|
18
|
+
the current schema version for their view set. These incompatible partitions
|
19
|
+
are ignored during queries but take up storage space and should be
|
20
|
+
retired to free storage and enable clean schema evolution.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
client: FlightSQLClient instance for executing queries.
|
24
|
+
view_set_name (str, optional): Filter results to a specific view set.
|
25
|
+
If None, returns incompatible partitions across all view sets.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
pandas.DataFrame: DataFrame with incompatible partition information containing:
|
29
|
+
- view_set_name: Name of the view set
|
30
|
+
- view_instance_id: Instance ID (e.g., process_id or 'global')
|
31
|
+
- incompatible_schema_hash: The old schema hash in the partition
|
32
|
+
- current_schema_hash: The current schema hash from ViewFactory
|
33
|
+
- partition_count: Number of incompatible partitions with this schema
|
34
|
+
- total_size_bytes: Total size in bytes of all incompatible partitions
|
35
|
+
- file_paths: Array of file paths for each incompatible partition (for precise retirement)
|
36
|
+
|
37
|
+
Example:
|
38
|
+
>>> import micromegas
|
39
|
+
>>> import micromegas.admin
|
40
|
+
>>>
|
41
|
+
>>> client = micromegas.connect()
|
42
|
+
>>>
|
43
|
+
>>> # List all incompatible partitions across all view sets
|
44
|
+
>>> incompatible = micromegas.admin.list_incompatible_partitions(client)
|
45
|
+
>>> print(f"Found {len(incompatible)} groups of incompatible partitions")
|
46
|
+
>>>
|
47
|
+
>>> # List incompatible partitions for specific view set
|
48
|
+
>>> log_incompatible = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
|
49
|
+
>>> print(f"Log entries incompatible partitions: {log_incompatible['partition_count'].sum()}")
|
50
|
+
|
51
|
+
Note:
|
52
|
+
This function leverages the existing list_partitions() and list_view_sets()
|
53
|
+
UDTFs to perform server-side JOIN and aggregation for optimal performance.
|
54
|
+
Schema "hashes" are actually version numbers (e.g., [4]) not cryptographic hashes.
|
55
|
+
SQL is executed directly by DataFusion, so no SQL injection concerns.
|
56
|
+
"""
|
57
|
+
# Build view filter clause if specific view set requested
|
58
|
+
view_filter = ""
|
59
|
+
if view_set_name is not None:
|
60
|
+
view_filter = f"AND p.view_set_name = '{view_set_name}'"
|
61
|
+
|
62
|
+
# Construct SQL query with JOIN between list_partitions() and list_view_sets()
|
63
|
+
# Server-side filtering and aggregation for optimal performance
|
64
|
+
sql = f"""
|
65
|
+
SELECT
|
66
|
+
p.view_set_name,
|
67
|
+
p.view_instance_id,
|
68
|
+
p.file_schema_hash as incompatible_schema_hash,
|
69
|
+
vs.current_schema_hash,
|
70
|
+
COUNT(*) as partition_count,
|
71
|
+
SUM(p.file_size) as total_size_bytes,
|
72
|
+
ARRAY_AGG(p.file_path) as file_paths
|
73
|
+
FROM list_partitions() p
|
74
|
+
JOIN list_view_sets() vs ON p.view_set_name = vs.view_set_name
|
75
|
+
WHERE p.file_schema_hash != vs.current_schema_hash
|
76
|
+
{view_filter}
|
77
|
+
GROUP BY p.view_set_name, p.view_instance_id, p.file_schema_hash, vs.current_schema_hash
|
78
|
+
ORDER BY p.view_set_name, p.view_instance_id
|
79
|
+
"""
|
80
|
+
|
81
|
+
return client.query(sql)
|
82
|
+
|
83
|
+
|
84
|
+
def retire_incompatible_partitions(
|
85
|
+
client, view_set_name: Optional[str] = None
|
86
|
+
) -> pd.DataFrame:
|
87
|
+
"""Retire partitions with schemas incompatible with current view set schemas.
|
88
|
+
|
89
|
+
This function identifies and retires partitions that have schema versions
|
90
|
+
different from the current schema version for their view set. This enables
|
91
|
+
safe schema evolution by cleaning up old schema versions.
|
92
|
+
|
93
|
+
**SAFETY**: This function retires only the exact incompatible partitions by
|
94
|
+
their file paths, ensuring no compatible partitions are accidentally retired.
|
95
|
+
|
96
|
+
**WARNING**: This operation is irreversible. Retired partitions will be
|
97
|
+
permanently deleted from metadata and their data files removed from object storage.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
client: FlightSQLClient instance for executing queries.
|
101
|
+
view_set_name (str, optional): Retire incompatible partitions only for
|
102
|
+
this specific view set. If None, retires incompatible partitions
|
103
|
+
across all view sets (use with extreme caution).
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
pandas.DataFrame: DataFrame with retirement results containing:
|
107
|
+
- view_set_name: View set that was processed
|
108
|
+
- view_instance_id: Instance ID of partitions retired
|
109
|
+
- partitions_retired: Count of partitions successfully retired
|
110
|
+
- partitions_failed: Count of partitions that failed to retire
|
111
|
+
- storage_freed_bytes: Total bytes freed from storage
|
112
|
+
- retirement_messages: Array of detailed messages for each retirement attempt
|
113
|
+
|
114
|
+
Example:
|
115
|
+
>>> import micromegas
|
116
|
+
>>> import micromegas.admin
|
117
|
+
>>>
|
118
|
+
>>> client = micromegas.connect()
|
119
|
+
>>>
|
120
|
+
>>> # Preview what would be retired (recommended first step)
|
121
|
+
>>> preview = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
|
122
|
+
>>> print(f"Would retire {preview['partition_count'].sum()} partitions")
|
123
|
+
>>> print(f"Would free {preview['total_size_bytes'].sum() / (1024**3):.2f} GB")
|
124
|
+
>>>
|
125
|
+
>>> # Retire incompatible partitions for specific view set
|
126
|
+
>>> if input("Proceed with retirement? (yes/no): ") == "yes":
|
127
|
+
... result = micromegas.admin.retire_incompatible_partitions(client, 'log_entries')
|
128
|
+
... print(f"Retired {result['partitions_retired'].sum()} partitions")
|
129
|
+
... print(f"Failed {result['partitions_failed'].sum()} partitions")
|
130
|
+
|
131
|
+
Note:
|
132
|
+
This function uses the retire_partition_by_file() UDF to retire each
|
133
|
+
partition individually by its exact file path. This ensures precise
|
134
|
+
targeting and eliminates the risk of accidentally retiring compatible
|
135
|
+
partitions that happen to exist in the same time ranges.
|
136
|
+
"""
|
137
|
+
# First identify incompatible partitions
|
138
|
+
incompatible = list_incompatible_partitions(client, view_set_name)
|
139
|
+
|
140
|
+
if incompatible.empty:
|
141
|
+
# No incompatible partitions found, return empty DataFrame with expected columns
|
142
|
+
return pd.DataFrame(
|
143
|
+
columns=[
|
144
|
+
"view_set_name",
|
145
|
+
"view_instance_id",
|
146
|
+
"partitions_retired",
|
147
|
+
"partitions_failed",
|
148
|
+
"storage_freed_bytes",
|
149
|
+
"retirement_messages",
|
150
|
+
]
|
151
|
+
)
|
152
|
+
|
153
|
+
results = []
|
154
|
+
|
155
|
+
# For each group of incompatible partitions, retire by individual file paths
|
156
|
+
for _, group in incompatible.iterrows():
|
157
|
+
file_paths = group["file_paths"]
|
158
|
+
|
159
|
+
# Convert file_paths to list if it's not already (handle different pandas array types)
|
160
|
+
if hasattr(file_paths, "tolist"):
|
161
|
+
file_paths_list = file_paths.tolist()
|
162
|
+
elif isinstance(file_paths, str):
|
163
|
+
# Single file path case
|
164
|
+
file_paths_list = [file_paths]
|
165
|
+
else:
|
166
|
+
file_paths_list = list(file_paths)
|
167
|
+
|
168
|
+
retirement_messages = []
|
169
|
+
partitions_retired = 0
|
170
|
+
partitions_failed = 0
|
171
|
+
|
172
|
+
# Retire each partition individually using the targeted UDF
|
173
|
+
for file_path in file_paths_list:
|
174
|
+
if not file_path or pd.isna(file_path):
|
175
|
+
continue
|
176
|
+
|
177
|
+
try:
|
178
|
+
# Use the new retire_partition_by_file UDF
|
179
|
+
retirement_sql = (
|
180
|
+
f"SELECT retire_partition_by_file('{file_path}') as message"
|
181
|
+
)
|
182
|
+
retirement_result = client.query(retirement_sql)
|
183
|
+
|
184
|
+
if not retirement_result.empty:
|
185
|
+
message = retirement_result["message"].iloc[0]
|
186
|
+
retirement_messages.append(message)
|
187
|
+
|
188
|
+
if message.startswith("SUCCESS:"):
|
189
|
+
partitions_retired += 1
|
190
|
+
else:
|
191
|
+
partitions_failed += 1
|
192
|
+
print(f"Warning: Failed to retire {file_path}: {message}")
|
193
|
+
else:
|
194
|
+
partitions_failed += 1
|
195
|
+
retirement_messages.append(
|
196
|
+
f"ERROR: No result returned for {file_path}"
|
197
|
+
)
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
partitions_failed += 1
|
201
|
+
error_msg = f"ERROR: Exception retiring {file_path}: {e}"
|
202
|
+
retirement_messages.append(error_msg)
|
203
|
+
print(f"Error retiring partition {file_path}: {e}")
|
204
|
+
|
205
|
+
# Calculate storage freed (only count successful retirements)
|
206
|
+
if partitions_retired > 0 and group["partition_count"] > 0:
|
207
|
+
# Proportional calculation based on successful retirements
|
208
|
+
storage_freed = int(
|
209
|
+
group["total_size_bytes"]
|
210
|
+
* (partitions_retired / group["partition_count"])
|
211
|
+
)
|
212
|
+
else:
|
213
|
+
storage_freed = 0
|
214
|
+
|
215
|
+
# Record retirement results for this group
|
216
|
+
results.append(
|
217
|
+
{
|
218
|
+
"view_set_name": group["view_set_name"],
|
219
|
+
"view_instance_id": group["view_instance_id"],
|
220
|
+
"partitions_retired": partitions_retired,
|
221
|
+
"partitions_failed": partitions_failed,
|
222
|
+
"storage_freed_bytes": storage_freed,
|
223
|
+
"retirement_messages": retirement_messages,
|
224
|
+
}
|
225
|
+
)
|
226
|
+
|
227
|
+
return pd.DataFrame(results)
|
@@ -31,7 +31,7 @@ class MicromegasMiddlewareFactory(flight.ClientMiddlewareFactory):
|
|
31
31
|
return MicromegasMiddleware(self.headers)
|
32
32
|
|
33
33
|
|
34
|
-
def make_call_headers(begin, end):
|
34
|
+
def make_call_headers(begin, end, preserve_dictionary=False):
|
35
35
|
call_headers = []
|
36
36
|
if begin is not None:
|
37
37
|
call_headers.append(
|
@@ -47,6 +47,13 @@ def make_call_headers(begin, end):
|
|
47
47
|
time.format_datetime(end).encode("utf8"),
|
48
48
|
)
|
49
49
|
)
|
50
|
+
if preserve_dictionary:
|
51
|
+
call_headers.append(
|
52
|
+
(
|
53
|
+
"preserve_dictionary".encode("utf8"),
|
54
|
+
"true".encode("utf8"),
|
55
|
+
)
|
56
|
+
)
|
50
57
|
return call_headers
|
51
58
|
|
52
59
|
|
@@ -130,7 +137,7 @@ class FlightSQLClient:
|
|
130
137
|
supports streaming for large result sets.
|
131
138
|
"""
|
132
139
|
|
133
|
-
def __init__(self, uri, headers=None):
|
140
|
+
def __init__(self, uri, headers=None, preserve_dictionary=False):
|
134
141
|
"""Initialize a FlightSQL client connection.
|
135
142
|
|
136
143
|
Args:
|
@@ -138,6 +145,9 @@ class FlightSQLClient:
|
|
138
145
|
Use "grpc://" for unencrypted connections or "grpc+tls://" for TLS.
|
139
146
|
headers (dict, optional): Custom headers for authentication or metadata.
|
140
147
|
Example: {"authorization": "Bearer token123"}
|
148
|
+
preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
|
149
|
+
Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
|
150
|
+
Defaults to False for backward compatibility.
|
141
151
|
|
142
152
|
Example:
|
143
153
|
>>> # Connect to local server
|
@@ -148,6 +158,12 @@ class FlightSQLClient:
|
|
148
158
|
... "grpc+tls://remote-server:50051",
|
149
159
|
... headers={"authorization": "Bearer mytoken"}
|
150
160
|
... )
|
161
|
+
>>>
|
162
|
+
>>> # Connect with dictionary preservation for memory efficiency
|
163
|
+
>>> client = FlightSQLClient(
|
164
|
+
... "grpc://localhost:50051",
|
165
|
+
... preserve_dictionary=True
|
166
|
+
... )
|
151
167
|
"""
|
152
168
|
fh = open(certifi.where(), "r")
|
153
169
|
cert = fh.read()
|
@@ -156,6 +172,69 @@ class FlightSQLClient:
|
|
156
172
|
self.__flight_client = flight.connect(
|
157
173
|
location=uri, tls_root_certs=cert, middleware=[factory]
|
158
174
|
)
|
175
|
+
self.__preserve_dictionary = preserve_dictionary
|
176
|
+
|
177
|
+
def _prepare_table_for_pandas(self, table):
|
178
|
+
"""Prepare Arrow table with dictionary columns for pandas conversion.
|
179
|
+
|
180
|
+
As of PyArrow/pandas 2024-2025, dictionary-encoded complex types
|
181
|
+
(List, Struct, Union) cannot be converted directly to pandas due to
|
182
|
+
"ArrowNotImplementedError: Unification of ... dictionaries is not implemented".
|
183
|
+
|
184
|
+
This method converts problematic dictionary columns back to regular arrays
|
185
|
+
while preserving memory efficiency during Arrow processing.
|
186
|
+
"""
|
187
|
+
import pyarrow.compute as pc
|
188
|
+
|
189
|
+
columns = []
|
190
|
+
column_names = []
|
191
|
+
|
192
|
+
for i, column in enumerate(table.columns):
|
193
|
+
column_name = table.column_names[i]
|
194
|
+
column_names.append(column_name)
|
195
|
+
|
196
|
+
# Check if this is a dictionary-encoded column
|
197
|
+
if pyarrow.types.is_dictionary(column.type):
|
198
|
+
value_type = column.type.value_type
|
199
|
+
|
200
|
+
# Convert dictionary-encoded complex types that pandas can't handle
|
201
|
+
if (
|
202
|
+
pyarrow.types.is_list(value_type)
|
203
|
+
or pyarrow.types.is_struct(value_type)
|
204
|
+
or pyarrow.types.is_union(value_type)
|
205
|
+
):
|
206
|
+
# Manually decode dictionary by reconstructing the array
|
207
|
+
# This works around PyArrow's casting limitations
|
208
|
+
|
209
|
+
# Decode each chunk of the dictionary column
|
210
|
+
reconstructed_chunks = []
|
211
|
+
|
212
|
+
if hasattr(column, "chunks"):
|
213
|
+
# ChunkedArray case
|
214
|
+
for chunk in column.chunks:
|
215
|
+
indices = chunk.indices
|
216
|
+
dictionary = chunk.dictionary
|
217
|
+
reconstructed_chunk = pc.take(dictionary, indices)
|
218
|
+
reconstructed_chunks.append(reconstructed_chunk)
|
219
|
+
|
220
|
+
# Create a new ChunkedArray from reconstructed chunks
|
221
|
+
reconstructed = pyarrow.chunked_array(reconstructed_chunks)
|
222
|
+
else:
|
223
|
+
# Single Array case
|
224
|
+
indices = column.indices
|
225
|
+
dictionary = column.dictionary
|
226
|
+
reconstructed = pc.take(dictionary, indices)
|
227
|
+
|
228
|
+
columns.append(reconstructed)
|
229
|
+
else:
|
230
|
+
# Keep simple dictionary types (strings, numbers) for pandas
|
231
|
+
# These work fine and provide memory benefits in pandas too
|
232
|
+
columns.append(column)
|
233
|
+
else:
|
234
|
+
# Non-dictionary columns are fine as-is
|
235
|
+
columns.append(column)
|
236
|
+
|
237
|
+
return pyarrow.Table.from_arrays(columns, names=column_names)
|
159
238
|
|
160
239
|
def query(self, sql, begin=None, end=None):
|
161
240
|
"""Execute a SQL query and return results as a pandas DataFrame.
|
@@ -173,7 +252,9 @@ class FlightSQLClient:
|
|
173
252
|
together with begin for optimal performance.
|
174
253
|
|
175
254
|
Returns:
|
176
|
-
pandas.DataFrame: Query results with appropriate column types.
|
255
|
+
pandas.DataFrame: Query results with appropriate column types. When the client was
|
256
|
+
created with preserve_dictionary=True, dictionary-encoded columns will maintain
|
257
|
+
their encoding for memory efficiency.
|
177
258
|
|
178
259
|
Raises:
|
179
260
|
Exception: If the query fails due to syntax errors, missing tables, or server issues.
|
@@ -189,14 +270,17 @@ class FlightSQLClient:
|
|
189
270
|
... begin, end
|
190
271
|
... )
|
191
272
|
>>>
|
192
|
-
>>> #
|
193
|
-
>>>
|
273
|
+
>>> # For dictionary preservation, create client with preserve_dictionary=True
|
274
|
+
>>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
|
275
|
+
>>> df = dict_client.query("SELECT dict_encoded_column FROM table")
|
194
276
|
|
195
277
|
Performance Note:
|
196
278
|
Always provide begin/end parameters when querying time-series data to enable
|
197
279
|
partition pruning, which can improve query performance by 10-100x.
|
280
|
+
Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
|
281
|
+
for significant memory reduction.
|
198
282
|
"""
|
199
|
-
call_headers = make_call_headers(begin, end)
|
283
|
+
call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
|
200
284
|
options = flight.FlightCallOptions(headers=call_headers)
|
201
285
|
ticket = make_query_ticket(sql)
|
202
286
|
reader = self.__flight_client.do_get(ticket, options=options)
|
@@ -204,6 +288,11 @@ class FlightSQLClient:
|
|
204
288
|
for chunk in reader:
|
205
289
|
record_batches.append(chunk.data)
|
206
290
|
table = pyarrow.Table.from_batches(record_batches, reader.schema)
|
291
|
+
|
292
|
+
# Handle dictionary-encoded columns that pandas can't convert directly
|
293
|
+
if self.__preserve_dictionary:
|
294
|
+
table = self._prepare_table_for_pandas(table)
|
295
|
+
|
207
296
|
return table.to_pandas()
|
208
297
|
|
209
298
|
def query_stream(self, sql, begin=None, end=None):
|
@@ -220,7 +309,8 @@ class FlightSQLClient:
|
|
220
309
|
|
221
310
|
Yields:
|
222
311
|
pyarrow.RecordBatch: Chunks of query results. Each batch contains a subset
|
223
|
-
of rows with all columns from the query.
|
312
|
+
of rows with all columns from the query. When the client was created with
|
313
|
+
preserve_dictionary=True, dictionary-encoded columns will maintain their encoding.
|
224
314
|
|
225
315
|
Example:
|
226
316
|
>>> # Stream and process large dataset
|
@@ -233,21 +323,63 @@ class FlightSQLClient:
|
|
233
323
|
... total_errors += len(df_chunk)
|
234
324
|
... # Process chunk and release memory
|
235
325
|
... print(f"Total errors: {total_errors}")
|
326
|
+
>>>
|
327
|
+
>>> # Stream with dictionary preservation
|
328
|
+
>>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
|
329
|
+
>>> for batch in dict_client.query_stream("SELECT dict_encoded_column FROM table"):
|
330
|
+
... # Process dictionary-encoded data efficiently
|
331
|
+
... pass
|
236
332
|
|
237
333
|
Performance Note:
|
238
334
|
Streaming is recommended when:
|
239
335
|
- Result set is larger than 100MB
|
240
336
|
- You want to start processing before the query completes
|
241
337
|
- Memory usage needs to be controlled
|
338
|
+
Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
|
339
|
+
for significant memory reduction.
|
242
340
|
"""
|
243
341
|
ticket = make_query_ticket(sql)
|
244
|
-
call_headers = make_call_headers(begin, end)
|
342
|
+
call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
|
245
343
|
options = flight.FlightCallOptions(headers=call_headers)
|
246
344
|
reader = self.__flight_client.do_get(ticket, options=options)
|
247
345
|
record_batches = []
|
248
346
|
for chunk in reader:
|
249
347
|
yield chunk.data
|
250
348
|
|
349
|
+
def query_arrow(self, sql, begin=None, end=None):
|
350
|
+
"""Execute a SQL query and return results as an Arrow Table.
|
351
|
+
|
352
|
+
This method preserves dictionary encoding and avoids pandas conversion issues.
|
353
|
+
Useful for working directly with Arrow data or when pandas can't handle
|
354
|
+
dictionary-encoded complex types.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
sql (str): The SQL query to execute.
|
358
|
+
begin (datetime or str, optional): Start time for partition pruning.
|
359
|
+
end (datetime or str, optional): End time for partition pruning.
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
pyarrow.Table: Query results as Arrow Table with preserved dictionary encoding.
|
363
|
+
|
364
|
+
Example:
|
365
|
+
>>> # Get Arrow table with preserved dictionary encoding
|
366
|
+
>>> table = client.query_arrow("SELECT dict_encoded_column FROM table")
|
367
|
+
>>> print(table.schema) # Shows dictionary<...> types
|
368
|
+
>>>
|
369
|
+
>>> # Work with Arrow directly to avoid pandas limitations
|
370
|
+
>>> for batch in table.to_batches():
|
371
|
+
... # Process Arrow data without pandas conversion
|
372
|
+
... pass
|
373
|
+
"""
|
374
|
+
call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
|
375
|
+
options = flight.FlightCallOptions(headers=call_headers)
|
376
|
+
ticket = make_query_ticket(sql)
|
377
|
+
reader = self.__flight_client.do_get(ticket, options=options)
|
378
|
+
record_batches = []
|
379
|
+
for chunk in reader:
|
380
|
+
record_batches.append(chunk.data)
|
381
|
+
return pyarrow.Table.from_batches(record_batches, reader.schema)
|
382
|
+
|
251
383
|
def prepare_statement(self, sql):
|
252
384
|
"""Create a prepared statement to retrieve query schema without executing it.
|
253
385
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|