micromegas 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
micromegas/__init__.py CHANGED
@@ -2,8 +2,17 @@ import grpc
2
2
  from . import time
3
3
  from . import perfetto
4
4
  from . import flightsql
5
+ from . import admin
5
6
 
6
7
 
7
- def connect():
8
- "connect to the analytics service using default values"
9
- return flightsql.client.FlightSQLClient("grpc://localhost:50051")
8
+ def connect(preserve_dictionary=False):
9
+ """Connect to the analytics service using default values.
10
+
11
+ Args:
12
+ preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
13
+ Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
14
+ Defaults to False for backward compatibility.
15
+ """
16
+ return flightsql.client.FlightSQLClient(
17
+ "grpc://localhost:50051", preserve_dictionary=preserve_dictionary
18
+ )
micromegas/admin.py ADDED
@@ -0,0 +1,227 @@
1
+ """Administrative utilities for Micromegas lakehouse management.
2
+
3
+ This module provides functions for managing schema evolution and partition lifecycle
4
+ in Micromegas lakehouse. These functions are intended for administrative use and
5
+ should be used with caution as they perform potentially destructive operations.
6
+ """
7
+
8
+ import pandas as pd
9
+ from typing import Optional
10
+
11
+
12
+ def list_incompatible_partitions(
13
+ client, view_set_name: Optional[str] = None
14
+ ) -> pd.DataFrame:
15
+ """List partitions with schemas incompatible with current view set schemas.
16
+
17
+ This function identifies partitions that have schema versions different from
18
+ the current schema version for their view set. These incompatible partitions
19
+ are ignored during queries but take up storage space and should be
20
+ retired to free storage and enable clean schema evolution.
21
+
22
+ Args:
23
+ client: FlightSQLClient instance for executing queries.
24
+ view_set_name (str, optional): Filter results to a specific view set.
25
+ If None, returns incompatible partitions across all view sets.
26
+
27
+ Returns:
28
+ pandas.DataFrame: DataFrame with incompatible partition information containing:
29
+ - view_set_name: Name of the view set
30
+ - view_instance_id: Instance ID (e.g., process_id or 'global')
31
+ - incompatible_schema_hash: The old schema hash in the partition
32
+ - current_schema_hash: The current schema hash from ViewFactory
33
+ - partition_count: Number of incompatible partitions with this schema
34
+ - total_size_bytes: Total size in bytes of all incompatible partitions
35
+ - file_paths: Array of file paths for each incompatible partition (for precise retirement)
36
+
37
+ Example:
38
+ >>> import micromegas
39
+ >>> import micromegas.admin
40
+ >>>
41
+ >>> client = micromegas.connect()
42
+ >>>
43
+ >>> # List all incompatible partitions across all view sets
44
+ >>> incompatible = micromegas.admin.list_incompatible_partitions(client)
45
+ >>> print(f"Found {len(incompatible)} groups of incompatible partitions")
46
+ >>>
47
+ >>> # List incompatible partitions for specific view set
48
+ >>> log_incompatible = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
49
+ >>> print(f"Log entries incompatible partitions: {log_incompatible['partition_count'].sum()}")
50
+
51
+ Note:
52
+ This function leverages the existing list_partitions() and list_view_sets()
53
+ UDTFs to perform server-side JOIN and aggregation for optimal performance.
54
+ Schema "hashes" are actually version numbers (e.g., [4]) not cryptographic hashes.
55
+ SQL is executed directly by DataFusion, so no SQL injection concerns.
56
+ """
57
+ # Build view filter clause if specific view set requested
58
+ view_filter = ""
59
+ if view_set_name is not None:
60
+ view_filter = f"AND p.view_set_name = '{view_set_name}'"
61
+
62
+ # Construct SQL query with JOIN between list_partitions() and list_view_sets()
63
+ # Server-side filtering and aggregation for optimal performance
64
+ sql = f"""
65
+ SELECT
66
+ p.view_set_name,
67
+ p.view_instance_id,
68
+ p.file_schema_hash as incompatible_schema_hash,
69
+ vs.current_schema_hash,
70
+ COUNT(*) as partition_count,
71
+ SUM(p.file_size) as total_size_bytes,
72
+ ARRAY_AGG(p.file_path) as file_paths
73
+ FROM list_partitions() p
74
+ JOIN list_view_sets() vs ON p.view_set_name = vs.view_set_name
75
+ WHERE p.file_schema_hash != vs.current_schema_hash
76
+ {view_filter}
77
+ GROUP BY p.view_set_name, p.view_instance_id, p.file_schema_hash, vs.current_schema_hash
78
+ ORDER BY p.view_set_name, p.view_instance_id
79
+ """
80
+
81
+ return client.query(sql)
82
+
83
+
84
+ def retire_incompatible_partitions(
85
+ client, view_set_name: Optional[str] = None
86
+ ) -> pd.DataFrame:
87
+ """Retire partitions with schemas incompatible with current view set schemas.
88
+
89
+ This function identifies and retires partitions that have schema versions
90
+ different from the current schema version for their view set. This enables
91
+ safe schema evolution by cleaning up old schema versions.
92
+
93
+ **SAFETY**: This function retires only the exact incompatible partitions by
94
+ their file paths, ensuring no compatible partitions are accidentally retired.
95
+
96
+ **WARNING**: This operation is irreversible. Retired partitions will be
97
+ permanently deleted from metadata and their data files removed from object storage.
98
+
99
+ Args:
100
+ client: FlightSQLClient instance for executing queries.
101
+ view_set_name (str, optional): Retire incompatible partitions only for
102
+ this specific view set. If None, retires incompatible partitions
103
+ across all view sets (use with extreme caution).
104
+
105
+ Returns:
106
+ pandas.DataFrame: DataFrame with retirement results containing:
107
+ - view_set_name: View set that was processed
108
+ - view_instance_id: Instance ID of partitions retired
109
+ - partitions_retired: Count of partitions successfully retired
110
+ - partitions_failed: Count of partitions that failed to retire
111
+ - storage_freed_bytes: Total bytes freed from storage
112
+ - retirement_messages: Array of detailed messages for each retirement attempt
113
+
114
+ Example:
115
+ >>> import micromegas
116
+ >>> import micromegas.admin
117
+ >>>
118
+ >>> client = micromegas.connect()
119
+ >>>
120
+ >>> # Preview what would be retired (recommended first step)
121
+ >>> preview = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
122
+ >>> print(f"Would retire {preview['partition_count'].sum()} partitions")
123
+ >>> print(f"Would free {preview['total_size_bytes'].sum() / (1024**3):.2f} GB")
124
+ >>>
125
+ >>> # Retire incompatible partitions for specific view set
126
+ >>> if input("Proceed with retirement? (yes/no): ") == "yes":
127
+ ... result = micromegas.admin.retire_incompatible_partitions(client, 'log_entries')
128
+ ... print(f"Retired {result['partitions_retired'].sum()} partitions")
129
+ ... print(f"Failed {result['partitions_failed'].sum()} partitions")
130
+
131
+ Note:
132
+ This function uses the retire_partition_by_file() UDF to retire each
133
+ partition individually by its exact file path. This ensures precise
134
+ targeting and eliminates the risk of accidentally retiring compatible
135
+ partitions that happen to exist in the same time ranges.
136
+ """
137
+ # First identify incompatible partitions
138
+ incompatible = list_incompatible_partitions(client, view_set_name)
139
+
140
+ if incompatible.empty:
141
+ # No incompatible partitions found, return empty DataFrame with expected columns
142
+ return pd.DataFrame(
143
+ columns=[
144
+ "view_set_name",
145
+ "view_instance_id",
146
+ "partitions_retired",
147
+ "partitions_failed",
148
+ "storage_freed_bytes",
149
+ "retirement_messages",
150
+ ]
151
+ )
152
+
153
+ results = []
154
+
155
+ # For each group of incompatible partitions, retire by individual file paths
156
+ for _, group in incompatible.iterrows():
157
+ file_paths = group["file_paths"]
158
+
159
+ # Convert file_paths to list if it's not already (handle different pandas array types)
160
+ if hasattr(file_paths, "tolist"):
161
+ file_paths_list = file_paths.tolist()
162
+ elif isinstance(file_paths, str):
163
+ # Single file path case
164
+ file_paths_list = [file_paths]
165
+ else:
166
+ file_paths_list = list(file_paths)
167
+
168
+ retirement_messages = []
169
+ partitions_retired = 0
170
+ partitions_failed = 0
171
+
172
+ # Retire each partition individually using the targeted UDF
173
+ for file_path in file_paths_list:
174
+ if not file_path or pd.isna(file_path):
175
+ continue
176
+
177
+ try:
178
+ # Use the new retire_partition_by_file UDF
179
+ retirement_sql = (
180
+ f"SELECT retire_partition_by_file('{file_path}') as message"
181
+ )
182
+ retirement_result = client.query(retirement_sql)
183
+
184
+ if not retirement_result.empty:
185
+ message = retirement_result["message"].iloc[0]
186
+ retirement_messages.append(message)
187
+
188
+ if message.startswith("SUCCESS:"):
189
+ partitions_retired += 1
190
+ else:
191
+ partitions_failed += 1
192
+ print(f"Warning: Failed to retire {file_path}: {message}")
193
+ else:
194
+ partitions_failed += 1
195
+ retirement_messages.append(
196
+ f"ERROR: No result returned for {file_path}"
197
+ )
198
+
199
+ except Exception as e:
200
+ partitions_failed += 1
201
+ error_msg = f"ERROR: Exception retiring {file_path}: {e}"
202
+ retirement_messages.append(error_msg)
203
+ print(f"Error retiring partition {file_path}: {e}")
204
+
205
+ # Calculate storage freed (only count successful retirements)
206
+ if partitions_retired > 0 and group["partition_count"] > 0:
207
+ # Proportional calculation based on successful retirements
208
+ storage_freed = int(
209
+ group["total_size_bytes"]
210
+ * (partitions_retired / group["partition_count"])
211
+ )
212
+ else:
213
+ storage_freed = 0
214
+
215
+ # Record retirement results for this group
216
+ results.append(
217
+ {
218
+ "view_set_name": group["view_set_name"],
219
+ "view_instance_id": group["view_instance_id"],
220
+ "partitions_retired": partitions_retired,
221
+ "partitions_failed": partitions_failed,
222
+ "storage_freed_bytes": storage_freed,
223
+ "retirement_messages": retirement_messages,
224
+ }
225
+ )
226
+
227
+ return pd.DataFrame(results)
@@ -31,7 +31,7 @@ class MicromegasMiddlewareFactory(flight.ClientMiddlewareFactory):
31
31
  return MicromegasMiddleware(self.headers)
32
32
 
33
33
 
34
- def make_call_headers(begin, end):
34
+ def make_call_headers(begin, end, preserve_dictionary=False):
35
35
  call_headers = []
36
36
  if begin is not None:
37
37
  call_headers.append(
@@ -47,6 +47,13 @@ def make_call_headers(begin, end):
47
47
  time.format_datetime(end).encode("utf8"),
48
48
  )
49
49
  )
50
+ if preserve_dictionary:
51
+ call_headers.append(
52
+ (
53
+ "preserve_dictionary".encode("utf8"),
54
+ "true".encode("utf8"),
55
+ )
56
+ )
50
57
  return call_headers
51
58
 
52
59
 
@@ -130,7 +137,7 @@ class FlightSQLClient:
130
137
  supports streaming for large result sets.
131
138
  """
132
139
 
133
- def __init__(self, uri, headers=None):
140
+ def __init__(self, uri, headers=None, preserve_dictionary=False):
134
141
  """Initialize a FlightSQL client connection.
135
142
 
136
143
  Args:
@@ -138,6 +145,9 @@ class FlightSQLClient:
138
145
  Use "grpc://" for unencrypted connections or "grpc+tls://" for TLS.
139
146
  headers (dict, optional): Custom headers for authentication or metadata.
140
147
  Example: {"authorization": "Bearer token123"}
148
+ preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
149
+ Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
150
+ Defaults to False for backward compatibility.
141
151
 
142
152
  Example:
143
153
  >>> # Connect to local server
@@ -148,6 +158,12 @@ class FlightSQLClient:
148
158
  ... "grpc+tls://remote-server:50051",
149
159
  ... headers={"authorization": "Bearer mytoken"}
150
160
  ... )
161
+ >>>
162
+ >>> # Connect with dictionary preservation for memory efficiency
163
+ >>> client = FlightSQLClient(
164
+ ... "grpc://localhost:50051",
165
+ ... preserve_dictionary=True
166
+ ... )
151
167
  """
152
168
  fh = open(certifi.where(), "r")
153
169
  cert = fh.read()
@@ -156,6 +172,69 @@ class FlightSQLClient:
156
172
  self.__flight_client = flight.connect(
157
173
  location=uri, tls_root_certs=cert, middleware=[factory]
158
174
  )
175
+ self.__preserve_dictionary = preserve_dictionary
176
+
177
+ def _prepare_table_for_pandas(self, table):
178
+ """Prepare Arrow table with dictionary columns for pandas conversion.
179
+
180
+ As of PyArrow/pandas 2024-2025, dictionary-encoded complex types
181
+ (List, Struct, Union) cannot be converted directly to pandas due to
182
+ "ArrowNotImplementedError: Unification of ... dictionaries is not implemented".
183
+
184
+ This method converts problematic dictionary columns back to regular arrays
185
+ while preserving memory efficiency during Arrow processing.
186
+ """
187
+ import pyarrow.compute as pc
188
+
189
+ columns = []
190
+ column_names = []
191
+
192
+ for i, column in enumerate(table.columns):
193
+ column_name = table.column_names[i]
194
+ column_names.append(column_name)
195
+
196
+ # Check if this is a dictionary-encoded column
197
+ if pyarrow.types.is_dictionary(column.type):
198
+ value_type = column.type.value_type
199
+
200
+ # Convert dictionary-encoded complex types that pandas can't handle
201
+ if (
202
+ pyarrow.types.is_list(value_type)
203
+ or pyarrow.types.is_struct(value_type)
204
+ or pyarrow.types.is_union(value_type)
205
+ ):
206
+ # Manually decode dictionary by reconstructing the array
207
+ # This works around PyArrow's casting limitations
208
+
209
+ # Decode each chunk of the dictionary column
210
+ reconstructed_chunks = []
211
+
212
+ if hasattr(column, "chunks"):
213
+ # ChunkedArray case
214
+ for chunk in column.chunks:
215
+ indices = chunk.indices
216
+ dictionary = chunk.dictionary
217
+ reconstructed_chunk = pc.take(dictionary, indices)
218
+ reconstructed_chunks.append(reconstructed_chunk)
219
+
220
+ # Create a new ChunkedArray from reconstructed chunks
221
+ reconstructed = pyarrow.chunked_array(reconstructed_chunks)
222
+ else:
223
+ # Single Array case
224
+ indices = column.indices
225
+ dictionary = column.dictionary
226
+ reconstructed = pc.take(dictionary, indices)
227
+
228
+ columns.append(reconstructed)
229
+ else:
230
+ # Keep simple dictionary types (strings, numbers) for pandas
231
+ # These work fine and provide memory benefits in pandas too
232
+ columns.append(column)
233
+ else:
234
+ # Non-dictionary columns are fine as-is
235
+ columns.append(column)
236
+
237
+ return pyarrow.Table.from_arrays(columns, names=column_names)
159
238
 
160
239
  def query(self, sql, begin=None, end=None):
161
240
  """Execute a SQL query and return results as a pandas DataFrame.
@@ -173,7 +252,9 @@ class FlightSQLClient:
173
252
  together with begin for optimal performance.
174
253
 
175
254
  Returns:
176
- pandas.DataFrame: Query results with appropriate column types.
255
+ pandas.DataFrame: Query results with appropriate column types. When the client was
256
+ created with preserve_dictionary=True, dictionary-encoded columns will maintain
257
+ their encoding for memory efficiency.
177
258
 
178
259
  Raises:
179
260
  Exception: If the query fails due to syntax errors, missing tables, or server issues.
@@ -189,14 +270,17 @@ class FlightSQLClient:
189
270
  ... begin, end
190
271
  ... )
191
272
  >>>
192
- >>> # Query without time range (less efficient for time-series data)
193
- >>> processes = client.query("SELECT * FROM processes LIMIT 10")
273
+ >>> # For dictionary preservation, create client with preserve_dictionary=True
274
+ >>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
275
+ >>> df = dict_client.query("SELECT dict_encoded_column FROM table")
194
276
 
195
277
  Performance Note:
196
278
  Always provide begin/end parameters when querying time-series data to enable
197
279
  partition pruning, which can improve query performance by 10-100x.
280
+ Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
281
+ for significant memory reduction.
198
282
  """
199
- call_headers = make_call_headers(begin, end)
283
+ call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
200
284
  options = flight.FlightCallOptions(headers=call_headers)
201
285
  ticket = make_query_ticket(sql)
202
286
  reader = self.__flight_client.do_get(ticket, options=options)
@@ -204,6 +288,11 @@ class FlightSQLClient:
204
288
  for chunk in reader:
205
289
  record_batches.append(chunk.data)
206
290
  table = pyarrow.Table.from_batches(record_batches, reader.schema)
291
+
292
+ # Handle dictionary-encoded columns that pandas can't convert directly
293
+ if self.__preserve_dictionary:
294
+ table = self._prepare_table_for_pandas(table)
295
+
207
296
  return table.to_pandas()
208
297
 
209
298
  def query_stream(self, sql, begin=None, end=None):
@@ -220,7 +309,8 @@ class FlightSQLClient:
220
309
 
221
310
  Yields:
222
311
  pyarrow.RecordBatch: Chunks of query results. Each batch contains a subset
223
- of rows with all columns from the query.
312
+ of rows with all columns from the query. When the client was created with
313
+ preserve_dictionary=True, dictionary-encoded columns will maintain their encoding.
224
314
 
225
315
  Example:
226
316
  >>> # Stream and process large dataset
@@ -233,21 +323,63 @@ class FlightSQLClient:
233
323
  ... total_errors += len(df_chunk)
234
324
  ... # Process chunk and release memory
235
325
  ... print(f"Total errors: {total_errors}")
326
+ >>>
327
+ >>> # Stream with dictionary preservation
328
+ >>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
329
+ >>> for batch in dict_client.query_stream("SELECT dict_encoded_column FROM table"):
330
+ ... # Process dictionary-encoded data efficiently
331
+ ... pass
236
332
 
237
333
  Performance Note:
238
334
  Streaming is recommended when:
239
335
  - Result set is larger than 100MB
240
336
  - You want to start processing before the query completes
241
337
  - Memory usage needs to be controlled
338
+ Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
339
+ for significant memory reduction.
242
340
  """
243
341
  ticket = make_query_ticket(sql)
244
- call_headers = make_call_headers(begin, end)
342
+ call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
245
343
  options = flight.FlightCallOptions(headers=call_headers)
246
344
  reader = self.__flight_client.do_get(ticket, options=options)
247
345
  record_batches = []
248
346
  for chunk in reader:
249
347
  yield chunk.data
250
348
 
349
+ def query_arrow(self, sql, begin=None, end=None):
350
+ """Execute a SQL query and return results as an Arrow Table.
351
+
352
+ This method preserves dictionary encoding and avoids pandas conversion issues.
353
+ Useful for working directly with Arrow data or when pandas can't handle
354
+ dictionary-encoded complex types.
355
+
356
+ Args:
357
+ sql (str): The SQL query to execute.
358
+ begin (datetime or str, optional): Start time for partition pruning.
359
+ end (datetime or str, optional): End time for partition pruning.
360
+
361
+ Returns:
362
+ pyarrow.Table: Query results as Arrow Table with preserved dictionary encoding.
363
+
364
+ Example:
365
+ >>> # Get Arrow table with preserved dictionary encoding
366
+ >>> table = client.query_arrow("SELECT dict_encoded_column FROM table")
367
+ >>> print(table.schema) # Shows dictionary<...> types
368
+ >>>
369
+ >>> # Work with Arrow directly to avoid pandas limitations
370
+ >>> for batch in table.to_batches():
371
+ ... # Process Arrow data without pandas conversion
372
+ ... pass
373
+ """
374
+ call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
375
+ options = flight.FlightCallOptions(headers=call_headers)
376
+ ticket = make_query_ticket(sql)
377
+ reader = self.__flight_client.do_get(ticket, options=options)
378
+ record_batches = []
379
+ for chunk in reader:
380
+ record_batches.append(chunk.data)
381
+ return pyarrow.Table.from_batches(record_batches, reader.schema)
382
+
251
383
  def prepare_statement(self, sql):
252
384
  """Create a prepared statement to retrieve query schema without executing it.
253
385
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: micromegas
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: Python analytics client for https://github.com/madesroches/micromegas/
5
5
  Author: Marc-Antoine Desroches
6
6
  Author-email: madesroches@gmail.com
@@ -1,10 +1,11 @@
1
- micromegas/__init__.py,sha256=oh0BAfNUVpHtFgufYWxvPwRfnqBTXY_nTPrNW1cwn-s,225
1
+ micromegas/__init__.py,sha256=hE1qxCVR1ddDdZHfNdYqHq9_897vKTkCBGgNCXTJqMQ,584
2
+ micromegas/admin.py,sha256=iEq1VFfc3c0E5qPReVUyOFwPZHUphbAO8faKTikamT4,9913
2
3
  micromegas/flightsql/FlightSql_pb2.py,sha256=3L_CtRVjjNppQE5gfXKF2AxgST7_kDc6dQEnA7fr_9A,28725
3
4
  micromegas/flightsql/__init__.py,sha256=So1GnY60k8QQPDOqocXbvr5KxWWn4KqJrXr2Q8zYOow,75
4
- micromegas/flightsql/client.py,sha256=SET-NVGzUkF9jmB72-sMH5cwr0B0HBlTBqi-ifPLhig,28590
5
+ micromegas/flightsql/client.py,sha256=7Wj5GJ-4RxsPDjWL3LliRAxP_Bdf2hdv62-ld3GuYPY,35000
5
6
  micromegas/flightsql/time.py,sha256=k2jYOT3Vab7l6A8hxBmwZP69olKjELNw9xBjALYde0I,583
6
7
  micromegas/perfetto.py,sha256=-IwcZ3PB4Dm6odHYW2w2ZUzxr9gtE9SCPHxzA3I7csg,3576
7
8
  micromegas/time.py,sha256=h4xv180XQh7z7LIYGmHjIp5vqC1tNw87fFE8kbeDixg,4358
8
- micromegas-0.12.0.dist-info/METADATA,sha256=LhnS-XplCTAxcGDpQv3PI_S3drN8EW-ojwf7hk7A5mM,6055
9
- micromegas-0.12.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- micromegas-0.12.0.dist-info/RECORD,,
9
+ micromegas-0.13.0.dist-info/METADATA,sha256=ErqRiBZftd8Nf1iyj_vVCzOwvXPsPJwMe_tgUYJn-Gs,6055
10
+ micromegas-0.13.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
11
+ micromegas-0.13.0.dist-info/RECORD,,