PyPI - micromegas - Versions diffs - 0.12.0__tar.gz → 0.13.0__tar.gz - Mend

micromegas 0.12.0tar.gz → 0.13.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{micromegas-0.12.0 → micromegas-0.13.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: micromegas
-Version: 0.12.0
+Version: 0.13.0
 Summary: Python analytics client for https://github.com/madesroches/micromegas/
 Author: Marc-Antoine Desroches
 Author-email: madesroches@gmail.com

micromegas-0.13.0/micromegas/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+import grpc
+from . import time
+from . import perfetto
+from . import flightsql
+from . import admin
+def connect(preserve_dictionary=False):
+    """Connect to the analytics service using default values.
+    Args:
+        preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
+            Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
+            Defaults to False for backward compatibility.
+    """
+    return flightsql.client.FlightSQLClient(
+        "grpc://localhost:50051", preserve_dictionary=preserve_dictionary
+    )

micromegas-0.13.0/micromegas/admin.py ADDED Viewed

@@ -0,0 +1,227 @@
+"""Administrative utilities for Micromegas lakehouse management.
+This module provides functions for managing schema evolution and partition lifecycle
+in Micromegas lakehouse. These functions are intended for administrative use and
+should be used with caution as they perform potentially destructive operations.
+"""
+import pandas as pd
+from typing import Optional
+def list_incompatible_partitions(
+    client, view_set_name: Optional[str] = None
+) -> pd.DataFrame:
+    """List partitions with schemas incompatible with current view set schemas.
+    This function identifies partitions that have schema versions different from
+    the current schema version for their view set. These incompatible partitions
+    are ignored during queries but take up storage space and should be
+    retired to free storage and enable clean schema evolution.
+    Args:
+        client: FlightSQLClient instance for executing queries.
+        view_set_name (str, optional): Filter results to a specific view set.
+            If None, returns incompatible partitions across all view sets.
+    Returns:
+        pandas.DataFrame: DataFrame with incompatible partition information containing:
+            - view_set_name: Name of the view set
+            - view_instance_id: Instance ID (e.g., process_id or 'global')
+            - incompatible_schema_hash: The old schema hash in the partition
+            - current_schema_hash: The current schema hash from ViewFactory
+            - partition_count: Number of incompatible partitions with this schema
+            - total_size_bytes: Total size in bytes of all incompatible partitions
+            - file_paths: Array of file paths for each incompatible partition (for precise retirement)
+    Example:
+        >>> import micromegas
+        >>> import micromegas.admin
+        >>>
+        >>> client = micromegas.connect()
+        >>>
+        >>> # List all incompatible partitions across all view sets
+        >>> incompatible = micromegas.admin.list_incompatible_partitions(client)
+        >>> print(f"Found {len(incompatible)} groups of incompatible partitions")
+        >>>
+        >>> # List incompatible partitions for specific view set
+        >>> log_incompatible = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
+        >>> print(f"Log entries incompatible partitions: {log_incompatible['partition_count'].sum()}")
+    Note:
+        This function leverages the existing list_partitions() and list_view_sets()
+        UDTFs to perform server-side JOIN and aggregation for optimal performance.
+        Schema "hashes" are actually version numbers (e.g., [4]) not cryptographic hashes.
+        SQL is executed directly by DataFusion, so no SQL injection concerns.
+    """
+    # Build view filter clause if specific view set requested
+    view_filter = ""
+    if view_set_name is not None:
+        view_filter = f"AND p.view_set_name = '{view_set_name}'"
+    # Construct SQL query with JOIN between list_partitions() and list_view_sets()
+    # Server-side filtering and aggregation for optimal performance
+    sql = f"""
+    SELECT
+        p.view_set_name,
+        p.view_instance_id,
+        p.file_schema_hash as incompatible_schema_hash,
+        vs.current_schema_hash,
+        COUNT(*) as partition_count,
+        SUM(p.file_size) as total_size_bytes,
+        ARRAY_AGG(p.file_path) as file_paths
+    FROM list_partitions() p
+    JOIN list_view_sets() vs ON p.view_set_name = vs.view_set_name
+    WHERE p.file_schema_hash != vs.current_schema_hash
+        {view_filter}
+    GROUP BY p.view_set_name, p.view_instance_id, p.file_schema_hash, vs.current_schema_hash
+    ORDER BY p.view_set_name, p.view_instance_id
+    """
+    return client.query(sql)
+def retire_incompatible_partitions(
+    client, view_set_name: Optional[str] = None
+) -> pd.DataFrame:
+    """Retire partitions with schemas incompatible with current view set schemas.
+    This function identifies and retires partitions that have schema versions
+    different from the current schema version for their view set. This enables
+    safe schema evolution by cleaning up old schema versions.
+    **SAFETY**: This function retires only the exact incompatible partitions by
+    their file paths, ensuring no compatible partitions are accidentally retired.
+    **WARNING**: This operation is irreversible. Retired partitions will be
+    permanently deleted from metadata and their data files removed from object storage.
+    Args:
+        client: FlightSQLClient instance for executing queries.
+        view_set_name (str, optional): Retire incompatible partitions only for
+            this specific view set. If None, retires incompatible partitions
+            across all view sets (use with extreme caution).
+    Returns:
+        pandas.DataFrame: DataFrame with retirement results containing:
+            - view_set_name: View set that was processed
+            - view_instance_id: Instance ID of partitions retired
+            - partitions_retired: Count of partitions successfully retired
+            - partitions_failed: Count of partitions that failed to retire
+            - storage_freed_bytes: Total bytes freed from storage
+            - retirement_messages: Array of detailed messages for each retirement attempt
+    Example:
+        >>> import micromegas
+        >>> import micromegas.admin
+        >>>
+        >>> client = micromegas.connect()
+        >>>
+        >>> # Preview what would be retired (recommended first step)
+        >>> preview = micromegas.admin.list_incompatible_partitions(client, 'log_entries')
+        >>> print(f"Would retire {preview['partition_count'].sum()} partitions")
+        >>> print(f"Would free {preview['total_size_bytes'].sum() / (1024**3):.2f} GB")
+        >>>
+        >>> # Retire incompatible partitions for specific view set
+        >>> if input("Proceed with retirement? (yes/no): ") == "yes":
+        ...     result = micromegas.admin.retire_incompatible_partitions(client, 'log_entries')
+        ...     print(f"Retired {result['partitions_retired'].sum()} partitions")
+        ...     print(f"Failed {result['partitions_failed'].sum()} partitions")
+    Note:
+        This function uses the retire_partition_by_file() UDF to retire each
+        partition individually by its exact file path. This ensures precise
+        targeting and eliminates the risk of accidentally retiring compatible
+        partitions that happen to exist in the same time ranges.
+    """
+    # First identify incompatible partitions
+    incompatible = list_incompatible_partitions(client, view_set_name)
+    if incompatible.empty:
+        # No incompatible partitions found, return empty DataFrame with expected columns
+        return pd.DataFrame(
+            columns=[
+                "view_set_name",
+                "view_instance_id",
+                "partitions_retired",
+                "partitions_failed",
+                "storage_freed_bytes",
+                "retirement_messages",
+            ]
+        )
+    results = []
+    # For each group of incompatible partitions, retire by individual file paths
+    for _, group in incompatible.iterrows():
+        file_paths = group["file_paths"]
+        # Convert file_paths to list if it's not already (handle different pandas array types)
+        if hasattr(file_paths, "tolist"):
+            file_paths_list = file_paths.tolist()
+        elif isinstance(file_paths, str):
+            # Single file path case
+            file_paths_list = [file_paths]
+        else:
+            file_paths_list = list(file_paths)
+        retirement_messages = []
+        partitions_retired = 0
+        partitions_failed = 0
+        # Retire each partition individually using the targeted UDF
+        for file_path in file_paths_list:
+            if not file_path or pd.isna(file_path):
+                continue
+            try:
+                # Use the new retire_partition_by_file UDF
+                retirement_sql = (
+                    f"SELECT retire_partition_by_file('{file_path}') as message"
+                )
+                retirement_result = client.query(retirement_sql)
+                if not retirement_result.empty:
+                    message = retirement_result["message"].iloc[0]
+                    retirement_messages.append(message)
+                    if message.startswith("SUCCESS:"):
+                        partitions_retired += 1
+                    else:
+                        partitions_failed += 1
+                        print(f"Warning: Failed to retire {file_path}: {message}")
+                else:
+                    partitions_failed += 1
+                    retirement_messages.append(
+                        f"ERROR: No result returned for {file_path}"
+                    )
+            except Exception as e:
+                partitions_failed += 1
+                error_msg = f"ERROR: Exception retiring {file_path}: {e}"
+                retirement_messages.append(error_msg)
+                print(f"Error retiring partition {file_path}: {e}")
+        # Calculate storage freed (only count successful retirements)
+        if partitions_retired > 0 and group["partition_count"] > 0:
+            # Proportional calculation based on successful retirements
+            storage_freed = int(
+                group["total_size_bytes"]
+                * (partitions_retired / group["partition_count"])
+            )
+        else:
+            storage_freed = 0
+        # Record retirement results for this group
+        results.append(
+            {
+                "view_set_name": group["view_set_name"],
+                "view_instance_id": group["view_instance_id"],
+                "partitions_retired": partitions_retired,
+                "partitions_failed": partitions_failed,
+                "storage_freed_bytes": storage_freed,
+                "retirement_messages": retirement_messages,
+            }
+        )
+    return pd.DataFrame(results)

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/client.py RENAMED Viewed

@@ -31,7 +31,7 @@ class MicromegasMiddlewareFactory(flight.ClientMiddlewareFactory):
         return MicromegasMiddleware(self.headers)
-def make_call_headers(begin, end):
+def make_call_headers(begin, end, preserve_dictionary=False):
     call_headers = []
     if begin is not None:
         call_headers.append(
@@ -47,6 +47,13 @@ def make_call_headers(begin, end):
                 time.format_datetime(end).encode("utf8"),
             )
         )
+    if preserve_dictionary:
+        call_headers.append(
+            (
+                "preserve_dictionary".encode("utf8"),
+                "true".encode("utf8"),
+            )
+        )
     return call_headers
@@ -130,7 +137,7 @@ class FlightSQLClient:
     supports streaming for large result sets.
     """
-    def __init__(self, uri, headers=None):
+    def __init__(self, uri, headers=None, preserve_dictionary=False):
         """Initialize a FlightSQL client connection.
         Args:
@@ -138,6 +145,9 @@ class FlightSQLClient:
                 Use "grpc://" for unencrypted connections or "grpc+tls://" for TLS.
             headers (dict, optional): Custom headers for authentication or metadata.
                 Example: {"authorization": "Bearer token123"}
+            preserve_dictionary (bool, optional): When True, preserve dictionary encoding in
+                Arrow arrays for memory efficiency. Useful when using dictionary-encoded UDFs.
+                Defaults to False for backward compatibility.
         Example:
             >>> # Connect to local server
@@ -148,6 +158,12 @@ class FlightSQLClient:
             ...     "grpc+tls://remote-server:50051",
             ...     headers={"authorization": "Bearer mytoken"}
             ... )
+            >>>
+            >>> # Connect with dictionary preservation for memory efficiency
+            >>> client = FlightSQLClient(
+            ...     "grpc://localhost:50051",
+            ...     preserve_dictionary=True
+            ... )
         """
         fh = open(certifi.where(), "r")
         cert = fh.read()
@@ -156,6 +172,69 @@ class FlightSQLClient:
         self.__flight_client = flight.connect(
             location=uri, tls_root_certs=cert, middleware=[factory]
         )
+        self.__preserve_dictionary = preserve_dictionary
+    def _prepare_table_for_pandas(self, table):
+        """Prepare Arrow table with dictionary columns for pandas conversion.
+        As of PyArrow/pandas 2024-2025, dictionary-encoded complex types
+        (List, Struct, Union) cannot be converted directly to pandas due to
+        "ArrowNotImplementedError: Unification of ... dictionaries is not implemented".
+        This method converts problematic dictionary columns back to regular arrays
+        while preserving memory efficiency during Arrow processing.
+        """
+        import pyarrow.compute as pc
+        columns = []
+        column_names = []
+        for i, column in enumerate(table.columns):
+            column_name = table.column_names[i]
+            column_names.append(column_name)
+            # Check if this is a dictionary-encoded column
+            if pyarrow.types.is_dictionary(column.type):
+                value_type = column.type.value_type
+                # Convert dictionary-encoded complex types that pandas can't handle
+                if (
+                    pyarrow.types.is_list(value_type)
+                    or pyarrow.types.is_struct(value_type)
+                    or pyarrow.types.is_union(value_type)
+                ):
+                    # Manually decode dictionary by reconstructing the array
+                    # This works around PyArrow's casting limitations
+                    # Decode each chunk of the dictionary column
+                    reconstructed_chunks = []
+                    if hasattr(column, "chunks"):
+                        # ChunkedArray case
+                        for chunk in column.chunks:
+                            indices = chunk.indices
+                            dictionary = chunk.dictionary
+                            reconstructed_chunk = pc.take(dictionary, indices)
+                            reconstructed_chunks.append(reconstructed_chunk)
+                        # Create a new ChunkedArray from reconstructed chunks
+                        reconstructed = pyarrow.chunked_array(reconstructed_chunks)
+                    else:
+                        # Single Array case
+                        indices = column.indices
+                        dictionary = column.dictionary
+                        reconstructed = pc.take(dictionary, indices)
+                    columns.append(reconstructed)
+                else:
+                    # Keep simple dictionary types (strings, numbers) for pandas
+                    # These work fine and provide memory benefits in pandas too
+                    columns.append(column)
+            else:
+                # Non-dictionary columns are fine as-is
+                columns.append(column)
+        return pyarrow.Table.from_arrays(columns, names=column_names)
     def query(self, sql, begin=None, end=None):
         """Execute a SQL query and return results as a pandas DataFrame.
@@ -173,7 +252,9 @@ class FlightSQLClient:
                 together with begin for optimal performance.
         Returns:
-            pandas.DataFrame: Query results with appropriate column types.
+            pandas.DataFrame: Query results with appropriate column types. When the client was
+                created with preserve_dictionary=True, dictionary-encoded columns will maintain
+                their encoding for memory efficiency.
         Raises:
             Exception: If the query fails due to syntax errors, missing tables, or server issues.
@@ -189,14 +270,17 @@ class FlightSQLClient:
             ...     begin, end
             ... )
             >>>
-            >>> # Query without time range (less efficient for time-series data)
-            >>> processes = client.query("SELECT * FROM processes LIMIT 10")
+            >>> # For dictionary preservation, create client with preserve_dictionary=True
+            >>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
+            >>> df = dict_client.query("SELECT dict_encoded_column FROM table")
         Performance Note:
             Always provide begin/end parameters when querying time-series data to enable
             partition pruning, which can improve query performance by 10-100x.
+            Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
+            for significant memory reduction.
         """
-        call_headers = make_call_headers(begin, end)
+        call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
         options = flight.FlightCallOptions(headers=call_headers)
         ticket = make_query_ticket(sql)
         reader = self.__flight_client.do_get(ticket, options=options)
@@ -204,6 +288,11 @@ class FlightSQLClient:
         for chunk in reader:
             record_batches.append(chunk.data)
         table = pyarrow.Table.from_batches(record_batches, reader.schema)
+        # Handle dictionary-encoded columns that pandas can't convert directly
+        if self.__preserve_dictionary:
+            table = self._prepare_table_for_pandas(table)
         return table.to_pandas()
     def query_stream(self, sql, begin=None, end=None):
@@ -220,7 +309,8 @@ class FlightSQLClient:
         Yields:
             pyarrow.RecordBatch: Chunks of query results. Each batch contains a subset
-                of rows with all columns from the query.
+                of rows with all columns from the query. When the client was created with
+                preserve_dictionary=True, dictionary-encoded columns will maintain their encoding.
         Example:
             >>> # Stream and process large dataset
@@ -233,21 +323,63 @@ class FlightSQLClient:
             ...     total_errors += len(df_chunk)
             ...     # Process chunk and release memory
             ... print(f"Total errors: {total_errors}")
+            >>>
+            >>> # Stream with dictionary preservation
+            >>> dict_client = FlightSQLClient("grpc://localhost:50051", preserve_dictionary=True)
+            >>> for batch in dict_client.query_stream("SELECT dict_encoded_column FROM table"):
+            ...     # Process dictionary-encoded data efficiently
+            ...     pass
         Performance Note:
             Streaming is recommended when:
             - Result set is larger than 100MB
             - You want to start processing before the query completes
             - Memory usage needs to be controlled
+            Use preserve_dictionary=True in client constructor with dictionary-encoded UDFs
+            for significant memory reduction.
         """
         ticket = make_query_ticket(sql)
-        call_headers = make_call_headers(begin, end)
+        call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
         options = flight.FlightCallOptions(headers=call_headers)
         reader = self.__flight_client.do_get(ticket, options=options)
         record_batches = []
         for chunk in reader:
             yield chunk.data
+    def query_arrow(self, sql, begin=None, end=None):
+        """Execute a SQL query and return results as an Arrow Table.
+        This method preserves dictionary encoding and avoids pandas conversion issues.
+        Useful for working directly with Arrow data or when pandas can't handle
+        dictionary-encoded complex types.
+        Args:
+            sql (str): The SQL query to execute.
+            begin (datetime or str, optional): Start time for partition pruning.
+            end (datetime or str, optional): End time for partition pruning.
+        Returns:
+            pyarrow.Table: Query results as Arrow Table with preserved dictionary encoding.
+        Example:
+            >>> # Get Arrow table with preserved dictionary encoding
+            >>> table = client.query_arrow("SELECT dict_encoded_column FROM table")
+            >>> print(table.schema)  # Shows dictionary<...> types
+            >>>
+            >>> # Work with Arrow directly to avoid pandas limitations
+            >>> for batch in table.to_batches():
+            ...     # Process Arrow data without pandas conversion
+            ...     pass
+        """
+        call_headers = make_call_headers(begin, end, self.__preserve_dictionary)
+        options = flight.FlightCallOptions(headers=call_headers)
+        ticket = make_query_ticket(sql)
+        reader = self.__flight_client.do_get(ticket, options=options)
+        record_batches = []
+        for chunk in reader:
+            record_batches.append(chunk.data)
+        return pyarrow.Table.from_batches(record_batches, reader.schema)
     def prepare_statement(self, sql):
         """Create a prepared statement to retrieve query schema without executing it.

{micromegas-0.12.0 → micromegas-0.13.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "micromegas"
-version = "0.12.0"
+version = "0.13.0"
 description = "Python analytics client for https://github.com/madesroches/micromegas/"
 authors = ["Marc-Antoine Desroches <madesroches@gmail.com>"]
 readme = "README.md"

micromegas-0.12.0/micromegas/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-import grpc
-from . import time
-from . import perfetto
-from . import flightsql
-def connect():
-    "connect to the analytics service using default values"
-    return flightsql.client.FlightSQLClient("grpc://localhost:50051")

{micromegas-0.12.0 → micromegas-0.13.0}/README.md RENAMED Viewed

File without changes

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/FlightSql_pb2.py RENAMED Viewed

File without changes

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/__init__.py RENAMED Viewed

File without changes

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/flightsql/time.py RENAMED Viewed

File without changes

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/perfetto.py RENAMED Viewed

File without changes

{micromegas-0.12.0 → micromegas-0.13.0}/micromegas/time.py RENAMED Viewed

File without changes

micromegas 0.12.0__tar.gz → 0.13.0__tar.gz

micromegas 0.12.0tar.gz → 0.13.0tar.gz