PyPI - supertable - Versions diffs - 2.3.5__tar.gz → 2.3.7__tar.gz - Mend

supertable 2.3.5tar.gz → 2.3.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

{supertable-2.3.5/supertable.egg-info → supertable-2.3.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: supertable
-Version: 2.3.5
+Version: 2.3.7
 Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
 Author: Levente Kupas
 Author-email: Levente Kupas <lkupas@kladnasoft.com>

{supertable-2.3.5 → supertable-2.3.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "supertable"
-version = "2.3.5"
+version = "2.3.7"
 description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
 readme = "README.md"
 requires-python = ">=3.10"

{supertable-2.3.5 → supertable-2.3.7}/setup.py RENAMED Viewed

@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
 setup(
     name="supertable",
-    version="2.3.5",
+    version="2.3.7",
     description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
     long_description=long_description,
     long_description_content_type="text/markdown",

{supertable-2.3.5 → supertable-2.3.7}/supertable/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
 project documentation for the full API surface.
 """
-__version__ = "2.3.5"
+__version__ = "2.3.7"
 # Re-export the core public surface so users can do ``from supertable import …``
 # instead of remembering submodule paths.

{supertable-2.3.5 → supertable-2.3.7}/supertable/config/settings.py RENAMED Viewed

@@ -157,6 +157,13 @@ class Settings:
     SUPERTABLE_DUCKDB_MATERIALIZE: str = "view"    # SUPERTABLE_DUCKDB_MATERIALIZE
     SUPERTABLE_DUCKDB_PRESIGNED: bool = False      # SUPERTABLE_DUCKDB_PRESIGNED
     SUPERTABLE_DUCKDB_USE_HTTPFS: bool = False     # SUPERTABLE_DUCKDB_USE_HTTPFS
+    # Write-path overwrite/delete resolution via the DuckDB pushdown probe.
+    # Disabled by default: the polars fallback reads only the projected key
+    # columns through the storage SDK and needs no httpfs extension, so it works
+    # in environments without one (or without internet to install it).  Enable
+    # only where httpfs is available and the probe's row-group skipping is worth
+    # it (e.g. very wide tables / many overlapping files).
+    SUPERTABLE_DUCKDB_WRITE_PROBE: bool = False    # SUPERTABLE_DUCKDB_WRITE_PROBE
     # Deletion-vector (tombstone) table cache.  Each entry is a small
     # `DISTINCT __rowid__` table keyed by the stable tombstone path; the
     # tombstone view ANTI JOINs it instead of re-reading the parquet every
@@ -437,6 +444,7 @@ def _build_settings() -> Settings:
         SUPERTABLE_DUCKDB_MATERIALIZE=_env_str("SUPERTABLE_DUCKDB_MATERIALIZE", "view"),
         SUPERTABLE_DUCKDB_PRESIGNED=_env_bool("SUPERTABLE_DUCKDB_PRESIGNED", False),
         SUPERTABLE_DUCKDB_USE_HTTPFS=_env_bool("SUPERTABLE_DUCKDB_USE_HTTPFS", False),
+        SUPERTABLE_DUCKDB_WRITE_PROBE=_env_bool("SUPERTABLE_DUCKDB_WRITE_PROBE", False),
         SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE", 8),
         SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC", 300),
         SUPERTABLE_DEBUG_TIMINGS=_env_bool("SUPERTABLE_DEBUG_TIMINGS", False),

{supertable-2.3.5 → supertable-2.3.7}/supertable/data_writer.py RENAMED Viewed

@@ -5,6 +5,7 @@ import json
 import os
 import time
 import uuid
+from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
 import re
@@ -343,8 +344,18 @@ class DataWriter:
             # layout and tight row-group zonemaps). Together with __rowid__ it
             # is hidden from query output by the read view's
             # ``EXCLUDE (__rowid__, __timestamp__)`` projection.
+            #
+            # System-owned, exactly like __rowid__ above: ALWAYS overwrite any
+            # caller-supplied __timestamp__ instead of preserving it.  It is a
+            # reserved internal column that is both the dedup ORDER BY key (newest
+            # per key wins) and the source of the __p_year__/month/day partition
+            # derivation (processing.py); letting a caller inject an arbitrary value
+            # (wrong dtype, non-UTC, or chosen to game which row wins) would
+            # silently corrupt partitioning and dedup.  ``newer_than`` is the
+            # supported, explicit mechanism for caller-controlled conflict
+            # resolution.
             table_config = self._get_table_config(simple_name)
-            if not delete_only and "__timestamp__" not in dataframe.columns:
+            if not delete_only:
                 dataframe = dataframe.with_columns(
                     polars.lit(datetime.now(timezone.utc)).alias("__timestamp__")
                 )
@@ -511,12 +522,21 @@ class DataWriter:
                 # Load the current deletion-vector once: used both to exclude
                 # already-tombstoned rows from this write's deletes (below) and,
                 # via prev_df, to extend the vector without a second read.
+                # required=True: a DV that exists but cannot be read must abort
+                # the write, never be treated as empty — silently dropping the
+                # carried-forward vector would resurrect previously deleted rows.
                 prev_dv_df = (
-                    _read_parquet_safe(prev_tombstone_path, profiler=profiler)
+                    _read_parquet_safe(prev_tombstone_path, profiler=profiler, required=True)
                     if prev_tombstone_path else None
                 )
+                # The rowid set is consumed only by the idempotency filter below,
+                # which runs only when this write actually tombstones rows
+                # (overwrite or delete_only).  Pure appends tombstone nothing, so
+                # skip materialising the whole deletion-vector as a Python set —
+                # prev_dv_df is still carried forward into build_tombstone_file.
                 prev_dv_rowids = set()
-                if prev_dv_df is not None and "__rowid__" in prev_dv_df.columns:
+                if (overwrite_columns or delete_only) and prev_dv_df is not None \
+                        and "__rowid__" in prev_dv_df.columns:
                     prev_dv_rowids = set(prev_dv_df.get_column("__rowid__").to_list())
                 # 1. Identify which existing rows this write deletes/replaces.
@@ -555,38 +575,87 @@ class DataWriter:
                     f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
                 ))
-                # 2. Write the incoming rows as a new file (insert/upsert side).
-                #    delete_only carries only predicate columns — nothing to insert.
-                if not delete_only and dataframe.height > 0:
+                # 2. + 3.  Write the incoming rows as a new data file (insert/
+                #    upsert side) AND carry-forward/extend the deletion-vector
+                #    tombstone file.  These two object-store PUTs are independent:
+                #    neither reads the other's output and they write to disjoint
+                #    dirs (data/ vs tombstone/), so they run concurrently to
+                #    overlap the two round-trips.  delete_only carries only
+                #    predicate columns → nothing to insert.  No new deletes →
+                #    build_tombstone reuses the previous file (combined_df=None).
+                #
+                #    Profiler is NOT thread-safe, so each branch records into its
+                #    own sub-profiler which the parent merges after the join;
+                #    each branch also measures its own wall time so the per-phase
+                #    monitoring timings stay meaningful despite the overlap.
+                #    Footers of files written via the write_bytes path are captured
+                #    in footer_md_cache so stats extraction (step 6) reuses them
+                #    instead of re-downloading each freshly-written file.
+                footer_md_cache = {}
+                tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
+                do_insert = (not delete_only and dataframe.height > 0)
+                def _write_data_branch():
+                    sub = Profiler()
+                    t = time.perf_counter()
                     write_parquet_and_collect_resources(
                         write_df=dataframe,
                         overwrite_columns=[],
                         data_dir=simple_table.data_dir,
                         new_resources=new_resources,
                         compression_level=compression_level,
-                        profiler=profiler,
+                        profiler=sub,
+                        footer_md_out=footer_md_cache,
                     )
+                    return sub, time.perf_counter() - t
+                def _write_tombstone_branch():
+                    sub = Profiler()
+                    t = time.perf_counter()
+                    tp, cdf = build_tombstone_file(
+                        tombstone_dir=tombstone_dir,
+                        prev_tombstone_path=prev_tombstone_path,
+                        new_pairs=new_delete_pairs,
+                        compression_level=compression_level,
+                        profiler=sub,
+                        prev_df=prev_dv_df,
+                    )
+                    return tp, cdf, sub, time.perf_counter() - t
+                if do_insert:
+                    with ThreadPoolExecutor(max_workers=2) as _ex:
+                        _f_data = _ex.submit(_write_data_branch)
+                        _f_tomb = _ex.submit(_write_tombstone_branch)
+                        # .result() re-raises in the parent: a failure in either
+                        # PUT aborts the write before any snapshot commit, exactly
+                        # as the former sequential path did (an orphaned immutable
+                        # file no snapshot references is harmless garbage).
+                        data_sub, data_secs = _f_data.result()
+                        tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
+                            _f_tomb.result()
+                        )
+                    profiler.merge(data_sub)
+                    profiler.merge(tomb_sub)
                     inserted = dataframe.height
                 else:
+                    tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
+                        _write_tombstone_branch()
+                    )
+                    profiler.merge(tomb_sub)
+                    data_secs = 0.0
                     inserted = 0
-                mark("write_parquet")
+                # Assign the two per-phase timings from each branch's own measured
+                # wall time (they overlapped, so the serial mark() deltas would
+                # misattribute the time), then advance the mark() baseline.
+                timings["write_parquet"] = data_secs
+                timings["build_tombstone"] = tomb_secs
+                t_last = time.time()
                 logger.debug(lp(
                     f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
                     f"new immutable file(s) (no existing data file rewritten)"
                 ))
-                # 3. Carry forward + extend the deletion-vector tombstone file.
-                #    No new deletes → reuse the previous file (combined_df=None).
-                tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
-                tombstone_path, combined_tombstone_df = build_tombstone_file(
-                    tombstone_dir=tombstone_dir,
-                    prev_tombstone_path=prev_tombstone_path,
-                    new_pairs=new_delete_pairs,
-                    compression_level=compression_level,
-                    profiler=profiler,
-                    prev_df=prev_dv_df,
-                )
                 # Track the live deletion-vector row count so meta reads can
                 # deduct dead rows from the physical resource row totals.
                 # New deletes → combined_tombstone_df is the full deduped DV
@@ -596,7 +665,6 @@ class DataWriter:
                     if combined_tombstone_df is not None
                     else int(last_simple_table.get("tombstone_rows", 0) or 0)
                 )
-                mark("build_tombstone")
                 logger.debug(lp(
                     f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
                     f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
@@ -745,7 +813,9 @@ class DataWriter:
                     r.get("file") for r in new_resources
                     if isinstance(r, dict) and r.get("file")
                 ]
-                new_stats_rows = extract_stats_rows(new_data_files, profiler=profiler)
+                new_stats_rows = extract_stats_rows(
+                    new_data_files, profiler=profiler, footer_md_cache=footer_md_cache
+                )
                 stats_path, combined_stats_df = build_stats_file(
                     stats_dir=stats_dir,
                     prev_stats_path=last_simple_table.get("stats_file"),
@@ -1181,8 +1251,17 @@ class DataWriter:
             # the *write* path; compact() is explicit maintenance and always
             # consumes the vector.
             tombstone_path = last_simple_table.get("tombstone")
+            # required=True: a DV that exists but cannot be read must abort the
+            # compaction, never be treated as empty. A swallowed read here would
+            # set should_run_tombstones=False, skipping both Phase A and the
+            # pointer-clear below, so Phase B would carry the dead rows into the
+            # new file while the vector kept pointing at the sunset __file__ —
+            # leaving them permanently unreclaimable. Failing loud leaves the
+            # prior snapshot + vector intact for a retry, and matches the
+            # write-path carry-forward read (required=True) above.
             tombstone_df = (
-                _read_parquet_safe(tombstone_path) if tombstone_path else None
+                _read_parquet_safe(tombstone_path, required=True)
+                if tombstone_path else None
             )
             tombstone_rows = (
                 tombstone_df.height if tombstone_df is not None else 0
@@ -1246,6 +1325,24 @@ class DataWriter:
                 r for r in (list(tomb_new_resources) + list(small_new_resources))
                 if r.get("file") not in all_sunset
             ]
+            # ``all_new_resources`` is the full set of files written by THIS
+            # compaction; it feeds stats extraction, the schema model_df and the
+            # result metrics below, all of which need every new file.
+            #
+            # For ``simple_table.update`` it must NOT be reused verbatim, though:
+            # Phase A's outputs were already spliced into
+            # ``last_simple_table["resources"]`` (the in-memory baseline that
+            # ``update`` starts from) right after Phase A ran.  ``update`` does
+            # ``(baseline - sunset) + new_resources`` with no dedup, so any
+            # Phase-A output that Phase B did NOT consume (left un-sunset because
+            # it exceeded the ``small_only`` threshold, or its read failed) would
+            # be counted once from the baseline AND once from new_resources —
+            # i.e. the same file listed twice in the new snapshot.  Hand ``update``
+            # only Phase B's brand-new files, which are the only resources genuinely
+            # absent from that baseline.
+            update_new_resources = [
+                r for r in small_new_resources if r.get("file") not in all_sunset
+            ]
             result["files_compacted"] = considered
             result["new_resources"] = len(all_new_resources)
             result["sunset_files"] = len(all_sunset)
@@ -1338,7 +1435,7 @@ class DataWriter:
                 )
                 new_snapshot_dict, new_snapshot_path = simple_table.update(
-                    all_new_resources,
+                    update_new_resources,
                     all_sunset,
                     model_df,
                     last_snapshot=last_simple_table,

{supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_common.py RENAMED Viewed

@@ -731,12 +731,66 @@ def new_duckdb_connection(
     purely local scans.
     """
     con = duckdb.connect()
-    init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
-    if for_paths and any("://" in str(p) for p in for_paths):
+    try:
+        init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
+        if for_paths and any("://" in str(p) for p in for_paths):
+            configure_httpfs_and_s3(con, for_paths)
+    except Exception:
+        # Don't leak the half-initialised connection if a pragma / httpfs load
+        # raises; re-raise so callers still fall back exactly as before.
+        con.close()
+        raise
+    return con
+# Thread-local pool for the write-side probe connection.  DuckDB connections are
+# NOT thread-safe, so each thread keeps its own; reusing it amortises the
+# ~150 ms init/warmup across writes on the same thread — the same reason the
+# read executors hold a persistent connection.
+_probe_pool = threading.local()
+def get_pooled_duckdb_connection(
+        temp_dir: str,
+        for_paths: Optional[List[str]] = None,
+        memory_limit: str = "1GB",
+) -> duckdb.DuckDBPyConnection:
+    """Return this thread's pooled probe connection, building it on first use.
+    The cold build goes through ``new_duckdb_connection`` so the pinned
+    ``home_directory`` / pragma contract is byte-for-byte identical to a
+    transient connection.  On a *warm* connection httpfs/S3 is re-applied for
+    remote paths so a connection first built for local paths can still serve a
+    later remote probe and credentials always reflect the current environment
+    (``configure_httpfs_and_s3`` re-reads env each call and is idempotent).
+    """
+    con = getattr(_probe_pool, "con", None)
+    if con is None:
+        con = new_duckdb_connection(
+            temp_dir=temp_dir, for_paths=for_paths, memory_limit=memory_limit
+        )
+        _probe_pool.con = con
+    elif for_paths and any("://" in str(p) for p in for_paths):
         configure_httpfs_and_s3(con, for_paths)
     return con
+def reset_pooled_duckdb_connections() -> None:
+    """Close and drop the calling thread's pooled probe connection.
+    A no-op when the thread has none.  Used for test determinism and as an
+    eviction hook; the pool slot is cleared before the close so a failing close
+    still leaves the thread ready to rebuild.
+    """
+    con = getattr(_probe_pool, "con", None)
+    if con is not None:
+        _probe_pool.con = None
+        try:
+            con.close()
+        except Exception:
+            pass
 def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
     """Re-apply the session-settable DuckDB pragmas from a live engine config.

{supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/conftest.py RENAMED Viewed

@@ -86,6 +86,21 @@ def _mock_redis_catalog():
         yield
+@pytest.fixture(autouse=True)
+def _reset_probe_pool():
+    """Clear the thread-local write-probe connection pool around every test.
+    The probe now reuses a pooled connection across writes, so tests that
+    assert how many times ``new_duckdb_connection`` is built must start from a
+    cold pool; resetting afterwards keeps the connection from leaking into the
+    next test.
+    """
+    from supertable.engine.engine_common import reset_pooled_duckdb_connections
+    reset_pooled_duckdb_connections()
+    yield
+    reset_pooled_duckdb_connections()
 @pytest.fixture()
 def duckdb_con():
     """Provide a real in-memory DuckDB connection, closed after each test."""

{supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine.py RENAMED Viewed

@@ -587,6 +587,38 @@ class TestReadWriteDuckDBParity:
         # for_paths forwarded so httpfs is loaded for remote scans.
         assert "for_paths" in calls[0][1]
+    def test_probe_reuses_pooled_connection(self, tmp_path, monkeypatch):
+        # A second probe on the same thread must REUSE the pooled connection,
+        # so new_duckdb_connection is built exactly once — the ~150ms warmup is
+        # paid on the cold probe and amortised on every subsequent write.
+        import polars
+        from supertable import processing as _processing
+        monkeypatch.setattr(_processing, "_get_storage", lambda: object())
+        f1 = str(tmp_path / "f1.parquet")
+        polars.DataFrame({"__rowid__": [10, 20], "id": [1, 2]}).write_parquet(f1)
+        calls = []
+        real = _engine_common.new_duckdb_connection
+        monkeypatch.setattr(
+            _engine_common,
+            "new_duckdb_connection",
+            lambda *a, **k: (calls.append((a, k)), real(*a, **k))[1],
+        )
+        def _probe():
+            return _processing._duckdb_probe_overlap_matches(
+                overlap_true_files=[(f1, 0)],
+                overwrite_columns=["id"],
+                newer_than_col=None,
+                incoming_keys=polars.DataFrame({"id": [2]}),
+            )
+        assert _probe() is not None
+        assert _probe() is not None
+        assert len(calls) == 1  # built on the cold probe, reused on the warm one
     def test_probe_matches_rows_on_local_parquet(self, tmp_path, monkeypatch):
         import polars
         from supertable import processing as _processing

supertable 2.3.5__tar.gz → 2.3.7__tar.gz

supertable 2.3.5tar.gz → 2.3.7tar.gz