PyPI - supertable - Versions diffs - 2.3.1__tar.gz → 2.3.3__tar.gz - Mend

supertable 2.3.1tar.gz → 2.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

{supertable-2.3.1/supertable.egg-info → supertable-2.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: supertable
-Version: 2.3.1
+Version: 2.3.3
 Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
 Author: Levente Kupas
 Author-email: Levente Kupas <lkupas@kladnasoft.com>

{supertable-2.3.1 → supertable-2.3.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "supertable"
-version = "2.3.1"
+version = "2.3.3"
 description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
 readme = "README.md"
 requires-python = ">=3.10"

{supertable-2.3.1 → supertable-2.3.3}/setup.py RENAMED Viewed

@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
 setup(
     name="supertable",
-    version="2.3.1",
+    version="2.3.3",
     description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
     long_description=long_description,
     long_description_content_type="text/markdown",

{supertable-2.3.1 → supertable-2.3.3}/supertable/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
 project documentation for the full API surface.
 """
-__version__ = "2.3.1"
+__version__ = "2.3.3"
 # Re-export the core public surface so users can do ``from supertable import …``
 # instead of remembering submodule paths.

{supertable-2.3.1 → supertable-2.3.3}/supertable/data_writer.py RENAMED Viewed

@@ -23,8 +23,7 @@ from supertable.utils.timer import Timer
 from supertable.utils.profiler import Profiler
 from supertable.processing import (
     find_overlapping_files,
-    filter_stale_incoming_rows,
-    identify_deleted_rowids,
+    resolve_overwrite_writes,
     identify_all_rowids,
     build_tombstone_file,
     build_stats_file,
@@ -36,6 +35,7 @@ from supertable.processing import (
     write_parquet_and_collect_resources,
     compact_resources,
     compact_tombstones,
+    should_compact_small_files,
     _max_tombstone_rows,
     _read_parquet_safe,
 )
@@ -398,60 +398,67 @@ class DataWriter:
                             logger.info(lp(f"stats pruning: skipped {pruned}/{before} candidate files"))
                 mark("stats_prune")
-            # File cache: populated by newer-than filtering, reused by process step
-            # to avoid double-reading overlapping parquet files from storage.
+            # File cache: used only by delete_only's identify_all_rowids below.
             file_cache = {}
-            # --- Newer-than filtering (skip stale/replayed rows) ---------------
-            if newer_than and overwrite_columns:
+            # --- Overwrite resolution: stale-row filtering + delete-pair -------
+            # identification in one DuckDB-pushdown probe over the overlapping
+            # files (column projection, row-group skipping, ranged GETs, native
+            # null-safe SEMI JOIN) instead of full-file polars reads.  Returns
+            # the stale-filtered incoming df plus the (file, __rowid__) delete
+            # pairs derived from the surviving keys; falls back to the polars
+            # oracle on any probe/derive failure.  delete_only (no
+            # overwrite_columns) is handled separately in the deletion block.
+            resolved_delete_pairs = None
+            if overwrite_columns:
                 pre_filter_count = dataframe.height
-                dataframe = filter_stale_incoming_rows(
+                dataframe, resolved_delete_pairs = resolve_overwrite_writes(
                     incoming_df=dataframe,
                     overlapping_files=overlapping_files,
                     overwrite_columns=overwrite_columns,
                     newer_than_col=newer_than,
-                    file_cache=file_cache,
                     profiler=profiler,
                 )
-                skipped = pre_filter_count - dataframe.height
-                if skipped > 0:
-                    logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
-                if dataframe.height == 0:
-                    logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
-                    mark("newer_than")
-                    total_columns = incoming_columns
-                    result_tuple = (total_columns, 0, 0, 0)
-                    stats_payload = {
-                        "query_id": qid,
-                        "recorded_at": datetime.now(timezone.utc).isoformat(),
-                        "organization": self.super_table.organization,
-                        "super_name": self.super_table.super_name,
-                        "role_name": role_name,
-                        "table_name": simple_name,
-                        "overwrite_columns": overwrite_columns,
-                        "compression_level": compression_level,
-                        "newer_than": newer_than,
-                        "delete_only": delete_only,
-                        "incoming_rows": incoming_rows,
-                        "incoming_columns": incoming_columns,
-                        "inserted": 0,
-                        "deleted": 0,
-                        "total_rows": 0,
-                        "total_columns": total_columns,
-                        "new_resources": 0,
-                        "sunset_files": 0,
-                        "skipped_stale": skipped,
-                        "lineage": _safe_json(lineage or {}),
-                        "duration": round(time.time() - t0, 6),
-                        "timings": profiler.emit_timings(),
-                        "counts": profiler.emit_counts(),
-                    }
-                    # Don't return here — fall through to finally (lock release)
-                    # and the post-finally monitoring block.  Returning inside the
-                    # try block would either skip monitoring or run it while the
-                    # Redis data lock is still held.
-                else:
-                    mark("newer_than")
+                if newer_than:
+                    skipped = pre_filter_count - dataframe.height
+                    if skipped > 0:
+                        logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
+                    if dataframe.height == 0:
+                        logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
+                        mark("newer_than")
+                        total_columns = incoming_columns
+                        result_tuple = (total_columns, 0, 0, 0)
+                        stats_payload = {
+                            "query_id": qid,
+                            "recorded_at": datetime.now(timezone.utc).isoformat(),
+                            "organization": self.super_table.organization,
+                            "super_name": self.super_table.super_name,
+                            "role_name": role_name,
+                            "table_name": simple_name,
+                            "overwrite_columns": overwrite_columns,
+                            "compression_level": compression_level,
+                            "newer_than": newer_than,
+                            "delete_only": delete_only,
+                            "incoming_rows": incoming_rows,
+                            "incoming_columns": incoming_columns,
+                            "inserted": 0,
+                            "deleted": 0,
+                            "total_rows": 0,
+                            "total_columns": total_columns,
+                            "new_resources": 0,
+                            "sunset_files": 0,
+                            "skipped_stale": skipped,
+                            "lineage": _safe_json(lineage or {}),
+                            "duration": round(time.time() - t0, 6),
+                            "timings": profiler.emit_timings(),
+                            "counts": profiler.emit_counts(),
+                        }
+                        # Don't return here — fall through to finally (lock release)
+                        # and the post-finally monitoring block.  Returning inside the
+                        # try block would either skip monitoring or run it while the
+                        # Redis data lock is still held.
+                    else:
+                        mark("newer_than")
             # --- Deletion-vector (tombstone) logic ----------------------------
             # Merge-on-read model: every write tombstones the __rowid__s of the
@@ -467,16 +474,12 @@ class DataWriter:
                 # 1. Identify which existing rows this write deletes/replaces.
                 #    overwrite_columns drives the anti-join key (delete + upsert);
-                #    pure appends (no overwrite_columns) tombstone nothing.
+                #    pure appends (no overwrite_columns) tombstone nothing.  The
+                #    pairs were already derived (from the surviving keys) by the
+                #    resolve_overwrite_writes probe above.
                 new_delete_pairs = []
                 if overwrite_columns:
-                    new_delete_pairs = identify_deleted_rowids(
-                        dataframe,
-                        overlapping_files,
-                        overwrite_columns,
-                        file_cache=file_cache,
-                        profiler=profiler,
-                    )
+                    new_delete_pairs = resolved_delete_pairs or []
                 elif delete_only:
                     # delete-all: no overwrite_columns → tombstone every row.
                     new_delete_pairs = identify_all_rowids(
@@ -525,28 +528,52 @@ class DataWriter:
                 )
                 mark("build_tombstone")
-                # 4. Threshold compaction: physically drop dead rows once the
-                #    deletion-vector grows past max_tombstone_rows, then clear it.
-                if (
+                # 4. Threshold compaction (two triggers, same physical step):
+                #      (a) the deletion-vector grew past max_tombstone_rows, or
+                #      (b) the small files tripped the auto-compaction gate.
+                #    Both must FIRST physically drop tombstoned rows (Phase A)
+                #    and only THEN merge small files (Phase B): compact_resources
+                #    rewrites data files WITHOUT consulting the deletion-vector,
+                #    so sunsetting a vector-referenced file would orphan its dead
+                #    rows (hidden on read, never reclaimable).  Draining first
+                #    guarantees Phase B only ever sees vector-free survivors.
+                post_write_resources = (
+                    (last_simple_table.get("resources") or []) + new_resources
+                )
+                compaction_gate = should_compact_small_files(
+                    post_write_resources, table_config
+                )
+                tombstone_threshold_hit = (
                     combined_tombstone_df is not None
                     and combined_tombstone_df.height >= _max_tombstone_rows(table_config)
-                ):
-                    removed, compact_new, compact_sunset = compact_tombstones(
-                        snapshot=last_simple_table,
-                        tombstone_df=combined_tombstone_df,
-                        data_dir=simple_table.data_dir,
-                        compression_level=compression_level,
-                        table_config=table_config,
-                        profiler=profiler,
-                    )
-                    new_resources.extend(compact_new)
-                    sunset_files |= compact_sunset
-                    tombstone_path = None  # deletion-vector fully consumed
-                    tombstone_rows = 0
-                    logger.info(lp(
-                        f"tombstone compaction removed {removed} rows "
-                        f"from {len(compact_sunset)} files"
-                    ))
+                )
+                # Phase A — drain the deletion-vector when either trigger fires
+                # and a vector is actually live (freshly built this write OR
+                # carried forward from a prior one).
+                if tombstone_threshold_hit or compaction_gate:
+                    dv_to_drain = combined_tombstone_df
+                    if dv_to_drain is None and tombstone_path:
+                        # Pure carry-forward: load the live vector so the merge
+                        # below never sunsets a file it still references.
+                        dv_to_drain = _read_parquet_safe(tombstone_path, profiler=profiler)
+                    if dv_to_drain is not None and dv_to_drain.height > 0:
+                        removed, tomb_new, tomb_sunset = compact_tombstones(
+                            snapshot=last_simple_table,
+                            tombstone_df=dv_to_drain,
+                            data_dir=simple_table.data_dir,
+                            compression_level=compression_level,
+                            table_config=table_config,
+                            profiler=profiler,
+                        )
+                        new_resources.extend(tomb_new)
+                        sunset_files |= tomb_sunset
+                        tombstone_path = None  # deletion-vector fully consumed
+                        tombstone_rows = 0
+                        logger.info(lp(
+                            f"tombstone compaction removed {removed} rows "
+                            f"from {len(tomb_sunset)} files"
+                        ))
                 # 5. Pin the (carried-forward / new / cleared) tombstone pointer
                 #    and its row count.
@@ -554,6 +581,45 @@ class DataWriter:
                 last_simple_table["tombstone_rows"] = tombstone_rows
                 mark("compact_tombstones")
+                # Phase B — auto small-file compaction.  Merge the accumulated
+                # small files (existing survivors + the file just written) once
+                # the gate is open so the file count stays bounded.  The vector
+                # was drained above, so every surviving file is safe to sunset.
+                # Result folds into the SAME snapshot commit below (new_resources
+                # / sunset_files feed build_stats and simple_table.update).
+                compaction_ran = False
+                if compaction_gate:
+                    live_resources = [
+                        r for r in (last_simple_table.get("resources") or [])
+                        if r.get("file") not in sunset_files
+                    ]
+                    live_resources += [
+                        r for r in new_resources if r.get("file") not in sunset_files
+                    ]
+                    considered, comp_rows, comp_new, comp_sunset = compact_resources(
+                        snapshot={"resources": live_resources},
+                        data_dir=simple_table.data_dir,
+                        compression_level=compression_level,
+                        table_config=table_config,
+                        small_only=True,
+                    )
+                    if comp_new or comp_sunset:
+                        sunset_files |= comp_sunset
+                        # A file written above (incoming or tombstone survivor)
+                        # may have been re-merged here; drop any new_resources
+                        # entry that is now sunset so the snapshot never lists a
+                        # file as both live and gone.
+                        new_resources = [
+                            r for r in (new_resources + comp_new)
+                            if r.get("file") not in sunset_files
+                        ]
+                        compaction_ran = True
+                        logger.info(lp(
+                            f"auto-compaction merged {considered} small files "
+                            f"into {len(comp_new)} file(s) ({comp_rows} rows)"
+                        ))
+                mark("compact_small")
                 # 6. Carry forward + extend the external column-statistics parquet.
                 #    Read the footers of the newly written data files, drop the
                 #    rows of any sunset file, and append the new ones. No new
@@ -612,7 +678,18 @@ class DataWriter:
                 # model_df would shrink schema / schemaString to that partial
                 # shape even though all parquet files still have full schema.
                 # See docs/03_data_model.md "Schema Field Semantics".
-                schema_model_df = None if delete_only else dataframe
+                #
+                # When auto-compaction merged files this write, derive the
+                # schema from the compacted output instead: a merged file may
+                # union in columns from older files that the incoming frame
+                # lacks (schema-evolving tables), so `dataframe` would narrow
+                # the metadata even though the Parquet is wider.
+                if compaction_ran:
+                    schema_model_df = self._build_compact_model_df(
+                        new_resources, last_simple_table
+                    )
+                else:
+                    schema_model_df = None if delete_only else dataframe
                 new_snapshot_dict, new_snapshot_path = simple_table.update(
                     new_resources, sunset_files, schema_model_df,
                     last_snapshot=last_simple_table,
@@ -718,7 +795,7 @@ class DataWriter:
                         f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
                         f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
                         f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
-                        f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
+                        f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
                         f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
                         f"mirror={timings.get('mirror', 0):.3f} | prepare_monitor={timings.get('prepare_monitor', 0):.3f}"
                     )

{supertable-2.3.1 → supertable-2.3.3}/supertable/processing.py RENAMED Viewed

@@ -6,6 +6,7 @@ import os
 import io
 import time
 import threading
+import uuid
 from collections import OrderedDict
 from datetime import datetime, date, timezone
 from typing import Dict, List, Set, Tuple, Optional
@@ -291,6 +292,33 @@ def prune_not_overlapping_files_by_threshold(
     return result
+def should_compact_small_files(
+        resources: List[Dict],
+        table_config: Optional[dict] = None,
+) -> bool:
+    """Return True when accumulated small files trip the auto-compaction gate.
+    Mirrors the threshold in ``prune_not_overlapping_files_by_threshold``: a
+    file is "small" when its ``file_size`` is strictly smaller than
+    ``max_memory_chunk_size``.  The gate opens when EITHER the small-file count
+    reaches ``max_overlapping_files`` OR the combined small-file size exceeds
+    ``max_memory_chunk_size``.  Files already at/above the chunk size are big
+    enough on their own and are never counted.
+    ``resources`` is a snapshot's resource list (dicts with ``file`` /
+    ``file_size``).  Limits resolve per-table via ``_resolve_limits``.
+    """
+    max_mem, max_files = _resolve_limits(table_config)
+    small_sizes = [
+        int(r.get("file_size") or 0)
+        for r in (resources or [])
+        if r.get("file") and int(r.get("file_size") or 0) < max_mem
+    ]
+    if not small_sizes:
+        return False
+    return len(small_sizes) >= max_files or sum(small_sizes) > max_mem
 # =========================
 # Public API: Overlap selection (with compaction triggers)
 # =========================
@@ -942,6 +970,278 @@ def identify_all_rowids(
     return pairs
+# =========================
+# Pushdown overwrite resolution (DuckDB probe, polars fallback)
+# =========================
+#
+# The legacy path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
+# reads EVERY overlapping data file FULLY (all columns, all rows) into polars,
+# then group/join over the whole table — cost O(table size), independent of how
+# few rows are actually written.  ``resolve_overwrite_writes`` replaces both with
+# ONE column-projected DuckDB ``parquet_scan`` that reads only the key /
+# ``__rowid__`` / newer-than columns and only the rows whose key matches an
+# incoming key (null-safe SEMI JOIN), then derives both results in-memory from
+# that small matched set.  The two legacy functions are retained as the exact
+# semantic oracle and the fallback for any environment/schema the probe can't
+# handle.
+def _storage_duckdb_path(storage, key: str) -> str:
+    """Resolve a storage key to a path string DuckDB can read directly.
+    Object stores expose ``to_duckdb_path`` (→ ``s3://`` or ``http(s)://``);
+    local storage has none, so the on-disk path is already DuckDB-readable and
+    returned unchanged.  Anything already a URL passes through untouched.
+    """
+    if not key or "://" in key:
+        return key
+    fn = getattr(storage, "to_duckdb_path", None)
+    if callable(fn):
+        try:
+            url = fn(key)
+            if isinstance(url, str) and url:
+                return url
+        except NotImplementedError:
+            pass
+        except Exception as e:
+            logging.debug(f"[write-probe] to_duckdb_path failed for {key}: {e}")
+    return key
+def _duckdb_probe_overlap_matches(
+        overlap_true_files: List[Tuple[str, int]],
+        overwrite_columns: List[str],
+        newer_than_col: Optional[str],
+        incoming_keys: polars.DataFrame,
+        profiler: Optional[Profiler] = None,
+) -> Optional[polars.DataFrame]:
+    """Column-projected pushdown probe over the overlapping data files.
+    Runs one ``parquet_scan`` (union_by_name, ranged GETs, row-group skipping)
+    null-safe ``SEMI JOIN``-ed against the unique *incoming_keys*, projecting only
+    ``__rowid__`` + the overwrite columns (+ *newer_than_col* when given) plus the
+    source ``filename``.  Returns a polars frame with columns ``__file__`` (the
+    original storage key), ``__rowid__``, the overwrite columns and the
+    newer-than column — i.e. every existing row whose key matches an incoming
+    key.  Returns ``None`` on any failure or unsupported schema (e.g. a referenced
+    column absent from EVERY candidate file → DuckDB binder error), signalling the
+    caller to fall back to the polars full-read path.
+    """
+    p = profiler or get_null_profiler()
+    if not overlap_true_files or not overwrite_columns:
+        return None
+    try:
+        import duckdb
+        from supertable.engine.engine_common import (
+            configure_httpfs_and_s3,
+            escape_parquet_path,
+            quote_if_needed,
+        )
+    except Exception as e:
+        logging.info(f"[write-probe] duckdb unavailable, using polars path: {e}")
+        return None
+    storage = _get_storage()
+    duck_to_key: Dict[str, str] = {}
+    duck_paths: List[str] = []
+    for file_key, _sz in overlap_true_files:
+        dp = _storage_duckdb_path(storage, file_key)
+        duck_to_key[dp] = file_key
+        duck_paths.append(dp)
+    select_cols = ["filename", quote_if_needed(ROWID_COL)]
+    select_cols += [quote_if_needed(c) for c in overwrite_columns]
+    if newer_than_col:
+        select_cols.append(quote_if_needed(newer_than_col))
+    join_cond = " AND ".join(
+        f"src.{quote_if_needed(c)} IS NOT DISTINCT FROM k.{quote_if_needed(c)}"
+        for c in overwrite_columns
+    )
+    files_sql = ", ".join(f"'{escape_parquet_path(dp)}'" for dp in duck_paths)
+    ik_name = f"__st_ik_{uuid.uuid4().hex}"
+    con = None
+    try:
+        con = duckdb.connect()
+        if any("://" in dp for dp in duck_paths):
+            configure_httpfs_and_s3(con, duck_paths)
+        con.register(ik_name, incoming_keys.to_arrow())
+        sql = (
+            f"SELECT {', '.join(select_cols)} "
+            f"FROM parquet_scan([{files_sql}], union_by_name=TRUE, "
+            f"filename=TRUE, hive_partitioning=FALSE) AS src "
+            f"SEMI JOIN {ik_name} AS k ON {join_cond}"
+        )
+        with p.span("io.duckdb_probe"):
+            matched = con.execute(sql).pl()
+    except Exception as e:
+        logging.info(f"[write-probe] probe failed, using polars path: {e}")
+        return None
+    finally:
+        if con is not None:
+            try:
+                con.unregister(ik_name)
+            except Exception:
+                pass
+            try:
+                con.close()
+            except Exception:
+                pass
+    if matched is None or "filename" not in matched.columns:
+        return None
+    # Restore the original storage key (DuckDB's ``filename`` is the path we
+    # passed in) as __file__ via a join so the tombstone stores keys, not URLs.
+    map_df = polars.DataFrame(
+        {"filename": list(duck_to_key.keys()),
+         TOMBSTONE_FILE_COL: list(duck_to_key.values())}
+    )
+    matched = matched.join(map_df, on="filename", how="left").drop("filename")
+    if matched.get_column(TOMBSTONE_FILE_COL).null_count() > 0:
+        # A returned filename did not map back — refuse to emit ambiguous
+        # tombstones; let the caller fall back to the polars path.
+        logging.info("[write-probe] unmapped filename in probe result; using polars path")
+        return None
+    p.add("probe_files", len(duck_paths))
+    p.add("probe_rows_matched", int(matched.height))
+    return matched
+def _align_keys_to_incoming(
+        matched: polars.DataFrame,
+        incoming_df: polars.DataFrame,
+        overwrite_columns: List[str],
+        newer_than_col: Optional[str],
+) -> polars.DataFrame:
+    """Cast probe-result key / newer-than columns to the incoming df's dtypes.
+    DuckDB → Arrow → polars round-trips can yield a different (if compatible)
+    dtype than the in-memory incoming frame; polars joins/comparisons want
+    matching dtypes.  Casts are best-effort; an unrepresentable cast raises and
+    the caller falls back to the polars path.
+    """
+    casts = []
+    for c in overwrite_columns:
+        if c in matched.columns and c in incoming_df.columns:
+            if matched.schema[c] != incoming_df.schema[c]:
+                casts.append(polars.col(c).cast(incoming_df.schema[c]))
+    if newer_than_col and newer_than_col in matched.columns and newer_than_col in incoming_df.columns:
+        if matched.schema[newer_than_col] != incoming_df.schema[newer_than_col]:
+            casts.append(polars.col(newer_than_col).cast(incoming_df.schema[newer_than_col]))
+    return matched.with_columns(casts) if casts else matched
+def _derive_stale_and_deletes(
+        incoming_df: polars.DataFrame,
+        matched: polars.DataFrame,
+        overwrite_columns: List[str],
+        newer_than_col: Optional[str],
+        profiler: Optional[Profiler] = None,
+) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
+    """Derive (filtered incoming df, delete pairs) from the probe's matched rows.
+    Mirrors the legacy two-function semantics exactly:
+      * stale filter — drop incoming rows whose newer-than value is <= the max
+        existing value for that key (null existing max ⇒ new/legacy key ⇒ keep);
+        skipped entirely when *newer_than_col* is falsy;
+      * delete pairs — ``(file, __rowid__)`` of existing rows matched by the
+        SURVIVING incoming keys (null-safe), so stale rows tombstone nothing and
+        rows without a ``__rowid__`` (legacy files) are dropped.
+    """
+    p = profiler or get_null_profiler()
+    matched = _align_keys_to_incoming(matched, incoming_df, overwrite_columns, newer_than_col)
+    if newer_than_col and newer_than_col in matched.columns:
+        with p.span("newer_than.group_agg"):
+            existing_max = matched.group_by(overwrite_columns).agg(
+                polars.col(newer_than_col).max().alias("__existing_max__")
+            )
+        with p.span("newer_than.join_filter"):
+            joined = incoming_df.join(existing_max, on=overwrite_columns, how="left")
+            filtered = joined.filter(
+                polars.col("__existing_max__").is_null()
+                | (polars.col(newer_than_col) > polars.col("__existing_max__"))
+            ).drop("__existing_max__")
+    else:
+        filtered = incoming_df
+    pairs: List[Tuple[str, int]] = []
+    if ROWID_COL in matched.columns:
+        surviving_keys = filtered.select(overwrite_columns).unique()
+        with p.span("delete.semi_join"):
+            matched_surviving = matched.join(
+                surviving_keys, on=overwrite_columns, how="semi", nulls_equal=True
+            )
+        dv = matched_surviving.select([TOMBSTONE_FILE_COL, ROWID_COL]).drop_nulls()
+        pairs = [(file, int(rid)) for file, rid in dv.iter_rows()]
+        p.add("delete_rows_matched", len(pairs))
+    return filtered, pairs
+def resolve_overwrite_writes(
+        incoming_df: polars.DataFrame,
+        overlapping_files: Set[Tuple[str, bool, int]],
+        overwrite_columns: List[str],
+        newer_than_col: Optional[str] = None,
+        profiler: Optional[Profiler] = None,
+) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
+    """Single-pass overwrite resolution: stale filtering + delete-vector pairs.
+    Returns ``(filtered_incoming_df, delete_pairs)`` computed from ONE DuckDB
+    pushdown probe over the overlapping files.  Falls back to the original polars
+    full-read path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
+    when DuckDB is unavailable, the probe fails, or the file schema can't be
+    probed — semantics are identical on both paths.
+    *newer_than_col* falsy ⇒ no stale filtering (delete/upsert without conflict
+    resolution); the incoming df is returned unchanged and every overlapping row
+    matched by an incoming key is tombstoned.
+    """
+    p = profiler or get_null_profiler()
+    overlap_true = [(f, sz) for f, has_overlap, sz in overlapping_files if has_overlap]
+    if not overlap_true or not overwrite_columns:
+        return incoming_df, []
+    key_cols = [c for c in overwrite_columns if c in incoming_df.columns]
+    if key_cols != list(overwrite_columns):
+        # Incoming df lacks a key column → no existing row can match (mirrors the
+        # polars path, which returns no pairs and filters nothing).
+        return incoming_df, []
+    incoming_keys = incoming_df.select(overwrite_columns).unique()
+    matched = _duckdb_probe_overlap_matches(
+        overlap_true, overwrite_columns, newer_than_col, incoming_keys, profiler=p,
+    )
+    if matched is not None:
+        try:
+            return _derive_stale_and_deletes(
+                incoming_df, matched, overwrite_columns, newer_than_col, profiler=p,
+            )
+        except Exception as e:
+            logging.warning(f"[write-probe] derive failed, using polars path: {e}")
+    # ---- Fallback: original polars full-read path (semantics oracle) ----
+    p.add("overwrite_resolve_fallback", 1)
+    file_cache: Dict[str, polars.DataFrame] = {}
+    if newer_than_col:
+        filtered = filter_stale_incoming_rows(
+            incoming_df=incoming_df,
+            overlapping_files=overlapping_files,
+            overwrite_columns=overwrite_columns,
+            newer_than_col=newer_than_col,
+            file_cache=file_cache,
+            profiler=p,
+        )
+    else:
+        filtered = incoming_df
+    pairs = identify_deleted_rowids(
+        filtered, overlapping_files, overwrite_columns,
+        file_cache=file_cache, profiler=p,
+    )
+    return filtered, pairs
 def build_tombstone_file(
         tombstone_dir: str,
         prev_tombstone_path: Optional[str],

supertable 2.3.1__tar.gz → 2.3.3__tar.gz

supertable 2.3.1tar.gz → 2.3.3tar.gz