PyPI - supertable - Versions diffs - 2.3.2__tar.gz → 2.3.3__tar.gz - Mend

supertable 2.3.2tar.gz → 2.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

{supertable-2.3.2/supertable.egg-info → supertable-2.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: supertable
-Version: 2.3.2
+Version: 2.3.3
 Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
 Author: Levente Kupas
 Author-email: Levente Kupas <lkupas@kladnasoft.com>

{supertable-2.3.2 → supertable-2.3.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "supertable"
-version = "2.3.2"
+version = "2.3.3"
 description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
 readme = "README.md"
 requires-python = ">=3.10"

{supertable-2.3.2 → supertable-2.3.3}/setup.py RENAMED Viewed

@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
 setup(
     name="supertable",
-    version="2.3.2",
+    version="2.3.3",
     description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
     long_description=long_description,
     long_description_content_type="text/markdown",

{supertable-2.3.2 → supertable-2.3.3}/supertable/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
 project documentation for the full API surface.
 """
-__version__ = "2.3.2"
+__version__ = "2.3.3"
 # Re-export the core public surface so users can do ``from supertable import …``
 # instead of remembering submodule paths.

{supertable-2.3.2 → supertable-2.3.3}/supertable/data_writer.py RENAMED Viewed

@@ -35,6 +35,7 @@ from supertable.processing import (
     write_parquet_and_collect_resources,
     compact_resources,
     compact_tombstones,
+    should_compact_small_files,
     _max_tombstone_rows,
     _read_parquet_safe,
 )
@@ -527,28 +528,52 @@ class DataWriter:
                 )
                 mark("build_tombstone")
-                # 4. Threshold compaction: physically drop dead rows once the
-                #    deletion-vector grows past max_tombstone_rows, then clear it.
-                if (
+                # 4. Threshold compaction (two triggers, same physical step):
+                #      (a) the deletion-vector grew past max_tombstone_rows, or
+                #      (b) the small files tripped the auto-compaction gate.
+                #    Both must FIRST physically drop tombstoned rows (Phase A)
+                #    and only THEN merge small files (Phase B): compact_resources
+                #    rewrites data files WITHOUT consulting the deletion-vector,
+                #    so sunsetting a vector-referenced file would orphan its dead
+                #    rows (hidden on read, never reclaimable).  Draining first
+                #    guarantees Phase B only ever sees vector-free survivors.
+                post_write_resources = (
+                    (last_simple_table.get("resources") or []) + new_resources
+                )
+                compaction_gate = should_compact_small_files(
+                    post_write_resources, table_config
+                )
+                tombstone_threshold_hit = (
                     combined_tombstone_df is not None
                     and combined_tombstone_df.height >= _max_tombstone_rows(table_config)
-                ):
-                    removed, compact_new, compact_sunset = compact_tombstones(
-                        snapshot=last_simple_table,
-                        tombstone_df=combined_tombstone_df,
-                        data_dir=simple_table.data_dir,
-                        compression_level=compression_level,
-                        table_config=table_config,
-                        profiler=profiler,
-                    )
-                    new_resources.extend(compact_new)
-                    sunset_files |= compact_sunset
-                    tombstone_path = None  # deletion-vector fully consumed
-                    tombstone_rows = 0
-                    logger.info(lp(
-                        f"tombstone compaction removed {removed} rows "
-                        f"from {len(compact_sunset)} files"
-                    ))
+                )
+                # Phase A — drain the deletion-vector when either trigger fires
+                # and a vector is actually live (freshly built this write OR
+                # carried forward from a prior one).
+                if tombstone_threshold_hit or compaction_gate:
+                    dv_to_drain = combined_tombstone_df
+                    if dv_to_drain is None and tombstone_path:
+                        # Pure carry-forward: load the live vector so the merge
+                        # below never sunsets a file it still references.
+                        dv_to_drain = _read_parquet_safe(tombstone_path, profiler=profiler)
+                    if dv_to_drain is not None and dv_to_drain.height > 0:
+                        removed, tomb_new, tomb_sunset = compact_tombstones(
+                            snapshot=last_simple_table,
+                            tombstone_df=dv_to_drain,
+                            data_dir=simple_table.data_dir,
+                            compression_level=compression_level,
+                            table_config=table_config,
+                            profiler=profiler,
+                        )
+                        new_resources.extend(tomb_new)
+                        sunset_files |= tomb_sunset
+                        tombstone_path = None  # deletion-vector fully consumed
+                        tombstone_rows = 0
+                        logger.info(lp(
+                            f"tombstone compaction removed {removed} rows "
+                            f"from {len(tomb_sunset)} files"
+                        ))
                 # 5. Pin the (carried-forward / new / cleared) tombstone pointer
                 #    and its row count.
@@ -556,6 +581,45 @@ class DataWriter:
                 last_simple_table["tombstone_rows"] = tombstone_rows
                 mark("compact_tombstones")
+                # Phase B — auto small-file compaction.  Merge the accumulated
+                # small files (existing survivors + the file just written) once
+                # the gate is open so the file count stays bounded.  The vector
+                # was drained above, so every surviving file is safe to sunset.
+                # Result folds into the SAME snapshot commit below (new_resources
+                # / sunset_files feed build_stats and simple_table.update).
+                compaction_ran = False
+                if compaction_gate:
+                    live_resources = [
+                        r for r in (last_simple_table.get("resources") or [])
+                        if r.get("file") not in sunset_files
+                    ]
+                    live_resources += [
+                        r for r in new_resources if r.get("file") not in sunset_files
+                    ]
+                    considered, comp_rows, comp_new, comp_sunset = compact_resources(
+                        snapshot={"resources": live_resources},
+                        data_dir=simple_table.data_dir,
+                        compression_level=compression_level,
+                        table_config=table_config,
+                        small_only=True,
+                    )
+                    if comp_new or comp_sunset:
+                        sunset_files |= comp_sunset
+                        # A file written above (incoming or tombstone survivor)
+                        # may have been re-merged here; drop any new_resources
+                        # entry that is now sunset so the snapshot never lists a
+                        # file as both live and gone.
+                        new_resources = [
+                            r for r in (new_resources + comp_new)
+                            if r.get("file") not in sunset_files
+                        ]
+                        compaction_ran = True
+                        logger.info(lp(
+                            f"auto-compaction merged {considered} small files "
+                            f"into {len(comp_new)} file(s) ({comp_rows} rows)"
+                        ))
+                mark("compact_small")
                 # 6. Carry forward + extend the external column-statistics parquet.
                 #    Read the footers of the newly written data files, drop the
                 #    rows of any sunset file, and append the new ones. No new
@@ -614,7 +678,18 @@ class DataWriter:
                 # model_df would shrink schema / schemaString to that partial
                 # shape even though all parquet files still have full schema.
                 # See docs/03_data_model.md "Schema Field Semantics".
-                schema_model_df = None if delete_only else dataframe
+                #
+                # When auto-compaction merged files this write, derive the
+                # schema from the compacted output instead: a merged file may
+                # union in columns from older files that the incoming frame
+                # lacks (schema-evolving tables), so `dataframe` would narrow
+                # the metadata even though the Parquet is wider.
+                if compaction_ran:
+                    schema_model_df = self._build_compact_model_df(
+                        new_resources, last_simple_table
+                    )
+                else:
+                    schema_model_df = None if delete_only else dataframe
                 new_snapshot_dict, new_snapshot_path = simple_table.update(
                     new_resources, sunset_files, schema_model_df,
                     last_snapshot=last_simple_table,
@@ -720,7 +795,7 @@ class DataWriter:
                         f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
                         f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
                         f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
-                        f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
+                        f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
                         f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
                         f"mirror={timings.get('mirror', 0):.3f} | prepare_monitor={timings.get('prepare_monitor', 0):.3f}"
                     )

{supertable-2.3.2 → supertable-2.3.3}/supertable/processing.py RENAMED Viewed

@@ -292,6 +292,33 @@ def prune_not_overlapping_files_by_threshold(
     return result
+def should_compact_small_files(
+        resources: List[Dict],
+        table_config: Optional[dict] = None,
+) -> bool:
+    """Return True when accumulated small files trip the auto-compaction gate.
+    Mirrors the threshold in ``prune_not_overlapping_files_by_threshold``: a
+    file is "small" when its ``file_size`` is strictly smaller than
+    ``max_memory_chunk_size``.  The gate opens when EITHER the small-file count
+    reaches ``max_overlapping_files`` OR the combined small-file size exceeds
+    ``max_memory_chunk_size``.  Files already at/above the chunk size are big
+    enough on their own and are never counted.
+    ``resources`` is a snapshot's resource list (dicts with ``file`` /
+    ``file_size``).  Limits resolve per-table via ``_resolve_limits``.
+    """
+    max_mem, max_files = _resolve_limits(table_config)
+    small_sizes = [
+        int(r.get("file_size") or 0)
+        for r in (resources or [])
+        if r.get("file") and int(r.get("file_size") or 0) < max_mem
+    ]
+    if not small_sizes:
+        return False
+    return len(small_sizes) >= max_files or sum(small_sizes) > max_mem
 # =========================
 # Public API: Overlap selection (with compaction triggers)
 # =========================

{supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer.py RENAMED Viewed

@@ -48,6 +48,16 @@ _PATCH_BUILD_TOMBSTONE = f"{_MOD}.build_tombstone_file"
 _PATCH_MIRROR = f"{_MOD}.MirrorFormats"
 _PATCH_GET_MON_LOGGER = f"{_MOD}.MonitoringWriter"
 _PATCH_UUID4 = f"{_MOD}.uuid.uuid4"
+# Auto-compaction step (Phase A drain + Phase B small-file merge) wired into
+# write().  The gate (should_compact_small_files) is left UNMOCKED so tests
+# drive the REAL threshold off the snapshot's resource list; the heavy merge
+# helpers and the stats writers (which would otherwise touch storage once files
+# are sunset) are mocked so the tests pin orchestration, not Parquet I/O.
+_PATCH_COMPACT_RES = f"{_MOD}.compact_resources"
+_PATCH_COMPACT_TOMB = f"{_MOD}.compact_tombstones"
+_PATCH_READ_PARQUET = f"{_MOD}._read_parquet_safe"
+_PATCH_EXTRACT_STATS = f"{_MOD}.extract_stats_rows"
+_PATCH_BUILD_STATS = f"{_MOD}.build_stats_file"
 # ---------------------------------------------------------------------------
@@ -1814,3 +1824,277 @@ class TestWriteOverwriteResolution:
         assert kwargs["newer_than_col"] == "ts"
         # The single returned delete pair drives the deleted count.
         assert result[3] == 1
+# ====================================================================
+# 12.  DataWriter.write — Inline Auto-Compaction (small-file gate)
+# ====================================================================
+def _small_resources(n: int, *, size: int = 80 * 1024) -> List[Dict]:
+    """N small-file resource dicts that trip should_compact_small_files'
+    REAL count gate (default MAX_OVERLAPPING_FILES=100) once n >= 100.
+    Only ``file`` / ``file_size`` matter — the gate ignores everything else,
+    and the merge helper is mocked so the files are never opened."""
+    return [
+        {"file": f"small_{i}.parquet", "file_size": size, "rows": 100}
+        for i in range(n)
+    ]
+def _mk_compaction_catalog():
+    cat = MagicMock()
+    cat.reserve_rowids.return_value = 0
+    cat.get_table_config.return_value = None   # → default limits (100 / 16MB)
+    cat.acquire_simple_lock.return_value = "t"
+    cat.release_simple_lock.return_value = True
+    cat.set_leaf_payload_cas.return_value = 1
+    cat.bump_root.return_value = 1
+    return cat
+class TestWriteAutoCompaction:
+    """The user's bug: small files accumulated forever because automatic
+    compaction was never wired into write() — only the manual compact()
+    entry point merged them.  These tests pin the inline step: the gate is
+    checked on every write, draining the deletion-vector FIRST (Phase A)
+    so the small-file merge (Phase B) can never sunset a vector-referenced
+    file, and the merged output folds into the SAME snapshot commit."""
+    @patch(_PATCH_COMPACT_RES)
+    @patch(_PATCH_COMPACT_TOMB)
+    @patch(_PATCH_READ_PARQUET)
+    @patch(_PATCH_BUILD_STATS)
+    @patch(_PATCH_EXTRACT_STATS)
+    @patch(_PATCH_BUILD_TOMBSTONE)
+    @patch(_PATCH_GET_MON_LOGGER)
+    @patch(_PATCH_MIRROR)
+    @patch(_PATCH_PROCESS_OVERLAP)
+    @patch(_PATCH_RESOLVE)
+    @patch(_PATCH_FIND_OVERLAP)
+    @patch(_PATCH_SIMPLE_TABLE)
+    @patch(_PATCH_CHECK_WRITE)
+    @patch(_PATCH_POLARS_FROM_ARROW)
+    @patch(_PATCH_REDIS_CATALOG)
+    @patch(_PATCH_SUPER_TABLE)
+    def test_gate_trips_append_merges_and_folds_into_snapshot(
+        self,
+        MockST, MockCat, mock_from_arrow, mock_check_write,
+        MockSimple, mock_find_overlap, mock_resolve, mock_process,
+        MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
+        mock_build_stats, mock_read_parquet, mock_compact_tomb,
+        mock_compact_res,
+    ):
+        """100 accumulated small files → REAL gate trips → compact_resources
+        runs once and its merged output / sunset set fold into the single
+        simple_table.update() commit.  No deletes ⇒ tombstone drain is a
+        no-op (nothing to orphan)."""
+        mock_st = MagicMock(super_name="s", organization="o")
+        MockST.return_value = mock_st
+        MockCat.return_value = _mk_compaction_catalog()
+        df = _polars_df({"id": [1], "ts": [100]})
+        mock_from_arrow.return_value = df
+        snap = {"resources": _small_resources(100)}
+        mock_simple = MagicMock(data_dir="/d")
+        mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
+        mock_simple.update.return_value = ({}, "/np")
+        MockSimple.return_value = mock_simple
+        mock_find_overlap.return_value = set()
+        # Pure append: rows survive, no delete pairs, no carried-forward vector.
+        mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
+        mock_build_tomb.return_value = (None, None)
+        # The just-written file lands in new_resources (the established pattern).
+        mock_process.side_effect = lambda **kw: kw["new_resources"].append(
+            {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
+        )
+        mock_extract_stats.return_value = MagicMock()
+        mock_build_stats.return_value = (None, None)
+        mock_get_mon.return_value = MagicMock()
+        # compact_resources merges EVERY live file into one and reports them
+        # all as sunset (computed from the snapshot it actually received).
+        def _merge(**kw):
+            live = kw["snapshot"]["resources"]
+            sunset = {r["file"] for r in live}
+            return (len(live), 10_100, [{"file": "merged.parquet",
+                                          "file_size": 8_000_000,
+                                          "rows": 10_100}], sunset)
+        mock_compact_res.side_effect = _merge
+        from supertable.data_writer import DataWriter
+        dw = DataWriter("s", "o")
+        result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
+        assert result is not None
+        # Gate tripped → merge ran exactly once; append had no vector to drain.
+        mock_compact_res.assert_called_once()
+        mock_compact_tomb.assert_not_called()
+        # The merge saw the 100 existing files plus the one just written.
+        merged_snapshot = mock_compact_res.call_args.kwargs["snapshot"]
+        assert len(merged_snapshot["resources"]) == 101
+        # Folded into the SAME commit: update() lists the merged file as the
+        # sole survivor and every consumed file as sunset.
+        new_resources_arg = mock_simple.update.call_args[0][0]
+        sunset_arg = mock_simple.update.call_args[0][1]
+        assert [r["file"] for r in new_resources_arg] == ["merged.parquet"]
+        assert "new.parquet" in sunset_arg
+        assert "small_0.parquet" in sunset_arg
+        assert "merged.parquet" not in sunset_arg
+    @patch(_PATCH_COMPACT_RES)
+    @patch(_PATCH_COMPACT_TOMB)
+    @patch(_PATCH_READ_PARQUET)
+    @patch(_PATCH_BUILD_STATS)
+    @patch(_PATCH_EXTRACT_STATS)
+    @patch(_PATCH_BUILD_TOMBSTONE)
+    @patch(_PATCH_GET_MON_LOGGER)
+    @patch(_PATCH_MIRROR)
+    @patch(_PATCH_PROCESS_OVERLAP)
+    @patch(_PATCH_RESOLVE)
+    @patch(_PATCH_FIND_OVERLAP)
+    @patch(_PATCH_SIMPLE_TABLE)
+    @patch(_PATCH_CHECK_WRITE)
+    @patch(_PATCH_POLARS_FROM_ARROW)
+    @patch(_PATCH_REDIS_CATALOG)
+    @patch(_PATCH_SUPER_TABLE)
+    def test_below_threshold_does_not_compact(
+        self,
+        MockST, MockCat, mock_from_arrow, mock_check_write,
+        MockSimple, mock_find_overlap, mock_resolve, mock_process,
+        MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
+        mock_build_stats, mock_read_parquet, mock_compact_tomb,
+        mock_compact_res,
+    ):
+        """A handful of small files stays under both the count and size
+        triggers, so the write commits without invoking either compaction
+        helper — auto-compaction must not run on every write, only when the
+        gate is open."""
+        mock_st = MagicMock(super_name="s", organization="o")
+        MockST.return_value = mock_st
+        MockCat.return_value = _mk_compaction_catalog()
+        df = _polars_df({"id": [1], "ts": [100]})
+        mock_from_arrow.return_value = df
+        snap = {"resources": _small_resources(5)}
+        mock_simple = MagicMock(data_dir="/d")
+        mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
+        mock_simple.update.return_value = ({}, "/np")
+        MockSimple.return_value = mock_simple
+        mock_find_overlap.return_value = set()
+        mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
+        mock_build_tomb.return_value = (None, None)
+        mock_process.side_effect = lambda **kw: kw["new_resources"].append(
+            {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
+        )
+        mock_extract_stats.return_value = MagicMock()
+        mock_build_stats.return_value = (None, None)
+        mock_get_mon.return_value = MagicMock()
+        from supertable.data_writer import DataWriter
+        dw = DataWriter("s", "o")
+        result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
+        assert result is not None
+        mock_compact_res.assert_not_called()
+        mock_compact_tomb.assert_not_called()
+        # Write still committed the freshly written file untouched.
+        new_resources_arg = mock_simple.update.call_args[0][0]
+        assert [r["file"] for r in new_resources_arg] == ["new.parquet"]
+        assert mock_simple.update.call_args[0][1] == set()
+    @patch(_PATCH_COMPACT_RES)
+    @patch(_PATCH_COMPACT_TOMB)
+    @patch(_PATCH_READ_PARQUET)
+    @patch(_PATCH_BUILD_STATS)
+    @patch(_PATCH_EXTRACT_STATS)
+    @patch(_PATCH_BUILD_TOMBSTONE)
+    @patch(_PATCH_GET_MON_LOGGER)
+    @patch(_PATCH_MIRROR)
+    @patch(_PATCH_PROCESS_OVERLAP)
+    @patch(_PATCH_RESOLVE)
+    @patch(_PATCH_FIND_OVERLAP)
+    @patch(_PATCH_SIMPLE_TABLE)
+    @patch(_PATCH_CHECK_WRITE)
+    @patch(_PATCH_POLARS_FROM_ARROW)
+    @patch(_PATCH_REDIS_CATALOG)
+    @patch(_PATCH_SUPER_TABLE)
+    def test_carried_forward_vector_drains_before_merge(
+        self,
+        MockST, MockCat, mock_from_arrow, mock_check_write,
+        MockSimple, mock_find_overlap, mock_resolve, mock_process,
+        MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
+        mock_build_stats, mock_read_parquet, mock_compact_tomb,
+        mock_compact_res,
+    ):
+        """The ordering invariant: when a live deletion-vector is carried
+        forward (build_tombstone_file returns a path but no fresh frame) and
+        the gate trips, Phase A must LOAD and drain that vector (compact_
+        tombstones) BEFORE Phase B merges small files (compact_resources).
+        Merging first could sunset a file the vector still references and
+        permanently orphan its dead rows."""
+        mock_st = MagicMock(super_name="s", organization="o")
+        MockST.return_value = mock_st
+        MockCat.return_value = _mk_compaction_catalog()
+        df = _polars_df({"id": [1], "ts": [100]})
+        mock_from_arrow.return_value = df
+        snap = {
+            "resources": _small_resources(100),
+            "tombstone": "/d/tombstone/dv.parquet",
+            "tombstone_rows": 50,
+        }
+        mock_simple = MagicMock(data_dir="/d")
+        mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
+        mock_simple.update.return_value = ({}, "/np")
+        MockSimple.return_value = mock_simple
+        mock_find_overlap.return_value = set()
+        mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
+        # Carry-forward: pointer reused, no fresh combined frame this write.
+        mock_build_tomb.return_value = ("/d/tombstone/dv.parquet", None)
+        mock_process.side_effect = lambda **kw: kw["new_resources"].append(
+            {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
+        )
+        # Phase A loads the live vector off its pointer to drain it.
+        mock_read_parquet.return_value = _polars_df(
+            {"__rowid__": list(range(50))}
+        )
+        mock_extract_stats.return_value = MagicMock()
+        mock_build_stats.return_value = (None, None)
+        mock_get_mon.return_value = MagicMock()
+        order: List[str] = []
+        def _drain(**kw):
+            order.append("tomb")
+            return (50, [{"file": "survivor.parquet",
+                          "file_size": 70 * 1024, "rows": 50}],
+                    {"small_0.parquet"})
+        mock_compact_tomb.side_effect = _drain
+        def _merge(**kw):
+            order.append("res")
+            return (10, 5_000, [{"file": "merged.parquet",
+                                 "file_size": 4_000_000, "rows": 5_000}],
+                    set())
+        mock_compact_res.side_effect = _merge
+        from supertable.data_writer import DataWriter
+        dw = DataWriter("s", "o")
+        result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
+        assert result is not None
+        # The carried-forward vector was read off its pointer to drain it.
+        mock_read_parquet.assert_called_once()
+        assert mock_read_parquet.call_args[0][0] == "/d/tombstone/dv.parquet"
+        # Both phases ran, drain strictly before merge.
+        mock_compact_tomb.assert_called_once()
+        mock_compact_res.assert_called_once()
+        assert order == ["tomb", "res"]

{supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing_compact_resources.py RENAMED Viewed

@@ -573,3 +573,96 @@ class TestRaceTolerance:
         # Output has only the present file's rows
         after = _read_all(patched_storage, [r["file"] for r in new_res])
         assert _multiset(after) == _multiset(df)
+# ---------------------------------------------------------------------------
+# should_compact_small_files — the auto-compaction gate predicate
+# ---------------------------------------------------------------------------
+#
+# Pure function (no I/O): decides whether write() should trigger an inline
+# small-file merge.  This is the gate the user's bug report was about —
+# 170×80KB files never compacted because nothing was *checking* it on write.
+# Two independent triggers, both measured over files SMALLER than the chunk
+# size (large files are already "done" and must never force a merge):
+#   (a) the small-file COUNT reaches max_overlapping_files, or
+#   (b) the small-file total BYTES exceed max_memory_chunk_size.
+def _res(file: str | None, size: int) -> dict:
+    return {"file": file, "file_size": size, "rows": 1}
+class TestShouldCompactSmallFiles:
+    def _limits(self):
+        from supertable.processing import _resolve_limits
+        return _resolve_limits(None)  # global defaults
+    def test_empty_resources_never_compacts(self):
+        from supertable.processing import should_compact_small_files
+        assert should_compact_small_files([]) is False
+        assert should_compact_small_files(None) is False
+    def test_count_trigger_at_threshold(self):
+        from supertable.processing import should_compact_small_files
+        max_mem, max_files = self._limits()
+        # Each file small enough that the BYTE trigger stays dormant, so this
+        # isolates the COUNT trigger: sum = max_files*s = max_mem/2 <= max_mem.
+        s = max_mem // (max_files * 2)
+        at = [_res(f"f{i}.parquet", s) for i in range(max_files)]
+        below = at[:-1]
+        assert should_compact_small_files(below) is False  # max_files-1
+        assert should_compact_small_files(at) is True       # == max_files
+    def test_size_trigger_below_count(self):
+        from supertable.processing import should_compact_small_files
+        max_mem, max_files = self._limits()
+        # 5 files, each a quarter-chunk (< chunk, so "small") → 1.25 chunks
+        # total: the BYTE trigger fires even though count is far below max_files.
+        s = max_mem // 4
+        res = [_res(f"f{i}.parquet", s) for i in range(5)]
+        assert len(res) < max_files
+        assert should_compact_small_files(res) is True
+    def test_size_trigger_is_strict_greater_than(self):
+        from supertable.processing import should_compact_small_files
+        max_mem, _ = self._limits()
+        # Exactly == max_mem must NOT trip (boundary): two half-chunk files.
+        res = [_res("a.parquet", max_mem // 2), _res("b.parquet", max_mem // 2)]
+        assert sum(r["file_size"] for r in res) == max_mem
+        assert should_compact_small_files(res) is False
+    def test_large_files_are_ignored(self):
+        from supertable.processing import should_compact_small_files
+        max_mem, max_files = self._limits()
+        # Files >= chunk size are "already compacted": even max_files+50 of
+        # them must NOT trigger a merge (they are not small).
+        big = [_res(f"b{i}.parquet", max_mem) for i in range(max_files + 50)]
+        assert should_compact_small_files(big) is False
+        # A handful of small files mixed in stays below both triggers.
+        mixed = big + [_res(f"s{i}.parquet", 80 * 1024) for i in range(5)]
+        assert should_compact_small_files(mixed) is False
+    def test_per_table_config_overrides_global_count(self):
+        from supertable.processing import should_compact_small_files
+        cfg = {"max_overlapping_files": 10}
+        small = [_res(f"f{i}.parquet", 80 * 1024) for i in range(10)]
+        assert should_compact_small_files(small, cfg) is True
+        assert should_compact_small_files(small[:-1], cfg) is False
+    def test_resource_without_file_key_is_skipped(self):
+        from supertable.processing import should_compact_small_files
+        _, max_files = self._limits()
+        # Entries lacking a ``file`` path are not real files → ignored, even
+        # at max_files of them (guards against directory/placeholder rows).
+        phantom = [_res(None, 80 * 1024) for _ in range(max_files)]
+        assert should_compact_small_files(phantom) is False
+    def test_missing_file_size_does_not_crash(self):
+        from supertable.processing import should_compact_small_files
+        # ``file_size`` absent/None coerces to 0 (counts toward the COUNT
+        # trigger but contributes no bytes) — must never raise.
+        _, max_files = self._limits()
+        no_size = [{"file": f"f{i}.parquet"} for i in range(max_files)]
+        assert should_compact_small_files(no_size) is True  # count trigger
+        assert should_compact_small_files(no_size[:1]) is False

{supertable-2.3.2 → supertable-2.3.3/supertable.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: supertable
-Version: 2.3.2
+Version: 2.3.3
 Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
 Author: Levente Kupas
 Author-email: Levente Kupas <lkupas@kladnasoft.com>