PyPI - core-lens - Versions diffs - 0.1.dev89__tar.gz → 0.1.dev97__tar.gz - Mend

core-lens 0.1.dev89tar.gz → 0.1.dev97tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/.gitignore RENAMED Viewed

@@ -367,3 +367,14 @@ context.md
 # Data
 data/
+# Profiling results
+scalene*.*
+# Results
+*.html
+output_*.parquet
+output_*.csv
+output_*.geojson
+output_*.geoparquet
+output_*.json

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: core-lens
-Version: 0.1.dev89
+Version: 0.1.dev97
 Summary: Query, analyse, and visualise CoreStack's microwatershed and Earth science data through a clean, composable Python API.
 Project-URL: Homepage, https://github.com/ApoorvaKashyap/core-lens
 Project-URL: Issues, https://github.com/ApoorvaKashyap/core-lens/issues
@@ -17,8 +17,8 @@ Requires-Python: >=3.13
 Requires-Dist: core-lens[core]
 Requires-Dist: core-lens[spatial]
 Provides-Extra: core
-Requires-Dist: polars<2,>=1.40.1; extra == 'core'
-Requires-Dist: pyarrow<25,>=24.0.0; extra == 'core'
+Requires-Dist: polars<2,>=1.39.0; extra == 'core'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'core'
 Requires-Dist: pydantic<3,>=2.13.3; extra == 'core'
 Provides-Extra: full
 Requires-Dist: duckdb<2,>=1.5.3; extra == 'full'
@@ -26,24 +26,45 @@ Requires-Dist: geopandas<2,>=1.1.3; extra == 'full'
 Requires-Dist: lonboard<1,>=0.16.0; extra == 'full'
 Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full'
 Requires-Dist: plotly<7,>=6.7.0; extra == 'full'
-Requires-Dist: polars<2,>=1.40.1; extra == 'full'
-Requires-Dist: pyarrow<25,>=24.0.0; extra == 'full'
+Requires-Dist: polars<2,>=1.39.0; extra == 'full'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full'
 Requires-Dist: pydantic<3,>=2.13.3; extra == 'full'
 Requires-Dist: pyproj<4,>=3.7.2; extra == 'full'
 Requires-Dist: scipy<2,>=1.17.1; extra == 'full'
 Requires-Dist: shapely<3,>=2.1.2; extra == 'full'
 Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full'
+Provides-Extra: full-gpu
+Requires-Dist: cudf-cu13==26.6.*; extra == 'full-gpu'
+Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'full-gpu'
+Requires-Dist: duckdb<2,>=1.5.3; extra == 'full-gpu'
+Requires-Dist: geopandas<2,>=1.1.3; extra == 'full-gpu'
+Requires-Dist: lonboard<1,>=0.16.0; extra == 'full-gpu'
+Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full-gpu'
+Requires-Dist: plotly<7,>=6.7.0; extra == 'full-gpu'
+Requires-Dist: polars<2,>=1.39.0; extra == 'full-gpu'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full-gpu'
+Requires-Dist: pydantic<3,>=2.13.3; extra == 'full-gpu'
+Requires-Dist: pyproj<4,>=3.7.2; extra == 'full-gpu'
+Requires-Dist: scipy<2,>=1.17.1; extra == 'full-gpu'
+Requires-Dist: shapely<3,>=2.1.2; extra == 'full-gpu'
+Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full-gpu'
+Provides-Extra: gpu
+Requires-Dist: cudf-cu13==26.6.*; extra == 'gpu'
+Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'gpu'
+Requires-Dist: polars<2,>=1.39.0; extra == 'gpu'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'gpu'
+Requires-Dist: pydantic<3,>=2.13.3; extra == 'gpu'
 Provides-Extra: spatial
 Requires-Dist: duckdb<2,>=1.5.3; extra == 'spatial'
 Requires-Dist: geopandas<2,>=1.1.3; extra == 'spatial'
-Requires-Dist: polars<2,>=1.40.1; extra == 'spatial'
-Requires-Dist: pyarrow<25,>=24.0.0; extra == 'spatial'
+Requires-Dist: polars<2,>=1.39.0; extra == 'spatial'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'spatial'
 Requires-Dist: pydantic<3,>=2.13.3; extra == 'spatial'
 Requires-Dist: pyproj<4,>=3.7.2; extra == 'spatial'
 Requires-Dist: shapely<3,>=2.1.2; extra == 'spatial'
 Provides-Extra: stats
-Requires-Dist: polars<2,>=1.40.1; extra == 'stats'
-Requires-Dist: pyarrow<25,>=24.0.0; extra == 'stats'
+Requires-Dist: polars<2,>=1.39.0; extra == 'stats'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'stats'
 Requires-Dist: pydantic<3,>=2.13.3; extra == 'stats'
 Requires-Dist: scipy<2,>=1.17.1; extra == 'stats'
 Requires-Dist: statsmodels<1,>=0.14.6; extra == 'stats'
@@ -51,8 +72,8 @@ Provides-Extra: viz
 Requires-Dist: lonboard<1,>=0.16.0; extra == 'viz'
 Requires-Dist: matplotlib<4,>=3.10.9; extra == 'viz'
 Requires-Dist: plotly<7,>=6.7.0; extra == 'viz'
-Requires-Dist: polars<2,>=1.40.1; extra == 'viz'
-Requires-Dist: pyarrow<25,>=24.0.0; extra == 'viz'
+Requires-Dist: polars<2,>=1.39.0; extra == 'viz'
+Requires-Dist: pyarrow<25,>=23.0.0; extra == 'viz'
 Requires-Dist: pydantic<3,>=2.13.3; extra == 'viz'
 Description-Content-Type: text/markdown

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/concepts.md RENAMED Viewed

@@ -4,4 +4,4 @@ CoreLens is built on three main layers:
 1. **AoI (Area of Interest)**: The primary entry point. It represents a spatial boundary and acts as a gateway to all registered entities that intersect that boundary.
 2. **View**: A lazy, immutable query definition. Filtering by attributes (`where`), space (`spatial_filter`), or time (`between`) returns a new `View` without reading any Parquet data.
-3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.
+3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query using the Polars streaming engine (automatically routed to the GPU if RAPIDS `cudf-polars` is installed) and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/intro.md RENAMED Viewed

@@ -8,6 +8,7 @@ CoreLens provides a unified interface over microwatersheds, administrative bound
 - **Area of Interest (AoI) First**: Define your spatial boundary once and instantly access all underlying entities (microwatersheds, villages, tehsils) scoped to that boundary.
 - **Lazy Evaluation**: Uses Polars for lazy evaluation and predicate pushdown. Data is only read from Parquet files when explicitly materialised.
+- **GPU Acceleration**: Zero-code GPU acceleration for query execution and aggregations via NVIDIA RAPIDS (`cudf-polars`), capable of automatically routing compatible queries to the GPU.
 - **Pluggable Entities**: Built-in support for standard units (MWS, Tehsil) with a simple plugin architecture for adding new domain entities.
 - **Temporal & Seasonal Awareness**: Native support for agronomic seasons (Kharif, Rabi, Zaid) and time-range filtering.
 - **Spatial Statistics & Analysis**: Built-in methods for anomaly detection, spatial similarity, temporal correlation, and hypothesis testing.

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/quickstart.md RENAMED Viewed

@@ -6,6 +6,12 @@
 pip install core-lens
 ```
+To enable GPU-accelerated queries (requires an NVIDIA GPU and Linux), install with the `gpu` extra. You will need to configure your package manager to use the NVIDIA PyPI index for RAPIDS dependencies:
+```bash
+pip install "core-lens[gpu]" --extra-index-url=https://pypi.nvidia.com
+```
 ## Basic Usage
 Before querying, you must register the entities you plan to use:

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/examples/demo_tehsil.py RENAMED Viewed

@@ -27,6 +27,11 @@ import polars as pl
 from core_lens import AoI, SeasonConfig
 from core_lens.entities import TehsilEntity
 from core_lens.export import geoparquet
+from core_lens.base.namespaces.stats import (
+    CorrelateMethod,
+    SimilarityMethod,
+    TestMethod,
+)
 import shapely.geometry as sgeom
@@ -163,7 +168,7 @@ print(desc_by_entity.df().head(5))
 corr = result_with_area.stats.correlate(
     columns=["area_km2", "Shape_Leng", "compactness"],
-    method="spearman",
+    method=CorrelateMethod.SPEARMAN,
     across="entity",
 )
 print("\nCorrelation (Spearman):")
@@ -180,7 +185,7 @@ try:
     test_result = result_with_area.stats.test(
         column="area_km2",
         groups="STATE",
-        method="mann-whitney",
+        method=TestMethod.MANN_WHITNEY,
     )
     print("\nHypothesis test — area by STATE:")
     print(test_result.df())
@@ -196,7 +201,7 @@ except IndexError:
 test_vs_ref = result_with_area.stats.test(
     column="area_km2",
     against=500.0,  # reference area in km²
-    method="t-test",
+    method=TestMethod.T_TEST,
 )
 print("\nOne-sample t-test vs 500 km²:")
 print(test_vs_ref.metadata)
@@ -227,7 +232,7 @@ sim = result_with_area.stats.similarity(
         "Shape_Leng": None,
         "compactness": None,
     },
-    method="euclidean",
+    method=SimilarityMethod.EUCLIDEAN,
     top_n=5,
 )
 print(f"\nMost similar tehsils to {target_id}:")

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/pyproject.toml RENAMED Viewed

@@ -44,7 +44,7 @@ venvPath = "."
 venv = ".venv"
 [project.optional-dependencies]
-core = ["polars>=1.40.1,<2", "pyarrow>=24.0.0,<25", "pydantic>=2.13.3,<3"]
+core = ["polars>=1.39.0,<2", "pyarrow>=23.0.0,<25", "pydantic>=2.13.3,<3"]
 spatial = [
     "core-lens[core]",
     "duckdb>=1.5.3,<2",
@@ -59,7 +59,9 @@ viz = [
     "plotly>=6.7.0,<7",
 ]
 stats = ["core-lens[core]", "scipy>=1.17.1,<2", "statsmodels>=0.14.6,<1"]
+gpu = ["core-lens[core]", "cudf-cu13==26.6.*", "cudf-polars-cu13==26.6.*"]
 full = ["core-lens[spatial,viz,stats]"]
+full-gpu = ["core-lens[spatial,viz,stats,gpu]"]
 [dependency-groups]
 dev = [
@@ -71,6 +73,7 @@ dev = [
     "pytest-cov>=7.1.0",
     "python-semantic-release>=10.5.3",
     "ruff>=0.15.12",
+    "scalene>=2.3.0",
     "twine>=6.2.0",
 ]
 types = ["types-geopandas>=1.1.3.20260518", "types-shapely>=2.1.0.20260518"]
@@ -95,3 +98,12 @@ disallow_incomplete_defs = false
 [[tool.mypy.overrides]]
 module = ["plotly.*"]
 ignore_missing_imports = true
+[[tool.uv.index]]
+name = "nvidia"
+url = "https://pypi.nvidia.com"
+explicit = true
+[tool.uv.sources]
+cudf-cu13 = { index = "nvidia" }
+cudf-polars-cu13 = { index = "nvidia" }

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/_version.py RENAMED Viewed

@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
 commit_id: str | None
 __commit_id__: str | None
-__version__ = version = '0.1.dev89'
-__version_tuple__ = version_tuple = (0, 1, 'dev89')
+__version__ = version = '0.1.dev97'
+__version_tuple__ = version_tuple = (0, 1, 'dev97')
 __commit_id__ = commit_id = None

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/stats.py RENAMED Viewed

@@ -10,10 +10,31 @@ from typing import TYPE_CHECKING, Any, cast
 import numpy as np
 import polars as pl
+from core_lens.utils.polars_utils import collect_lf
 if TYPE_CHECKING:
     from core_lens.base.result import Result
+def _sf(x: object) -> float:
+    """Narrow a polars scalar (mean/std/median/quantile return type) to float.
+    Polars returns a wide union that includes non-numeric types; mypy cannot
+    narrow it automatically. This helper asserts the value is numeric at
+    runtime and returns a proper ``float``, or ``nan`` when the value is
+    ``None`` (empty series).
+    Args:
+        x: The scalar value returned by a Polars aggregation.
+    Returns:
+        float: The numeric value as a float, or NaN if the input is None.
+    """
+    if x is None:
+        return float("nan")
+    return float(x)  # type: ignore[arg-type]
 class CorrelateMethod(Enum):
     """Correlation methods.
@@ -320,13 +341,14 @@ class StatsNamespace:
                     f"StatsNamespace.test: method={method!r} is not valid for single-sample test against a reference value. "
                     "Valid options: 't-test', 'wilcoxon'."
                 )
+            _s = pl.Series(all_vals)
             data = pl.DataFrame(
                 {
                     "group": ["all"],
                     "n": [len(all_vals)],
-                    "mean": [float(np.mean(all_vals))],
-                    "std": [float(np.std(all_vals, ddof=1))],
-                    "median": [float(np.median(all_vals))],
+                    "mean": [_sf(_s.mean())],
+                    "std": [_sf(_s.std(ddof=1))],
+                    "median": [_sf(_s.median())],
                 }
             )
             metadata: dict[str, Any] = {
@@ -352,9 +374,11 @@ class StatsNamespace:
                 {
                     "group": str(g),
                     "n": len(a),
-                    "mean": float(np.mean(a)) if len(a) else float("nan"),
-                    "std": float(np.std(a, ddof=1)) if len(a) > 1 else float("nan"),
-                    "median": float(np.median(a)) if len(a) else float("nan"),
+                    "mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
+                    "std": _sf(pl.Series(a).std(ddof=1))
+                    if len(a) > 1
+                    else float("nan"),
+                    "median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
                 }
                 for g, a in zip(group_names, arrays)
             ]
@@ -388,9 +412,11 @@ class StatsNamespace:
                 {
                     "group": f"{p[0]}-{p[1]}",
                     "n": len(a),
-                    "mean": float(np.mean(a)) if len(a) else float("nan"),
-                    "std": float(np.std(a, ddof=1)) if len(a) > 1 else float("nan"),
-                    "median": float(np.median(a)) if len(a) else float("nan"),
+                    "mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
+                    "std": _sf(pl.Series(a).std(ddof=1))
+                    if len(a) > 1
+                    else float("nan"),
+                    "median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
                 }
                 for p, a in zip(periods, arrays)
             ]
@@ -573,8 +599,9 @@ class StatsNamespace:
             all_vals = df[column].to_numpy().astype(float)
             if method is AnomalyCrossMethod.ZSCORE:
-                mean = float(np.nanmean(ref_vals))
-                std = float(np.nanstd(ref_vals, ddof=1))
+                _rs = pl.Series(ref_vals)
+                mean = _sf(_rs.mean())
+                std = _sf(_rs.std(ddof=1))
                 scores = (all_vals - mean) / (std or 1.0)
                 flags = np.abs(scores) > threshold
                 meta: dict[str, Any] = {
@@ -586,41 +613,43 @@ class StatsNamespace:
                 }
             elif method is AnomalyCrossMethod.IQR:
-                q1 = float(np.nanpercentile(ref_vals, 25))
-                q3 = float(np.nanpercentile(ref_vals, 75))
+                _rs = pl.Series(ref_vals)
+                q1 = _sf(_rs.quantile(0.25))
+                q3 = _sf(_rs.quantile(0.75))
                 iqr = q3 - q1
                 lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
-                med = float(np.nanmedian(ref_vals))
+                med = _sf(_rs.median())
                 scores = (all_vals - med) / (iqr or 1.0)
                 flags = (all_vals < lo) | (all_vals > hi)
                 meta = {
                     "mode": "cross_sectional",
                     "method": "iqr",
                     "baseline": baseline,
-                    "baseline_mean": float(np.nanmean(ref_vals)),
+                    "baseline_mean": _sf(_rs.mean()),
                     "q1": q1,
                     "q3": q3,
                     "iqr": iqr,
                 }
             elif method is AnomalyCrossMethod.PERCENTILE:
-                lo = float(np.nanpercentile(ref_vals, 5))
-                hi = float(np.nanpercentile(ref_vals, 95))
-                med = float(np.nanmedian(ref_vals))
-                std = float(np.nanstd(ref_vals)) or 1.0
+                _rs = pl.Series(ref_vals)
+                lo = _sf(_rs.quantile(0.05))
+                hi = _sf(_rs.quantile(0.95))
+                med = _sf(_rs.median())
+                std = _sf(_rs.std()) or 1.0
                 scores = (all_vals - med) / std
                 flags = (all_vals < lo) | (all_vals > hi)
                 meta = {
                     "mode": "cross_sectional",
                     "method": "percentile",
                     "baseline": baseline,
-                    "baseline_mean": float(np.nanmean(ref_vals)),
+                    "baseline_mean": _sf(_rs.mean()),
                     "lower_pct": lo,
                     "upper_pct": hi,
                 }
             else:  # threshold
-                mean = float(np.nanmean(ref_vals))
+                mean = _sf(pl.Series(ref_vals).mean())
                 scores = all_vals - mean
                 flags = np.abs(scores) > threshold
                 meta = {
@@ -680,15 +709,17 @@ class StatsNamespace:
                 ts_flags: list[bool] = []
                 if method is AnomalyTsMethod.MAD:
-                    med = float(np.median(base_vals))
-                    mad = float(np.median(np.abs(base_vals - med)))
+                    _bs = pl.Series(base_vals)
+                    med = _sf(_bs.median())
+                    mad = _sf(pl.Series(np.abs(base_vals - med)).median())
                     scale = (mad * 1.4826) or 1.0
                     ts_scores = [(v - med) / scale for v in eval_vals]
                     ts_flags = [abs(s) > threshold for s in ts_scores]
                 elif method is AnomalyTsMethod.CUSUM:
-                    mean = float(np.mean(base_vals))
-                    std = float(np.std(base_vals, ddof=1)) or 1.0
+                    _bs = pl.Series(base_vals)
+                    mean = _sf(_bs.mean())
+                    std = _sf(_bs.std(ddof=1)) or 1.0
                     k, h = 0.5 * std, threshold * std
                     cp, cn = 0.0, 0.0
                     for v in eval_vals:
@@ -713,7 +744,7 @@ class StatsNamespace:
                         base_len = len(base_vals)
                         base_resid = resid[:base_len]
                         eval_resid = resid[base_len : base_len + len(eval_years)]
-                        std = float(np.std(base_resid, ddof=1)) or 1.0
+                        std = _sf(pl.Series(base_resid).std(ddof=1)) or 1.0
                         ts_scores = [float(r / std) for r in eval_resid]
                         ts_flags = [abs(s) > threshold for s in ts_scores]
                     except Exception:
@@ -755,7 +786,7 @@ class StatsNamespace:
                 "mode": "timeseries",
                 "method": method.value if method is not None else None,
                 "baseline": baseline,
-                "baseline_mean": float(np.mean(global_base_vals))
+                "baseline_mean": _sf(pl.Series(global_base_vals).mean())
                 if len(global_base_vals) > 0
                 else float("nan"),
                 "baseline_fitted": True,
@@ -895,13 +926,12 @@ class StatsNamespace:
             if resolution_str == "static":
                 # Static: no grouping needed — one row per entity.
-                fetched = col_lf.select([key, col_name]).collect()
+                fetched = collect_lf(col_lf.select([key, col_name]))
             else:
-                fetched = col_lf.group_by(key).agg(agg_expr).collect()
+                fetched = collect_lf(col_lf.group_by(key).agg(agg_expr))
             feat = feat.join(fetched.select([key, col_name]), on=key, how="left")
-        # ------------------------------------------------------------------
         feature_cols = [c for c in feat.columns if c != key]
         if not feature_cols:

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/result.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
 import polars as pl
 from core_lens.schema.profile import Resolution
+from core_lens.utils.polars_utils import collect_lf
 if TYPE_CHECKING:
     import geopandas as gpd
@@ -101,10 +102,8 @@ class Result:
         import shapely.wkb as wkb
         geometry_col = self.entity.geometry_col
-        geo_series = gpd.GeoSeries(
-            self.data[geometry_col].map_elements(wkb.loads, return_dtype=pl.Object),
-            crs="EPSG:4326",
-        )
+        geometries = [wkb.loads(b) for b in self.data[geometry_col].to_list()]
+        geo_series = gpd.GeoSeries(geometries, crs="EPSG:4326")
         return gpd.GeoDataFrame(
             self.data.drop(geometry_col).to_pandas(),
             geometry=geo_series,
@@ -145,7 +144,7 @@ class Result:
         key_cols = self.key_cols
         static_path = self.entity._resolve(self.entity.static_path)
-        geo_df = (
+        geo_df = collect_lf(
             pl.scan_parquet(static_path)
             .select(key_cols + [geom_col])
             .filter(
@@ -153,7 +152,6 @@ class Result:
                 if len(key_cols) == 1
                 else pl.lit(True)
             )
-            .collect()
         )
         joined = self.data.join(geo_df, on=key_cols, how="left")

{core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/view.py RENAMED Viewed

@@ -9,7 +9,7 @@ import polars as pl
 from enum import Enum
 from core_lens.schema.profile import Resolution
-from core_lens.utils.polars_utils import scan_with_key_filter
+from core_lens.utils.polars_utils import scan_with_key_filter, collect_lf
 if TYPE_CHECKING:
     import shapely
@@ -384,7 +384,7 @@ class View:
             key_values=self.keys,
             time_expr=time_expr,
         )
-        data = lf.collect()
+        data = collect_lf(lf)
         # For fortnightly results, inject temporal grouping columns so that
         # aggregate(by="year"), aggregate(by="season"), etc. work out of the

core_lens-0.1.dev97/src/core_lens/utils/polars_utils.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Polars scan helpers with predicate pushdown for entity materialisation."""
+from __future__ import annotations
+import polars as pl
+_GPU_AVAILABLE: bool | None = None  # None = not yet probed
+def _gpu_available() -> bool:
+    """Return ``True`` if ``cudf_polars`` is importable (RAPIDS GPU backend).
+    The result is cached after the first call so subsequent invocations are
+    effectively free.
+    """
+    global _GPU_AVAILABLE
+    if _GPU_AVAILABLE is None:
+        try:
+            import cudf_polars  # noqa: F401  # type: ignore[import-untyped]
+            _GPU_AVAILABLE = True
+            gpu = "cudf_polars (RAPIDS)"
+            print("=" * 50)
+            print(f"GPU : {gpu} found. Running in GPU mode.")
+            print("=" * 50)
+        except ModuleNotFoundError:
+            _GPU_AVAILABLE = False
+            print("=" * 50)
+            print("GPU : None found. Running in CPU mode.")
+            print("=" * 50)
+    return _GPU_AVAILABLE
+def collect_lf(lf: pl.LazyFrame) -> pl.DataFrame:
+    """Collect a ``LazyFrame`` using the best available backend.
+    * **GPU present** — executes via the RAPIDS ``cudf_polars`` streaming
+      engine (``GPUEngine(executor="streaming")``).  Handles datasets larger
+      than VRAM through data partitioning.
+    * **No GPU** — falls back to Polars' built-in CPU streaming executor
+      (``collect(streaming=True)``), which keeps memory usage low for large
+      Parquet scans.
+    Use this function for all *data* scans (materialisation, geometry joins,
+    similarity fetches).  Tiny *index* scans that feed into subsequent
+    in-process joins should stay as bare ``.collect()`` calls — the streaming
+    path can occasionally change row ordering in ways that break those joins.
+    Args:
+        lf: The lazy frame to collect.
+    Returns:
+        A materialised ``pl.DataFrame``.
+    """
+    global _GPU_AVAILABLE
+    if _gpu_available():
+        engine = pl.GPUEngine(executor="streaming")
+        try:
+            result = lf.collect(engine=engine)
+            assert isinstance(result, pl.DataFrame)
+            return result
+        except pl.exceptions.ComputeError as e:
+            if "cuda" in str(e).lower() or "nvml" in str(e).lower():
+                _GPU_AVAILABLE = False
+                print("=" * 50)
+                print(f"GPU runtime error ({e}). Falling back to CPU mode.")
+                print("=" * 50)
+            else:
+                raise
+    return lf.collect(engine="streaming")
+def scan_with_key_filter(
+    path: str,
+    key_cols: list[str],
+    key_values: pl.DataFrame,
+    time_expr: pl.Expr | None = None,
+) -> pl.LazyFrame:
+    """Return a ``pl.LazyFrame`` filtered to the given keys and optional time range.
+    Uses ``pl.scan_parquet`` with two predicate-pushdown layers:
+    1. **Key filter** — restricts to entity instances whose key column(s) are
+       in ``key_values``.  For a single-column key this is an ``is_in``
+       predicate pushed down to the Parquet reader.  For composite keys each
+       column is filtered independently (over-selects slightly, then pruned
+       by the join at collect time).
+    2. **Time filter** — an optional Polars expression appended with ``&``,
+       also pushed down if the Parquet file carries column statistics.
+    Args:
+        path: Absolute path to a Parquet file.
+        key_cols: Column name(s) that form the entity's unique key.
+        key_values: A narrow ``pl.DataFrame`` containing only the key
+            column(s) with the exact values to retain.
+        time_expr: An optional Polars filter expression for the time column,
+            as produced by :func:`~core_lens.utils.season.resolve_time_filter`.
+    Returns:
+        A ``pl.LazyFrame`` ready to be ``.collect()``-ed.
+    """
+    lf = pl.scan_parquet(path)
+    if len(key_cols) == 1:
+        key = key_cols[0]
+        values = key_values[key].to_list()
+        lf = lf.filter(pl.col(key).is_in(values))
+    else:
+        # Composite key: filter each column independently. A small over-selection
+        # is acceptable because the subsequent join at collect time is exact.
+        for key in key_cols:
+            values = key_values[key].to_list()
+            lf = lf.filter(pl.col(key).is_in(values))
+    if time_expr is not None:
+        lf = lf.filter(time_expr)
+    return lf

core-lens 0.1.dev89__tar.gz → 0.1.dev97__tar.gz

core-lens 0.1.dev89tar.gz → 0.1.dev97tar.gz