core-lens 0.1.dev89__tar.gz → 0.1.dev97__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.gitignore +11 -0
  2. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/PKG-INFO +32 -11
  3. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/concepts.md +1 -1
  4. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/intro.md +1 -0
  5. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/quickstart.md +6 -0
  6. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/examples/demo_tehsil.py +9 -4
  7. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/pyproject.toml +13 -1
  8. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/_version.py +2 -2
  9. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/stats.py +60 -30
  10. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/result.py +4 -6
  11. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/view.py +2 -2
  12. core_lens-0.1.dev97/src/core_lens/utils/polars_utils.py +119 -0
  13. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/uv.lock +674 -106
  14. core_lens-0.1.dev89/src/core_lens/utils/polars_utils.py +0 -54
  15. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/pull_request_template.md +0 -0
  16. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/ci.yml +0 -0
  17. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/gh-pages.yml +0 -0
  18. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/pre-release.yml +0 -0
  19. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/release.yml +0 -0
  20. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.gitmessage +0 -0
  21. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.pre-commit-config.yaml +0 -0
  22. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.python-version +0 -0
  23. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/CONTRIBUTING.md +0 -0
  24. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/LICENSE +0 -0
  25. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/README.md +0 -0
  26. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/SKILLS.md +0 -0
  27. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/Makefile +0 -0
  28. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/make.bat +0 -0
  29. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/conf.py +0 -0
  30. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/index.rst +0 -0
  31. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/plots.md +0 -0
  32. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/plugins.md +0 -0
  33. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/queries.md +0 -0
  34. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/stats.md +0 -0
  35. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/examples/demo_mws.py +0 -0
  36. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/mypy.sh +0 -0
  37. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/no-parquet-outside-fixtures.sh +0 -0
  38. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/pytest.sh +0 -0
  39. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/__init__.py +0 -0
  40. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/__main__.py +0 -0
  41. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/aoi.py +0 -0
  42. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/__init__.py +0 -0
  43. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/entity.py +0 -0
  44. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/__init__.py +0 -0
  45. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/plot.py +0 -0
  46. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/__init__.py +0 -0
  47. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/mws.py +0 -0
  48. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/tehsil.py +0 -0
  49. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/export/__init__.py +0 -0
  50. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/export/formats.py +0 -0
  51. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/py.typed +0 -0
  52. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/__init__.py +0 -0
  53. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/detection.py +0 -0
  54. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/profile.py +0 -0
  55. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/__init__.py +0 -0
  56. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/season.py +0 -0
  57. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/spatial.py +0 -0
  58. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/fixtures/generate_fixtures.py +0 -0
  59. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/conftest.py +0 -0
  60. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_aoi.py +0 -0
  61. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_entities.py +0 -0
  62. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_entity.py +0 -0
  63. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_export.py +0 -0
  64. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_main.py +0 -0
  65. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_plot.py +0 -0
  66. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_polars_utils.py +0 -0
  67. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_profile.py +0 -0
  68. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_result.py +0 -0
  69. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_schema_detection.py +0 -0
  70. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_schema_profile.py +0 -0
  71. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_season.py +0 -0
  72. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_season_config.py +0 -0
  73. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_spatial.py +0 -0
  74. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_stats.py +0 -0
  75. {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_view.py +0 -0
@@ -367,3 +367,14 @@ context.md
367
367
 
368
368
  # Data
369
369
  data/
370
+
371
+ # Profiling results
372
+ scalene*.*
373
+
374
+ # Results
375
+ *.html
376
+ output_*.parquet
377
+ output_*.csv
378
+ output_*.geojson
379
+ output_*.geoparquet
380
+ output_*.json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: core-lens
3
- Version: 0.1.dev89
3
+ Version: 0.1.dev97
4
4
  Summary: Query, analyse, and visualise CoreStack's microwatershed and Earth science data through a clean, composable Python API.
5
5
  Project-URL: Homepage, https://github.com/ApoorvaKashyap/core-lens
6
6
  Project-URL: Issues, https://github.com/ApoorvaKashyap/core-lens/issues
@@ -17,8 +17,8 @@ Requires-Python: >=3.13
17
17
  Requires-Dist: core-lens[core]
18
18
  Requires-Dist: core-lens[spatial]
19
19
  Provides-Extra: core
20
- Requires-Dist: polars<2,>=1.40.1; extra == 'core'
21
- Requires-Dist: pyarrow<25,>=24.0.0; extra == 'core'
20
+ Requires-Dist: polars<2,>=1.39.0; extra == 'core'
21
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'core'
22
22
  Requires-Dist: pydantic<3,>=2.13.3; extra == 'core'
23
23
  Provides-Extra: full
24
24
  Requires-Dist: duckdb<2,>=1.5.3; extra == 'full'
@@ -26,24 +26,45 @@ Requires-Dist: geopandas<2,>=1.1.3; extra == 'full'
26
26
  Requires-Dist: lonboard<1,>=0.16.0; extra == 'full'
27
27
  Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full'
28
28
  Requires-Dist: plotly<7,>=6.7.0; extra == 'full'
29
- Requires-Dist: polars<2,>=1.40.1; extra == 'full'
30
- Requires-Dist: pyarrow<25,>=24.0.0; extra == 'full'
29
+ Requires-Dist: polars<2,>=1.39.0; extra == 'full'
30
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full'
31
31
  Requires-Dist: pydantic<3,>=2.13.3; extra == 'full'
32
32
  Requires-Dist: pyproj<4,>=3.7.2; extra == 'full'
33
33
  Requires-Dist: scipy<2,>=1.17.1; extra == 'full'
34
34
  Requires-Dist: shapely<3,>=2.1.2; extra == 'full'
35
35
  Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full'
36
+ Provides-Extra: full-gpu
37
+ Requires-Dist: cudf-cu13==26.6.*; extra == 'full-gpu'
38
+ Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'full-gpu'
39
+ Requires-Dist: duckdb<2,>=1.5.3; extra == 'full-gpu'
40
+ Requires-Dist: geopandas<2,>=1.1.3; extra == 'full-gpu'
41
+ Requires-Dist: lonboard<1,>=0.16.0; extra == 'full-gpu'
42
+ Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full-gpu'
43
+ Requires-Dist: plotly<7,>=6.7.0; extra == 'full-gpu'
44
+ Requires-Dist: polars<2,>=1.39.0; extra == 'full-gpu'
45
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full-gpu'
46
+ Requires-Dist: pydantic<3,>=2.13.3; extra == 'full-gpu'
47
+ Requires-Dist: pyproj<4,>=3.7.2; extra == 'full-gpu'
48
+ Requires-Dist: scipy<2,>=1.17.1; extra == 'full-gpu'
49
+ Requires-Dist: shapely<3,>=2.1.2; extra == 'full-gpu'
50
+ Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full-gpu'
51
+ Provides-Extra: gpu
52
+ Requires-Dist: cudf-cu13==26.6.*; extra == 'gpu'
53
+ Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'gpu'
54
+ Requires-Dist: polars<2,>=1.39.0; extra == 'gpu'
55
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'gpu'
56
+ Requires-Dist: pydantic<3,>=2.13.3; extra == 'gpu'
36
57
  Provides-Extra: spatial
37
58
  Requires-Dist: duckdb<2,>=1.5.3; extra == 'spatial'
38
59
  Requires-Dist: geopandas<2,>=1.1.3; extra == 'spatial'
39
- Requires-Dist: polars<2,>=1.40.1; extra == 'spatial'
40
- Requires-Dist: pyarrow<25,>=24.0.0; extra == 'spatial'
60
+ Requires-Dist: polars<2,>=1.39.0; extra == 'spatial'
61
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'spatial'
41
62
  Requires-Dist: pydantic<3,>=2.13.3; extra == 'spatial'
42
63
  Requires-Dist: pyproj<4,>=3.7.2; extra == 'spatial'
43
64
  Requires-Dist: shapely<3,>=2.1.2; extra == 'spatial'
44
65
  Provides-Extra: stats
45
- Requires-Dist: polars<2,>=1.40.1; extra == 'stats'
46
- Requires-Dist: pyarrow<25,>=24.0.0; extra == 'stats'
66
+ Requires-Dist: polars<2,>=1.39.0; extra == 'stats'
67
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'stats'
47
68
  Requires-Dist: pydantic<3,>=2.13.3; extra == 'stats'
48
69
  Requires-Dist: scipy<2,>=1.17.1; extra == 'stats'
49
70
  Requires-Dist: statsmodels<1,>=0.14.6; extra == 'stats'
@@ -51,8 +72,8 @@ Provides-Extra: viz
51
72
  Requires-Dist: lonboard<1,>=0.16.0; extra == 'viz'
52
73
  Requires-Dist: matplotlib<4,>=3.10.9; extra == 'viz'
53
74
  Requires-Dist: plotly<7,>=6.7.0; extra == 'viz'
54
- Requires-Dist: polars<2,>=1.40.1; extra == 'viz'
55
- Requires-Dist: pyarrow<25,>=24.0.0; extra == 'viz'
75
+ Requires-Dist: polars<2,>=1.39.0; extra == 'viz'
76
+ Requires-Dist: pyarrow<25,>=23.0.0; extra == 'viz'
56
77
  Requires-Dist: pydantic<3,>=2.13.3; extra == 'viz'
57
78
  Description-Content-Type: text/markdown
58
79
 
@@ -4,4 +4,4 @@ CoreLens is built on three main layers:
4
4
 
5
5
  1. **AoI (Area of Interest)**: The primary entry point. It represents a spatial boundary and acts as a gateway to all registered entities that intersect that boundary.
6
6
  2. **View**: A lazy, immutable query definition. Filtering by attributes (`where`), space (`spatial_filter`), or time (`between`) returns a new `View` without reading any Parquet data.
7
- 3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.
7
+ 3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query using the Polars streaming engine (automatically routed to the GPU if RAPIDS `cudf-polars` is installed) and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.
@@ -8,6 +8,7 @@ CoreLens provides a unified interface over microwatersheds, administrative bound
8
8
 
9
9
  - **Area of Interest (AoI) First**: Define your spatial boundary once and instantly access all underlying entities (microwatersheds, villages, tehsils) scoped to that boundary.
10
10
  - **Lazy Evaluation**: Uses Polars for lazy evaluation and predicate pushdown. Data is only read from Parquet files when explicitly materialised.
11
+ - **GPU Acceleration**: Zero-code GPU acceleration for query execution and aggregations via NVIDIA RAPIDS (`cudf-polars`), capable of automatically routing compatible queries to the GPU.
11
12
  - **Pluggable Entities**: Built-in support for standard units (MWS, Tehsil) with a simple plugin architecture for adding new domain entities.
12
13
  - **Temporal & Seasonal Awareness**: Native support for agronomic seasons (Kharif, Rabi, Zaid) and time-range filtering.
13
14
  - **Spatial Statistics & Analysis**: Built-in methods for anomaly detection, spatial similarity, temporal correlation, and hypothesis testing.
@@ -6,6 +6,12 @@
6
6
  pip install core-lens
7
7
  ```
8
8
 
9
+ To enable GPU-accelerated queries (requires an NVIDIA GPU and Linux), install with the `gpu` extra. You will need to configure your package manager to use the NVIDIA PyPI index for RAPIDS dependencies:
10
+
11
+ ```bash
12
+ pip install "core-lens[gpu]" --extra-index-url=https://pypi.nvidia.com
13
+ ```
14
+
9
15
  ## Basic Usage
10
16
 
11
17
  Before querying, you must register the entities you plan to use:
@@ -27,6 +27,11 @@ import polars as pl
27
27
  from core_lens import AoI, SeasonConfig
28
28
  from core_lens.entities import TehsilEntity
29
29
  from core_lens.export import geoparquet
30
+ from core_lens.base.namespaces.stats import (
31
+ CorrelateMethod,
32
+ SimilarityMethod,
33
+ TestMethod,
34
+ )
30
35
  import shapely.geometry as sgeom
31
36
 
32
37
 
@@ -163,7 +168,7 @@ print(desc_by_entity.df().head(5))
163
168
 
164
169
  corr = result_with_area.stats.correlate(
165
170
  columns=["area_km2", "Shape_Leng", "compactness"],
166
- method="spearman",
171
+ method=CorrelateMethod.SPEARMAN,
167
172
  across="entity",
168
173
  )
169
174
  print("\nCorrelation (Spearman):")
@@ -180,7 +185,7 @@ try:
180
185
  test_result = result_with_area.stats.test(
181
186
  column="area_km2",
182
187
  groups="STATE",
183
- method="mann-whitney",
188
+ method=TestMethod.MANN_WHITNEY,
184
189
  )
185
190
  print("\nHypothesis test — area by STATE:")
186
191
  print(test_result.df())
@@ -196,7 +201,7 @@ except IndexError:
196
201
  test_vs_ref = result_with_area.stats.test(
197
202
  column="area_km2",
198
203
  against=500.0, # reference area in km²
199
- method="t-test",
204
+ method=TestMethod.T_TEST,
200
205
  )
201
206
  print("\nOne-sample t-test vs 500 km²:")
202
207
  print(test_vs_ref.metadata)
@@ -227,7 +232,7 @@ sim = result_with_area.stats.similarity(
227
232
  "Shape_Leng": None,
228
233
  "compactness": None,
229
234
  },
230
- method="euclidean",
235
+ method=SimilarityMethod.EUCLIDEAN,
231
236
  top_n=5,
232
237
  )
233
238
  print(f"\nMost similar tehsils to {target_id}:")
@@ -44,7 +44,7 @@ venvPath = "."
44
44
  venv = ".venv"
45
45
 
46
46
  [project.optional-dependencies]
47
- core = ["polars>=1.40.1,<2", "pyarrow>=24.0.0,<25", "pydantic>=2.13.3,<3"]
47
+ core = ["polars>=1.39.0,<2", "pyarrow>=23.0.0,<25", "pydantic>=2.13.3,<3"]
48
48
  spatial = [
49
49
  "core-lens[core]",
50
50
  "duckdb>=1.5.3,<2",
@@ -59,7 +59,9 @@ viz = [
59
59
  "plotly>=6.7.0,<7",
60
60
  ]
61
61
  stats = ["core-lens[core]", "scipy>=1.17.1,<2", "statsmodels>=0.14.6,<1"]
62
+ gpu = ["core-lens[core]", "cudf-cu13==26.6.*", "cudf-polars-cu13==26.6.*"]
62
63
  full = ["core-lens[spatial,viz,stats]"]
64
+ full-gpu = ["core-lens[spatial,viz,stats,gpu]"]
63
65
 
64
66
  [dependency-groups]
65
67
  dev = [
@@ -71,6 +73,7 @@ dev = [
71
73
  "pytest-cov>=7.1.0",
72
74
  "python-semantic-release>=10.5.3",
73
75
  "ruff>=0.15.12",
76
+ "scalene>=2.3.0",
74
77
  "twine>=6.2.0",
75
78
  ]
76
79
  types = ["types-geopandas>=1.1.3.20260518", "types-shapely>=2.1.0.20260518"]
@@ -95,3 +98,12 @@ disallow_incomplete_defs = false
95
98
  [[tool.mypy.overrides]]
96
99
  module = ["plotly.*"]
97
100
  ignore_missing_imports = true
101
+
102
+ [[tool.uv.index]]
103
+ name = "nvidia"
104
+ url = "https://pypi.nvidia.com"
105
+ explicit = true
106
+
107
+ [tool.uv.sources]
108
+ cudf-cu13 = { index = "nvidia" }
109
+ cudf-polars-cu13 = { index = "nvidia" }
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.1.dev89'
22
- __version_tuple__ = version_tuple = (0, 1, 'dev89')
21
+ __version__ = version = '0.1.dev97'
22
+ __version_tuple__ = version_tuple = (0, 1, 'dev97')
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -10,10 +10,31 @@ from typing import TYPE_CHECKING, Any, cast
10
10
  import numpy as np
11
11
  import polars as pl
12
12
 
13
+ from core_lens.utils.polars_utils import collect_lf
14
+
13
15
  if TYPE_CHECKING:
14
16
  from core_lens.base.result import Result
15
17
 
16
18
 
19
+ def _sf(x: object) -> float:
20
+ """Narrow a polars scalar (mean/std/median/quantile return type) to float.
21
+
22
+ Polars returns a wide union that includes non-numeric types; mypy cannot
23
+ narrow it automatically. This helper asserts the value is numeric at
24
+ runtime and returns a proper ``float``, or ``nan`` when the value is
25
+ ``None`` (empty series).
26
+
27
+ Args:
28
+ x: The scalar value returned by a Polars aggregation.
29
+
30
+ Returns:
31
+ float: The numeric value as a float, or NaN if the input is None.
32
+ """
33
+ if x is None:
34
+ return float("nan")
35
+ return float(x) # type: ignore[arg-type]
36
+
37
+
17
38
  class CorrelateMethod(Enum):
18
39
  """Correlation methods.
19
40
 
@@ -320,13 +341,14 @@ class StatsNamespace:
320
341
  f"StatsNamespace.test: method={method!r} is not valid for single-sample test against a reference value. "
321
342
  "Valid options: 't-test', 'wilcoxon'."
322
343
  )
344
+ _s = pl.Series(all_vals)
323
345
  data = pl.DataFrame(
324
346
  {
325
347
  "group": ["all"],
326
348
  "n": [len(all_vals)],
327
- "mean": [float(np.mean(all_vals))],
328
- "std": [float(np.std(all_vals, ddof=1))],
329
- "median": [float(np.median(all_vals))],
349
+ "mean": [_sf(_s.mean())],
350
+ "std": [_sf(_s.std(ddof=1))],
351
+ "median": [_sf(_s.median())],
330
352
  }
331
353
  )
332
354
  metadata: dict[str, Any] = {
@@ -352,9 +374,11 @@ class StatsNamespace:
352
374
  {
353
375
  "group": str(g),
354
376
  "n": len(a),
355
- "mean": float(np.mean(a)) if len(a) else float("nan"),
356
- "std": float(np.std(a, ddof=1)) if len(a) > 1 else float("nan"),
357
- "median": float(np.median(a)) if len(a) else float("nan"),
377
+ "mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
378
+ "std": _sf(pl.Series(a).std(ddof=1))
379
+ if len(a) > 1
380
+ else float("nan"),
381
+ "median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
358
382
  }
359
383
  for g, a in zip(group_names, arrays)
360
384
  ]
@@ -388,9 +412,11 @@ class StatsNamespace:
388
412
  {
389
413
  "group": f"{p[0]}-{p[1]}",
390
414
  "n": len(a),
391
- "mean": float(np.mean(a)) if len(a) else float("nan"),
392
- "std": float(np.std(a, ddof=1)) if len(a) > 1 else float("nan"),
393
- "median": float(np.median(a)) if len(a) else float("nan"),
415
+ "mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
416
+ "std": _sf(pl.Series(a).std(ddof=1))
417
+ if len(a) > 1
418
+ else float("nan"),
419
+ "median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
394
420
  }
395
421
  for p, a in zip(periods, arrays)
396
422
  ]
@@ -573,8 +599,9 @@ class StatsNamespace:
573
599
  all_vals = df[column].to_numpy().astype(float)
574
600
 
575
601
  if method is AnomalyCrossMethod.ZSCORE:
576
- mean = float(np.nanmean(ref_vals))
577
- std = float(np.nanstd(ref_vals, ddof=1))
602
+ _rs = pl.Series(ref_vals)
603
+ mean = _sf(_rs.mean())
604
+ std = _sf(_rs.std(ddof=1))
578
605
  scores = (all_vals - mean) / (std or 1.0)
579
606
  flags = np.abs(scores) > threshold
580
607
  meta: dict[str, Any] = {
@@ -586,41 +613,43 @@ class StatsNamespace:
586
613
  }
587
614
 
588
615
  elif method is AnomalyCrossMethod.IQR:
589
- q1 = float(np.nanpercentile(ref_vals, 25))
590
- q3 = float(np.nanpercentile(ref_vals, 75))
616
+ _rs = pl.Series(ref_vals)
617
+ q1 = _sf(_rs.quantile(0.25))
618
+ q3 = _sf(_rs.quantile(0.75))
591
619
  iqr = q3 - q1
592
620
  lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
593
- med = float(np.nanmedian(ref_vals))
621
+ med = _sf(_rs.median())
594
622
  scores = (all_vals - med) / (iqr or 1.0)
595
623
  flags = (all_vals < lo) | (all_vals > hi)
596
624
  meta = {
597
625
  "mode": "cross_sectional",
598
626
  "method": "iqr",
599
627
  "baseline": baseline,
600
- "baseline_mean": float(np.nanmean(ref_vals)),
628
+ "baseline_mean": _sf(_rs.mean()),
601
629
  "q1": q1,
602
630
  "q3": q3,
603
631
  "iqr": iqr,
604
632
  }
605
633
 
606
634
  elif method is AnomalyCrossMethod.PERCENTILE:
607
- lo = float(np.nanpercentile(ref_vals, 5))
608
- hi = float(np.nanpercentile(ref_vals, 95))
609
- med = float(np.nanmedian(ref_vals))
610
- std = float(np.nanstd(ref_vals)) or 1.0
635
+ _rs = pl.Series(ref_vals)
636
+ lo = _sf(_rs.quantile(0.05))
637
+ hi = _sf(_rs.quantile(0.95))
638
+ med = _sf(_rs.median())
639
+ std = _sf(_rs.std()) or 1.0
611
640
  scores = (all_vals - med) / std
612
641
  flags = (all_vals < lo) | (all_vals > hi)
613
642
  meta = {
614
643
  "mode": "cross_sectional",
615
644
  "method": "percentile",
616
645
  "baseline": baseline,
617
- "baseline_mean": float(np.nanmean(ref_vals)),
646
+ "baseline_mean": _sf(_rs.mean()),
618
647
  "lower_pct": lo,
619
648
  "upper_pct": hi,
620
649
  }
621
650
 
622
651
  else: # threshold
623
- mean = float(np.nanmean(ref_vals))
652
+ mean = _sf(pl.Series(ref_vals).mean())
624
653
  scores = all_vals - mean
625
654
  flags = np.abs(scores) > threshold
626
655
  meta = {
@@ -680,15 +709,17 @@ class StatsNamespace:
680
709
  ts_flags: list[bool] = []
681
710
 
682
711
  if method is AnomalyTsMethod.MAD:
683
- med = float(np.median(base_vals))
684
- mad = float(np.median(np.abs(base_vals - med)))
712
+ _bs = pl.Series(base_vals)
713
+ med = _sf(_bs.median())
714
+ mad = _sf(pl.Series(np.abs(base_vals - med)).median())
685
715
  scale = (mad * 1.4826) or 1.0
686
716
  ts_scores = [(v - med) / scale for v in eval_vals]
687
717
  ts_flags = [abs(s) > threshold for s in ts_scores]
688
718
 
689
719
  elif method is AnomalyTsMethod.CUSUM:
690
- mean = float(np.mean(base_vals))
691
- std = float(np.std(base_vals, ddof=1)) or 1.0
720
+ _bs = pl.Series(base_vals)
721
+ mean = _sf(_bs.mean())
722
+ std = _sf(_bs.std(ddof=1)) or 1.0
692
723
  k, h = 0.5 * std, threshold * std
693
724
  cp, cn = 0.0, 0.0
694
725
  for v in eval_vals:
@@ -713,7 +744,7 @@ class StatsNamespace:
713
744
  base_len = len(base_vals)
714
745
  base_resid = resid[:base_len]
715
746
  eval_resid = resid[base_len : base_len + len(eval_years)]
716
- std = float(np.std(base_resid, ddof=1)) or 1.0
747
+ std = _sf(pl.Series(base_resid).std(ddof=1)) or 1.0
717
748
  ts_scores = [float(r / std) for r in eval_resid]
718
749
  ts_flags = [abs(s) > threshold for s in ts_scores]
719
750
  except Exception:
@@ -755,7 +786,7 @@ class StatsNamespace:
755
786
  "mode": "timeseries",
756
787
  "method": method.value if method is not None else None,
757
788
  "baseline": baseline,
758
- "baseline_mean": float(np.mean(global_base_vals))
789
+ "baseline_mean": _sf(pl.Series(global_base_vals).mean())
759
790
  if len(global_base_vals) > 0
760
791
  else float("nan"),
761
792
  "baseline_fitted": True,
@@ -895,13 +926,12 @@ class StatsNamespace:
895
926
 
896
927
  if resolution_str == "static":
897
928
  # Static: no grouping needed — one row per entity.
898
- fetched = col_lf.select([key, col_name]).collect()
929
+ fetched = collect_lf(col_lf.select([key, col_name]))
899
930
  else:
900
- fetched = col_lf.group_by(key).agg(agg_expr).collect()
931
+ fetched = collect_lf(col_lf.group_by(key).agg(agg_expr))
901
932
 
902
933
  feat = feat.join(fetched.select([key, col_name]), on=key, how="left")
903
934
 
904
- # ------------------------------------------------------------------
905
935
  feature_cols = [c for c in feat.columns if c != key]
906
936
 
907
937
  if not feature_cols:
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
7
7
  import polars as pl
8
8
 
9
9
  from core_lens.schema.profile import Resolution
10
+ from core_lens.utils.polars_utils import collect_lf
10
11
 
11
12
  if TYPE_CHECKING:
12
13
  import geopandas as gpd
@@ -101,10 +102,8 @@ class Result:
101
102
  import shapely.wkb as wkb
102
103
 
103
104
  geometry_col = self.entity.geometry_col
104
- geo_series = gpd.GeoSeries(
105
- self.data[geometry_col].map_elements(wkb.loads, return_dtype=pl.Object),
106
- crs="EPSG:4326",
107
- )
105
+ geometries = [wkb.loads(b) for b in self.data[geometry_col].to_list()]
106
+ geo_series = gpd.GeoSeries(geometries, crs="EPSG:4326")
108
107
  return gpd.GeoDataFrame(
109
108
  self.data.drop(geometry_col).to_pandas(),
110
109
  geometry=geo_series,
@@ -145,7 +144,7 @@ class Result:
145
144
  key_cols = self.key_cols
146
145
  static_path = self.entity._resolve(self.entity.static_path)
147
146
 
148
- geo_df = (
147
+ geo_df = collect_lf(
149
148
  pl.scan_parquet(static_path)
150
149
  .select(key_cols + [geom_col])
151
150
  .filter(
@@ -153,7 +152,6 @@ class Result:
153
152
  if len(key_cols) == 1
154
153
  else pl.lit(True)
155
154
  )
156
- .collect()
157
155
  )
158
156
 
159
157
  joined = self.data.join(geo_df, on=key_cols, how="left")
@@ -9,7 +9,7 @@ import polars as pl
9
9
  from enum import Enum
10
10
 
11
11
  from core_lens.schema.profile import Resolution
12
- from core_lens.utils.polars_utils import scan_with_key_filter
12
+ from core_lens.utils.polars_utils import scan_with_key_filter, collect_lf
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  import shapely
@@ -384,7 +384,7 @@ class View:
384
384
  key_values=self.keys,
385
385
  time_expr=time_expr,
386
386
  )
387
- data = lf.collect()
387
+ data = collect_lf(lf)
388
388
 
389
389
  # For fortnightly results, inject temporal grouping columns so that
390
390
  # aggregate(by="year"), aggregate(by="season"), etc. work out of the
@@ -0,0 +1,119 @@
1
+ """Polars scan helpers with predicate pushdown for entity materialisation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import polars as pl
6
+
7
+ _GPU_AVAILABLE: bool | None = None # None = not yet probed
8
+
9
+
10
+ def _gpu_available() -> bool:
11
+ """Return ``True`` if ``cudf_polars`` is importable (RAPIDS GPU backend).
12
+
13
+ The result is cached after the first call so subsequent invocations are
14
+ effectively free.
15
+ """
16
+ global _GPU_AVAILABLE
17
+ if _GPU_AVAILABLE is None:
18
+ try:
19
+ import cudf_polars # noqa: F401 # type: ignore[import-untyped]
20
+
21
+ _GPU_AVAILABLE = True
22
+ gpu = "cudf_polars (RAPIDS)"
23
+ print("=" * 50)
24
+ print(f"GPU : {gpu} found. Running in GPU mode.")
25
+ print("=" * 50)
26
+ except ModuleNotFoundError:
27
+ _GPU_AVAILABLE = False
28
+ print("=" * 50)
29
+ print("GPU : None found. Running in CPU mode.")
30
+ print("=" * 50)
31
+ return _GPU_AVAILABLE
32
+
33
+
34
+ def collect_lf(lf: pl.LazyFrame) -> pl.DataFrame:
35
+ """Collect a ``LazyFrame`` using the best available backend.
36
+
37
+ * **GPU present** — executes via the RAPIDS ``cudf_polars`` streaming
38
+ engine (``GPUEngine(executor="streaming")``). Handles datasets larger
39
+ than VRAM through data partitioning.
40
+ * **No GPU** — falls back to Polars' built-in CPU streaming executor
41
+ (``collect(streaming=True)``), which keeps memory usage low for large
42
+ Parquet scans.
43
+
44
+ Use this function for all *data* scans (materialisation, geometry joins,
45
+ similarity fetches). Tiny *index* scans that feed into subsequent
46
+ in-process joins should stay as bare ``.collect()`` calls — the streaming
47
+ path can occasionally change row ordering in ways that break those joins.
48
+
49
+ Args:
50
+ lf: The lazy frame to collect.
51
+
52
+ Returns:
53
+ A materialised ``pl.DataFrame``.
54
+ """
55
+ global _GPU_AVAILABLE
56
+ if _gpu_available():
57
+ engine = pl.GPUEngine(executor="streaming")
58
+ try:
59
+ result = lf.collect(engine=engine)
60
+ assert isinstance(result, pl.DataFrame)
61
+ return result
62
+ except pl.exceptions.ComputeError as e:
63
+ if "cuda" in str(e).lower() or "nvml" in str(e).lower():
64
+ _GPU_AVAILABLE = False
65
+ print("=" * 50)
66
+ print(f"GPU runtime error ({e}). Falling back to CPU mode.")
67
+ print("=" * 50)
68
+ else:
69
+ raise
70
+ return lf.collect(engine="streaming")
71
+
72
+
73
+ def scan_with_key_filter(
74
+ path: str,
75
+ key_cols: list[str],
76
+ key_values: pl.DataFrame,
77
+ time_expr: pl.Expr | None = None,
78
+ ) -> pl.LazyFrame:
79
+ """Return a ``pl.LazyFrame`` filtered to the given keys and optional time range.
80
+
81
+ Uses ``pl.scan_parquet`` with two predicate-pushdown layers:
82
+
83
+ 1. **Key filter** — restricts to entity instances whose key column(s) are
84
+ in ``key_values``. For a single-column key this is an ``is_in``
85
+ predicate pushed down to the Parquet reader. For composite keys each
86
+ column is filtered independently (over-selects slightly, then pruned
87
+ by the join at collect time).
88
+
89
+ 2. **Time filter** — an optional Polars expression appended with ``&``,
90
+ also pushed down if the Parquet file carries column statistics.
91
+
92
+ Args:
93
+ path: Absolute path to a Parquet file.
94
+ key_cols: Column name(s) that form the entity's unique key.
95
+ key_values: A narrow ``pl.DataFrame`` containing only the key
96
+ column(s) with the exact values to retain.
97
+ time_expr: An optional Polars filter expression for the time column,
98
+ as produced by :func:`~core_lens.utils.season.resolve_time_filter`.
99
+
100
+ Returns:
101
+ A ``pl.LazyFrame`` ready to be ``.collect()``-ed.
102
+ """
103
+ lf = pl.scan_parquet(path)
104
+
105
+ if len(key_cols) == 1:
106
+ key = key_cols[0]
107
+ values = key_values[key].to_list()
108
+ lf = lf.filter(pl.col(key).is_in(values))
109
+ else:
110
+ # Composite key: filter each column independently. A small over-selection
111
+ # is acceptable because the subsequent join at collect time is exact.
112
+ for key in key_cols:
113
+ values = key_values[key].to_list()
114
+ lf = lf.filter(pl.col(key).is_in(values))
115
+
116
+ if time_expr is not None:
117
+ lf = lf.filter(time_expr)
118
+
119
+ return lf