core-lens 0.1.dev89__tar.gz → 0.1.dev97__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.gitignore +11 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/PKG-INFO +32 -11
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/concepts.md +1 -1
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/intro.md +1 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/quickstart.md +6 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/examples/demo_tehsil.py +9 -4
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/pyproject.toml +13 -1
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/_version.py +2 -2
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/stats.py +60 -30
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/result.py +4 -6
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/view.py +2 -2
- core_lens-0.1.dev97/src/core_lens/utils/polars_utils.py +119 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/uv.lock +674 -106
- core_lens-0.1.dev89/src/core_lens/utils/polars_utils.py +0 -54
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/pull_request_template.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/ci.yml +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/gh-pages.yml +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/pre-release.yml +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.github/workflows/release.yml +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.gitmessage +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.pre-commit-config.yaml +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/.python-version +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/CONTRIBUTING.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/LICENSE +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/README.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/SKILLS.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/Makefile +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/make.bat +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/conf.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/index.rst +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/plots.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/plugins.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/queries.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/docs/source/stats.md +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/examples/demo_mws.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/mypy.sh +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/no-parquet-outside-fixtures.sh +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/hooks/pytest.sh +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/__main__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/aoi.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/entity.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/base/namespaces/plot.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/mws.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/entities/tehsil.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/export/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/export/formats.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/py.typed +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/detection.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/schema/profile.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/__init__.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/season.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/src/core_lens/utils/spatial.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/fixtures/generate_fixtures.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/conftest.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_aoi.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_entities.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_entity.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_export.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_main.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_plot.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_polars_utils.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_profile.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_result.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_schema_detection.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_schema_profile.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_season.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_season_config.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_spatial.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_stats.py +0 -0
- {core_lens-0.1.dev89 → core_lens-0.1.dev97}/tests/unit/test_view.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: core-lens
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.dev97
|
|
4
4
|
Summary: Query, analyse, and visualise CoreStack's microwatershed and Earth science data through a clean, composable Python API.
|
|
5
5
|
Project-URL: Homepage, https://github.com/ApoorvaKashyap/core-lens
|
|
6
6
|
Project-URL: Issues, https://github.com/ApoorvaKashyap/core-lens/issues
|
|
@@ -17,8 +17,8 @@ Requires-Python: >=3.13
|
|
|
17
17
|
Requires-Dist: core-lens[core]
|
|
18
18
|
Requires-Dist: core-lens[spatial]
|
|
19
19
|
Provides-Extra: core
|
|
20
|
-
Requires-Dist: polars<2,>=1.
|
|
21
|
-
Requires-Dist: pyarrow<25,>=
|
|
20
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'core'
|
|
21
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'core'
|
|
22
22
|
Requires-Dist: pydantic<3,>=2.13.3; extra == 'core'
|
|
23
23
|
Provides-Extra: full
|
|
24
24
|
Requires-Dist: duckdb<2,>=1.5.3; extra == 'full'
|
|
@@ -26,24 +26,45 @@ Requires-Dist: geopandas<2,>=1.1.3; extra == 'full'
|
|
|
26
26
|
Requires-Dist: lonboard<1,>=0.16.0; extra == 'full'
|
|
27
27
|
Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full'
|
|
28
28
|
Requires-Dist: plotly<7,>=6.7.0; extra == 'full'
|
|
29
|
-
Requires-Dist: polars<2,>=1.
|
|
30
|
-
Requires-Dist: pyarrow<25,>=
|
|
29
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'full'
|
|
30
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full'
|
|
31
31
|
Requires-Dist: pydantic<3,>=2.13.3; extra == 'full'
|
|
32
32
|
Requires-Dist: pyproj<4,>=3.7.2; extra == 'full'
|
|
33
33
|
Requires-Dist: scipy<2,>=1.17.1; extra == 'full'
|
|
34
34
|
Requires-Dist: shapely<3,>=2.1.2; extra == 'full'
|
|
35
35
|
Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full'
|
|
36
|
+
Provides-Extra: full-gpu
|
|
37
|
+
Requires-Dist: cudf-cu13==26.6.*; extra == 'full-gpu'
|
|
38
|
+
Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'full-gpu'
|
|
39
|
+
Requires-Dist: duckdb<2,>=1.5.3; extra == 'full-gpu'
|
|
40
|
+
Requires-Dist: geopandas<2,>=1.1.3; extra == 'full-gpu'
|
|
41
|
+
Requires-Dist: lonboard<1,>=0.16.0; extra == 'full-gpu'
|
|
42
|
+
Requires-Dist: matplotlib<4,>=3.10.9; extra == 'full-gpu'
|
|
43
|
+
Requires-Dist: plotly<7,>=6.7.0; extra == 'full-gpu'
|
|
44
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'full-gpu'
|
|
45
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'full-gpu'
|
|
46
|
+
Requires-Dist: pydantic<3,>=2.13.3; extra == 'full-gpu'
|
|
47
|
+
Requires-Dist: pyproj<4,>=3.7.2; extra == 'full-gpu'
|
|
48
|
+
Requires-Dist: scipy<2,>=1.17.1; extra == 'full-gpu'
|
|
49
|
+
Requires-Dist: shapely<3,>=2.1.2; extra == 'full-gpu'
|
|
50
|
+
Requires-Dist: statsmodels<1,>=0.14.6; extra == 'full-gpu'
|
|
51
|
+
Provides-Extra: gpu
|
|
52
|
+
Requires-Dist: cudf-cu13==26.6.*; extra == 'gpu'
|
|
53
|
+
Requires-Dist: cudf-polars-cu13==26.6.*; extra == 'gpu'
|
|
54
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'gpu'
|
|
55
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'gpu'
|
|
56
|
+
Requires-Dist: pydantic<3,>=2.13.3; extra == 'gpu'
|
|
36
57
|
Provides-Extra: spatial
|
|
37
58
|
Requires-Dist: duckdb<2,>=1.5.3; extra == 'spatial'
|
|
38
59
|
Requires-Dist: geopandas<2,>=1.1.3; extra == 'spatial'
|
|
39
|
-
Requires-Dist: polars<2,>=1.
|
|
40
|
-
Requires-Dist: pyarrow<25,>=
|
|
60
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'spatial'
|
|
61
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'spatial'
|
|
41
62
|
Requires-Dist: pydantic<3,>=2.13.3; extra == 'spatial'
|
|
42
63
|
Requires-Dist: pyproj<4,>=3.7.2; extra == 'spatial'
|
|
43
64
|
Requires-Dist: shapely<3,>=2.1.2; extra == 'spatial'
|
|
44
65
|
Provides-Extra: stats
|
|
45
|
-
Requires-Dist: polars<2,>=1.
|
|
46
|
-
Requires-Dist: pyarrow<25,>=
|
|
66
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'stats'
|
|
67
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'stats'
|
|
47
68
|
Requires-Dist: pydantic<3,>=2.13.3; extra == 'stats'
|
|
48
69
|
Requires-Dist: scipy<2,>=1.17.1; extra == 'stats'
|
|
49
70
|
Requires-Dist: statsmodels<1,>=0.14.6; extra == 'stats'
|
|
@@ -51,8 +72,8 @@ Provides-Extra: viz
|
|
|
51
72
|
Requires-Dist: lonboard<1,>=0.16.0; extra == 'viz'
|
|
52
73
|
Requires-Dist: matplotlib<4,>=3.10.9; extra == 'viz'
|
|
53
74
|
Requires-Dist: plotly<7,>=6.7.0; extra == 'viz'
|
|
54
|
-
Requires-Dist: polars<2,>=1.
|
|
55
|
-
Requires-Dist: pyarrow<25,>=
|
|
75
|
+
Requires-Dist: polars<2,>=1.39.0; extra == 'viz'
|
|
76
|
+
Requires-Dist: pyarrow<25,>=23.0.0; extra == 'viz'
|
|
56
77
|
Requires-Dist: pydantic<3,>=2.13.3; extra == 'viz'
|
|
57
78
|
Description-Content-Type: text/markdown
|
|
58
79
|
|
|
@@ -4,4 +4,4 @@ CoreLens is built on three main layers:
|
|
|
4
4
|
|
|
5
5
|
1. **AoI (Area of Interest)**: The primary entry point. It represents a spatial boundary and acts as a gateway to all registered entities that intersect that boundary.
|
|
6
6
|
2. **View**: A lazy, immutable query definition. Filtering by attributes (`where`), space (`spatial_filter`), or time (`between`) returns a new `View` without reading any Parquet data.
|
|
7
|
-
3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.
|
|
7
|
+
3. **Result**: The materialised data. Accessing `.static`, `.annual`, or `.fortnightly` on a `View` executes the query using the Polars streaming engine (automatically routed to the GPU if RAPIDS `cudf-polars` is installed) and returns a `Result` object. All statistical operations, aggregations, and plotting are done on `Result` objects.
|
|
@@ -8,6 +8,7 @@ CoreLens provides a unified interface over microwatersheds, administrative bound
|
|
|
8
8
|
|
|
9
9
|
- **Area of Interest (AoI) First**: Define your spatial boundary once and instantly access all underlying entities (microwatersheds, villages, tehsils) scoped to that boundary.
|
|
10
10
|
- **Lazy Evaluation**: Uses Polars for lazy evaluation and predicate pushdown. Data is only read from Parquet files when explicitly materialised.
|
|
11
|
+
- **GPU Acceleration**: Zero-code GPU acceleration for query execution and aggregations via NVIDIA RAPIDS (`cudf-polars`), capable of automatically routing compatible queries to the GPU.
|
|
11
12
|
- **Pluggable Entities**: Built-in support for standard units (MWS, Tehsil) with a simple plugin architecture for adding new domain entities.
|
|
12
13
|
- **Temporal & Seasonal Awareness**: Native support for agronomic seasons (Kharif, Rabi, Zaid) and time-range filtering.
|
|
13
14
|
- **Spatial Statistics & Analysis**: Built-in methods for anomaly detection, spatial similarity, temporal correlation, and hypothesis testing.
|
|
@@ -6,6 +6,12 @@
|
|
|
6
6
|
pip install core-lens
|
|
7
7
|
```
|
|
8
8
|
|
|
9
|
+
To enable GPU-accelerated queries (requires an NVIDIA GPU and Linux), install with the `gpu` extra. You will need to configure your package manager to use the NVIDIA PyPI index for RAPIDS dependencies:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install "core-lens[gpu]" --extra-index-url=https://pypi.nvidia.com
|
|
13
|
+
```
|
|
14
|
+
|
|
9
15
|
## Basic Usage
|
|
10
16
|
|
|
11
17
|
Before querying, you must register the entities you plan to use:
|
|
@@ -27,6 +27,11 @@ import polars as pl
|
|
|
27
27
|
from core_lens import AoI, SeasonConfig
|
|
28
28
|
from core_lens.entities import TehsilEntity
|
|
29
29
|
from core_lens.export import geoparquet
|
|
30
|
+
from core_lens.base.namespaces.stats import (
|
|
31
|
+
CorrelateMethod,
|
|
32
|
+
SimilarityMethod,
|
|
33
|
+
TestMethod,
|
|
34
|
+
)
|
|
30
35
|
import shapely.geometry as sgeom
|
|
31
36
|
|
|
32
37
|
|
|
@@ -163,7 +168,7 @@ print(desc_by_entity.df().head(5))
|
|
|
163
168
|
|
|
164
169
|
corr = result_with_area.stats.correlate(
|
|
165
170
|
columns=["area_km2", "Shape_Leng", "compactness"],
|
|
166
|
-
method=
|
|
171
|
+
method=CorrelateMethod.SPEARMAN,
|
|
167
172
|
across="entity",
|
|
168
173
|
)
|
|
169
174
|
print("\nCorrelation (Spearman):")
|
|
@@ -180,7 +185,7 @@ try:
|
|
|
180
185
|
test_result = result_with_area.stats.test(
|
|
181
186
|
column="area_km2",
|
|
182
187
|
groups="STATE",
|
|
183
|
-
method=
|
|
188
|
+
method=TestMethod.MANN_WHITNEY,
|
|
184
189
|
)
|
|
185
190
|
print("\nHypothesis test — area by STATE:")
|
|
186
191
|
print(test_result.df())
|
|
@@ -196,7 +201,7 @@ except IndexError:
|
|
|
196
201
|
test_vs_ref = result_with_area.stats.test(
|
|
197
202
|
column="area_km2",
|
|
198
203
|
against=500.0, # reference area in km²
|
|
199
|
-
method=
|
|
204
|
+
method=TestMethod.T_TEST,
|
|
200
205
|
)
|
|
201
206
|
print("\nOne-sample t-test vs 500 km²:")
|
|
202
207
|
print(test_vs_ref.metadata)
|
|
@@ -227,7 +232,7 @@ sim = result_with_area.stats.similarity(
|
|
|
227
232
|
"Shape_Leng": None,
|
|
228
233
|
"compactness": None,
|
|
229
234
|
},
|
|
230
|
-
method=
|
|
235
|
+
method=SimilarityMethod.EUCLIDEAN,
|
|
231
236
|
top_n=5,
|
|
232
237
|
)
|
|
233
238
|
print(f"\nMost similar tehsils to {target_id}:")
|
|
@@ -44,7 +44,7 @@ venvPath = "."
|
|
|
44
44
|
venv = ".venv"
|
|
45
45
|
|
|
46
46
|
[project.optional-dependencies]
|
|
47
|
-
core = ["polars>=1.
|
|
47
|
+
core = ["polars>=1.39.0,<2", "pyarrow>=23.0.0,<25", "pydantic>=2.13.3,<3"]
|
|
48
48
|
spatial = [
|
|
49
49
|
"core-lens[core]",
|
|
50
50
|
"duckdb>=1.5.3,<2",
|
|
@@ -59,7 +59,9 @@ viz = [
|
|
|
59
59
|
"plotly>=6.7.0,<7",
|
|
60
60
|
]
|
|
61
61
|
stats = ["core-lens[core]", "scipy>=1.17.1,<2", "statsmodels>=0.14.6,<1"]
|
|
62
|
+
gpu = ["core-lens[core]", "cudf-cu13==26.6.*", "cudf-polars-cu13==26.6.*"]
|
|
62
63
|
full = ["core-lens[spatial,viz,stats]"]
|
|
64
|
+
full-gpu = ["core-lens[spatial,viz,stats,gpu]"]
|
|
63
65
|
|
|
64
66
|
[dependency-groups]
|
|
65
67
|
dev = [
|
|
@@ -71,6 +73,7 @@ dev = [
|
|
|
71
73
|
"pytest-cov>=7.1.0",
|
|
72
74
|
"python-semantic-release>=10.5.3",
|
|
73
75
|
"ruff>=0.15.12",
|
|
76
|
+
"scalene>=2.3.0",
|
|
74
77
|
"twine>=6.2.0",
|
|
75
78
|
]
|
|
76
79
|
types = ["types-geopandas>=1.1.3.20260518", "types-shapely>=2.1.0.20260518"]
|
|
@@ -95,3 +98,12 @@ disallow_incomplete_defs = false
|
|
|
95
98
|
[[tool.mypy.overrides]]
|
|
96
99
|
module = ["plotly.*"]
|
|
97
100
|
ignore_missing_imports = true
|
|
101
|
+
|
|
102
|
+
[[tool.uv.index]]
|
|
103
|
+
name = "nvidia"
|
|
104
|
+
url = "https://pypi.nvidia.com"
|
|
105
|
+
explicit = true
|
|
106
|
+
|
|
107
|
+
[tool.uv.sources]
|
|
108
|
+
cudf-cu13 = { index = "nvidia" }
|
|
109
|
+
cudf-polars-cu13 = { index = "nvidia" }
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.1.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 1, '
|
|
21
|
+
__version__ = version = '0.1.dev97'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 'dev97')
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -10,10 +10,31 @@ from typing import TYPE_CHECKING, Any, cast
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import polars as pl
|
|
12
12
|
|
|
13
|
+
from core_lens.utils.polars_utils import collect_lf
|
|
14
|
+
|
|
13
15
|
if TYPE_CHECKING:
|
|
14
16
|
from core_lens.base.result import Result
|
|
15
17
|
|
|
16
18
|
|
|
19
|
+
def _sf(x: object) -> float:
|
|
20
|
+
"""Narrow a polars scalar (mean/std/median/quantile return type) to float.
|
|
21
|
+
|
|
22
|
+
Polars returns a wide union that includes non-numeric types; mypy cannot
|
|
23
|
+
narrow it automatically. This helper asserts the value is numeric at
|
|
24
|
+
runtime and returns a proper ``float``, or ``nan`` when the value is
|
|
25
|
+
``None`` (empty series).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
x: The scalar value returned by a Polars aggregation.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
float: The numeric value as a float, or NaN if the input is None.
|
|
32
|
+
"""
|
|
33
|
+
if x is None:
|
|
34
|
+
return float("nan")
|
|
35
|
+
return float(x) # type: ignore[arg-type]
|
|
36
|
+
|
|
37
|
+
|
|
17
38
|
class CorrelateMethod(Enum):
|
|
18
39
|
"""Correlation methods.
|
|
19
40
|
|
|
@@ -320,13 +341,14 @@ class StatsNamespace:
|
|
|
320
341
|
f"StatsNamespace.test: method={method!r} is not valid for single-sample test against a reference value. "
|
|
321
342
|
"Valid options: 't-test', 'wilcoxon'."
|
|
322
343
|
)
|
|
344
|
+
_s = pl.Series(all_vals)
|
|
323
345
|
data = pl.DataFrame(
|
|
324
346
|
{
|
|
325
347
|
"group": ["all"],
|
|
326
348
|
"n": [len(all_vals)],
|
|
327
|
-
"mean": [
|
|
328
|
-
"std": [
|
|
329
|
-
"median": [
|
|
349
|
+
"mean": [_sf(_s.mean())],
|
|
350
|
+
"std": [_sf(_s.std(ddof=1))],
|
|
351
|
+
"median": [_sf(_s.median())],
|
|
330
352
|
}
|
|
331
353
|
)
|
|
332
354
|
metadata: dict[str, Any] = {
|
|
@@ -352,9 +374,11 @@ class StatsNamespace:
|
|
|
352
374
|
{
|
|
353
375
|
"group": str(g),
|
|
354
376
|
"n": len(a),
|
|
355
|
-
"mean":
|
|
356
|
-
"std":
|
|
357
|
-
|
|
377
|
+
"mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
|
|
378
|
+
"std": _sf(pl.Series(a).std(ddof=1))
|
|
379
|
+
if len(a) > 1
|
|
380
|
+
else float("nan"),
|
|
381
|
+
"median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
|
|
358
382
|
}
|
|
359
383
|
for g, a in zip(group_names, arrays)
|
|
360
384
|
]
|
|
@@ -388,9 +412,11 @@ class StatsNamespace:
|
|
|
388
412
|
{
|
|
389
413
|
"group": f"{p[0]}-{p[1]}",
|
|
390
414
|
"n": len(a),
|
|
391
|
-
"mean":
|
|
392
|
-
"std":
|
|
393
|
-
|
|
415
|
+
"mean": _sf(pl.Series(a).mean()) if len(a) else float("nan"),
|
|
416
|
+
"std": _sf(pl.Series(a).std(ddof=1))
|
|
417
|
+
if len(a) > 1
|
|
418
|
+
else float("nan"),
|
|
419
|
+
"median": _sf(pl.Series(a).median()) if len(a) else float("nan"),
|
|
394
420
|
}
|
|
395
421
|
for p, a in zip(periods, arrays)
|
|
396
422
|
]
|
|
@@ -573,8 +599,9 @@ class StatsNamespace:
|
|
|
573
599
|
all_vals = df[column].to_numpy().astype(float)
|
|
574
600
|
|
|
575
601
|
if method is AnomalyCrossMethod.ZSCORE:
|
|
576
|
-
|
|
577
|
-
|
|
602
|
+
_rs = pl.Series(ref_vals)
|
|
603
|
+
mean = _sf(_rs.mean())
|
|
604
|
+
std = _sf(_rs.std(ddof=1))
|
|
578
605
|
scores = (all_vals - mean) / (std or 1.0)
|
|
579
606
|
flags = np.abs(scores) > threshold
|
|
580
607
|
meta: dict[str, Any] = {
|
|
@@ -586,41 +613,43 @@ class StatsNamespace:
|
|
|
586
613
|
}
|
|
587
614
|
|
|
588
615
|
elif method is AnomalyCrossMethod.IQR:
|
|
589
|
-
|
|
590
|
-
|
|
616
|
+
_rs = pl.Series(ref_vals)
|
|
617
|
+
q1 = _sf(_rs.quantile(0.25))
|
|
618
|
+
q3 = _sf(_rs.quantile(0.75))
|
|
591
619
|
iqr = q3 - q1
|
|
592
620
|
lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
593
|
-
med =
|
|
621
|
+
med = _sf(_rs.median())
|
|
594
622
|
scores = (all_vals - med) / (iqr or 1.0)
|
|
595
623
|
flags = (all_vals < lo) | (all_vals > hi)
|
|
596
624
|
meta = {
|
|
597
625
|
"mode": "cross_sectional",
|
|
598
626
|
"method": "iqr",
|
|
599
627
|
"baseline": baseline,
|
|
600
|
-
"baseline_mean":
|
|
628
|
+
"baseline_mean": _sf(_rs.mean()),
|
|
601
629
|
"q1": q1,
|
|
602
630
|
"q3": q3,
|
|
603
631
|
"iqr": iqr,
|
|
604
632
|
}
|
|
605
633
|
|
|
606
634
|
elif method is AnomalyCrossMethod.PERCENTILE:
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
635
|
+
_rs = pl.Series(ref_vals)
|
|
636
|
+
lo = _sf(_rs.quantile(0.05))
|
|
637
|
+
hi = _sf(_rs.quantile(0.95))
|
|
638
|
+
med = _sf(_rs.median())
|
|
639
|
+
std = _sf(_rs.std()) or 1.0
|
|
611
640
|
scores = (all_vals - med) / std
|
|
612
641
|
flags = (all_vals < lo) | (all_vals > hi)
|
|
613
642
|
meta = {
|
|
614
643
|
"mode": "cross_sectional",
|
|
615
644
|
"method": "percentile",
|
|
616
645
|
"baseline": baseline,
|
|
617
|
-
"baseline_mean":
|
|
646
|
+
"baseline_mean": _sf(_rs.mean()),
|
|
618
647
|
"lower_pct": lo,
|
|
619
648
|
"upper_pct": hi,
|
|
620
649
|
}
|
|
621
650
|
|
|
622
651
|
else: # threshold
|
|
623
|
-
mean =
|
|
652
|
+
mean = _sf(pl.Series(ref_vals).mean())
|
|
624
653
|
scores = all_vals - mean
|
|
625
654
|
flags = np.abs(scores) > threshold
|
|
626
655
|
meta = {
|
|
@@ -680,15 +709,17 @@ class StatsNamespace:
|
|
|
680
709
|
ts_flags: list[bool] = []
|
|
681
710
|
|
|
682
711
|
if method is AnomalyTsMethod.MAD:
|
|
683
|
-
|
|
684
|
-
|
|
712
|
+
_bs = pl.Series(base_vals)
|
|
713
|
+
med = _sf(_bs.median())
|
|
714
|
+
mad = _sf(pl.Series(np.abs(base_vals - med)).median())
|
|
685
715
|
scale = (mad * 1.4826) or 1.0
|
|
686
716
|
ts_scores = [(v - med) / scale for v in eval_vals]
|
|
687
717
|
ts_flags = [abs(s) > threshold for s in ts_scores]
|
|
688
718
|
|
|
689
719
|
elif method is AnomalyTsMethod.CUSUM:
|
|
690
|
-
|
|
691
|
-
|
|
720
|
+
_bs = pl.Series(base_vals)
|
|
721
|
+
mean = _sf(_bs.mean())
|
|
722
|
+
std = _sf(_bs.std(ddof=1)) or 1.0
|
|
692
723
|
k, h = 0.5 * std, threshold * std
|
|
693
724
|
cp, cn = 0.0, 0.0
|
|
694
725
|
for v in eval_vals:
|
|
@@ -713,7 +744,7 @@ class StatsNamespace:
|
|
|
713
744
|
base_len = len(base_vals)
|
|
714
745
|
base_resid = resid[:base_len]
|
|
715
746
|
eval_resid = resid[base_len : base_len + len(eval_years)]
|
|
716
|
-
std =
|
|
747
|
+
std = _sf(pl.Series(base_resid).std(ddof=1)) or 1.0
|
|
717
748
|
ts_scores = [float(r / std) for r in eval_resid]
|
|
718
749
|
ts_flags = [abs(s) > threshold for s in ts_scores]
|
|
719
750
|
except Exception:
|
|
@@ -755,7 +786,7 @@ class StatsNamespace:
|
|
|
755
786
|
"mode": "timeseries",
|
|
756
787
|
"method": method.value if method is not None else None,
|
|
757
788
|
"baseline": baseline,
|
|
758
|
-
"baseline_mean":
|
|
789
|
+
"baseline_mean": _sf(pl.Series(global_base_vals).mean())
|
|
759
790
|
if len(global_base_vals) > 0
|
|
760
791
|
else float("nan"),
|
|
761
792
|
"baseline_fitted": True,
|
|
@@ -895,13 +926,12 @@ class StatsNamespace:
|
|
|
895
926
|
|
|
896
927
|
if resolution_str == "static":
|
|
897
928
|
# Static: no grouping needed — one row per entity.
|
|
898
|
-
fetched = col_lf.select([key, col_name])
|
|
929
|
+
fetched = collect_lf(col_lf.select([key, col_name]))
|
|
899
930
|
else:
|
|
900
|
-
fetched = col_lf.group_by(key).agg(agg_expr)
|
|
931
|
+
fetched = collect_lf(col_lf.group_by(key).agg(agg_expr))
|
|
901
932
|
|
|
902
933
|
feat = feat.join(fetched.select([key, col_name]), on=key, how="left")
|
|
903
934
|
|
|
904
|
-
# ------------------------------------------------------------------
|
|
905
935
|
feature_cols = [c for c in feat.columns if c != key]
|
|
906
936
|
|
|
907
937
|
if not feature_cols:
|
|
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
7
7
|
import polars as pl
|
|
8
8
|
|
|
9
9
|
from core_lens.schema.profile import Resolution
|
|
10
|
+
from core_lens.utils.polars_utils import collect_lf
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
import geopandas as gpd
|
|
@@ -101,10 +102,8 @@ class Result:
|
|
|
101
102
|
import shapely.wkb as wkb
|
|
102
103
|
|
|
103
104
|
geometry_col = self.entity.geometry_col
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
crs="EPSG:4326",
|
|
107
|
-
)
|
|
105
|
+
geometries = [wkb.loads(b) for b in self.data[geometry_col].to_list()]
|
|
106
|
+
geo_series = gpd.GeoSeries(geometries, crs="EPSG:4326")
|
|
108
107
|
return gpd.GeoDataFrame(
|
|
109
108
|
self.data.drop(geometry_col).to_pandas(),
|
|
110
109
|
geometry=geo_series,
|
|
@@ -145,7 +144,7 @@ class Result:
|
|
|
145
144
|
key_cols = self.key_cols
|
|
146
145
|
static_path = self.entity._resolve(self.entity.static_path)
|
|
147
146
|
|
|
148
|
-
geo_df = (
|
|
147
|
+
geo_df = collect_lf(
|
|
149
148
|
pl.scan_parquet(static_path)
|
|
150
149
|
.select(key_cols + [geom_col])
|
|
151
150
|
.filter(
|
|
@@ -153,7 +152,6 @@ class Result:
|
|
|
153
152
|
if len(key_cols) == 1
|
|
154
153
|
else pl.lit(True)
|
|
155
154
|
)
|
|
156
|
-
.collect()
|
|
157
155
|
)
|
|
158
156
|
|
|
159
157
|
joined = self.data.join(geo_df, on=key_cols, how="left")
|
|
@@ -9,7 +9,7 @@ import polars as pl
|
|
|
9
9
|
from enum import Enum
|
|
10
10
|
|
|
11
11
|
from core_lens.schema.profile import Resolution
|
|
12
|
-
from core_lens.utils.polars_utils import scan_with_key_filter
|
|
12
|
+
from core_lens.utils.polars_utils import scan_with_key_filter, collect_lf
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
import shapely
|
|
@@ -384,7 +384,7 @@ class View:
|
|
|
384
384
|
key_values=self.keys,
|
|
385
385
|
time_expr=time_expr,
|
|
386
386
|
)
|
|
387
|
-
data = lf
|
|
387
|
+
data = collect_lf(lf)
|
|
388
388
|
|
|
389
389
|
# For fortnightly results, inject temporal grouping columns so that
|
|
390
390
|
# aggregate(by="year"), aggregate(by="season"), etc. work out of the
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Polars scan helpers with predicate pushdown for entity materialisation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
_GPU_AVAILABLE: bool | None = None # None = not yet probed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _gpu_available() -> bool:
|
|
11
|
+
"""Return ``True`` if ``cudf_polars`` is importable (RAPIDS GPU backend).
|
|
12
|
+
|
|
13
|
+
The result is cached after the first call so subsequent invocations are
|
|
14
|
+
effectively free.
|
|
15
|
+
"""
|
|
16
|
+
global _GPU_AVAILABLE
|
|
17
|
+
if _GPU_AVAILABLE is None:
|
|
18
|
+
try:
|
|
19
|
+
import cudf_polars # noqa: F401 # type: ignore[import-untyped]
|
|
20
|
+
|
|
21
|
+
_GPU_AVAILABLE = True
|
|
22
|
+
gpu = "cudf_polars (RAPIDS)"
|
|
23
|
+
print("=" * 50)
|
|
24
|
+
print(f"GPU : {gpu} found. Running in GPU mode.")
|
|
25
|
+
print("=" * 50)
|
|
26
|
+
except ModuleNotFoundError:
|
|
27
|
+
_GPU_AVAILABLE = False
|
|
28
|
+
print("=" * 50)
|
|
29
|
+
print("GPU : None found. Running in CPU mode.")
|
|
30
|
+
print("=" * 50)
|
|
31
|
+
return _GPU_AVAILABLE
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def collect_lf(lf: pl.LazyFrame) -> pl.DataFrame:
|
|
35
|
+
"""Collect a ``LazyFrame`` using the best available backend.
|
|
36
|
+
|
|
37
|
+
* **GPU present** — executes via the RAPIDS ``cudf_polars`` streaming
|
|
38
|
+
engine (``GPUEngine(executor="streaming")``). Handles datasets larger
|
|
39
|
+
than VRAM through data partitioning.
|
|
40
|
+
* **No GPU** — falls back to Polars' built-in CPU streaming executor
|
|
41
|
+
(``collect(streaming=True)``), which keeps memory usage low for large
|
|
42
|
+
Parquet scans.
|
|
43
|
+
|
|
44
|
+
Use this function for all *data* scans (materialisation, geometry joins,
|
|
45
|
+
similarity fetches). Tiny *index* scans that feed into subsequent
|
|
46
|
+
in-process joins should stay as bare ``.collect()`` calls — the streaming
|
|
47
|
+
path can occasionally change row ordering in ways that break those joins.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
lf: The lazy frame to collect.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
A materialised ``pl.DataFrame``.
|
|
54
|
+
"""
|
|
55
|
+
global _GPU_AVAILABLE
|
|
56
|
+
if _gpu_available():
|
|
57
|
+
engine = pl.GPUEngine(executor="streaming")
|
|
58
|
+
try:
|
|
59
|
+
result = lf.collect(engine=engine)
|
|
60
|
+
assert isinstance(result, pl.DataFrame)
|
|
61
|
+
return result
|
|
62
|
+
except pl.exceptions.ComputeError as e:
|
|
63
|
+
if "cuda" in str(e).lower() or "nvml" in str(e).lower():
|
|
64
|
+
_GPU_AVAILABLE = False
|
|
65
|
+
print("=" * 50)
|
|
66
|
+
print(f"GPU runtime error ({e}). Falling back to CPU mode.")
|
|
67
|
+
print("=" * 50)
|
|
68
|
+
else:
|
|
69
|
+
raise
|
|
70
|
+
return lf.collect(engine="streaming")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def scan_with_key_filter(
|
|
74
|
+
path: str,
|
|
75
|
+
key_cols: list[str],
|
|
76
|
+
key_values: pl.DataFrame,
|
|
77
|
+
time_expr: pl.Expr | None = None,
|
|
78
|
+
) -> pl.LazyFrame:
|
|
79
|
+
"""Return a ``pl.LazyFrame`` filtered to the given keys and optional time range.
|
|
80
|
+
|
|
81
|
+
Uses ``pl.scan_parquet`` with two predicate-pushdown layers:
|
|
82
|
+
|
|
83
|
+
1. **Key filter** — restricts to entity instances whose key column(s) are
|
|
84
|
+
in ``key_values``. For a single-column key this is an ``is_in``
|
|
85
|
+
predicate pushed down to the Parquet reader. For composite keys each
|
|
86
|
+
column is filtered independently (over-selects slightly, then pruned
|
|
87
|
+
by the join at collect time).
|
|
88
|
+
|
|
89
|
+
2. **Time filter** — an optional Polars expression appended with ``&``,
|
|
90
|
+
also pushed down if the Parquet file carries column statistics.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
path: Absolute path to a Parquet file.
|
|
94
|
+
key_cols: Column name(s) that form the entity's unique key.
|
|
95
|
+
key_values: A narrow ``pl.DataFrame`` containing only the key
|
|
96
|
+
column(s) with the exact values to retain.
|
|
97
|
+
time_expr: An optional Polars filter expression for the time column,
|
|
98
|
+
as produced by :func:`~core_lens.utils.season.resolve_time_filter`.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
A ``pl.LazyFrame`` ready to be ``.collect()``-ed.
|
|
102
|
+
"""
|
|
103
|
+
lf = pl.scan_parquet(path)
|
|
104
|
+
|
|
105
|
+
if len(key_cols) == 1:
|
|
106
|
+
key = key_cols[0]
|
|
107
|
+
values = key_values[key].to_list()
|
|
108
|
+
lf = lf.filter(pl.col(key).is_in(values))
|
|
109
|
+
else:
|
|
110
|
+
# Composite key: filter each column independently. A small over-selection
|
|
111
|
+
# is acceptable because the subsequent join at collect time is exact.
|
|
112
|
+
for key in key_cols:
|
|
113
|
+
values = key_values[key].to_list()
|
|
114
|
+
lf = lf.filter(pl.col(key).is_in(values))
|
|
115
|
+
|
|
116
|
+
if time_expr is not None:
|
|
117
|
+
lf = lf.filter(time_expr)
|
|
118
|
+
|
|
119
|
+
return lf
|