zagg 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zagg/__init__.py +51 -0
- zagg/__main__.py +86 -0
- zagg/_version.py +24 -0
- zagg/auth.py +66 -0
- zagg/catalog.py +622 -0
- zagg/config.py +402 -0
- zagg/configs/__init__.py +0 -0
- zagg/configs/atl06.yaml +73 -0
- zagg/processing.py +442 -0
- zagg/runner.py +519 -0
- zagg/schema.py +165 -0
- zagg/store.py +75 -0
- zagg-0.1.0.dist-info/METADATA +217 -0
- zagg-0.1.0.dist-info/RECORD +17 -0
- zagg-0.1.0.dist-info/WHEEL +4 -0
- zagg-0.1.0.dist-info/entry_points.txt +2 -0
- zagg-0.1.0.dist-info/licenses/LICENSE +21 -0
zagg/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
zagg - Multi-resolution Aggregation
|
|
3
|
+
|
|
4
|
+
Multi-resolution aggregation using morton/healpix indexing.
|
|
5
|
+
|
|
6
|
+
This package provides cloud-agnostic processing functions that can be deployed
|
|
7
|
+
to various cloud platforms (AWS Lambda, GCP Cloud Functions, Azure Functions, etc.)
|
|
8
|
+
or used for local processing.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from ._version import __version__
|
|
13
|
+
except ImportError:
|
|
14
|
+
__version__ = "0.0.0+unknown"
|
|
15
|
+
|
|
16
|
+
from .auth import get_edl_token, get_nsidc_s3_credentials
|
|
17
|
+
from .config import (
|
|
18
|
+
PipelineConfig,
|
|
19
|
+
default_config,
|
|
20
|
+
get_child_order,
|
|
21
|
+
get_driver,
|
|
22
|
+
get_store_path,
|
|
23
|
+
load_config,
|
|
24
|
+
)
|
|
25
|
+
from .processing import (
|
|
26
|
+
calculate_cell_statistics,
|
|
27
|
+
process_morton_cell,
|
|
28
|
+
write_dataframe_to_zarr,
|
|
29
|
+
)
|
|
30
|
+
from .runner import agg
|
|
31
|
+
from .schema import xdggs_spec, xdggs_zarr_template
|
|
32
|
+
from .store import open_store, parse_s3_path
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"PipelineConfig",
|
|
36
|
+
"calculate_cell_statistics",
|
|
37
|
+
"default_config",
|
|
38
|
+
"get_child_order",
|
|
39
|
+
"get_driver",
|
|
40
|
+
"get_edl_token",
|
|
41
|
+
"get_nsidc_s3_credentials",
|
|
42
|
+
"get_store_path",
|
|
43
|
+
"load_config",
|
|
44
|
+
"open_store",
|
|
45
|
+
"parse_s3_path",
|
|
46
|
+
"process_morton_cell",
|
|
47
|
+
"agg",
|
|
48
|
+
"write_dataframe_to_zarr",
|
|
49
|
+
"xdggs_spec",
|
|
50
|
+
"xdggs_zarr_template",
|
|
51
|
+
]
|
zagg/__main__.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""CLI entry point for zagg processing.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python -m zagg --config atl06.yaml --catalog catalog.json
|
|
5
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --store ./test.zarr
|
|
6
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --max-cells 5
|
|
7
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --backend lambda
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
from zagg.config import load_config
|
|
15
|
+
from zagg.runner import agg
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main():
|
|
19
|
+
parser = argparse.ArgumentParser(
|
|
20
|
+
description="zagg processing runner",
|
|
21
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
22
|
+
epilog="""
|
|
23
|
+
examples:
|
|
24
|
+
python -m zagg --config atl06.yaml --catalog catalog.json
|
|
25
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --store ./test.zarr
|
|
26
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --max-cells 5
|
|
27
|
+
python -m zagg --config atl06.yaml --catalog catalog.json --backend lambda
|
|
28
|
+
""",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("--config", required=True, help="Path to pipeline config YAML")
|
|
31
|
+
parser.add_argument("--catalog", default=None, help="Path to granule catalog JSON (overrides config)")
|
|
32
|
+
parser.add_argument("--store", default=None, help="Output store path (overrides config)")
|
|
33
|
+
parser.add_argument("--backend", default="local", choices=["local", "lambda"],
|
|
34
|
+
help="Execution backend (default: local)")
|
|
35
|
+
parser.add_argument("--driver", default=None, choices=["s3", "https"],
|
|
36
|
+
help="Data access driver (default: from config, or s3)")
|
|
37
|
+
parser.add_argument("--max-cells", type=int, default=None, help="Limit number of cells (for testing)")
|
|
38
|
+
parser.add_argument("--morton-cell", type=str, default=None, help="Process a specific morton cell")
|
|
39
|
+
parser.add_argument("--max-workers", type=int, default=None, help="Max concurrent workers")
|
|
40
|
+
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing Zarr template")
|
|
41
|
+
parser.add_argument("--dry-run", action="store_true", help="Show what would be processed")
|
|
42
|
+
parser.add_argument("--region", default="us-west-2", help="AWS region (default: us-west-2)")
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--function-name",
|
|
45
|
+
default=os.environ.get("ZAGG_LAMBDA_FUNCTION_NAME", "process-morton-cell"),
|
|
46
|
+
help="Lambda function name (default: env ZAGG_LAMBDA_FUNCTION_NAME or 'process-morton-cell')",
|
|
47
|
+
)
|
|
48
|
+
args = parser.parse_args()
|
|
49
|
+
|
|
50
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
51
|
+
|
|
52
|
+
config = load_config(args.config)
|
|
53
|
+
|
|
54
|
+
results = agg(
|
|
55
|
+
config,
|
|
56
|
+
catalog=args.catalog,
|
|
57
|
+
store=args.store,
|
|
58
|
+
backend=args.backend,
|
|
59
|
+
driver=args.driver,
|
|
60
|
+
max_cells=args.max_cells,
|
|
61
|
+
morton_cell=args.morton_cell,
|
|
62
|
+
max_workers=args.max_workers,
|
|
63
|
+
overwrite=args.overwrite,
|
|
64
|
+
dry_run=args.dry_run,
|
|
65
|
+
function_name=args.function_name,
|
|
66
|
+
region=args.region,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if args.dry_run:
|
|
70
|
+
print(f"\n[DRY RUN] Would process {results['total_cells']} cells")
|
|
71
|
+
print(f" Granules per cell: min={results['granules_per_cell_min']}, "
|
|
72
|
+
f"max={results['granules_per_cell_max']}, "
|
|
73
|
+
f"avg={results['granules_per_cell_avg']:.1f}")
|
|
74
|
+
print(f" Output: {results['store_path']}")
|
|
75
|
+
else:
|
|
76
|
+
print(f"\nDone: {results['cells_with_data']} cells with data, "
|
|
77
|
+
f"{results['total_obs']:,} obs, {results['cells_error']} errors, "
|
|
78
|
+
f"{results['wall_time_s']:.1f}s")
|
|
79
|
+
if "estimated_cost_usd" in results:
|
|
80
|
+
print(f"Lambda compute: {results['lambda_time_s']:.0f}s total, "
|
|
81
|
+
f"{results['gb_seconds']:.0f} GB-s, ~${results['estimated_cost_usd']:.2f}")
|
|
82
|
+
print(f"Output: {results['store_path']}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
main()
|
zagg/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.1.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 0)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
zagg/auth.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Orchestrator authentication helpers for NASA Earthdata access.
|
|
3
|
+
|
|
4
|
+
Two credential types:
|
|
5
|
+
|
|
6
|
+
- **S3**: ``get_nsidc_s3_credentials()`` returns STS temporary credentials
|
|
7
|
+
for direct S3 access. Only works from within us-west-2.
|
|
8
|
+
- **HTTPS**: ``get_edl_token()`` returns a bearer token for HTTPS access.
|
|
9
|
+
Works from anywhere.
|
|
10
|
+
|
|
11
|
+
Call ONCE in the orchestrator before processing. Credentials are valid
|
|
12
|
+
for approximately 1 hour.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import earthaccess
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_edl_token() -> str:
|
|
19
|
+
"""Return an Earthdata Login bearer token for HTTPS data access.
|
|
20
|
+
|
|
21
|
+
Works from any network location (not region-restricted like S3).
|
|
22
|
+
The token is used by h5coro's HTTPDriver.
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
str
|
|
27
|
+
Bearer token string.
|
|
28
|
+
"""
|
|
29
|
+
auth = earthaccess.login()
|
|
30
|
+
return auth.token["access_token"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_nsidc_s3_credentials() -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Authenticate with NASA Earthdata and return S3 credentials for NSIDC.
|
|
36
|
+
|
|
37
|
+
Call this ONCE in the orchestrator before invoking Lambda functions.
|
|
38
|
+
Credentials are valid for ~1 hour, which is longer than Lambda max
|
|
39
|
+
execution time (15 minutes).
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
dict
|
|
44
|
+
S3 credentials with keys:
|
|
45
|
+
- accessKeyId: str
|
|
46
|
+
- secretAccessKey: str
|
|
47
|
+
- sessionToken: str
|
|
48
|
+
- expiration: str (ISO timestamp)
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
creds = get_nsidc_s3_credentials()
|
|
55
|
+
print(f"Credentials expire: {creds.get('expiration')}")
|
|
56
|
+
|
|
57
|
+
# Pass to Lambda invocation
|
|
58
|
+
event = {
|
|
59
|
+
"parent_morton": -6134114,
|
|
60
|
+
"s3_credentials": creds,
|
|
61
|
+
# ... other params
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
"""
|
|
65
|
+
auth = earthaccess.login()
|
|
66
|
+
return auth.get_s3_credentials(daac="NSIDC")
|