cecil 0.0.28__tar.gz → 0.0.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cecil-0.0.28 → cecil-0.0.31}/PKG-INFO +1 -1
- {cecil-0.0.28 → cecil-0.0.31}/src/cecil/client.py +121 -15
- {cecil-0.0.28 → cecil-0.0.31}/src/cecil/models.py +51 -2
- cecil-0.0.31/src/cecil/version.py +1 -0
- cecil-0.0.31/src/cecil/xarray.py +415 -0
- cecil-0.0.28/src/cecil/version.py +0 -1
- cecil-0.0.28/src/cecil/xarray.py +0 -74
- {cecil-0.0.28 → cecil-0.0.31}/.editorconfig +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/.gitignore +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/CONTRIBUTING.md +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/LICENSE.txt +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/Makefile +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/README.md +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/pyproject.toml +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/src/cecil/__init__.py +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/src/cecil/errors.py +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/tests/__init__.py +0 -0
- {cecil-0.0.28 → cecil-0.0.31}/tests/test_client.py +0 -0
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
from warnings import warn
|
|
2
4
|
|
|
3
5
|
import pandas as pd
|
|
4
6
|
import requests
|
|
5
7
|
import snowflake.connector
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
+
from cryptography.hazmat.primitives import serialization
|
|
8
9
|
from pydantic import BaseModel
|
|
9
10
|
from requests import auth
|
|
10
|
-
from cryptography.hazmat.primitives import serialization
|
|
11
|
-
from typing import Dict, List, Optional
|
|
12
|
-
from warnings import warn
|
|
13
11
|
|
|
12
|
+
import xarray
|
|
14
13
|
from .errors import (
|
|
15
14
|
Error,
|
|
16
15
|
_handle_bad_request,
|
|
@@ -35,11 +34,15 @@ from .models import (
|
|
|
35
34
|
TransformationCreate,
|
|
36
35
|
User,
|
|
37
36
|
UserCreate,
|
|
38
|
-
|
|
39
|
-
|
|
37
|
+
SubscriptionMetadata,
|
|
38
|
+
SubscriptionParquetFiles,
|
|
39
|
+
SubscriptionListFiles,
|
|
40
|
+
Subscription,
|
|
41
|
+
SubscriptionCreate,
|
|
40
42
|
)
|
|
41
43
|
from .version import __version__
|
|
42
44
|
from .xarray import load_xarray
|
|
45
|
+
from .xarray import load_xarray_v2
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
class Client:
|
|
@@ -69,6 +72,11 @@ class Client:
|
|
|
69
72
|
def create_data_request(
|
|
70
73
|
self, aoi_id: str, dataset_id: str, external_ref: Optional[str] = None
|
|
71
74
|
) -> DataRequest:
|
|
75
|
+
warn(
|
|
76
|
+
"create_data_request() is deprecated, use create_subscription() instead.",
|
|
77
|
+
DeprecationWarning,
|
|
78
|
+
stacklevel=2,
|
|
79
|
+
)
|
|
72
80
|
res = self._post(
|
|
73
81
|
url="/v0/data-requests",
|
|
74
82
|
model=DataRequestCreate(
|
|
@@ -78,22 +86,120 @@ class Client:
|
|
|
78
86
|
return DataRequest(**res)
|
|
79
87
|
|
|
80
88
|
def get_data_request(self, id: str) -> DataRequest:
|
|
89
|
+
warn(
|
|
90
|
+
"get_data_request() is deprecated, use get_subscription() instead.",
|
|
91
|
+
DeprecationWarning,
|
|
92
|
+
stacklevel=2,
|
|
93
|
+
)
|
|
81
94
|
res = self._get(url=f"/v0/data-requests/{id}")
|
|
82
95
|
return DataRequest(**res)
|
|
83
96
|
|
|
84
97
|
def list_data_requests(self) -> List[DataRequest]:
|
|
98
|
+
warn(
|
|
99
|
+
"list_data_requests() is deprecated, use list_subscriptions() instead.",
|
|
100
|
+
DeprecationWarning,
|
|
101
|
+
stacklevel=2,
|
|
102
|
+
)
|
|
85
103
|
res = self._get(url="/v0/data-requests")
|
|
86
104
|
return [DataRequest(**record) for record in res["records"]]
|
|
87
105
|
|
|
88
|
-
def
|
|
89
|
-
res = self._get(url=
|
|
90
|
-
|
|
91
|
-
|
|
106
|
+
def list_subscriptions(self) -> List[Subscription]:
|
|
107
|
+
res = self._get(url="/v0/data-requests")
|
|
108
|
+
return [Subscription(**record) for record in res["records"]]
|
|
109
|
+
|
|
110
|
+
def create_subscription(
|
|
111
|
+
self, aoi_id: str, dataset_id: str, external_ref: Optional[str] = None
|
|
112
|
+
) -> Subscription:
|
|
113
|
+
res = self._post(
|
|
114
|
+
url="/v0/data-requests",
|
|
115
|
+
model=SubscriptionCreate(
|
|
116
|
+
aoi_id=aoi_id, dataset_id=dataset_id, external_ref=external_ref
|
|
117
|
+
),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return Subscription(**res)
|
|
121
|
+
|
|
122
|
+
def get_subscription(self, id: str) -> Subscription:
|
|
123
|
+
res = self._get(url=f"/v0/data-requests/{id}")
|
|
124
|
+
return Subscription(**res)
|
|
125
|
+
|
|
126
|
+
def load_xarray(
|
|
127
|
+
self,
|
|
128
|
+
subscription_id: Optional[str] = None,
|
|
129
|
+
data_request_id: Optional[str] = None,
|
|
130
|
+
) -> xarray.Dataset:
|
|
131
|
+
if subscription_id is None and data_request_id is None:
|
|
132
|
+
raise TypeError("load_xarray() missing argument: 'subscription_id'")
|
|
133
|
+
|
|
134
|
+
if subscription_id is not None and data_request_id is not None:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
"load_xarray() only accepts one argument but two were provided"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if data_request_id:
|
|
140
|
+
warn(
|
|
141
|
+
"data_request_id is deprecated, use subscription_id instead.",
|
|
142
|
+
DeprecationWarning,
|
|
143
|
+
stacklevel=2,
|
|
144
|
+
)
|
|
145
|
+
subscription_id = data_request_id
|
|
146
|
+
|
|
147
|
+
res = SubscriptionMetadata(
|
|
148
|
+
**self._get(url=f"/v0/data-requests/{subscription_id}/metadata")
|
|
149
|
+
)
|
|
150
|
+
return load_xarray(res)
|
|
151
|
+
|
|
152
|
+
def _load_xarray_v2(
|
|
153
|
+
self,
|
|
154
|
+
subscription_id: Optional[str] = None,
|
|
155
|
+
data_request_id: Optional[str] = None,
|
|
156
|
+
) -> xarray.Dataset:
|
|
157
|
+
if subscription_id is None and data_request_id is None:
|
|
158
|
+
raise TypeError("load_xarray_v2() missing argument: 'subscription_id'")
|
|
159
|
+
|
|
160
|
+
if subscription_id is not None and data_request_id is not None:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"load_xarray_v2() only accepts one argument but two were provided"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if data_request_id:
|
|
166
|
+
warn(
|
|
167
|
+
"data_request_id is deprecated, use subscription_id instead.",
|
|
168
|
+
DeprecationWarning,
|
|
169
|
+
stacklevel=2,
|
|
170
|
+
)
|
|
171
|
+
subscription_id = data_request_id
|
|
172
|
+
|
|
173
|
+
res = SubscriptionListFiles(
|
|
174
|
+
**self._get(url=f"/v0/data-requests/{subscription_id}/files/tiff")
|
|
175
|
+
)
|
|
176
|
+
return load_xarray_v2(res)
|
|
92
177
|
|
|
93
|
-
def load_dataframe(
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
178
|
+
def load_dataframe(
|
|
179
|
+
self,
|
|
180
|
+
subscription_id: Optional[str] = None,
|
|
181
|
+
data_request_id: Optional[str] = None,
|
|
182
|
+
) -> pd.DataFrame:
|
|
183
|
+
if subscription_id is None and data_request_id is None:
|
|
184
|
+
raise TypeError("load_dataframe missing argument: 'subscription_id'")
|
|
185
|
+
|
|
186
|
+
if subscription_id is not None and data_request_id is not None:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"load_dataframe only accepts one argument but two were provided"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if data_request_id:
|
|
192
|
+
warn(
|
|
193
|
+
"data_request_id is deprecated, use subscription_id instead.",
|
|
194
|
+
DeprecationWarning,
|
|
195
|
+
stacklevel=2,
|
|
196
|
+
)
|
|
197
|
+
subscription_id = data_request_id
|
|
198
|
+
|
|
199
|
+
res = SubscriptionParquetFiles(
|
|
200
|
+
**self._get(url=f"/v0/data-requests/{subscription_id}/parquet-files")
|
|
201
|
+
)
|
|
202
|
+
df = pd.concat((pd.read_parquet(f) for f in res.files))
|
|
97
203
|
return df[
|
|
98
204
|
[col for col in df.columns if col not in ("organisation_id", "created_at")]
|
|
99
205
|
]
|
|
@@ -126,7 +126,7 @@ class File(BaseModel):
|
|
|
126
126
|
bands: List[Band]
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
class
|
|
129
|
+
class SubscriptionMetadata(BaseModel):
|
|
130
130
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
131
131
|
provider_name: str
|
|
132
132
|
dataset_id: str
|
|
@@ -137,6 +137,55 @@ class DataRequestMetadata(BaseModel):
|
|
|
137
137
|
files: List[File]
|
|
138
138
|
|
|
139
139
|
|
|
140
|
-
class
|
|
140
|
+
class Bucket(BaseModel):
|
|
141
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
142
|
+
name: str
|
|
143
|
+
prefix: str
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class BucketCredentials(BaseModel):
|
|
147
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
148
|
+
access_key_id: str
|
|
149
|
+
secret_access_key: str
|
|
150
|
+
session_token: str
|
|
151
|
+
expiration: datetime.datetime
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class FileMapping(BaseModel):
|
|
155
|
+
type: str
|
|
156
|
+
bands: List
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SubscriptionListFiles(BaseModel):
|
|
160
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
161
|
+
provider_name: str
|
|
162
|
+
dataset_id: str
|
|
163
|
+
dataset_name: str
|
|
164
|
+
aoi_id: str
|
|
165
|
+
data_request_id: str
|
|
166
|
+
bucket: Bucket
|
|
167
|
+
credentials: BucketCredentials
|
|
168
|
+
allowed_actions: List
|
|
169
|
+
file_mapping: Dict[str, FileMapping]
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class SubscriptionParquetFiles(BaseModel):
|
|
141
173
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
142
174
|
files: List[str]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class Subscription(BaseModel):
|
|
178
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
179
|
+
id: str
|
|
180
|
+
aoi_id: str
|
|
181
|
+
dataset_id: str
|
|
182
|
+
external_ref: Optional[str]
|
|
183
|
+
created_at: datetime.datetime
|
|
184
|
+
created_by: str
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class SubscriptionCreate(BaseModel):
|
|
188
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
189
|
+
aoi_id: str
|
|
190
|
+
dataset_id: str
|
|
191
|
+
external_ref: Optional[str]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.31"
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import time
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import boto3
|
|
6
|
+
import dask
|
|
7
|
+
import rasterio
|
|
8
|
+
import rasterio.session
|
|
9
|
+
import rioxarray
|
|
10
|
+
import xarray
|
|
11
|
+
|
|
12
|
+
from .errors import Error
|
|
13
|
+
from .models import SubscriptionMetadata, SubscriptionListFiles
|
|
14
|
+
|
|
15
|
+
# v1
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_xarray(metadata: SubscriptionMetadata) -> xarray.Dataset:
|
|
19
|
+
data_vars = {}
|
|
20
|
+
|
|
21
|
+
for f in metadata.files:
|
|
22
|
+
try:
|
|
23
|
+
dataset = _retry_with_exponential_backoff(_load_file, 5, 1, 2, f.url)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
raise ValueError(f"failed to load file: {e}")
|
|
26
|
+
|
|
27
|
+
for b in f.bands:
|
|
28
|
+
band = dataset.sel(band=b.number, drop=True)
|
|
29
|
+
|
|
30
|
+
if b.time and b.time_pattern:
|
|
31
|
+
t = datetime.strptime(b.time, b.time_pattern)
|
|
32
|
+
band = band.expand_dims("time")
|
|
33
|
+
band = band.assign_coords(time=[t])
|
|
34
|
+
|
|
35
|
+
band.name = b.variable_name
|
|
36
|
+
|
|
37
|
+
if b.variable_name not in data_vars:
|
|
38
|
+
data_vars[b.variable_name] = []
|
|
39
|
+
|
|
40
|
+
data_vars[b.variable_name].append(band)
|
|
41
|
+
|
|
42
|
+
for variable_name, time_series in data_vars.items():
|
|
43
|
+
if "time" in time_series[0].dims:
|
|
44
|
+
data_vars[variable_name] = xarray.concat(
|
|
45
|
+
time_series, dim="time", join="exact"
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
data_vars[variable_name] = time_series[0]
|
|
49
|
+
|
|
50
|
+
return xarray.Dataset(
|
|
51
|
+
data_vars=data_vars,
|
|
52
|
+
attrs={
|
|
53
|
+
"provider_name": metadata.provider_name,
|
|
54
|
+
"dataset_name": metadata.dataset_name,
|
|
55
|
+
"dataset_id": metadata.dataset_id,
|
|
56
|
+
"aoi_id": metadata.aoi_id,
|
|
57
|
+
"subscription_id": metadata.data_request_id,
|
|
58
|
+
},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _retry_with_exponential_backoff(
|
|
63
|
+
func, retries, start_delay, multiplier, *args, **kwargs
|
|
64
|
+
):
|
|
65
|
+
delay = start_delay
|
|
66
|
+
for attempt in range(1, retries + 1):
|
|
67
|
+
try:
|
|
68
|
+
return func(*args, **kwargs)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
if attempt == retries:
|
|
71
|
+
raise e
|
|
72
|
+
time.sleep(delay)
|
|
73
|
+
delay *= multiplier
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _load_file(url: str):
|
|
78
|
+
return rioxarray.open_rasterio(
|
|
79
|
+
url,
|
|
80
|
+
chunks={"x": 2000, "y": 2000},
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# v2
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def load_xarray_v2(res: SubscriptionListFiles) -> xarray.Dataset:
|
|
88
|
+
session = boto3.session.Session(
|
|
89
|
+
aws_access_key_id=res.credentials.access_key_id,
|
|
90
|
+
aws_secret_access_key=res.credentials.secret_access_key,
|
|
91
|
+
aws_session_token=res.credentials.session_token,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
keys = _list_keys_v2(session, res.bucket.name, res.bucket.prefix)
|
|
95
|
+
|
|
96
|
+
if not keys:
|
|
97
|
+
return xarray.Dataset()
|
|
98
|
+
|
|
99
|
+
timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
|
|
100
|
+
data_vars = {}
|
|
101
|
+
|
|
102
|
+
for key in keys:
|
|
103
|
+
try:
|
|
104
|
+
file_da = _retry_with_exponential_backoff(
|
|
105
|
+
_load_file_v2,
|
|
106
|
+
5,
|
|
107
|
+
1,
|
|
108
|
+
2,
|
|
109
|
+
session,
|
|
110
|
+
f"s3://{res.bucket.name}/{key}",
|
|
111
|
+
)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise ValueError(f"failed to load file: {e}")
|
|
114
|
+
|
|
115
|
+
filename = key.split("/")[-1]
|
|
116
|
+
|
|
117
|
+
file_info = res.file_mapping.get(filename)
|
|
118
|
+
if not file_info:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
timestamp_str = timestamp_pattern.search(key).group()
|
|
122
|
+
|
|
123
|
+
for band_num, var_name in enumerate(file_info.bands, start=1):
|
|
124
|
+
band_da = file_da.sel(band=band_num, drop=True)
|
|
125
|
+
band_da.name = var_name
|
|
126
|
+
|
|
127
|
+
# Dataset with time dimension
|
|
128
|
+
if timestamp_str != "0000/00/00/00/00/00":
|
|
129
|
+
t = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
|
|
130
|
+
band_da = band_da.expand_dims("time")
|
|
131
|
+
band_da = band_da.assign_coords(time=[t])
|
|
132
|
+
|
|
133
|
+
if var_name not in data_vars:
|
|
134
|
+
data_vars[var_name] = []
|
|
135
|
+
|
|
136
|
+
data_vars[var_name].append(band_da)
|
|
137
|
+
|
|
138
|
+
for var_name, time_series in data_vars.items():
|
|
139
|
+
if "time" in time_series[0].dims:
|
|
140
|
+
data_vars[var_name] = xarray.concat(time_series, dim="time", join="exact")
|
|
141
|
+
else:
|
|
142
|
+
data_vars[var_name] = time_series[0]
|
|
143
|
+
|
|
144
|
+
return xarray.Dataset(
|
|
145
|
+
data_vars=data_vars,
|
|
146
|
+
attrs={
|
|
147
|
+
"provider_name": res.provider_name,
|
|
148
|
+
"dataset_name": res.dataset_name,
|
|
149
|
+
"dataset_id": res.dataset_id,
|
|
150
|
+
"aoi_id": res.aoi_id,
|
|
151
|
+
"subscription_id": res.data_request_id,
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _list_keys_v2(session: boto3.session.Session, bucket_name, prefix) -> list[str]:
|
|
157
|
+
s3_client = session.client("s3")
|
|
158
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
|
159
|
+
page_iterator = paginator.paginate(
|
|
160
|
+
Bucket=bucket_name,
|
|
161
|
+
Prefix=prefix,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
keys = []
|
|
165
|
+
for page in page_iterator:
|
|
166
|
+
for obj in page.get("Contents", []):
|
|
167
|
+
keys.append(obj["Key"])
|
|
168
|
+
|
|
169
|
+
return keys
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _load_file_v2(aws_session: boto3.session.Session, url: str):
|
|
173
|
+
with rasterio.env.Env(
|
|
174
|
+
session=rasterio.session.AWSSession(aws_session),
|
|
175
|
+
GDAL_DISABLE_READDIR_ON_OPEN=True,
|
|
176
|
+
):
|
|
177
|
+
return rioxarray.open_rasterio(
|
|
178
|
+
url,
|
|
179
|
+
chunks={"x": 2000, "y": 2000},
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# v3
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def load_xarray_v3(res: SubscriptionListFiles) -> xarray.Dataset:
|
|
187
|
+
session = boto3.session.Session(
|
|
188
|
+
aws_access_key_id=res.credentials.access_key_id,
|
|
189
|
+
aws_secret_access_key=res.credentials.secret_access_key,
|
|
190
|
+
aws_session_token=res.credentials.session_token,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
keys = _list_keys_v3(session, res.bucket.name, res.bucket.prefix)
|
|
194
|
+
|
|
195
|
+
if not keys:
|
|
196
|
+
return xarray.Dataset()
|
|
197
|
+
|
|
198
|
+
timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
|
|
199
|
+
data_vars = {}
|
|
200
|
+
|
|
201
|
+
with rasterio.env.Env(
|
|
202
|
+
session=rasterio.session.AWSSession(session),
|
|
203
|
+
GDAL_DISABLE_READDIR_ON_OPEN=True,
|
|
204
|
+
):
|
|
205
|
+
first_file = rioxarray.open_rasterio(
|
|
206
|
+
f"s3://{res.bucket.name}/{keys[0]}", chunks="auto"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
for key in keys:
|
|
210
|
+
filename = key.split("/")[-1]
|
|
211
|
+
|
|
212
|
+
file_info = res.file_mapping.get(filename)
|
|
213
|
+
if not file_info:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
lazy_array = dask.array.from_delayed(
|
|
217
|
+
dask.delayed(_load_file_v3)(session, f"s3://{res.bucket.name}/{key}"),
|
|
218
|
+
shape=first_file.shape,
|
|
219
|
+
dtype=file_info.type,
|
|
220
|
+
)
|
|
221
|
+
lazy_da = xarray.DataArray(
|
|
222
|
+
lazy_array,
|
|
223
|
+
dims=first_file.dims,
|
|
224
|
+
coords=dict(first_file.coords),
|
|
225
|
+
# attrs=first_file.attrs.copy() # TODO: not the same for all files
|
|
226
|
+
)
|
|
227
|
+
# lazy_da.encoding = first_file.encoding.copy()
|
|
228
|
+
# lazy_da.rio.write_crs(first_file.rio.crs, inplace=True)
|
|
229
|
+
# lazy_da.rio.write_transform(first_file.rio.transform(), inplace=True)
|
|
230
|
+
|
|
231
|
+
timestamp_str = timestamp_pattern.search(key).group()
|
|
232
|
+
|
|
233
|
+
for band_num, var_name in enumerate(file_info.bands, start=1):
|
|
234
|
+
band_da = lazy_da.sel(band=band_num, drop=True)
|
|
235
|
+
band_da.name = var_name
|
|
236
|
+
|
|
237
|
+
# Dataset with time dimension
|
|
238
|
+
if timestamp_str != "0000/00/00/00/00/00":
|
|
239
|
+
t = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
|
|
240
|
+
band_da = band_da.expand_dims("time")
|
|
241
|
+
band_da = band_da.assign_coords(time=[t])
|
|
242
|
+
|
|
243
|
+
if var_name not in data_vars:
|
|
244
|
+
data_vars[var_name] = []
|
|
245
|
+
|
|
246
|
+
data_vars[var_name].append(band_da)
|
|
247
|
+
|
|
248
|
+
for var_name, time_series in data_vars.items():
|
|
249
|
+
if "time" in time_series[0].dims:
|
|
250
|
+
data_vars[var_name] = xarray.concat(time_series, dim="time", join="exact")
|
|
251
|
+
else:
|
|
252
|
+
data_vars[var_name] = time_series[0]
|
|
253
|
+
|
|
254
|
+
return xarray.Dataset(
|
|
255
|
+
data_vars=data_vars,
|
|
256
|
+
attrs={
|
|
257
|
+
"provider_name": res.provider_name,
|
|
258
|
+
"dataset_name": res.dataset_name,
|
|
259
|
+
"dataset_id": res.dataset_id,
|
|
260
|
+
"aoi_id": res.aoi_id,
|
|
261
|
+
"subscription_id": res.data_request_id,
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _load_file_v3(aws_session: boto3.session.Session, url: str):
|
|
267
|
+
with rasterio.env.Env(
|
|
268
|
+
session=rasterio.session.AWSSession(aws_session),
|
|
269
|
+
GDAL_DISABLE_READDIR_ON_OPEN=True,
|
|
270
|
+
):
|
|
271
|
+
return rioxarray.open_rasterio(
|
|
272
|
+
url,
|
|
273
|
+
chunks="auto",
|
|
274
|
+
).values
|
|
275
|
+
# ).sel(band=num_band, drop=True)
|
|
276
|
+
# ).sel(band=num_band, drop=True).values
|
|
277
|
+
# ).isel(band=num_band-1).values
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _list_keys_v3(session: boto3.session.Session, bucket_name, prefix) -> list[str]:
|
|
281
|
+
s3_client = session.client("s3")
|
|
282
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
|
283
|
+
page_iterator = paginator.paginate(
|
|
284
|
+
Bucket=bucket_name,
|
|
285
|
+
Prefix=prefix,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
keys = []
|
|
289
|
+
for page in page_iterator:
|
|
290
|
+
for obj in page.get("Contents", []):
|
|
291
|
+
keys.append(obj["Key"])
|
|
292
|
+
|
|
293
|
+
return keys
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# v4
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def load_xarray_v4(res: SubscriptionListFiles) -> xarray.Dataset:
|
|
300
|
+
|
|
301
|
+
session = boto3.session.Session(
|
|
302
|
+
aws_access_key_id=res.credentials.access_key_id,
|
|
303
|
+
aws_secret_access_key=res.credentials.secret_access_key,
|
|
304
|
+
aws_session_token=res.credentials.session_token,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
keys = _list_keys_v2(session, res.bucket.name, res.bucket.prefix)
|
|
308
|
+
|
|
309
|
+
if not keys:
|
|
310
|
+
return xarray.Dataset()
|
|
311
|
+
|
|
312
|
+
first_file_metadata = _get_file_metadata_v4(session, res.bucket.name, keys[0])
|
|
313
|
+
|
|
314
|
+
timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
|
|
315
|
+
|
|
316
|
+
data_vars = {}
|
|
317
|
+
for key in keys:
|
|
318
|
+
filename = key.split("/")[-1].rsplit(".", 1)[0]
|
|
319
|
+
|
|
320
|
+
file_info = res.file_mapping.get(filename)
|
|
321
|
+
if not file_info:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
timestamp_str = timestamp_pattern.search(key).group()
|
|
325
|
+
|
|
326
|
+
for band_num, band_name in enumerate(file_info.bands, start=1):
|
|
327
|
+
array = _create_dask_array_v4(
|
|
328
|
+
session,
|
|
329
|
+
f"s3://{res.bucket.name}/{key}",
|
|
330
|
+
band_num,
|
|
331
|
+
first_file_metadata["height"],
|
|
332
|
+
first_file_metadata["width"],
|
|
333
|
+
file_info.type,
|
|
334
|
+
)
|
|
335
|
+
da = xarray.DataArray(
|
|
336
|
+
array,
|
|
337
|
+
dims=("y", "x"),
|
|
338
|
+
)
|
|
339
|
+
da.name = band_name
|
|
340
|
+
|
|
341
|
+
# Dataset with time dimension
|
|
342
|
+
if timestamp_str != "0000/00/00/00/00/00":
|
|
343
|
+
time = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
|
|
344
|
+
da = da.expand_dims("time")
|
|
345
|
+
da = da.assign_coords(time=[time])
|
|
346
|
+
|
|
347
|
+
if band_name not in data_vars:
|
|
348
|
+
data_vars[band_name] = []
|
|
349
|
+
|
|
350
|
+
data_vars[band_name].append(da)
|
|
351
|
+
|
|
352
|
+
for variable_name, time_series in data_vars.items():
|
|
353
|
+
if "time" in time_series[0].dims:
|
|
354
|
+
data_vars[variable_name] = xarray.concat(
|
|
355
|
+
time_series,
|
|
356
|
+
dim="time",
|
|
357
|
+
join="exact",
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
data_vars[variable_name] = time_series[0]
|
|
361
|
+
|
|
362
|
+
ds = xarray.Dataset(
|
|
363
|
+
data_vars=data_vars,
|
|
364
|
+
coords={
|
|
365
|
+
"y": first_file_metadata["y"],
|
|
366
|
+
"x": first_file_metadata["x"],
|
|
367
|
+
},
|
|
368
|
+
attrs={
|
|
369
|
+
"provider_name": res.provider_name,
|
|
370
|
+
"dataset_name": res.dataset_name,
|
|
371
|
+
"dataset_id": res.dataset_id,
|
|
372
|
+
"aoi_id": res.aoi_id,
|
|
373
|
+
"subscription_id": res.data_request_id,
|
|
374
|
+
},
|
|
375
|
+
)
|
|
376
|
+
ds = ds.rio.write_crs(first_file_metadata["crs"])
|
|
377
|
+
|
|
378
|
+
return ds
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _get_file_metadata_v4(session, bucket: str, path: str):
|
|
382
|
+
with rasterio.env.Env(
|
|
383
|
+
rasterio.session.AWSSession(session), GDAL_DISABLE_READDIR_ON_OPEN=True
|
|
384
|
+
):
|
|
385
|
+
da = xarray.open_dataarray(f"s3://{bucket}/{path}", engine="rasterio")
|
|
386
|
+
|
|
387
|
+
return {
|
|
388
|
+
"crs": da.rio.crs,
|
|
389
|
+
"height": da.rio.height,
|
|
390
|
+
"width": da.rio.width,
|
|
391
|
+
"x": da.x.values,
|
|
392
|
+
"y": da.y.values,
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _create_dask_array_v4(
|
|
397
|
+
session: boto3.session.Session,
|
|
398
|
+
file_path: str,
|
|
399
|
+
band_num: int,
|
|
400
|
+
height: int,
|
|
401
|
+
width: int,
|
|
402
|
+
dtype: str,
|
|
403
|
+
):
|
|
404
|
+
rasterio_session = rasterio.session.AWSSession(session)
|
|
405
|
+
|
|
406
|
+
def read_chunk():
|
|
407
|
+
with rasterio.env.Env(
|
|
408
|
+
session=rasterio_session, GDAL_DISABLE_READDIR_ON_OPEN=True
|
|
409
|
+
):
|
|
410
|
+
with rasterio.open(file_path) as src:
|
|
411
|
+
return src.read(band_num)
|
|
412
|
+
|
|
413
|
+
return dask.array.from_delayed(
|
|
414
|
+
dask.delayed(read_chunk)(), shape=(height, width), dtype=dtype
|
|
415
|
+
)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.28"
|
cecil-0.0.28/src/cecil/xarray.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import rioxarray
|
|
3
|
-
import xarray
|
|
4
|
-
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
|
|
7
|
-
from .errors import Error
|
|
8
|
-
from .models import DataRequestMetadata
|
|
9
|
-
|
|
10
|
-
os.environ["GDAL_NUM_THREADS"] = "1"
|
|
11
|
-
os.environ["GDAL_DISABLE_READDIR_ON_OPEN"] = "FALSE"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def align_pixel_grids(time_series):
|
|
15
|
-
# Use the first timestep as reference
|
|
16
|
-
reference_da = time_series[0]
|
|
17
|
-
aligned_series = [reference_da]
|
|
18
|
-
|
|
19
|
-
# Align all other timesteps to the reference grid
|
|
20
|
-
for i, da in enumerate(time_series[1:], 1):
|
|
21
|
-
try:
|
|
22
|
-
aligned_da = da.rio.reproject_match(reference_da)
|
|
23
|
-
aligned_series.append(aligned_da)
|
|
24
|
-
except Exception as e:
|
|
25
|
-
raise Error
|
|
26
|
-
continue
|
|
27
|
-
|
|
28
|
-
return aligned_series
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def load_xarray(metadata: DataRequestMetadata) -> xarray.Dataset:
|
|
32
|
-
data_vars = {}
|
|
33
|
-
|
|
34
|
-
for f in metadata.files:
|
|
35
|
-
dataset = rioxarray.open_rasterio(
|
|
36
|
-
f.url,
|
|
37
|
-
chunks={"x": 2000, "y": 2000},
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
for b in f.bands:
|
|
41
|
-
band = dataset.sel(band=b.number, drop=True)
|
|
42
|
-
|
|
43
|
-
if b.time and b.time_pattern:
|
|
44
|
-
time = datetime.strptime(b.time, b.time_pattern)
|
|
45
|
-
band = band.expand_dims("time")
|
|
46
|
-
band = band.assign_coords(time=[time])
|
|
47
|
-
|
|
48
|
-
band.name = b.variable_name
|
|
49
|
-
|
|
50
|
-
if b.variable_name not in data_vars:
|
|
51
|
-
data_vars[b.variable_name] = []
|
|
52
|
-
|
|
53
|
-
data_vars[b.variable_name].append(band)
|
|
54
|
-
|
|
55
|
-
for variable_name, time_series in data_vars.items():
|
|
56
|
-
if "time" in time_series[0].dims:
|
|
57
|
-
# time_series = align_pixel_grids(time_series)
|
|
58
|
-
data_vars[variable_name] = xarray.concat(
|
|
59
|
-
time_series, dim="time", join="exact"
|
|
60
|
-
)
|
|
61
|
-
else:
|
|
62
|
-
data_vars[variable_name] = time_series[0]
|
|
63
|
-
|
|
64
|
-
return xarray.Dataset(
|
|
65
|
-
data_vars=data_vars,
|
|
66
|
-
attrs={
|
|
67
|
-
"provider_name": metadata.provider_name,
|
|
68
|
-
"dataset_id": metadata.dataset_id,
|
|
69
|
-
"dataset_name": metadata.dataset_name,
|
|
70
|
-
"dataset_crs": metadata.dataset_crs,
|
|
71
|
-
"aoi_id": metadata.aoi_id,
|
|
72
|
-
"data_request_id": metadata.data_request_id,
|
|
73
|
-
},
|
|
74
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|