cecil 0.0.31__tar.gz → 0.0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,3 +9,4 @@ __pycache__
9
9
  dist
10
10
  tmp
11
11
  venv
12
+ .venv
@@ -0,0 +1,23 @@
1
+ ## Development installation
2
+
3
+ Install packaging/distribution tools and linter:
4
+
5
+ ```shell
6
+ pip install hatch twine black
7
+ ```
8
+
9
+ From top-level repo directory, install the package in editable mode:
10
+
11
+ ```shell
12
+ pip install -e .
13
+ ```
14
+
15
+ Local edits to the package will immediately take effect.
16
+
17
+ Get the PyPI Test API Key from 1Password and add it to `~/.pypirc`:
18
+
19
+ ```bash
20
+ [testpypi]
21
+ username = __token__
22
+ password = <PyPI Test API Key>
23
+ ```
cecil-0.0.35/PKG-INFO ADDED
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: cecil
3
+ Version: 0.0.35
4
+ Summary: Python SDK for Cecil Earth
5
+ License-Expression: MIT
6
+ License-File: LICENSE.txt
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.10
12
+ Requires-Dist: dask==2025.11.0
13
+ Requires-Dist: pydantic<3.0.0,>=2.11.9
14
+ Requires-Dist: requests<3.0.0,>=2.32.5
15
+ Requires-Dist: rioxarray==0.19.0
16
+ Requires-Dist: snowflake-connector-python[pandas]<4.0.0,>=3.17.4
17
+ Requires-Dist: xarray==2025.11.0
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Cecil SDK
21
+
22
+ Please refer to the Cecil documentation:
23
+
24
+ https://docs.cecil.earth
cecil-0.0.35/README.md ADDED
@@ -0,0 +1,5 @@
1
+ # Cecil SDK
2
+
3
+ Please refer to the Cecil documentation:
4
+
5
+ https://docs.cecil.earth
@@ -16,12 +16,12 @@ classifiers = [
16
16
  "Operating System :: OS Independent",
17
17
  ]
18
18
  dependencies = [
19
- "dask==2025.9.1",
19
+ "dask==2025.11.0",
20
20
  "pydantic>=2.11.9,<3.0.0",
21
21
  "requests>=2.32.5,<3.0.0",
22
22
  "rioxarray==0.19.0",
23
23
  "snowflake-connector-python[pandas]>=3.17.4,<4.0.0",
24
- "xarray==2025.6.1"
24
+ "xarray==2025.11.0"
25
25
  ]
26
26
 
27
27
  [tool.hatch.version]
@@ -104,14 +104,14 @@ class Client:
104
104
  return [DataRequest(**record) for record in res["records"]]
105
105
 
106
106
  def list_subscriptions(self) -> List[Subscription]:
107
- res = self._get(url="/v0/data-requests")
107
+ res = self._get(url="/v0/subscriptions")
108
108
  return [Subscription(**record) for record in res["records"]]
109
109
 
110
110
  def create_subscription(
111
111
  self, aoi_id: str, dataset_id: str, external_ref: Optional[str] = None
112
112
  ) -> Subscription:
113
113
  res = self._post(
114
- url="/v0/data-requests",
114
+ url="/v0/subscriptions",
115
115
  model=SubscriptionCreate(
116
116
  aoi_id=aoi_id, dataset_id=dataset_id, external_ref=external_ref
117
117
  ),
@@ -120,7 +120,7 @@ class Client:
120
120
  return Subscription(**res)
121
121
 
122
122
  def get_subscription(self, id: str) -> Subscription:
123
- res = self._get(url=f"/v0/data-requests/{id}")
123
+ res = self._get(url=f"/v0/subscriptions/{id}")
124
124
  return Subscription(**res)
125
125
 
126
126
  def load_xarray(
@@ -145,7 +145,7 @@ class Client:
145
145
  subscription_id = data_request_id
146
146
 
147
147
  res = SubscriptionMetadata(
148
- **self._get(url=f"/v0/data-requests/{subscription_id}/metadata")
148
+ **self._get(url=f"/v0/subscriptions/{subscription_id}/metadata")
149
149
  )
150
150
  return load_xarray(res)
151
151
 
@@ -171,7 +171,7 @@ class Client:
171
171
  subscription_id = data_request_id
172
172
 
173
173
  res = SubscriptionListFiles(
174
- **self._get(url=f"/v0/data-requests/{subscription_id}/files/tiff")
174
+ **self._get(url=f"/v0/subscriptions/{subscription_id}/files/tiff")
175
175
  )
176
176
  return load_xarray_v2(res)
177
177
 
@@ -197,7 +197,7 @@ class Client:
197
197
  subscription_id = data_request_id
198
198
 
199
199
  res = SubscriptionParquetFiles(
200
- **self._get(url=f"/v0/data-requests/{subscription_id}/parquet-files")
200
+ **self._get(url=f"/v0/subscriptions/{subscription_id}/parquet-files")
201
201
  )
202
202
  df = pd.concat((pd.read_parquet(f) for f in res.files))
203
203
  return df[
@@ -308,12 +308,12 @@ class Client:
308
308
  def update_organisation_settings(
309
309
  self,
310
310
  *,
311
- monthly_data_request_limit,
311
+ monthly_subscription_limit,
312
312
  ) -> OrganisationSettings:
313
313
  res = self._post(
314
314
  url="/v0/organisation/settings",
315
315
  model=OrganisationSettings(
316
- monthly_data_request_limit=monthly_data_request_limit,
316
+ monthly_subscription_limit=monthly_subscription_limit,
317
317
  ),
318
318
  )
319
319
  return OrganisationSettings(**res)
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  from typing import Dict, Optional, List
3
3
 
4
- from pydantic import BaseModel, ConfigDict, SecretStr
4
+ from pydantic import BaseModel, ConfigDict, Field, SecretStr
5
5
  from pydantic.alias_generators import to_camel
6
6
 
7
7
 
@@ -49,7 +49,9 @@ class DataRequestCreate(BaseModel):
49
49
 
50
50
  class OrganisationSettings(BaseModel):
51
51
  model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
52
- monthly_data_request_limit: Optional[int] = None
52
+ monthly_subscription_limit: Optional[int] = Field(
53
+ alias="monthlyDataRequestLimit",
54
+ )
53
55
 
54
56
 
55
57
  class RecoverAPIKey(BaseModel):
@@ -0,0 +1 @@
1
+ __version__ = "0.0.35"
@@ -0,0 +1,193 @@
1
+ import re
2
+ import time
3
+ from datetime import datetime
4
+
5
+ import boto3
6
+ import dask
7
+ import rasterio
8
+ import rasterio.session
9
+ import rioxarray
10
+ import xarray
11
+
12
+ from .models import SubscriptionMetadata, SubscriptionListFiles
13
+
14
+ # v1
15
+
16
+
17
+ def load_xarray(metadata: SubscriptionMetadata) -> xarray.Dataset:
18
+ data_vars = {}
19
+
20
+ for f in metadata.files:
21
+ try:
22
+ dataset = _retry_with_exponential_backoff(_load_file, 5, 1, 2, f.url)
23
+ except Exception as e:
24
+ raise ValueError(f"failed to load file: {e}")
25
+
26
+ for b in f.bands:
27
+ band = dataset.sel(band=b.number, drop=True)
28
+
29
+ if b.time and b.time_pattern:
30
+ t = datetime.strptime(b.time, b.time_pattern)
31
+ band = band.expand_dims("time")
32
+ band = band.assign_coords(time=[t])
33
+
34
+ band.name = b.variable_name
35
+
36
+ if b.variable_name not in data_vars:
37
+ data_vars[b.variable_name] = []
38
+
39
+ data_vars[b.variable_name].append(band)
40
+
41
+ for variable_name, time_series in data_vars.items():
42
+ if "time" in time_series[0].dims:
43
+ data_vars[variable_name] = xarray.concat(
44
+ time_series, dim="time", join="exact"
45
+ )
46
+ else:
47
+ data_vars[variable_name] = time_series[0]
48
+
49
+ return xarray.Dataset(
50
+ data_vars=data_vars,
51
+ attrs={
52
+ "provider_name": metadata.provider_name,
53
+ "dataset_name": metadata.dataset_name,
54
+ "dataset_id": metadata.dataset_id,
55
+ "aoi_id": metadata.aoi_id,
56
+ "subscription_id": metadata.data_request_id,
57
+ },
58
+ )
59
+
60
+
61
+ def _retry_with_exponential_backoff(
62
+ func, retries, start_delay, multiplier, *args, **kwargs
63
+ ):
64
+ delay = start_delay
65
+ for attempt in range(1, retries + 1):
66
+ try:
67
+ return func(*args, **kwargs)
68
+ except Exception as e:
69
+ if attempt == retries:
70
+ raise e
71
+ time.sleep(delay)
72
+ delay *= multiplier
73
+ return None
74
+
75
+
76
+ def _load_file(url: str):
77
+ return rioxarray.open_rasterio(
78
+ url,
79
+ chunks={"x": 2000, "y": 2000},
80
+ )
81
+
82
+
83
+ # v2
84
+
85
+
86
+ def load_xarray_v2(res: SubscriptionListFiles) -> xarray.Dataset:
87
+ session = boto3.session.Session(
88
+ aws_access_key_id=res.credentials.access_key_id,
89
+ aws_secret_access_key=res.credentials.secret_access_key,
90
+ aws_session_token=res.credentials.session_token,
91
+ )
92
+
93
+ keys = _list_keys_v2(session, res.bucket.name, res.bucket.prefix)
94
+
95
+ if not keys:
96
+ return xarray.Dataset()
97
+
98
+ timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
99
+ data_vars = {}
100
+
101
+ with rasterio.env.Env(
102
+ session=rasterio.session.AWSSession(session),
103
+ ):
104
+ first_file = rioxarray.open_rasterio(
105
+ f"s3://{res.bucket.name}/{keys[0]}", chunks="auto"
106
+ )
107
+
108
+ for key in keys:
109
+ filename = key.split("/")[-1]
110
+
111
+ file_info = res.file_mapping.get(filename)
112
+ if not file_info:
113
+ continue
114
+
115
+ timestamp_str = timestamp_pattern.search(key).group()
116
+
117
+ for band_num, var_name in enumerate(file_info.bands, start=1):
118
+ lazy_array = dask.array.from_delayed(
119
+ dask.delayed(_load_file_v2)(
120
+ session, f"s3://{res.bucket.name}/{key}", band_num
121
+ ),
122
+ shape=(
123
+ first_file.rio.height,
124
+ first_file.rio.width,
125
+ ),
126
+ dtype=file_info.type,
127
+ )
128
+ band_da = xarray.DataArray(
129
+ lazy_array,
130
+ dims=("y", "x"),
131
+ coords={
132
+ "y": first_file.y.values,
133
+ "x": first_file.x.values,
134
+ },
135
+ # attrs=first_file.attrs.copy() # TODO: is it the same for all files?
136
+ )
137
+ # band_da.encoding = first_file.encoding.copy() # TODO: is it the same for all files?
138
+ band_da.rio.write_crs(first_file.rio.crs, inplace=True)
139
+ band_da.rio.write_transform(first_file.rio.transform(), inplace=True)
140
+
141
+ band_da.name = var_name
142
+
143
+ # Dataset with time dimension
144
+ if timestamp_str != "0000/00/00/00/00/00":
145
+ t = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
146
+ band_da = band_da.expand_dims("time")
147
+ band_da = band_da.assign_coords(time=[t])
148
+
149
+ if var_name not in data_vars:
150
+ data_vars[var_name] = []
151
+
152
+ data_vars[var_name].append(band_da)
153
+
154
+ for var_name, time_series in data_vars.items():
155
+ if "time" in time_series[0].dims:
156
+ data_vars[var_name] = xarray.concat(time_series, dim="time", join="exact")
157
+ else:
158
+ data_vars[var_name] = time_series[0]
159
+
160
+ return xarray.Dataset(
161
+ data_vars=data_vars,
162
+ attrs={
163
+ "provider_name": res.provider_name,
164
+ "dataset_name": res.dataset_name,
165
+ "dataset_id": res.dataset_id,
166
+ "aoi_id": res.aoi_id,
167
+ "subscription_id": res.data_request_id,
168
+ },
169
+ )
170
+
171
+
172
+ def _load_file_v2(aws_session: boto3.session.Session, url: str, band_num: int):
173
+ with rasterio.env.Env(
174
+ session=rasterio.session.AWSSession(aws_session),
175
+ ):
176
+ with rasterio.open(url) as src:
177
+ return src.read(band_num)
178
+
179
+
180
+ def _list_keys_v2(session: boto3.session.Session, bucket_name, prefix) -> list[str]:
181
+ s3_client = session.client("s3")
182
+ paginator = s3_client.get_paginator("list_objects_v2")
183
+ page_iterator = paginator.paginate(
184
+ Bucket=bucket_name,
185
+ Prefix=prefix,
186
+ )
187
+
188
+ keys = []
189
+ for page in page_iterator:
190
+ for obj in page.get("Contents", []):
191
+ keys.append(obj["Key"])
192
+
193
+ return keys
@@ -1,21 +0,0 @@
1
- ## Development installation
2
-
3
- Install packaging/distribution tools:
4
-
5
- ```shell
6
- pip install hatch twine
7
- ```
8
-
9
- Install linter
10
-
11
- ```shell
12
- pip install black
13
- ```
14
-
15
- From top-level repo directory, install the package in editable mode:
16
-
17
- ```shell
18
- pip install -e .
19
- ```
20
-
21
- Local edits to the package will immediately take effect.
cecil-0.0.31/PKG-INFO DELETED
@@ -1,122 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: cecil
3
- Version: 0.0.31
4
- Summary: Python SDK for Cecil Earth
5
- License-Expression: MIT
6
- License-File: LICENSE.txt
7
- Classifier: Development Status :: 4 - Beta
8
- Classifier: License :: OSI Approved :: MIT License
9
- Classifier: Operating System :: OS Independent
10
- Classifier: Programming Language :: Python :: 3
11
- Requires-Python: >=3.10
12
- Requires-Dist: dask==2025.9.1
13
- Requires-Dist: pydantic<3.0.0,>=2.11.9
14
- Requires-Dist: requests<3.0.0,>=2.32.5
15
- Requires-Dist: rioxarray==0.19.0
16
- Requires-Dist: snowflake-connector-python[pandas]<4.0.0,>=3.17.4
17
- Requires-Dist: xarray==2025.6.1
18
- Description-Content-Type: text/markdown
19
-
20
- # Cecil SDK
21
-
22
- [![PyPI - Version](https://img.shields.io/pypi/v/cecil-sdk.svg)](https://pypi.org/project/cecil-sdk)
23
- [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cecil-sdk.svg)](https://pypi.org/project/cecil-sdk)
24
-
25
- -----
26
-
27
- ## Table of Contents
28
-
29
- - [Installation](#installation)
30
- - [Authentication](#authentication)
31
- - [License](#license)
32
- - [Examples](#examples)
33
-
34
- ## Installation
35
-
36
- ```shell
37
- pip install cecil
38
- ```
39
-
40
- ## Authentication
41
-
42
- Set `CECIL_API_KEY` environment variable to your Cecil API key.
43
-
44
- ## Examples
45
-
46
- ### Create an AOI and data request using the Cecil client
47
-
48
- ```python
49
- import cecil
50
-
51
- client = cecil.Client()
52
-
53
- my_aoi = client.create_aoi(
54
- name="My AOI",
55
- geometry={
56
- "type": "Polygon",
57
- "coordinates": [
58
- [
59
- [145.410408835, -42.004083838],
60
- [145.410408835, -42.004203978],
61
- [145.410623191, -42.004203978],
62
- [145.410623191, -42.004083838],
63
- [145.410408835, -42.004083838],
64
- ]
65
- ],
66
- },
67
- )
68
-
69
- # Get dataset ID from docs.cecil.earth -> Datasets
70
- planet_forest_carbon_diligence_id = "c2dd4f55-56f6-4d05-aae3-ba7c1dcd812f"
71
-
72
- my_data_request = client.create_data_request(
73
- aoi_id=my_aoi.id,
74
- dataset_id=planet_forest_carbon_diligence_id,
75
- )
76
-
77
- print(client.get_data_request(my_data_request.id))
78
- ```
79
-
80
- ### Create a transformation using the Cecil client
81
-
82
- ```python
83
- my_transformation = client.create_transformation(
84
- data_request_id=my_data_request.id,
85
- crs="EPSG:4326",
86
- spatial_resolution=0.005,
87
- )
88
-
89
- print(client.get_transformation(my_transformation.id))
90
- ```
91
-
92
- ### Query data (once transformation is completed)
93
-
94
- ```python
95
- df = client.query(f'''
96
- SELECT *
97
- FROM
98
- planet.forest_carbon_diligence
99
- WHERE
100
- transformation_id = '{my_transformation.id}'
101
- ''')
102
- ```
103
-
104
- ### Other client methods:
105
-
106
- ```python
107
- client.list_aois()
108
-
109
- client.get_aoi(my_aoi.id)
110
-
111
- client.list_data_requests()
112
-
113
- client.get_data_request(my_data_request.id)
114
-
115
- client.list_transformations()
116
-
117
- client.get_transformation(my_transformation.id)
118
- ```
119
-
120
- ## License
121
-
122
- `cecil` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
cecil-0.0.31/README.md DELETED
@@ -1,103 +0,0 @@
1
- # Cecil SDK
2
-
3
- [![PyPI - Version](https://img.shields.io/pypi/v/cecil-sdk.svg)](https://pypi.org/project/cecil-sdk)
4
- [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cecil-sdk.svg)](https://pypi.org/project/cecil-sdk)
5
-
6
- -----
7
-
8
- ## Table of Contents
9
-
10
- - [Installation](#installation)
11
- - [Authentication](#authentication)
12
- - [License](#license)
13
- - [Examples](#examples)
14
-
15
- ## Installation
16
-
17
- ```shell
18
- pip install cecil
19
- ```
20
-
21
- ## Authentication
22
-
23
- Set `CECIL_API_KEY` environment variable to your Cecil API key.
24
-
25
- ## Examples
26
-
27
- ### Create an AOI and data request using the Cecil client
28
-
29
- ```python
30
- import cecil
31
-
32
- client = cecil.Client()
33
-
34
- my_aoi = client.create_aoi(
35
- name="My AOI",
36
- geometry={
37
- "type": "Polygon",
38
- "coordinates": [
39
- [
40
- [145.410408835, -42.004083838],
41
- [145.410408835, -42.004203978],
42
- [145.410623191, -42.004203978],
43
- [145.410623191, -42.004083838],
44
- [145.410408835, -42.004083838],
45
- ]
46
- ],
47
- },
48
- )
49
-
50
- # Get dataset ID from docs.cecil.earth -> Datasets
51
- planet_forest_carbon_diligence_id = "c2dd4f55-56f6-4d05-aae3-ba7c1dcd812f"
52
-
53
- my_data_request = client.create_data_request(
54
- aoi_id=my_aoi.id,
55
- dataset_id=planet_forest_carbon_diligence_id,
56
- )
57
-
58
- print(client.get_data_request(my_data_request.id))
59
- ```
60
-
61
- ### Create a transformation using the Cecil client
62
-
63
- ```python
64
- my_transformation = client.create_transformation(
65
- data_request_id=my_data_request.id,
66
- crs="EPSG:4326",
67
- spatial_resolution=0.005,
68
- )
69
-
70
- print(client.get_transformation(my_transformation.id))
71
- ```
72
-
73
- ### Query data (once transformation is completed)
74
-
75
- ```python
76
- df = client.query(f'''
77
- SELECT *
78
- FROM
79
- planet.forest_carbon_diligence
80
- WHERE
81
- transformation_id = '{my_transformation.id}'
82
- ''')
83
- ```
84
-
85
- ### Other client methods:
86
-
87
- ```python
88
- client.list_aois()
89
-
90
- client.get_aoi(my_aoi.id)
91
-
92
- client.list_data_requests()
93
-
94
- client.get_data_request(my_data_request.id)
95
-
96
- client.list_transformations()
97
-
98
- client.get_transformation(my_transformation.id)
99
- ```
100
-
101
- ## License
102
-
103
- `cecil` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
@@ -1 +0,0 @@
1
- __version__ = "0.0.31"
@@ -1,415 +0,0 @@
1
- import re
2
- import time
3
- from datetime import datetime
4
-
5
- import boto3
6
- import dask
7
- import rasterio
8
- import rasterio.session
9
- import rioxarray
10
- import xarray
11
-
12
- from .errors import Error
13
- from .models import SubscriptionMetadata, SubscriptionListFiles
14
-
15
- # v1
16
-
17
-
18
- def load_xarray(metadata: SubscriptionMetadata) -> xarray.Dataset:
19
- data_vars = {}
20
-
21
- for f in metadata.files:
22
- try:
23
- dataset = _retry_with_exponential_backoff(_load_file, 5, 1, 2, f.url)
24
- except Exception as e:
25
- raise ValueError(f"failed to load file: {e}")
26
-
27
- for b in f.bands:
28
- band = dataset.sel(band=b.number, drop=True)
29
-
30
- if b.time and b.time_pattern:
31
- t = datetime.strptime(b.time, b.time_pattern)
32
- band = band.expand_dims("time")
33
- band = band.assign_coords(time=[t])
34
-
35
- band.name = b.variable_name
36
-
37
- if b.variable_name not in data_vars:
38
- data_vars[b.variable_name] = []
39
-
40
- data_vars[b.variable_name].append(band)
41
-
42
- for variable_name, time_series in data_vars.items():
43
- if "time" in time_series[0].dims:
44
- data_vars[variable_name] = xarray.concat(
45
- time_series, dim="time", join="exact"
46
- )
47
- else:
48
- data_vars[variable_name] = time_series[0]
49
-
50
- return xarray.Dataset(
51
- data_vars=data_vars,
52
- attrs={
53
- "provider_name": metadata.provider_name,
54
- "dataset_name": metadata.dataset_name,
55
- "dataset_id": metadata.dataset_id,
56
- "aoi_id": metadata.aoi_id,
57
- "subscription_id": metadata.data_request_id,
58
- },
59
- )
60
-
61
-
62
- def _retry_with_exponential_backoff(
63
- func, retries, start_delay, multiplier, *args, **kwargs
64
- ):
65
- delay = start_delay
66
- for attempt in range(1, retries + 1):
67
- try:
68
- return func(*args, **kwargs)
69
- except Exception as e:
70
- if attempt == retries:
71
- raise e
72
- time.sleep(delay)
73
- delay *= multiplier
74
- return None
75
-
76
-
77
- def _load_file(url: str):
78
- return rioxarray.open_rasterio(
79
- url,
80
- chunks={"x": 2000, "y": 2000},
81
- )
82
-
83
-
84
- # v2
85
-
86
-
87
- def load_xarray_v2(res: SubscriptionListFiles) -> xarray.Dataset:
88
- session = boto3.session.Session(
89
- aws_access_key_id=res.credentials.access_key_id,
90
- aws_secret_access_key=res.credentials.secret_access_key,
91
- aws_session_token=res.credentials.session_token,
92
- )
93
-
94
- keys = _list_keys_v2(session, res.bucket.name, res.bucket.prefix)
95
-
96
- if not keys:
97
- return xarray.Dataset()
98
-
99
- timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
100
- data_vars = {}
101
-
102
- for key in keys:
103
- try:
104
- file_da = _retry_with_exponential_backoff(
105
- _load_file_v2,
106
- 5,
107
- 1,
108
- 2,
109
- session,
110
- f"s3://{res.bucket.name}/{key}",
111
- )
112
- except Exception as e:
113
- raise ValueError(f"failed to load file: {e}")
114
-
115
- filename = key.split("/")[-1]
116
-
117
- file_info = res.file_mapping.get(filename)
118
- if not file_info:
119
- continue
120
-
121
- timestamp_str = timestamp_pattern.search(key).group()
122
-
123
- for band_num, var_name in enumerate(file_info.bands, start=1):
124
- band_da = file_da.sel(band=band_num, drop=True)
125
- band_da.name = var_name
126
-
127
- # Dataset with time dimension
128
- if timestamp_str != "0000/00/00/00/00/00":
129
- t = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
130
- band_da = band_da.expand_dims("time")
131
- band_da = band_da.assign_coords(time=[t])
132
-
133
- if var_name not in data_vars:
134
- data_vars[var_name] = []
135
-
136
- data_vars[var_name].append(band_da)
137
-
138
- for var_name, time_series in data_vars.items():
139
- if "time" in time_series[0].dims:
140
- data_vars[var_name] = xarray.concat(time_series, dim="time", join="exact")
141
- else:
142
- data_vars[var_name] = time_series[0]
143
-
144
- return xarray.Dataset(
145
- data_vars=data_vars,
146
- attrs={
147
- "provider_name": res.provider_name,
148
- "dataset_name": res.dataset_name,
149
- "dataset_id": res.dataset_id,
150
- "aoi_id": res.aoi_id,
151
- "subscription_id": res.data_request_id,
152
- },
153
- )
154
-
155
-
156
- def _list_keys_v2(session: boto3.session.Session, bucket_name, prefix) -> list[str]:
157
- s3_client = session.client("s3")
158
- paginator = s3_client.get_paginator("list_objects_v2")
159
- page_iterator = paginator.paginate(
160
- Bucket=bucket_name,
161
- Prefix=prefix,
162
- )
163
-
164
- keys = []
165
- for page in page_iterator:
166
- for obj in page.get("Contents", []):
167
- keys.append(obj["Key"])
168
-
169
- return keys
170
-
171
-
172
- def _load_file_v2(aws_session: boto3.session.Session, url: str):
173
- with rasterio.env.Env(
174
- session=rasterio.session.AWSSession(aws_session),
175
- GDAL_DISABLE_READDIR_ON_OPEN=True,
176
- ):
177
- return rioxarray.open_rasterio(
178
- url,
179
- chunks={"x": 2000, "y": 2000},
180
- )
181
-
182
-
183
- # v3
184
-
185
-
186
- def load_xarray_v3(res: SubscriptionListFiles) -> xarray.Dataset:
187
- session = boto3.session.Session(
188
- aws_access_key_id=res.credentials.access_key_id,
189
- aws_secret_access_key=res.credentials.secret_access_key,
190
- aws_session_token=res.credentials.session_token,
191
- )
192
-
193
- keys = _list_keys_v3(session, res.bucket.name, res.bucket.prefix)
194
-
195
- if not keys:
196
- return xarray.Dataset()
197
-
198
- timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
199
- data_vars = {}
200
-
201
- with rasterio.env.Env(
202
- session=rasterio.session.AWSSession(session),
203
- GDAL_DISABLE_READDIR_ON_OPEN=True,
204
- ):
205
- first_file = rioxarray.open_rasterio(
206
- f"s3://{res.bucket.name}/{keys[0]}", chunks="auto"
207
- )
208
-
209
- for key in keys:
210
- filename = key.split("/")[-1]
211
-
212
- file_info = res.file_mapping.get(filename)
213
- if not file_info:
214
- continue
215
-
216
- lazy_array = dask.array.from_delayed(
217
- dask.delayed(_load_file_v3)(session, f"s3://{res.bucket.name}/{key}"),
218
- shape=first_file.shape,
219
- dtype=file_info.type,
220
- )
221
- lazy_da = xarray.DataArray(
222
- lazy_array,
223
- dims=first_file.dims,
224
- coords=dict(first_file.coords),
225
- # attrs=first_file.attrs.copy() # TODO: not the same for all files
226
- )
227
- # lazy_da.encoding = first_file.encoding.copy()
228
- # lazy_da.rio.write_crs(first_file.rio.crs, inplace=True)
229
- # lazy_da.rio.write_transform(first_file.rio.transform(), inplace=True)
230
-
231
- timestamp_str = timestamp_pattern.search(key).group()
232
-
233
- for band_num, var_name in enumerate(file_info.bands, start=1):
234
- band_da = lazy_da.sel(band=band_num, drop=True)
235
- band_da.name = var_name
236
-
237
- # Dataset with time dimension
238
- if timestamp_str != "0000/00/00/00/00/00":
239
- t = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
240
- band_da = band_da.expand_dims("time")
241
- band_da = band_da.assign_coords(time=[t])
242
-
243
- if var_name not in data_vars:
244
- data_vars[var_name] = []
245
-
246
- data_vars[var_name].append(band_da)
247
-
248
- for var_name, time_series in data_vars.items():
249
- if "time" in time_series[0].dims:
250
- data_vars[var_name] = xarray.concat(time_series, dim="time", join="exact")
251
- else:
252
- data_vars[var_name] = time_series[0]
253
-
254
- return xarray.Dataset(
255
- data_vars=data_vars,
256
- attrs={
257
- "provider_name": res.provider_name,
258
- "dataset_name": res.dataset_name,
259
- "dataset_id": res.dataset_id,
260
- "aoi_id": res.aoi_id,
261
- "subscription_id": res.data_request_id,
262
- },
263
- )
264
-
265
-
266
- def _load_file_v3(aws_session: boto3.session.Session, url: str):
267
- with rasterio.env.Env(
268
- session=rasterio.session.AWSSession(aws_session),
269
- GDAL_DISABLE_READDIR_ON_OPEN=True,
270
- ):
271
- return rioxarray.open_rasterio(
272
- url,
273
- chunks="auto",
274
- ).values
275
- # ).sel(band=num_band, drop=True)
276
- # ).sel(band=num_band, drop=True).values
277
- # ).isel(band=num_band-1).values
278
-
279
-
280
- def _list_keys_v3(session: boto3.session.Session, bucket_name, prefix) -> list[str]:
281
- s3_client = session.client("s3")
282
- paginator = s3_client.get_paginator("list_objects_v2")
283
- page_iterator = paginator.paginate(
284
- Bucket=bucket_name,
285
- Prefix=prefix,
286
- )
287
-
288
- keys = []
289
- for page in page_iterator:
290
- for obj in page.get("Contents", []):
291
- keys.append(obj["Key"])
292
-
293
- return keys
294
-
295
-
296
- # v4
297
-
298
-
299
- def load_xarray_v4(res: SubscriptionListFiles) -> xarray.Dataset:
300
-
301
- session = boto3.session.Session(
302
- aws_access_key_id=res.credentials.access_key_id,
303
- aws_secret_access_key=res.credentials.secret_access_key,
304
- aws_session_token=res.credentials.session_token,
305
- )
306
-
307
- keys = _list_keys_v2(session, res.bucket.name, res.bucket.prefix)
308
-
309
- if not keys:
310
- return xarray.Dataset()
311
-
312
- first_file_metadata = _get_file_metadata_v4(session, res.bucket.name, keys[0])
313
-
314
- timestamp_pattern = re.compile(r"\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}")
315
-
316
- data_vars = {}
317
- for key in keys:
318
- filename = key.split("/")[-1].rsplit(".", 1)[0]
319
-
320
- file_info = res.file_mapping.get(filename)
321
- if not file_info:
322
- continue
323
-
324
- timestamp_str = timestamp_pattern.search(key).group()
325
-
326
- for band_num, band_name in enumerate(file_info.bands, start=1):
327
- array = _create_dask_array_v4(
328
- session,
329
- f"s3://{res.bucket.name}/{key}",
330
- band_num,
331
- first_file_metadata["height"],
332
- first_file_metadata["width"],
333
- file_info.type,
334
- )
335
- da = xarray.DataArray(
336
- array,
337
- dims=("y", "x"),
338
- )
339
- da.name = band_name
340
-
341
- # Dataset with time dimension
342
- if timestamp_str != "0000/00/00/00/00/00":
343
- time = datetime.strptime(timestamp_str, "%Y/%m/%d/%H/%M/%S")
344
- da = da.expand_dims("time")
345
- da = da.assign_coords(time=[time])
346
-
347
- if band_name not in data_vars:
348
- data_vars[band_name] = []
349
-
350
- data_vars[band_name].append(da)
351
-
352
- for variable_name, time_series in data_vars.items():
353
- if "time" in time_series[0].dims:
354
- data_vars[variable_name] = xarray.concat(
355
- time_series,
356
- dim="time",
357
- join="exact",
358
- )
359
- else:
360
- data_vars[variable_name] = time_series[0]
361
-
362
- ds = xarray.Dataset(
363
- data_vars=data_vars,
364
- coords={
365
- "y": first_file_metadata["y"],
366
- "x": first_file_metadata["x"],
367
- },
368
- attrs={
369
- "provider_name": res.provider_name,
370
- "dataset_name": res.dataset_name,
371
- "dataset_id": res.dataset_id,
372
- "aoi_id": res.aoi_id,
373
- "subscription_id": res.data_request_id,
374
- },
375
- )
376
- ds = ds.rio.write_crs(first_file_metadata["crs"])
377
-
378
- return ds
379
-
380
-
381
- def _get_file_metadata_v4(session, bucket: str, path: str):
382
- with rasterio.env.Env(
383
- rasterio.session.AWSSession(session), GDAL_DISABLE_READDIR_ON_OPEN=True
384
- ):
385
- da = xarray.open_dataarray(f"s3://{bucket}/{path}", engine="rasterio")
386
-
387
- return {
388
- "crs": da.rio.crs,
389
- "height": da.rio.height,
390
- "width": da.rio.width,
391
- "x": da.x.values,
392
- "y": da.y.values,
393
- }
394
-
395
-
396
- def _create_dask_array_v4(
397
- session: boto3.session.Session,
398
- file_path: str,
399
- band_num: int,
400
- height: int,
401
- width: int,
402
- dtype: str,
403
- ):
404
- rasterio_session = rasterio.session.AWSSession(session)
405
-
406
- def read_chunk():
407
- with rasterio.env.Env(
408
- session=rasterio_session, GDAL_DISABLE_READDIR_ON_OPEN=True
409
- ):
410
- with rasterio.open(file_path) as src:
411
- return src.read(band_num)
412
-
413
- return dask.array.from_delayed(
414
- dask.delayed(read_chunk)(), shape=(height, width), dtype=dtype
415
- )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes