rslearn 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rslearn/arg_parser.py +31 -0
- rslearn/config/__init__.py +6 -12
- rslearn/config/dataset.py +520 -401
- rslearn/const.py +9 -15
- rslearn/data_sources/__init__.py +8 -23
- rslearn/data_sources/aws_landsat.py +242 -98
- rslearn/data_sources/aws_open_data.py +111 -151
- rslearn/data_sources/aws_sentinel1.py +131 -0
- rslearn/data_sources/climate_data_store.py +471 -0
- rslearn/data_sources/copernicus.py +884 -12
- rslearn/data_sources/data_source.py +43 -12
- rslearn/data_sources/earthdaily.py +484 -0
- rslearn/data_sources/earthdata_srtm.py +282 -0
- rslearn/data_sources/eurocrops.py +242 -0
- rslearn/data_sources/gcp_public_data.py +578 -222
- rslearn/data_sources/google_earth_engine.py +461 -135
- rslearn/data_sources/local_files.py +219 -150
- rslearn/data_sources/openstreetmap.py +51 -89
- rslearn/data_sources/planet.py +24 -60
- rslearn/data_sources/planet_basemap.py +275 -0
- rslearn/data_sources/planetary_computer.py +798 -0
- rslearn/data_sources/usda_cdl.py +195 -0
- rslearn/data_sources/usgs_landsat.py +115 -83
- rslearn/data_sources/utils.py +249 -61
- rslearn/data_sources/vector_source.py +1 -0
- rslearn/data_sources/worldcereal.py +449 -0
- rslearn/data_sources/worldcover.py +144 -0
- rslearn/data_sources/worldpop.py +153 -0
- rslearn/data_sources/xyz_tiles.py +150 -107
- rslearn/dataset/__init__.py +8 -2
- rslearn/dataset/add_windows.py +2 -2
- rslearn/dataset/dataset.py +40 -51
- rslearn/dataset/handler_summaries.py +131 -0
- rslearn/dataset/manage.py +313 -74
- rslearn/dataset/materialize.py +431 -107
- rslearn/dataset/remap.py +29 -4
- rslearn/dataset/storage/__init__.py +1 -0
- rslearn/dataset/storage/file.py +202 -0
- rslearn/dataset/storage/storage.py +140 -0
- rslearn/dataset/window.py +181 -44
- rslearn/lightning_cli.py +454 -0
- rslearn/log_utils.py +24 -0
- rslearn/main.py +384 -181
- rslearn/models/anysat.py +215 -0
- rslearn/models/attention_pooling.py +177 -0
- rslearn/models/clay/clay.py +231 -0
- rslearn/models/clay/configs/metadata.yaml +295 -0
- rslearn/models/clip.py +68 -0
- rslearn/models/component.py +111 -0
- rslearn/models/concatenate_features.py +103 -0
- rslearn/models/conv.py +63 -0
- rslearn/models/croma.py +306 -0
- rslearn/models/detr/__init__.py +5 -0
- rslearn/models/detr/box_ops.py +103 -0
- rslearn/models/detr/detr.py +504 -0
- rslearn/models/detr/matcher.py +107 -0
- rslearn/models/detr/position_encoding.py +114 -0
- rslearn/models/detr/transformer.py +429 -0
- rslearn/models/detr/util.py +24 -0
- rslearn/models/dinov3.py +177 -0
- rslearn/models/faster_rcnn.py +30 -28
- rslearn/models/feature_center_crop.py +53 -0
- rslearn/models/fpn.py +19 -8
- rslearn/models/galileo/__init__.py +5 -0
- rslearn/models/galileo/galileo.py +595 -0
- rslearn/models/galileo/single_file_galileo.py +1678 -0
- rslearn/models/module_wrapper.py +65 -0
- rslearn/models/molmo.py +69 -0
- rslearn/models/multitask.py +384 -28
- rslearn/models/olmoearth_pretrain/__init__.py +1 -0
- rslearn/models/olmoearth_pretrain/model.py +421 -0
- rslearn/models/olmoearth_pretrain/norm.py +86 -0
- rslearn/models/panopticon.py +170 -0
- rslearn/models/panopticon_data/sensors/drone.yaml +32 -0
- rslearn/models/panopticon_data/sensors/enmap.yaml +904 -0
- rslearn/models/panopticon_data/sensors/goes.yaml +9 -0
- rslearn/models/panopticon_data/sensors/himawari.yaml +9 -0
- rslearn/models/panopticon_data/sensors/intuition.yaml +606 -0
- rslearn/models/panopticon_data/sensors/landsat8.yaml +84 -0
- rslearn/models/panopticon_data/sensors/modis_terra.yaml +99 -0
- rslearn/models/panopticon_data/sensors/qb2_ge1.yaml +34 -0
- rslearn/models/panopticon_data/sensors/sentinel1.yaml +85 -0
- rslearn/models/panopticon_data/sensors/sentinel2.yaml +97 -0
- rslearn/models/panopticon_data/sensors/superdove.yaml +60 -0
- rslearn/models/panopticon_data/sensors/wv23.yaml +63 -0
- rslearn/models/pick_features.py +17 -10
- rslearn/models/pooling_decoder.py +60 -7
- rslearn/models/presto/__init__.py +5 -0
- rslearn/models/presto/presto.py +297 -0
- rslearn/models/presto/single_file_presto.py +926 -0
- rslearn/models/prithvi.py +1147 -0
- rslearn/models/resize_features.py +59 -0
- rslearn/models/sam2_enc.py +13 -9
- rslearn/models/satlaspretrain.py +38 -18
- rslearn/models/simple_time_series.py +188 -77
- rslearn/models/singletask.py +24 -13
- rslearn/models/ssl4eo_s12.py +40 -30
- rslearn/models/swin.py +44 -32
- rslearn/models/task_embedding.py +250 -0
- rslearn/models/terramind.py +256 -0
- rslearn/models/trunk.py +139 -0
- rslearn/models/unet.py +68 -22
- rslearn/models/upsample.py +48 -0
- rslearn/models/use_croma.py +508 -0
- rslearn/template_params.py +26 -0
- rslearn/tile_stores/__init__.py +41 -18
- rslearn/tile_stores/default.py +409 -0
- rslearn/tile_stores/tile_store.py +236 -132
- rslearn/train/all_patches_dataset.py +530 -0
- rslearn/train/callbacks/adapters.py +53 -0
- rslearn/train/callbacks/freeze_unfreeze.py +348 -17
- rslearn/train/callbacks/gradients.py +129 -0
- rslearn/train/callbacks/peft.py +116 -0
- rslearn/train/data_module.py +444 -20
- rslearn/train/dataset.py +588 -235
- rslearn/train/lightning_module.py +192 -62
- rslearn/train/model_context.py +88 -0
- rslearn/train/optimizer.py +31 -0
- rslearn/train/prediction_writer.py +319 -84
- rslearn/train/scheduler.py +92 -0
- rslearn/train/tasks/classification.py +55 -28
- rslearn/train/tasks/detection.py +132 -76
- rslearn/train/tasks/embedding.py +120 -0
- rslearn/train/tasks/multi_task.py +28 -14
- rslearn/train/tasks/per_pixel_regression.py +291 -0
- rslearn/train/tasks/regression.py +161 -44
- rslearn/train/tasks/segmentation.py +428 -53
- rslearn/train/tasks/task.py +6 -5
- rslearn/train/transforms/__init__.py +1 -1
- rslearn/train/transforms/concatenate.py +54 -10
- rslearn/train/transforms/crop.py +29 -11
- rslearn/train/transforms/flip.py +18 -6
- rslearn/train/transforms/mask.py +78 -0
- rslearn/train/transforms/normalize.py +101 -17
- rslearn/train/transforms/pad.py +19 -7
- rslearn/train/transforms/resize.py +83 -0
- rslearn/train/transforms/select_bands.py +76 -0
- rslearn/train/transforms/sentinel1.py +75 -0
- rslearn/train/transforms/transform.py +89 -70
- rslearn/utils/__init__.py +2 -6
- rslearn/utils/array.py +8 -6
- rslearn/utils/feature.py +2 -2
- rslearn/utils/fsspec.py +90 -1
- rslearn/utils/geometry.py +347 -7
- rslearn/utils/get_utm_ups_crs.py +2 -3
- rslearn/utils/grid_index.py +5 -5
- rslearn/utils/jsonargparse.py +178 -0
- rslearn/utils/mp.py +4 -3
- rslearn/utils/raster_format.py +268 -116
- rslearn/utils/rtree_index.py +64 -17
- rslearn/utils/sqlite_index.py +7 -1
- rslearn/utils/vector_format.py +252 -97
- {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/METADATA +532 -283
- rslearn-0.0.21.dist-info/RECORD +167 -0
- {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/WHEEL +1 -1
- rslearn-0.0.21.dist-info/licenses/NOTICE +115 -0
- rslearn/data_sources/raster_source.py +0 -309
- rslearn/models/registry.py +0 -5
- rslearn/tile_stores/file.py +0 -242
- rslearn/utils/mgrs.py +0 -24
- rslearn/utils/utils.py +0 -22
- rslearn-0.0.1.dist-info/RECORD +0 -88
- /rslearn/{data_sources/geotiff.py → py.typed} +0 -0
- {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/entry_points.txt +0 -0
- {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info/licenses}/LICENSE +0 -0
- {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/top_level.txt +0 -0
|
@@ -1,34 +1,36 @@
|
|
|
1
1
|
"""Data source for raster data on public Cloud Storage buckets."""
|
|
2
2
|
|
|
3
|
-
import csv
|
|
4
|
-
import gzip
|
|
5
3
|
import io
|
|
6
4
|
import json
|
|
5
|
+
import os
|
|
6
|
+
import random
|
|
7
7
|
import tempfile
|
|
8
8
|
import xml.etree.ElementTree as ET
|
|
9
9
|
from collections.abc import Generator
|
|
10
|
-
from
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
11
12
|
from typing import Any, BinaryIO
|
|
12
13
|
|
|
13
14
|
import dateutil.parser
|
|
14
|
-
import pytimeparse
|
|
15
15
|
import rasterio
|
|
16
16
|
import shapely
|
|
17
17
|
import tqdm
|
|
18
|
-
from google.cloud import storage
|
|
18
|
+
from google.cloud import bigquery, storage
|
|
19
19
|
from upath import UPath
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
from rslearn.config import LayerConfig, QueryConfig, RasterLayerConfig
|
|
21
|
+
from rslearn.config import QueryConfig
|
|
23
22
|
from rslearn.const import WGS84_PROJECTION
|
|
24
|
-
from rslearn.data_sources import DataSource, Item
|
|
23
|
+
from rslearn.data_sources import DataSource, DataSourceContext, Item
|
|
25
24
|
from rslearn.data_sources.utils import match_candidate_items_to_window
|
|
26
|
-
from rslearn.
|
|
27
|
-
from rslearn.
|
|
25
|
+
from rslearn.log_utils import get_logger
|
|
26
|
+
from rslearn.tile_stores import TileStoreWithLayer
|
|
28
27
|
from rslearn.utils.fsspec import join_upath, open_atomic
|
|
28
|
+
from rslearn.utils.geometry import STGeometry, flatten_shape, split_at_antimeridian
|
|
29
|
+
from rslearn.utils.raster_format import get_raster_projection_and_bounds
|
|
29
30
|
|
|
30
|
-
from .copernicus import get_harmonize_callback
|
|
31
|
-
|
|
31
|
+
from .copernicus import get_harmonize_callback, get_sentinel2_tiles
|
|
32
|
+
|
|
33
|
+
logger = get_logger(__name__)
|
|
32
34
|
|
|
33
35
|
|
|
34
36
|
class Sentinel2Item(Item):
|
|
@@ -57,7 +59,7 @@ class Sentinel2Item(Item):
|
|
|
57
59
|
return d
|
|
58
60
|
|
|
59
61
|
@staticmethod
|
|
60
|
-
def deserialize(d: dict[str, Any]) ->
|
|
62
|
+
def deserialize(d: dict[str, Any]) -> "Sentinel2Item":
|
|
61
63
|
"""Deserializes an item from a JSON-decoded dictionary."""
|
|
62
64
|
item = super(Sentinel2Item, Sentinel2Item).deserialize(d)
|
|
63
65
|
return Sentinel2Item(
|
|
@@ -68,6 +70,45 @@ class Sentinel2Item(Item):
|
|
|
68
70
|
)
|
|
69
71
|
|
|
70
72
|
|
|
73
|
+
class CorruptItemException(Exception):
|
|
74
|
+
"""A Sentinel-2 scene is corrupted or otherwise unreadable for a known reason."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, message: str) -> None:
|
|
77
|
+
"""Create a new CorruptItemException.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
message: error message.
|
|
81
|
+
"""
|
|
82
|
+
self.message = message
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class MissingXMLException(Exception):
|
|
86
|
+
"""Exception for when an item's XML file does not exist in GCS.
|
|
87
|
+
|
|
88
|
+
Some items that appear in the index on BigQuery, or that have a folder, lack an XML
|
|
89
|
+
file, and so in those cases this exception can be ignored.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(self, item_name: str):
|
|
93
|
+
"""Create a new MissingXMLException.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
item_name: the name of the item (Sentinel-2 scene) that is missing its XML
|
|
97
|
+
file in the GCS bucket.
|
|
98
|
+
"""
|
|
99
|
+
self.item_name = item_name
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class ParsedProductXML:
|
|
104
|
+
"""Result of parsing a Sentinel-2 product XML file."""
|
|
105
|
+
|
|
106
|
+
blob_prefix: str
|
|
107
|
+
shp: shapely.Polygon
|
|
108
|
+
start_time: datetime
|
|
109
|
+
cloud_cover: float
|
|
110
|
+
|
|
111
|
+
|
|
71
112
|
class Sentinel2(DataSource):
|
|
72
113
|
"""A data source for Sentinel-2 data on Google Cloud Storage.
|
|
73
114
|
|
|
@@ -80,11 +121,12 @@ class Sentinel2(DataSource):
|
|
|
80
121
|
The bucket is public and free so no credentials are needed.
|
|
81
122
|
"""
|
|
82
123
|
|
|
83
|
-
|
|
124
|
+
BUCKET_NAME = "gcp-public-data-sentinel-2"
|
|
84
125
|
|
|
85
|
-
|
|
126
|
+
# Name of BigQuery table containing index of Sentinel-2 scenes in the bucket.
|
|
127
|
+
TABLE_NAME = "bigquery-public-data.cloud_storage_geo_index.sentinel_2_index"
|
|
86
128
|
|
|
87
|
-
|
|
129
|
+
BANDS = [
|
|
88
130
|
("B01.jp2", ["B01"]),
|
|
89
131
|
("B02.jp2", ["B02"]),
|
|
90
132
|
("B03.jp2", ["B03"]),
|
|
@@ -101,144 +143,270 @@ class Sentinel2(DataSource):
|
|
|
101
143
|
("TCI.jp2", ["R", "G", "B"]),
|
|
102
144
|
]
|
|
103
145
|
|
|
146
|
+
# Possible prefixes of the product name that may appear on GCS, before the year
|
|
147
|
+
# appears in the product name. For example, a product may start with
|
|
148
|
+
# "S2A_MSIL1C_20230101..." so S2A_MSIL1C appears here. This list is used when
|
|
149
|
+
# enumerating the list of products on GCS that fall in a certain year: because the
|
|
150
|
+
# year comes after this prefix, filtering in the object list operation requires
|
|
151
|
+
# including this prefix first followed by the year.
|
|
152
|
+
VALID_PRODUCT_PREFIXES = ["S2A_MSIL1C", "S2B_MSIL1C", "S2C_MSIL1C"]
|
|
153
|
+
|
|
154
|
+
# The name of the L1C product metadata XML file.
|
|
155
|
+
METADATA_FILENAME = "MTD_MSIL1C.xml"
|
|
156
|
+
|
|
104
157
|
def __init__(
|
|
105
158
|
self,
|
|
106
|
-
|
|
107
|
-
index_cache_dir: UPath,
|
|
108
|
-
max_time_delta: timedelta = timedelta(days=30),
|
|
159
|
+
index_cache_dir: str,
|
|
109
160
|
sort_by: str | None = None,
|
|
110
161
|
use_rtree_index: bool = True,
|
|
111
162
|
harmonize: bool = False,
|
|
112
163
|
rtree_time_range: tuple[datetime, datetime] | None = None,
|
|
164
|
+
rtree_cache_dir: str | None = None,
|
|
165
|
+
use_bigquery: bool | None = None,
|
|
166
|
+
bands: list[str] | None = None,
|
|
167
|
+
context: DataSourceContext = DataSourceContext(),
|
|
113
168
|
):
|
|
114
169
|
"""Initialize a new Sentinel2 instance.
|
|
115
170
|
|
|
116
171
|
Args:
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
well as individual product metadata files. Defaults to None in which
|
|
120
|
-
case products are looked up from the cloud storage directly.
|
|
121
|
-
max_time_delta: maximum time before a query start time or after a
|
|
122
|
-
query end time to look for products. This is required due to the large
|
|
123
|
-
number of available products, and defaults to 30 days.
|
|
172
|
+
index_cache_dir: local directory to cache the index contents, as well as
|
|
173
|
+
individual product metadata files.
|
|
124
174
|
sort_by: can be "cloud_cover", default arbitrary order; only has effect for
|
|
125
175
|
SpaceMode.WITHIN.
|
|
126
176
|
use_rtree_index: whether to create an rtree index to enable faster lookups
|
|
127
|
-
(default true)
|
|
177
|
+
(default true). rtree will take several hours if it is not restricted
|
|
178
|
+
to a short time range using rtree_time_range.
|
|
128
179
|
harmonize: harmonize pixel values across different processing baselines,
|
|
129
180
|
see https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR_HARMONIZED
|
|
130
181
|
rtree_time_range: only populate the rtree index with scenes within this
|
|
131
|
-
time range
|
|
182
|
+
time range. Restricting to a few months significantly speeds up rtree
|
|
183
|
+
creation time.
|
|
184
|
+
rtree_cache_dir: by default, if use_rtree_index is enabled, the rtree is
|
|
185
|
+
stored in index_cache_dir (where product XML files are also stored). If
|
|
186
|
+
rtree_cache_dir is set, then the rtree is stored here instead (so
|
|
187
|
+
index_cache_dir is only used to cache product XML files).
|
|
188
|
+
use_bigquery: whether to use the BigQuery index over the scenes in the
|
|
189
|
+
bucket. This must be enabled if use_rtree_index is enabled, since we
|
|
190
|
+
only support populating the rtree index from BigQuery. Note that
|
|
191
|
+
BigQuery requires GCP credentials to be setup; to avoid the need for
|
|
192
|
+
credentials, set use_bigquery=False and use_rtree_index=False. The
|
|
193
|
+
default value is None which enables BigQuery when use_rtree_index=True
|
|
194
|
+
and disables when use_rtree_index=False.
|
|
195
|
+
bands: the bands to download, or None to download all bands. This is only
|
|
196
|
+
used if the layer config is not in the context.
|
|
197
|
+
context: the data source context.
|
|
132
198
|
"""
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
199
|
+
if use_bigquery is None:
|
|
200
|
+
use_bigquery = use_rtree_index
|
|
201
|
+
if not use_bigquery and use_rtree_index:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
"use_bigquery must be enabled if use_rtree_index is enabled"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Resolve index_cache_dir and rtree_cache_dir depending on dataset context.
|
|
207
|
+
if context.ds_path is not None:
|
|
208
|
+
self.index_cache_dir = join_upath(context.ds_path, index_cache_dir)
|
|
209
|
+
else:
|
|
210
|
+
self.index_cache_dir = UPath(index_cache_dir)
|
|
211
|
+
|
|
212
|
+
if rtree_cache_dir is None:
|
|
213
|
+
self.rtree_cache_dir = self.index_cache_dir
|
|
214
|
+
elif context.ds_path is not None:
|
|
215
|
+
self.rtree_cache_dir = join_upath(context.ds_path, rtree_cache_dir)
|
|
216
|
+
else:
|
|
217
|
+
self.rtree_cache_dir = UPath(rtree_cache_dir)
|
|
218
|
+
|
|
136
219
|
self.sort_by = sort_by
|
|
137
220
|
self.harmonize = harmonize
|
|
221
|
+
self.use_bigquery = use_bigquery
|
|
138
222
|
|
|
139
223
|
self.index_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
140
224
|
|
|
141
|
-
|
|
225
|
+
# Determine the subset of bands that are needed based on the layer config.
|
|
226
|
+
self.needed_bands: list[tuple[str, list[str]]]
|
|
227
|
+
if context.layer_config is not None:
|
|
228
|
+
self.needed_bands = []
|
|
229
|
+
for fname, cur_bands in self.BANDS:
|
|
230
|
+
# See if the bands provided by this file intersect with the bands in at
|
|
231
|
+
# least one configured band set.
|
|
232
|
+
for band_set in context.layer_config.band_sets:
|
|
233
|
+
if not set(band_set.bands).intersection(cur_bands):
|
|
234
|
+
continue
|
|
235
|
+
self.needed_bands.append((fname, cur_bands))
|
|
236
|
+
break
|
|
237
|
+
elif bands is not None:
|
|
238
|
+
self.needed_bands = []
|
|
239
|
+
for fname, cur_bands in self.BANDS:
|
|
240
|
+
if not set(bands).intersection(cur_bands):
|
|
241
|
+
continue
|
|
242
|
+
self.needed_bands.append((fname, cur_bands))
|
|
243
|
+
else:
|
|
244
|
+
self.needed_bands = list(self.BANDS)
|
|
142
245
|
|
|
246
|
+
self.bucket = storage.Client.create_anonymous_client().bucket(self.BUCKET_NAME)
|
|
247
|
+
self.rtree_index: Any | None = None
|
|
143
248
|
if use_rtree_index:
|
|
144
249
|
from rslearn.utils.rtree_index import RtreeIndex, get_cached_rtree
|
|
145
250
|
|
|
146
|
-
|
|
251
|
+
self.rtree_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
|
|
253
|
+
def build_fn(index: RtreeIndex) -> None:
|
|
147
254
|
"""Build the RtreeIndex from items in the data source."""
|
|
148
|
-
for item in self.
|
|
255
|
+
for item in self._read_bigquery(
|
|
149
256
|
desc="Building rtree index", time_range=rtree_time_range
|
|
150
257
|
):
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
self.rtree_tmp_dir = tempfile.TemporaryDirectory()
|
|
154
|
-
self.rtree_index = get_cached_rtree(
|
|
155
|
-
self.index_cache_dir, self.rtree_tmp_dir.name, build_fn
|
|
156
|
-
)
|
|
157
|
-
else:
|
|
158
|
-
self.rtree_index = None
|
|
159
|
-
|
|
160
|
-
@staticmethod
|
|
161
|
-
def from_config(config: LayerConfig, ds_path: UPath) -> "Sentinel2":
|
|
162
|
-
"""Creates a new Sentinel2 instance from a configuration dictionary."""
|
|
163
|
-
assert isinstance(config, RasterLayerConfig)
|
|
164
|
-
d = config.data_source.config_dict
|
|
165
|
-
kwargs = dict(
|
|
166
|
-
config=config,
|
|
167
|
-
index_cache_dir=join_upath(ds_path, d["index_cache_dir"]),
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
if "max_time_delta" in d:
|
|
171
|
-
kwargs["max_time_delta"] = timedelta(
|
|
172
|
-
seconds=pytimeparse.parse(d["max_time_delta"])
|
|
173
|
-
)
|
|
174
|
-
simple_optionals = ["sort_by", "use_rtree_index", "harmonize"]
|
|
175
|
-
for k in simple_optionals:
|
|
176
|
-
if k in d:
|
|
177
|
-
kwargs[k] = d[k]
|
|
258
|
+
for shp in flatten_shape(item.geometry.shp):
|
|
259
|
+
index.insert(shp.bounds, json.dumps(item.serialize()))
|
|
178
260
|
|
|
179
|
-
|
|
261
|
+
self.rtree_index = get_cached_rtree(self.rtree_cache_dir, build_fn)
|
|
180
262
|
|
|
181
|
-
def
|
|
182
|
-
self,
|
|
183
|
-
|
|
184
|
-
|
|
263
|
+
def _read_bigquery(
|
|
264
|
+
self,
|
|
265
|
+
desc: str | None = None,
|
|
266
|
+
time_range: tuple[datetime, datetime] | None = None,
|
|
267
|
+
wgs84_bbox: tuple[float, float, float, float] | None = None,
|
|
268
|
+
) -> Generator[Sentinel2Item, None, None]:
|
|
269
|
+
"""Read Sentinel-2 scenes from BigQuery table.
|
|
185
270
|
|
|
186
|
-
The
|
|
271
|
+
The table only contains the bounding box of each image and not the exact
|
|
187
272
|
geometry, which can be retrieved from individual product metadata
|
|
188
273
|
(MTD_MSIL1C.xml) files.
|
|
189
274
|
|
|
190
275
|
Args:
|
|
191
276
|
desc: description to include with tqdm progress bar.
|
|
192
277
|
time_range: optional time_range to restrict the reading.
|
|
278
|
+
wgs84_bbox: optional bounding box in WGS-84 coordinates to restrict the
|
|
279
|
+
reading.
|
|
193
280
|
"""
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
281
|
+
query_str = f"""
|
|
282
|
+
SELECT source_url, base_url, product_id, sensing_time, granule_id,
|
|
283
|
+
east_lon, south_lat, west_lon, north_lat, cloud_cover
|
|
284
|
+
FROM `{self.TABLE_NAME}`
|
|
285
|
+
"""
|
|
286
|
+
clauses = []
|
|
287
|
+
if time_range is not None:
|
|
288
|
+
clauses.append(f"""(
|
|
289
|
+
sensing_time >= "{time_range[0]}" AND sensing_time <= "{time_range[1]}"
|
|
290
|
+
)""")
|
|
291
|
+
if wgs84_bbox is not None:
|
|
292
|
+
clauses.append(f"""(
|
|
293
|
+
west_lon < {wgs84_bbox[2]} AND
|
|
294
|
+
east_lon > {wgs84_bbox[0]} AND
|
|
295
|
+
south_lat < {wgs84_bbox[3]} AND
|
|
296
|
+
north_lat > {wgs84_bbox[1]}
|
|
297
|
+
)""")
|
|
298
|
+
if clauses:
|
|
299
|
+
query_str += " WHERE " + " AND ".join(clauses)
|
|
300
|
+
|
|
301
|
+
client = bigquery.Client()
|
|
302
|
+
result = client.query(query_str)
|
|
303
|
+
if desc is not None:
|
|
304
|
+
result = tqdm.tqdm(result, desc=desc)
|
|
305
|
+
|
|
306
|
+
for row in result:
|
|
307
|
+
# Validate product ID has correct number of sections and that it is MSIL1C.
|
|
308
|
+
# Example product IDs:
|
|
309
|
+
# - S2B_MSIL1C_20180210T200549_N0206_R128_T08VPK_20180210T215722
|
|
310
|
+
# - S2A_OPER_PRD_MSIL1C_PDMC_20160315T180002_R091_V20160315T060423_20160315T060423
|
|
311
|
+
# We must do this before checking source_url because we want to skip the
|
|
312
|
+
# products that say OPER instead of MSIL1C (occasionally the OPER products
|
|
313
|
+
# are missing other fields in the CSV).
|
|
314
|
+
# For example, the OPER product above has:
|
|
315
|
+
# - source_url = https://storage.googleapis.com/gcp-public-data-sentinel-2/index.csv.gz
|
|
316
|
+
# - base_url = None
|
|
317
|
+
product_id = row["product_id"]
|
|
318
|
+
product_id_parts = product_id.split("_")
|
|
319
|
+
if len(product_id_parts) < 7:
|
|
320
|
+
continue
|
|
321
|
+
product_type = product_id_parts[1]
|
|
322
|
+
if product_type != "MSIL1C":
|
|
323
|
+
continue
|
|
324
|
+
time_str = product_id_parts[2]
|
|
325
|
+
tile_id = product_id_parts[5]
|
|
326
|
+
assert tile_id[0] == "T"
|
|
327
|
+
|
|
328
|
+
# Figure out what the product folder is for this entry.
|
|
329
|
+
# Some entries have source_url correct and others have base_url correct.
|
|
330
|
+
# If base_url is correct, then it seems the source_url always ends in
|
|
331
|
+
# index.csv.gz.
|
|
332
|
+
# Example 1:
|
|
333
|
+
# - source_url = https://storage.googleapis.com/gcp-public-data-sentinel-2/index.csv.gz
|
|
334
|
+
# - base_url = gs://gcp-public-data-sentinel-2/tiles/54/U/VV/S2A_MSIL1C_20160219T015301_N0201_R017_T54UVV_20160222T152042.SAFE
|
|
335
|
+
# Example 2:
|
|
336
|
+
# - source_url = gs://gcp-public-data-sentinel-2/tiles/15/C/WM/S2B_MSIL1C_20250101T121229_N0511_R080_T15CWM_20250101T150509.SAFE
|
|
337
|
+
# - base_url = None
|
|
338
|
+
if row["source_url"] and not row["source_url"].endswith("index.csv.gz"):
|
|
339
|
+
product_folder = row["source_url"].split(f"gs://{self.BUCKET_NAME}/")[1]
|
|
340
|
+
elif row["base_url"] is not None and row["base_url"] != "":
|
|
341
|
+
product_folder = row["base_url"].split(f"gs://{self.BUCKET_NAME}/")[1]
|
|
342
|
+
else:
|
|
343
|
+
raise ValueError(
|
|
344
|
+
f"Unexpected value '{row['source_url']}' in column 'source_url'"
|
|
345
|
+
+ f" and '{row['base_url']} in column 'base_url'"
|
|
346
|
+
+ f"for product {row['product_id']}"
|
|
347
|
+
)
|
|
217
348
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
349
|
+
# Build the blob prefix based on the product ID and granule ID.
|
|
350
|
+
# The blob prefix is the prefix to the JP2 image files on GCS.
|
|
351
|
+
granule_id = row["granule_id"]
|
|
352
|
+
blob_prefix = (
|
|
353
|
+
f"{product_folder}/GRANULE/{granule_id}/IMG_DATA/{tile_id}_{time_str}_"
|
|
354
|
+
)
|
|
222
355
|
|
|
223
|
-
|
|
356
|
+
# Extract the spatial and temporal bounds of the image.
|
|
357
|
+
bounds = (
|
|
358
|
+
float(row["west_lon"]),
|
|
359
|
+
float(row["south_lat"]),
|
|
360
|
+
float(row["east_lon"]),
|
|
361
|
+
float(row["north_lat"]),
|
|
362
|
+
)
|
|
363
|
+
shp = shapely.box(*bounds)
|
|
364
|
+
sensing_time = row["sensing_time"]
|
|
365
|
+
geometry = STGeometry(WGS84_PROJECTION, shp, (sensing_time, sensing_time))
|
|
366
|
+
geometry = split_at_antimeridian(geometry)
|
|
224
367
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
WGS84_PROJECTION, shp, (sensing_time, sensing_time)
|
|
235
|
-
)
|
|
368
|
+
cloud_cover = float(row["cloud_cover"])
|
|
369
|
+
|
|
370
|
+
yield Sentinel2Item(product_id, geometry, blob_prefix, cloud_cover)
|
|
371
|
+
|
|
372
|
+
def _build_cell_folder_name(self, cell_id: str) -> str:
|
|
373
|
+
"""Get the prefix on GCS containing the product files in the provided cell.
|
|
374
|
+
|
|
375
|
+
The Sentinel-2 cell ID is based on MGRS and is a way of splitting up the world
|
|
376
|
+
into large tiles.
|
|
236
377
|
|
|
237
|
-
|
|
378
|
+
Args:
|
|
379
|
+
cell_id: the 5-character cell ID. Note that the product name includes the
|
|
380
|
+
cell ID with a "T" prefix, the T should be removed.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
the path on GCS of the folder corresponding to this Sentinel-2 cell.
|
|
384
|
+
"""
|
|
385
|
+
return f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/"
|
|
386
|
+
|
|
387
|
+
def _build_product_folder_name(self, item_name: str) -> str:
|
|
388
|
+
"""Get the folder containing the given Sentinel-2 scene ID on GCS.
|
|
238
389
|
|
|
239
|
-
|
|
390
|
+
Args:
|
|
391
|
+
item_name: the item name (Sentinel-2 scene ID).
|
|
240
392
|
|
|
241
|
-
|
|
393
|
+
Returns:
|
|
394
|
+
the path on GCS of the .SAFE folder corresponding to this item.
|
|
395
|
+
"""
|
|
396
|
+
parts = item_name.split("_")
|
|
397
|
+
cell_id_with_prefix = parts[5]
|
|
398
|
+
if len(cell_id_with_prefix) != 6:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"cell ID should be 6 characters but got {cell_id_with_prefix}"
|
|
401
|
+
)
|
|
402
|
+
if cell_id_with_prefix[0] != "T":
|
|
403
|
+
raise ValueError(
|
|
404
|
+
f"cell ID should start with T but got {cell_id_with_prefix}"
|
|
405
|
+
)
|
|
406
|
+
cell_id = cell_id_with_prefix[1:]
|
|
407
|
+
return self._build_cell_folder_name(cell_id) + f"{item_name}.SAFE/"
|
|
408
|
+
|
|
409
|
+
def _get_xml_by_name(self, name: str) -> "ET.ElementTree[ET.Element[str]]":
|
|
242
410
|
"""Gets the metadata XML of an item by its name.
|
|
243
411
|
|
|
244
412
|
Args:
|
|
@@ -247,76 +415,224 @@ class Sentinel2(DataSource):
|
|
|
247
415
|
Returns:
|
|
248
416
|
the parsed XML ElementTree
|
|
249
417
|
"""
|
|
250
|
-
parts = name.split("_")
|
|
251
|
-
assert len(parts[5]) == 6
|
|
252
|
-
assert parts[5][0] == "T"
|
|
253
|
-
cell_id = parts[5][1:]
|
|
254
|
-
base_url = f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/{name}.SAFE/"
|
|
255
|
-
|
|
256
418
|
cache_xml_fname = self.index_cache_dir / (name + ".xml")
|
|
257
419
|
if not cache_xml_fname.exists():
|
|
258
|
-
|
|
420
|
+
product_folder = self._build_product_folder_name(name)
|
|
421
|
+
metadata_blob_path = product_folder + self.METADATA_FILENAME
|
|
422
|
+
logger.debug("reading metadata XML from %s", metadata_blob_path)
|
|
259
423
|
blob = self.bucket.blob(metadata_blob_path)
|
|
424
|
+
if not blob.exists():
|
|
425
|
+
raise MissingXMLException(name)
|
|
260
426
|
with open_atomic(cache_xml_fname, "wb") as f:
|
|
261
427
|
blob.download_to_file(f)
|
|
262
428
|
|
|
263
429
|
with cache_xml_fname.open("rb") as f:
|
|
264
430
|
return ET.parse(f)
|
|
265
431
|
|
|
266
|
-
def
|
|
267
|
-
"""
|
|
432
|
+
def _parse_xml(self, name: str) -> ParsedProductXML:
|
|
433
|
+
"""Parse a Sentinel-2 product XML file.
|
|
268
434
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
the product (not just the bounding box).
|
|
435
|
+
This extracts the blob prefix in the GCS bucket, the polygon extent, sensing
|
|
436
|
+
start time, and cloud cover.
|
|
272
437
|
|
|
273
438
|
Args:
|
|
274
|
-
name: the
|
|
275
|
-
|
|
276
|
-
Returns:
|
|
277
|
-
the item object
|
|
439
|
+
name: the Sentinel-2 scene name.
|
|
278
440
|
"""
|
|
279
|
-
|
|
280
|
-
assert len(parts[5]) == 6
|
|
281
|
-
assert parts[5][0] == "T"
|
|
282
|
-
cell_id = parts[5][1:]
|
|
283
|
-
base_url = f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/{name}.SAFE/"
|
|
284
|
-
|
|
441
|
+
# Get the XML. This helper function handles caching the XML file.
|
|
285
442
|
tree = self._get_xml_by_name(name)
|
|
286
443
|
|
|
444
|
+
# Now parse the XML, starting with the detailed geometry of the image.
|
|
287
445
|
# The EXT_POS_LIST tag has flat list of polygon coordinates.
|
|
288
446
|
elements = list(tree.iter("EXT_POS_LIST"))
|
|
289
447
|
assert len(elements) == 1
|
|
290
|
-
|
|
448
|
+
if elements[0].text is None:
|
|
449
|
+
raise ValueError(f"EXT_POS_LIST is empty for {name}")
|
|
450
|
+
coords_text = elements[0].text.strip().split(" ")
|
|
291
451
|
# Convert flat list of lat1 lon1 lat2 lon2 ...
|
|
292
452
|
# into (lon1, lat1), (lon2, lat2), ...
|
|
293
453
|
# Then we can get the shapely geometry.
|
|
294
454
|
coords = [
|
|
295
|
-
[float(
|
|
455
|
+
[float(coords_text[i + 1]), float(coords_text[i])]
|
|
456
|
+
for i in range(0, len(coords_text), 2)
|
|
296
457
|
]
|
|
297
458
|
shp = shapely.Polygon(coords)
|
|
298
459
|
|
|
299
|
-
# Get blob prefix which is a subfolder of the
|
|
460
|
+
# Get blob prefix which is a subfolder of the product folder.
|
|
461
|
+
# The blob prefix is the prefix to the JP2 image files on GCS.
|
|
462
|
+
product_folder = self._build_product_folder_name(name)
|
|
300
463
|
elements = list(tree.iter("IMAGE_FILE"))
|
|
301
|
-
elements = [
|
|
464
|
+
elements = [
|
|
465
|
+
el for el in elements if el.text is not None and el.text.endswith("_B01")
|
|
466
|
+
]
|
|
302
467
|
assert len(elements) == 1
|
|
303
|
-
|
|
468
|
+
if elements[0].text is None:
|
|
469
|
+
raise ValueError(f"IMAGE_FILE is empty for {name}")
|
|
470
|
+
blob_prefix = product_folder + elements[0].text.split("B01")[0]
|
|
304
471
|
|
|
472
|
+
# Get the sensing start time.
|
|
305
473
|
elements = list(tree.iter("PRODUCT_START_TIME"))
|
|
306
474
|
assert len(elements) == 1
|
|
475
|
+
if elements[0].text is None:
|
|
476
|
+
raise ValueError(f"PRODUCT_START_TIME is empty for {name}")
|
|
307
477
|
start_time = dateutil.parser.isoparse(elements[0].text)
|
|
308
478
|
|
|
479
|
+
# Get the cloud cover.
|
|
309
480
|
elements = list(tree.iter("Cloud_Coverage_Assessment"))
|
|
310
481
|
assert len(elements) == 1
|
|
482
|
+
if elements[0].text is None:
|
|
483
|
+
raise ValueError(f"Cloud_Coverage_Assessment is empty for {name}")
|
|
311
484
|
cloud_cover = float(elements[0].text)
|
|
312
485
|
|
|
486
|
+
return ParsedProductXML(
|
|
487
|
+
blob_prefix=blob_prefix,
|
|
488
|
+
shp=shp,
|
|
489
|
+
start_time=start_time,
|
|
490
|
+
cloud_cover=cloud_cover,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def _get_item_by_name(self, name: str) -> Sentinel2Item:
|
|
494
|
+
"""Gets an item by name.
|
|
495
|
+
|
|
496
|
+
This implements the main logic of processing the product metadata file
|
|
497
|
+
without the caching logic in get_item_by_name, see that function for details.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
name: the Sentinel-2 scene ID.
|
|
501
|
+
"""
|
|
502
|
+
product_xml = self._parse_xml(name)
|
|
503
|
+
|
|
504
|
+
# Some Sentinel-2 scenes in the bucket are missing a subset of image files. So
|
|
505
|
+
# here we verify that all the bands we know about are intact.
|
|
506
|
+
expected_suffixes = {t[0] for t in self.BANDS}
|
|
507
|
+
for blob in self.bucket.list_blobs(prefix=product_xml.blob_prefix):
|
|
508
|
+
assert blob.name.startswith(product_xml.blob_prefix)
|
|
509
|
+
suffix = blob.name[len(product_xml.blob_prefix) :]
|
|
510
|
+
if suffix in expected_suffixes:
|
|
511
|
+
expected_suffixes.remove(suffix)
|
|
512
|
+
if len(expected_suffixes) > 0:
|
|
513
|
+
raise CorruptItemException(
|
|
514
|
+
f"item is missing image files: {expected_suffixes}"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
time_range = (product_xml.start_time, product_xml.start_time)
|
|
518
|
+
geometry = STGeometry(WGS84_PROJECTION, product_xml.shp, time_range)
|
|
519
|
+
geometry = split_at_antimeridian(geometry)
|
|
520
|
+
|
|
521
|
+
# Sometimes the geometry is not valid.
|
|
522
|
+
# We just apply make_valid on it to correct issues.
|
|
523
|
+
if not geometry.shp.is_valid:
|
|
524
|
+
geometry.shp = shapely.make_valid(geometry.shp)
|
|
525
|
+
|
|
526
|
+
# Some rasters have zero-area geometry due to incorrect geometry. For example,
|
|
527
|
+
# S2B_MSIL1C_20190111T193659_N0207_R056_T08MLS_20190111T205033.SAFE.
|
|
528
|
+
# So here we add a check for that and mark it corrupt if so.
|
|
529
|
+
if geometry.shp.area == 0:
|
|
530
|
+
raise CorruptItemException(
|
|
531
|
+
f"XML for item {name} shows geometry with zero area"
|
|
532
|
+
)
|
|
533
|
+
|
|
313
534
|
return Sentinel2Item(
|
|
314
|
-
name,
|
|
315
|
-
|
|
316
|
-
blob_prefix,
|
|
317
|
-
cloud_cover,
|
|
535
|
+
name=name,
|
|
536
|
+
geometry=geometry,
|
|
537
|
+
blob_prefix=product_xml.blob_prefix,
|
|
538
|
+
cloud_cover=product_xml.cloud_cover,
|
|
318
539
|
)
|
|
319
540
|
|
|
541
|
+
def get_item_by_name(self, name: str) -> Sentinel2Item:
|
|
542
|
+
"""Gets an item by name.
|
|
543
|
+
|
|
544
|
+
Reads the individual product metadata file (MTD_MSIL1C.xml) to get both the
|
|
545
|
+
expected blob path where images are stored as well as the detailed geometry of
|
|
546
|
+
the product (not just the bounding box).
|
|
547
|
+
|
|
548
|
+
Args:
|
|
549
|
+
name: the name of the item to get
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
the item object
|
|
553
|
+
"""
|
|
554
|
+
# The main logic for getting the item is implemented in _get_item_by_name.
|
|
555
|
+
# Here, we implement caching logic so that, if we have already seen this item
|
|
556
|
+
# before, then we can just deserialize it from a JSON file.
|
|
557
|
+
# We want to cache the item if it is successful, but also cache the
|
|
558
|
+
# CorruptItemException if it is raised.
|
|
559
|
+
cache_item_fname = self.index_cache_dir / (name + ".json")
|
|
560
|
+
|
|
561
|
+
if cache_item_fname.exists():
|
|
562
|
+
with cache_item_fname.open() as f:
|
|
563
|
+
d = json.load(f)
|
|
564
|
+
|
|
565
|
+
if "error" in d:
|
|
566
|
+
raise CorruptItemException(d["error"])
|
|
567
|
+
|
|
568
|
+
return Sentinel2Item.deserialize(d)
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
item = self._get_item_by_name(name)
|
|
572
|
+
except CorruptItemException as e:
|
|
573
|
+
with open_atomic(cache_item_fname, "w") as f:
|
|
574
|
+
json.dump({"error": e.message}, f)
|
|
575
|
+
raise
|
|
576
|
+
|
|
577
|
+
with open_atomic(cache_item_fname, "w") as f:
|
|
578
|
+
json.dump(item.serialize(), f)
|
|
579
|
+
return item
|
|
580
|
+
|
|
581
|
+
def _read_products_for_cell_year(
|
|
582
|
+
self, cell_id: str, year: int
|
|
583
|
+
) -> list[Sentinel2Item]:
|
|
584
|
+
"""Read items for the given cell and year directly from the GCS bucket.
|
|
585
|
+
|
|
586
|
+
This helper function is used by self._read_products which then caches the
|
|
587
|
+
items together in one file.
|
|
588
|
+
"""
|
|
589
|
+
items = []
|
|
590
|
+
|
|
591
|
+
for product_prefix in self.VALID_PRODUCT_PREFIXES:
|
|
592
|
+
cell_folder = self._build_cell_folder_name(cell_id)
|
|
593
|
+
blob_prefix = f"{cell_folder}{product_prefix}_{year}"
|
|
594
|
+
blobs = self.bucket.list_blobs(prefix=blob_prefix, delimiter="/")
|
|
595
|
+
|
|
596
|
+
# Need to consume the iterator to obtain folder names.
|
|
597
|
+
# See https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix#storage_list_files_with_prefix-python # noqa: E501
|
|
598
|
+
# Previously we checked for .SAFE_$folder$ blobs here, but those do
|
|
599
|
+
# not exist for some years like 2017.
|
|
600
|
+
for _ in blobs:
|
|
601
|
+
pass
|
|
602
|
+
|
|
603
|
+
logger.debug(
|
|
604
|
+
"under %s, found %d folders to scan",
|
|
605
|
+
blob_prefix,
|
|
606
|
+
len(blobs.prefixes),
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
for prefix in blobs.prefixes:
|
|
610
|
+
folder_name = prefix.split("/")[-2]
|
|
611
|
+
expected_suffix = ".SAFE"
|
|
612
|
+
assert folder_name.endswith(expected_suffix)
|
|
613
|
+
item_name = folder_name.split(expected_suffix)[0]
|
|
614
|
+
|
|
615
|
+
try:
|
|
616
|
+
item = self.get_item_by_name(item_name)
|
|
617
|
+
except CorruptItemException as e:
|
|
618
|
+
logger.warning("skipping corrupt item %s: %s", item_name, e.message)
|
|
619
|
+
continue
|
|
620
|
+
except MissingXMLException:
|
|
621
|
+
# Sometimes there is a .SAFE folder but some files like the
|
|
622
|
+
# XML file are just missing for whatever reason. Since we
|
|
623
|
+
# know this happens occasionally, we just ignore the error
|
|
624
|
+
# here.
|
|
625
|
+
logger.warning(
|
|
626
|
+
"no metadata XML for Sentinel-2 folder %s/%s",
|
|
627
|
+
blob_prefix,
|
|
628
|
+
folder_name,
|
|
629
|
+
)
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
items.append(item)
|
|
633
|
+
|
|
634
|
+
return items
|
|
635
|
+
|
|
320
636
|
def _read_products(
|
|
321
637
|
self, needed_cell_years: set[tuple[str, int]]
|
|
322
638
|
) -> Generator[Sentinel2Item, None, None]:
|
|
@@ -326,39 +642,20 @@ class Sentinel2(DataSource):
|
|
|
326
642
|
needed_cell_years: set of (mgrs grid cell, year) where we need to search
|
|
327
643
|
for images.
|
|
328
644
|
"""
|
|
329
|
-
|
|
645
|
+
# Read the product infos in random order so in case there are multiple jobs
|
|
646
|
+
# reading similar cells, they are more likely to work on different cells/years
|
|
647
|
+
# in parallel.
|
|
648
|
+
needed_cell_years_list = list(needed_cell_years)
|
|
649
|
+
random.shuffle(needed_cell_years_list)
|
|
650
|
+
|
|
651
|
+
for cell_id, year in tqdm.tqdm(
|
|
652
|
+
needed_cell_years_list, desc="Reading product infos"
|
|
653
|
+
):
|
|
330
654
|
assert len(cell_id) == 5
|
|
331
655
|
cache_fname = self.index_cache_dir / f"{cell_id}_{year}.json"
|
|
332
656
|
|
|
333
657
|
if not cache_fname.exists():
|
|
334
|
-
|
|
335
|
-
cell_part2 = cell_id[2:3]
|
|
336
|
-
cell_part3 = cell_id[3:5]
|
|
337
|
-
|
|
338
|
-
items = []
|
|
339
|
-
|
|
340
|
-
for product_prefix in ["S2A_MSIL1C", "S2B_MSIL1C"]:
|
|
341
|
-
blob_prefix = (
|
|
342
|
-
f"tiles/{cell_part1}/{cell_part2}/{cell_part3}/"
|
|
343
|
-
+ f"{product_prefix}_{year}"
|
|
344
|
-
)
|
|
345
|
-
blobs = self.bucket.list_blobs(prefix=blob_prefix, delimiter="/")
|
|
346
|
-
|
|
347
|
-
# Need to consume the iterator to obtain folder names.
|
|
348
|
-
# See https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix#storage_list_files_with_prefix-python # noqa: E501
|
|
349
|
-
# Previously we checked for .SAFE_$folder$ blobs here, but those do
|
|
350
|
-
# not exist for some years like 2017.
|
|
351
|
-
for _ in blobs:
|
|
352
|
-
pass
|
|
353
|
-
|
|
354
|
-
for prefix in blobs.prefixes:
|
|
355
|
-
folder_name = prefix.split("/")[-2]
|
|
356
|
-
expected_suffix = ".SAFE"
|
|
357
|
-
assert folder_name.endswith(expected_suffix)
|
|
358
|
-
item_name = folder_name.split(expected_suffix)[0]
|
|
359
|
-
item = self.get_item_by_name(item_name)
|
|
360
|
-
items.append(item)
|
|
361
|
-
|
|
658
|
+
items = self._read_products_for_cell_year(cell_id, year)
|
|
362
659
|
with open_atomic(cache_fname, "w") as f:
|
|
363
660
|
json.dump([item.serialize() for item in items], f)
|
|
364
661
|
|
|
@@ -366,22 +663,26 @@ class Sentinel2(DataSource):
|
|
|
366
663
|
with cache_fname.open() as f:
|
|
367
664
|
items = [Sentinel2Item.deserialize(d) for d in json.load(f)]
|
|
368
665
|
|
|
369
|
-
|
|
370
|
-
yield item
|
|
666
|
+
yield from items
|
|
371
667
|
|
|
372
668
|
def _get_candidate_items_index(
|
|
373
669
|
self, wgs84_geometries: list[STGeometry]
|
|
374
|
-
) -> list[list[
|
|
375
|
-
"""List relevant items using rtree index.
|
|
376
|
-
|
|
670
|
+
) -> list[list[Sentinel2Item]]:
|
|
671
|
+
"""List relevant items using rtree index.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
wgs84_geometries: the geometries to query.
|
|
675
|
+
"""
|
|
676
|
+
candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
|
|
377
677
|
for idx, geometry in enumerate(wgs84_geometries):
|
|
378
678
|
time_range = None
|
|
379
679
|
if geometry.time_range:
|
|
380
680
|
time_range = (
|
|
381
|
-
geometry.time_range[0]
|
|
382
|
-
geometry.time_range[1]
|
|
681
|
+
geometry.time_range[0],
|
|
682
|
+
geometry.time_range[1],
|
|
383
683
|
)
|
|
384
|
-
|
|
684
|
+
if self.rtree_index is None:
|
|
685
|
+
raise ValueError("rtree_index is required")
|
|
385
686
|
encoded_items = self.rtree_index.query(geometry.shp.bounds)
|
|
386
687
|
for encoded_item in encoded_items:
|
|
387
688
|
item = Sentinel2Item.deserialize(json.loads(encoded_item))
|
|
@@ -389,7 +690,23 @@ class Sentinel2(DataSource):
|
|
|
389
690
|
continue
|
|
390
691
|
if not item.geometry.shp.intersects(geometry.shp):
|
|
391
692
|
continue
|
|
392
|
-
|
|
693
|
+
|
|
694
|
+
# Get the item from XML to get its exact geometry (the index only
|
|
695
|
+
# knows the bounding box of the item).
|
|
696
|
+
try:
|
|
697
|
+
item = self.get_item_by_name(item.name)
|
|
698
|
+
except CorruptItemException as e:
|
|
699
|
+
logger.warning("skipping corrupt item %s: %s", item.name, e.message)
|
|
700
|
+
continue
|
|
701
|
+
except MissingXMLException:
|
|
702
|
+
# Sometimes a scene that appears in the BigQuery index does not
|
|
703
|
+
# actually have an XML file on GCS. Since we know this happens
|
|
704
|
+
# occasionally, we ignore the error here.
|
|
705
|
+
logger.warning(
|
|
706
|
+
"skipping item %s that is missing XML file", item.name
|
|
707
|
+
)
|
|
708
|
+
continue
|
|
709
|
+
|
|
393
710
|
if not item.geometry.shp.intersects(geometry.shp):
|
|
394
711
|
continue
|
|
395
712
|
candidates[idx].append(item)
|
|
@@ -397,22 +714,26 @@ class Sentinel2(DataSource):
|
|
|
397
714
|
|
|
398
715
|
def _get_candidate_items_direct(
|
|
399
716
|
self, wgs84_geometries: list[STGeometry]
|
|
400
|
-
) -> list[list[
|
|
401
|
-
"""Use _read_products to list
|
|
717
|
+
) -> list[list[Sentinel2Item]]:
|
|
718
|
+
"""Use _read_products to list matching items directly from the bucket.
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
wgs84_geometries: the geometries to query.
|
|
722
|
+
"""
|
|
402
723
|
needed_cell_years = set()
|
|
403
724
|
for wgs84_geometry in wgs84_geometries:
|
|
404
725
|
if wgs84_geometry.time_range is None:
|
|
405
726
|
raise ValueError(
|
|
406
727
|
"Sentinel2 on GCP requires geometry time ranges to be set"
|
|
407
728
|
)
|
|
408
|
-
for cell_id in
|
|
729
|
+
for cell_id in get_sentinel2_tiles(wgs84_geometry, self.index_cache_dir):
|
|
409
730
|
for year in range(
|
|
410
|
-
|
|
411
|
-
|
|
731
|
+
wgs84_geometry.time_range[0].year,
|
|
732
|
+
wgs84_geometry.time_range[1].year + 1,
|
|
412
733
|
):
|
|
413
734
|
needed_cell_years.add((cell_id, year))
|
|
414
735
|
|
|
415
|
-
items_by_cell = {}
|
|
736
|
+
items_by_cell: dict[str, list[Sentinel2Item]] = {}
|
|
416
737
|
for item in self._read_products(needed_cell_years):
|
|
417
738
|
cell_id = "".join(item.blob_prefix.split("/")[1:4])
|
|
418
739
|
assert len(cell_id) == 5
|
|
@@ -420,9 +741,9 @@ class Sentinel2(DataSource):
|
|
|
420
741
|
items_by_cell[cell_id] = []
|
|
421
742
|
items_by_cell[cell_id].append(item)
|
|
422
743
|
|
|
423
|
-
candidates = [[] for _ in wgs84_geometries]
|
|
744
|
+
candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
|
|
424
745
|
for idx, geometry in enumerate(wgs84_geometries):
|
|
425
|
-
for cell_id in
|
|
746
|
+
for cell_id in get_sentinel2_tiles(geometry, self.index_cache_dir):
|
|
426
747
|
for item in items_by_cell.get(cell_id, []):
|
|
427
748
|
if not geometry.shp.intersects(item.geometry.shp):
|
|
428
749
|
continue
|
|
@@ -430,9 +751,27 @@ class Sentinel2(DataSource):
|
|
|
430
751
|
|
|
431
752
|
return candidates
|
|
432
753
|
|
|
754
|
+
def _get_candidate_items_bigquery(
|
|
755
|
+
self, wgs84_geometries: list[STGeometry]
|
|
756
|
+
) -> list[list[Sentinel2Item]]:
|
|
757
|
+
"""Use _read_bigquery to list matching items by querying the BigQuery table.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
wgs84_geometries: the geometries to query.
|
|
761
|
+
"""
|
|
762
|
+
candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
|
|
763
|
+
for idx, geometry in enumerate(wgs84_geometries):
|
|
764
|
+
wgs84_bbox = geometry.shp.bounds
|
|
765
|
+
for item in self._read_bigquery(
|
|
766
|
+
time_range=geometry.time_range, wgs84_bbox=wgs84_bbox
|
|
767
|
+
):
|
|
768
|
+
candidates[idx].append(item)
|
|
769
|
+
|
|
770
|
+
return candidates
|
|
771
|
+
|
|
433
772
|
def get_items(
|
|
434
773
|
self, geometries: list[STGeometry], query_config: QueryConfig
|
|
435
|
-
) -> list[list[list[
|
|
774
|
+
) -> list[list[list[Sentinel2Item]]]:
|
|
436
775
|
"""Get a list of items in the data source intersecting the given geometries.
|
|
437
776
|
|
|
438
777
|
Args:
|
|
@@ -448,6 +787,8 @@ class Sentinel2(DataSource):
|
|
|
448
787
|
|
|
449
788
|
if self.rtree_index:
|
|
450
789
|
candidates = self._get_candidate_items_index(wgs84_geometries)
|
|
790
|
+
elif self.use_bigquery:
|
|
791
|
+
candidates = self._get_candidate_items_bigquery(wgs84_geometries)
|
|
451
792
|
else:
|
|
452
793
|
candidates = self._get_candidate_items_direct(wgs84_geometries)
|
|
453
794
|
|
|
@@ -463,14 +804,16 @@ class Sentinel2(DataSource):
|
|
|
463
804
|
groups.append(cur_groups)
|
|
464
805
|
return groups
|
|
465
806
|
|
|
466
|
-
def deserialize_item(self, serialized_item: Any) ->
|
|
807
|
+
def deserialize_item(self, serialized_item: Any) -> Sentinel2Item:
|
|
467
808
|
"""Deserializes an item from JSON-decoded data."""
|
|
468
809
|
assert isinstance(serialized_item, dict)
|
|
469
810
|
return Sentinel2Item.deserialize(serialized_item)
|
|
470
811
|
|
|
471
|
-
def retrieve_item(
|
|
812
|
+
def retrieve_item(
|
|
813
|
+
self, item: Sentinel2Item
|
|
814
|
+
) -> Generator[tuple[str, BinaryIO], None, None]:
|
|
472
815
|
"""Retrieves the rasters corresponding to an item as file streams."""
|
|
473
|
-
for suffix, _ in self.
|
|
816
|
+
for suffix, _ in self.BANDS:
|
|
474
817
|
blob_path = item.blob_prefix + suffix
|
|
475
818
|
fname = blob_path.split("/")[-1]
|
|
476
819
|
buf = io.BytesIO()
|
|
@@ -483,8 +826,8 @@ class Sentinel2(DataSource):
|
|
|
483
826
|
|
|
484
827
|
def ingest(
|
|
485
828
|
self,
|
|
486
|
-
tile_store:
|
|
487
|
-
items: list[
|
|
829
|
+
tile_store: TileStoreWithLayer,
|
|
830
|
+
items: list[Sentinel2Item],
|
|
488
831
|
geometries: list[list[STGeometry]],
|
|
489
832
|
) -> None:
|
|
490
833
|
"""Ingest items into the given tile store.
|
|
@@ -494,36 +837,49 @@ class Sentinel2(DataSource):
|
|
|
494
837
|
items: the items to ingest
|
|
495
838
|
geometries: a list of geometries needed for each item
|
|
496
839
|
"""
|
|
497
|
-
for item
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
harmonize_callback = get_harmonize_callback(
|
|
501
|
-
self._get_xml_by_name(item.name)
|
|
502
|
-
)
|
|
503
|
-
|
|
504
|
-
for suffix, band_names in self.bands:
|
|
505
|
-
cur_tile_store = PrefixedTileStore(
|
|
506
|
-
tile_store, (item.name, "_".join(band_names))
|
|
507
|
-
)
|
|
508
|
-
needed_projections = get_needed_projections(
|
|
509
|
-
cur_tile_store, band_names, self.config.band_sets, cur_geometries
|
|
510
|
-
)
|
|
511
|
-
if not needed_projections:
|
|
840
|
+
for item in items:
|
|
841
|
+
for suffix, band_names in self.needed_bands:
|
|
842
|
+
if tile_store.is_raster_ready(item.name, band_names):
|
|
512
843
|
continue
|
|
513
844
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
845
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
846
|
+
fname = os.path.join(tmp_dir, suffix)
|
|
847
|
+
blob = self.bucket.blob(item.blob_prefix + suffix)
|
|
848
|
+
logger.debug(
|
|
849
|
+
"gcp_public_data downloading raster file %s",
|
|
850
|
+
item.blob_prefix + suffix,
|
|
851
|
+
)
|
|
852
|
+
blob.download_to_filename(fname)
|
|
853
|
+
logger.debug(
|
|
854
|
+
"gcp_public_data ingesting raster file %s into tile store",
|
|
855
|
+
item.blob_prefix + suffix,
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# Harmonize values if needed.
|
|
859
|
+
# TCI does not need harmonization.
|
|
860
|
+
harmonize_callback = None
|
|
861
|
+
if self.harmonize and suffix != "TCI.jp2":
|
|
862
|
+
harmonize_callback = get_harmonize_callback(
|
|
863
|
+
self._get_xml_by_name(item.name)
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
if harmonize_callback is not None:
|
|
867
|
+
# In this case we need to read the array, convert the pixel
|
|
868
|
+
# values, and pass modified array directly to the TileStore.
|
|
869
|
+
with rasterio.open(fname) as src:
|
|
870
|
+
array = src.read()
|
|
871
|
+
projection, bounds = get_raster_projection_and_bounds(src)
|
|
872
|
+
array = harmonize_callback(array)
|
|
873
|
+
tile_store.write_raster(
|
|
874
|
+
item.name, band_names, projection, bounds, array
|
|
529
875
|
)
|
|
876
|
+
|
|
877
|
+
else:
|
|
878
|
+
tile_store.write_raster_file(
|
|
879
|
+
item.name, band_names, UPath(fname)
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
logger.debug(
|
|
883
|
+
"gcp_public_data done ingesting raster file %s",
|
|
884
|
+
item.blob_prefix + suffix,
|
|
885
|
+
)
|