rslearn 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. rslearn/arg_parser.py +31 -0
  2. rslearn/config/__init__.py +6 -12
  3. rslearn/config/dataset.py +520 -401
  4. rslearn/const.py +9 -15
  5. rslearn/data_sources/__init__.py +8 -23
  6. rslearn/data_sources/aws_landsat.py +242 -98
  7. rslearn/data_sources/aws_open_data.py +111 -151
  8. rslearn/data_sources/aws_sentinel1.py +131 -0
  9. rslearn/data_sources/climate_data_store.py +471 -0
  10. rslearn/data_sources/copernicus.py +884 -12
  11. rslearn/data_sources/data_source.py +43 -12
  12. rslearn/data_sources/earthdaily.py +484 -0
  13. rslearn/data_sources/earthdata_srtm.py +282 -0
  14. rslearn/data_sources/eurocrops.py +242 -0
  15. rslearn/data_sources/gcp_public_data.py +578 -222
  16. rslearn/data_sources/google_earth_engine.py +461 -135
  17. rslearn/data_sources/local_files.py +219 -150
  18. rslearn/data_sources/openstreetmap.py +51 -89
  19. rslearn/data_sources/planet.py +24 -60
  20. rslearn/data_sources/planet_basemap.py +275 -0
  21. rslearn/data_sources/planetary_computer.py +798 -0
  22. rslearn/data_sources/usda_cdl.py +195 -0
  23. rslearn/data_sources/usgs_landsat.py +115 -83
  24. rslearn/data_sources/utils.py +249 -61
  25. rslearn/data_sources/vector_source.py +1 -0
  26. rslearn/data_sources/worldcereal.py +449 -0
  27. rslearn/data_sources/worldcover.py +144 -0
  28. rslearn/data_sources/worldpop.py +153 -0
  29. rslearn/data_sources/xyz_tiles.py +150 -107
  30. rslearn/dataset/__init__.py +8 -2
  31. rslearn/dataset/add_windows.py +2 -2
  32. rslearn/dataset/dataset.py +40 -51
  33. rslearn/dataset/handler_summaries.py +131 -0
  34. rslearn/dataset/manage.py +313 -74
  35. rslearn/dataset/materialize.py +431 -107
  36. rslearn/dataset/remap.py +29 -4
  37. rslearn/dataset/storage/__init__.py +1 -0
  38. rslearn/dataset/storage/file.py +202 -0
  39. rslearn/dataset/storage/storage.py +140 -0
  40. rslearn/dataset/window.py +181 -44
  41. rslearn/lightning_cli.py +454 -0
  42. rslearn/log_utils.py +24 -0
  43. rslearn/main.py +384 -181
  44. rslearn/models/anysat.py +215 -0
  45. rslearn/models/attention_pooling.py +177 -0
  46. rslearn/models/clay/clay.py +231 -0
  47. rslearn/models/clay/configs/metadata.yaml +295 -0
  48. rslearn/models/clip.py +68 -0
  49. rslearn/models/component.py +111 -0
  50. rslearn/models/concatenate_features.py +103 -0
  51. rslearn/models/conv.py +63 -0
  52. rslearn/models/croma.py +306 -0
  53. rslearn/models/detr/__init__.py +5 -0
  54. rslearn/models/detr/box_ops.py +103 -0
  55. rslearn/models/detr/detr.py +504 -0
  56. rslearn/models/detr/matcher.py +107 -0
  57. rslearn/models/detr/position_encoding.py +114 -0
  58. rslearn/models/detr/transformer.py +429 -0
  59. rslearn/models/detr/util.py +24 -0
  60. rslearn/models/dinov3.py +177 -0
  61. rslearn/models/faster_rcnn.py +30 -28
  62. rslearn/models/feature_center_crop.py +53 -0
  63. rslearn/models/fpn.py +19 -8
  64. rslearn/models/galileo/__init__.py +5 -0
  65. rslearn/models/galileo/galileo.py +595 -0
  66. rslearn/models/galileo/single_file_galileo.py +1678 -0
  67. rslearn/models/module_wrapper.py +65 -0
  68. rslearn/models/molmo.py +69 -0
  69. rslearn/models/multitask.py +384 -28
  70. rslearn/models/olmoearth_pretrain/__init__.py +1 -0
  71. rslearn/models/olmoearth_pretrain/model.py +421 -0
  72. rslearn/models/olmoearth_pretrain/norm.py +86 -0
  73. rslearn/models/panopticon.py +170 -0
  74. rslearn/models/panopticon_data/sensors/drone.yaml +32 -0
  75. rslearn/models/panopticon_data/sensors/enmap.yaml +904 -0
  76. rslearn/models/panopticon_data/sensors/goes.yaml +9 -0
  77. rslearn/models/panopticon_data/sensors/himawari.yaml +9 -0
  78. rslearn/models/panopticon_data/sensors/intuition.yaml +606 -0
  79. rslearn/models/panopticon_data/sensors/landsat8.yaml +84 -0
  80. rslearn/models/panopticon_data/sensors/modis_terra.yaml +99 -0
  81. rslearn/models/panopticon_data/sensors/qb2_ge1.yaml +34 -0
  82. rslearn/models/panopticon_data/sensors/sentinel1.yaml +85 -0
  83. rslearn/models/panopticon_data/sensors/sentinel2.yaml +97 -0
  84. rslearn/models/panopticon_data/sensors/superdove.yaml +60 -0
  85. rslearn/models/panopticon_data/sensors/wv23.yaml +63 -0
  86. rslearn/models/pick_features.py +17 -10
  87. rslearn/models/pooling_decoder.py +60 -7
  88. rslearn/models/presto/__init__.py +5 -0
  89. rslearn/models/presto/presto.py +297 -0
  90. rslearn/models/presto/single_file_presto.py +926 -0
  91. rslearn/models/prithvi.py +1147 -0
  92. rslearn/models/resize_features.py +59 -0
  93. rslearn/models/sam2_enc.py +13 -9
  94. rslearn/models/satlaspretrain.py +38 -18
  95. rslearn/models/simple_time_series.py +188 -77
  96. rslearn/models/singletask.py +24 -13
  97. rslearn/models/ssl4eo_s12.py +40 -30
  98. rslearn/models/swin.py +44 -32
  99. rslearn/models/task_embedding.py +250 -0
  100. rslearn/models/terramind.py +256 -0
  101. rslearn/models/trunk.py +139 -0
  102. rslearn/models/unet.py +68 -22
  103. rslearn/models/upsample.py +48 -0
  104. rslearn/models/use_croma.py +508 -0
  105. rslearn/template_params.py +26 -0
  106. rslearn/tile_stores/__init__.py +41 -18
  107. rslearn/tile_stores/default.py +409 -0
  108. rslearn/tile_stores/tile_store.py +236 -132
  109. rslearn/train/all_patches_dataset.py +530 -0
  110. rslearn/train/callbacks/adapters.py +53 -0
  111. rslearn/train/callbacks/freeze_unfreeze.py +348 -17
  112. rslearn/train/callbacks/gradients.py +129 -0
  113. rslearn/train/callbacks/peft.py +116 -0
  114. rslearn/train/data_module.py +444 -20
  115. rslearn/train/dataset.py +588 -235
  116. rslearn/train/lightning_module.py +192 -62
  117. rslearn/train/model_context.py +88 -0
  118. rslearn/train/optimizer.py +31 -0
  119. rslearn/train/prediction_writer.py +319 -84
  120. rslearn/train/scheduler.py +92 -0
  121. rslearn/train/tasks/classification.py +55 -28
  122. rslearn/train/tasks/detection.py +132 -76
  123. rslearn/train/tasks/embedding.py +120 -0
  124. rslearn/train/tasks/multi_task.py +28 -14
  125. rslearn/train/tasks/per_pixel_regression.py +291 -0
  126. rslearn/train/tasks/regression.py +161 -44
  127. rslearn/train/tasks/segmentation.py +428 -53
  128. rslearn/train/tasks/task.py +6 -5
  129. rslearn/train/transforms/__init__.py +1 -1
  130. rslearn/train/transforms/concatenate.py +54 -10
  131. rslearn/train/transforms/crop.py +29 -11
  132. rslearn/train/transforms/flip.py +18 -6
  133. rslearn/train/transforms/mask.py +78 -0
  134. rslearn/train/transforms/normalize.py +101 -17
  135. rslearn/train/transforms/pad.py +19 -7
  136. rslearn/train/transforms/resize.py +83 -0
  137. rslearn/train/transforms/select_bands.py +76 -0
  138. rslearn/train/transforms/sentinel1.py +75 -0
  139. rslearn/train/transforms/transform.py +89 -70
  140. rslearn/utils/__init__.py +2 -6
  141. rslearn/utils/array.py +8 -6
  142. rslearn/utils/feature.py +2 -2
  143. rslearn/utils/fsspec.py +90 -1
  144. rslearn/utils/geometry.py +347 -7
  145. rslearn/utils/get_utm_ups_crs.py +2 -3
  146. rslearn/utils/grid_index.py +5 -5
  147. rslearn/utils/jsonargparse.py +178 -0
  148. rslearn/utils/mp.py +4 -3
  149. rslearn/utils/raster_format.py +268 -116
  150. rslearn/utils/rtree_index.py +64 -17
  151. rslearn/utils/sqlite_index.py +7 -1
  152. rslearn/utils/vector_format.py +252 -97
  153. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/METADATA +532 -283
  154. rslearn-0.0.21.dist-info/RECORD +167 -0
  155. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/WHEEL +1 -1
  156. rslearn-0.0.21.dist-info/licenses/NOTICE +115 -0
  157. rslearn/data_sources/raster_source.py +0 -309
  158. rslearn/models/registry.py +0 -5
  159. rslearn/tile_stores/file.py +0 -242
  160. rslearn/utils/mgrs.py +0 -24
  161. rslearn/utils/utils.py +0 -22
  162. rslearn-0.0.1.dist-info/RECORD +0 -88
  163. /rslearn/{data_sources/geotiff.py → py.typed} +0 -0
  164. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/entry_points.txt +0 -0
  165. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info/licenses}/LICENSE +0 -0
  166. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,36 @@
1
1
  """Data source for raster data on public Cloud Storage buckets."""
2
2
 
3
- import csv
4
- import gzip
5
3
  import io
6
4
  import json
5
+ import os
6
+ import random
7
7
  import tempfile
8
8
  import xml.etree.ElementTree as ET
9
9
  from collections.abc import Generator
10
- from datetime import datetime, timedelta
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
11
12
  from typing import Any, BinaryIO
12
13
 
13
14
  import dateutil.parser
14
- import pytimeparse
15
15
  import rasterio
16
16
  import shapely
17
17
  import tqdm
18
- from google.cloud import storage
18
+ from google.cloud import bigquery, storage
19
19
  from upath import UPath
20
20
 
21
- import rslearn.utils.mgrs
22
- from rslearn.config import LayerConfig, QueryConfig, RasterLayerConfig
21
+ from rslearn.config import QueryConfig
23
22
  from rslearn.const import WGS84_PROJECTION
24
- from rslearn.data_sources import DataSource, Item
23
+ from rslearn.data_sources import DataSource, DataSourceContext, Item
25
24
  from rslearn.data_sources.utils import match_candidate_items_to_window
26
- from rslearn.tile_stores import PrefixedTileStore, TileStore
27
- from rslearn.utils import STGeometry
25
+ from rslearn.log_utils import get_logger
26
+ from rslearn.tile_stores import TileStoreWithLayer
28
27
  from rslearn.utils.fsspec import join_upath, open_atomic
28
+ from rslearn.utils.geometry import STGeometry, flatten_shape, split_at_antimeridian
29
+ from rslearn.utils.raster_format import get_raster_projection_and_bounds
29
30
 
30
- from .copernicus import get_harmonize_callback
31
- from .raster_source import get_needed_projections, ingest_raster
31
+ from .copernicus import get_harmonize_callback, get_sentinel2_tiles
32
+
33
+ logger = get_logger(__name__)
32
34
 
33
35
 
34
36
  class Sentinel2Item(Item):
@@ -57,7 +59,7 @@ class Sentinel2Item(Item):
57
59
  return d
58
60
 
59
61
  @staticmethod
60
- def deserialize(d: dict[str, Any]) -> Item:
62
+ def deserialize(d: dict[str, Any]) -> "Sentinel2Item":
61
63
  """Deserializes an item from a JSON-decoded dictionary."""
62
64
  item = super(Sentinel2Item, Sentinel2Item).deserialize(d)
63
65
  return Sentinel2Item(
@@ -68,6 +70,45 @@ class Sentinel2Item(Item):
68
70
  )
69
71
 
70
72
 
73
+ class CorruptItemException(Exception):
74
+ """A Sentinel-2 scene is corrupted or otherwise unreadable for a known reason."""
75
+
76
+ def __init__(self, message: str) -> None:
77
+ """Create a new CorruptItemException.
78
+
79
+ Args:
80
+ message: error message.
81
+ """
82
+ self.message = message
83
+
84
+
85
+ class MissingXMLException(Exception):
86
+ """Exception for when an item's XML file does not exist in GCS.
87
+
88
+ Some items that appear in the index on BigQuery, or that have a folder, lack an XML
89
+ file, and so in those cases this exception can be ignored.
90
+ """
91
+
92
+ def __init__(self, item_name: str):
93
+ """Create a new MissingXMLException.
94
+
95
+ Args:
96
+ item_name: the name of the item (Sentinel-2 scene) that is missing its XML
97
+ file in the GCS bucket.
98
+ """
99
+ self.item_name = item_name
100
+
101
+
102
+ @dataclass
103
+ class ParsedProductXML:
104
+ """Result of parsing a Sentinel-2 product XML file."""
105
+
106
+ blob_prefix: str
107
+ shp: shapely.Polygon
108
+ start_time: datetime
109
+ cloud_cover: float
110
+
111
+
71
112
  class Sentinel2(DataSource):
72
113
  """A data source for Sentinel-2 data on Google Cloud Storage.
73
114
 
@@ -80,11 +121,12 @@ class Sentinel2(DataSource):
80
121
  The bucket is public and free so no credentials are needed.
81
122
  """
82
123
 
83
- bucket_name = "gcp-public-data-sentinel-2"
124
+ BUCKET_NAME = "gcp-public-data-sentinel-2"
84
125
 
85
- index_fname = "index.csv.gz"
126
+ # Name of BigQuery table containing index of Sentinel-2 scenes in the bucket.
127
+ TABLE_NAME = "bigquery-public-data.cloud_storage_geo_index.sentinel_2_index"
86
128
 
87
- bands = [
129
+ BANDS = [
88
130
  ("B01.jp2", ["B01"]),
89
131
  ("B02.jp2", ["B02"]),
90
132
  ("B03.jp2", ["B03"]),
@@ -101,144 +143,270 @@ class Sentinel2(DataSource):
101
143
  ("TCI.jp2", ["R", "G", "B"]),
102
144
  ]
103
145
 
146
+ # Possible prefixes of the product name that may appear on GCS, before the year
147
+ # appears in the product name. For example, a product may start with
148
+ # "S2A_MSIL1C_20230101..." so S2A_MSIL1C appears here. This list is used when
149
+ # enumerating the list of products on GCS that fall in a certain year: because the
150
+ # year comes after this prefix, filtering in the object list operation requires
151
+ # including this prefix first followed by the year.
152
+ VALID_PRODUCT_PREFIXES = ["S2A_MSIL1C", "S2B_MSIL1C", "S2C_MSIL1C"]
153
+
154
+ # The name of the L1C product metadata XML file.
155
+ METADATA_FILENAME = "MTD_MSIL1C.xml"
156
+
104
157
  def __init__(
105
158
  self,
106
- config: LayerConfig,
107
- index_cache_dir: UPath,
108
- max_time_delta: timedelta = timedelta(days=30),
159
+ index_cache_dir: str,
109
160
  sort_by: str | None = None,
110
161
  use_rtree_index: bool = True,
111
162
  harmonize: bool = False,
112
163
  rtree_time_range: tuple[datetime, datetime] | None = None,
164
+ rtree_cache_dir: str | None = None,
165
+ use_bigquery: bool | None = None,
166
+ bands: list[str] | None = None,
167
+ context: DataSourceContext = DataSourceContext(),
113
168
  ):
114
169
  """Initialize a new Sentinel2 instance.
115
170
 
116
171
  Args:
117
- config: the LayerConfig of the layer containing this data source.
118
- index_cache_dir: local directory to cache the index.csv.gz contents, as
119
- well as individual product metadata files. Defaults to None in which
120
- case products are looked up from the cloud storage directly.
121
- max_time_delta: maximum time before a query start time or after a
122
- query end time to look for products. This is required due to the large
123
- number of available products, and defaults to 30 days.
172
+ index_cache_dir: local directory to cache the index contents, as well as
173
+ individual product metadata files.
124
174
  sort_by: can be "cloud_cover", default arbitrary order; only has effect for
125
175
  SpaceMode.WITHIN.
126
176
  use_rtree_index: whether to create an rtree index to enable faster lookups
127
- (default true)
177
+ (default true). rtree will take several hours if it is not restricted
178
+ to a short time range using rtree_time_range.
128
179
  harmonize: harmonize pixel values across different processing baselines,
129
180
  see https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR_HARMONIZED
130
181
  rtree_time_range: only populate the rtree index with scenes within this
131
- time range
182
+ time range. Restricting to a few months significantly speeds up rtree
183
+ creation time.
184
+ rtree_cache_dir: by default, if use_rtree_index is enabled, the rtree is
185
+ stored in index_cache_dir (where product XML files are also stored). If
186
+ rtree_cache_dir is set, then the rtree is stored here instead (so
187
+ index_cache_dir is only used to cache product XML files).
188
+ use_bigquery: whether to use the BigQuery index over the scenes in the
189
+ bucket. This must be enabled if use_rtree_index is enabled, since we
190
+ only support populating the rtree index from BigQuery. Note that
191
+ BigQuery requires GCP credentials to be setup; to avoid the need for
192
+ credentials, set use_bigquery=False and use_rtree_index=False. The
193
+ default value is None which enables BigQuery when use_rtree_index=True
194
+ and disables when use_rtree_index=False.
195
+ bands: the bands to download, or None to download all bands. This is only
196
+ used if the layer config is not in the context.
197
+ context: the data source context.
132
198
  """
133
- self.config = config
134
- self.index_cache_dir = index_cache_dir
135
- self.max_time_delta = max_time_delta
199
+ if use_bigquery is None:
200
+ use_bigquery = use_rtree_index
201
+ if not use_bigquery and use_rtree_index:
202
+ raise ValueError(
203
+ "use_bigquery must be enabled if use_rtree_index is enabled"
204
+ )
205
+
206
+ # Resolve index_cache_dir and rtree_cache_dir depending on dataset context.
207
+ if context.ds_path is not None:
208
+ self.index_cache_dir = join_upath(context.ds_path, index_cache_dir)
209
+ else:
210
+ self.index_cache_dir = UPath(index_cache_dir)
211
+
212
+ if rtree_cache_dir is None:
213
+ self.rtree_cache_dir = self.index_cache_dir
214
+ elif context.ds_path is not None:
215
+ self.rtree_cache_dir = join_upath(context.ds_path, rtree_cache_dir)
216
+ else:
217
+ self.rtree_cache_dir = UPath(rtree_cache_dir)
218
+
136
219
  self.sort_by = sort_by
137
220
  self.harmonize = harmonize
221
+ self.use_bigquery = use_bigquery
138
222
 
139
223
  self.index_cache_dir.mkdir(parents=True, exist_ok=True)
140
224
 
141
- self.bucket = storage.Client.create_anonymous_client().bucket(self.bucket_name)
225
+ # Determine the subset of bands that are needed based on the layer config.
226
+ self.needed_bands: list[tuple[str, list[str]]]
227
+ if context.layer_config is not None:
228
+ self.needed_bands = []
229
+ for fname, cur_bands in self.BANDS:
230
+ # See if the bands provided by this file intersect with the bands in at
231
+ # least one configured band set.
232
+ for band_set in context.layer_config.band_sets:
233
+ if not set(band_set.bands).intersection(cur_bands):
234
+ continue
235
+ self.needed_bands.append((fname, cur_bands))
236
+ break
237
+ elif bands is not None:
238
+ self.needed_bands = []
239
+ for fname, cur_bands in self.BANDS:
240
+ if not set(bands).intersection(cur_bands):
241
+ continue
242
+ self.needed_bands.append((fname, cur_bands))
243
+ else:
244
+ self.needed_bands = list(self.BANDS)
142
245
 
246
+ self.bucket = storage.Client.create_anonymous_client().bucket(self.BUCKET_NAME)
247
+ self.rtree_index: Any | None = None
143
248
  if use_rtree_index:
144
249
  from rslearn.utils.rtree_index import RtreeIndex, get_cached_rtree
145
250
 
146
- def build_fn(index: RtreeIndex):
251
+ self.rtree_cache_dir.mkdir(parents=True, exist_ok=True)
252
+
253
+ def build_fn(index: RtreeIndex) -> None:
147
254
  """Build the RtreeIndex from items in the data source."""
148
- for item in self._read_index(
255
+ for item in self._read_bigquery(
149
256
  desc="Building rtree index", time_range=rtree_time_range
150
257
  ):
151
- index.insert(item.geometry.shp.bounds, json.dumps(item.serialize()))
152
-
153
- self.rtree_tmp_dir = tempfile.TemporaryDirectory()
154
- self.rtree_index = get_cached_rtree(
155
- self.index_cache_dir, self.rtree_tmp_dir.name, build_fn
156
- )
157
- else:
158
- self.rtree_index = None
159
-
160
- @staticmethod
161
- def from_config(config: LayerConfig, ds_path: UPath) -> "Sentinel2":
162
- """Creates a new Sentinel2 instance from a configuration dictionary."""
163
- assert isinstance(config, RasterLayerConfig)
164
- d = config.data_source.config_dict
165
- kwargs = dict(
166
- config=config,
167
- index_cache_dir=join_upath(ds_path, d["index_cache_dir"]),
168
- )
169
-
170
- if "max_time_delta" in d:
171
- kwargs["max_time_delta"] = timedelta(
172
- seconds=pytimeparse.parse(d["max_time_delta"])
173
- )
174
- simple_optionals = ["sort_by", "use_rtree_index", "harmonize"]
175
- for k in simple_optionals:
176
- if k in d:
177
- kwargs[k] = d[k]
258
+ for shp in flatten_shape(item.geometry.shp):
259
+ index.insert(shp.bounds, json.dumps(item.serialize()))
178
260
 
179
- return Sentinel2(**kwargs)
261
+ self.rtree_index = get_cached_rtree(self.rtree_cache_dir, build_fn)
180
262
 
181
- def _read_index(
182
- self, desc: str, time_range: tuple[datetime, datetime] | None = None
183
- ) -> Generator[dict[str, str], None, None]:
184
- """Read the index.csv.gz in the Cloud Storage bucket.
263
+ def _read_bigquery(
264
+ self,
265
+ desc: str | None = None,
266
+ time_range: tuple[datetime, datetime] | None = None,
267
+ wgs84_bbox: tuple[float, float, float, float] | None = None,
268
+ ) -> Generator[Sentinel2Item, None, None]:
269
+ """Read Sentinel-2 scenes from BigQuery table.
185
270
 
186
- The CSV only contains the bounding box of each image and not the exact
271
+ The table only contains the bounding box of each image and not the exact
187
272
  geometry, which can be retrieved from individual product metadata
188
273
  (MTD_MSIL1C.xml) files.
189
274
 
190
275
  Args:
191
276
  desc: description to include with tqdm progress bar.
192
277
  time_range: optional time_range to restrict the reading.
278
+ wgs84_bbox: optional bounding box in WGS-84 coordinates to restrict the
279
+ reading.
193
280
  """
194
- blob = self.bucket.blob(self.index_fname)
195
- with blob.open("rb") as blob_f:
196
- with gzip.open(blob_f, "rt") as gzip_f:
197
- reader = csv.DictReader(gzip_f)
198
- for row in tqdm.tqdm(reader, desc=desc):
199
- if not row["BASE_URL"]:
200
- continue
201
- product_id = row["PRODUCT_ID"]
202
- product_id_parts = product_id.split("_")
203
- if len(product_id_parts) < 7:
204
- continue
205
- product_type = product_id_parts[1]
206
- if product_type != "MSIL1C":
207
- continue
208
- time_str = product_id_parts[2]
209
- tile_id = product_id_parts[5]
210
- assert tile_id[0] == "T"
211
-
212
- sensing_time = dateutil.parser.isoparse(row["SENSING_TIME"])
213
- if time_range and (
214
- sensing_time < time_range[0] or sensing_time > time_range[1]
215
- ):
216
- continue
281
+ query_str = f"""
282
+ SELECT source_url, base_url, product_id, sensing_time, granule_id,
283
+ east_lon, south_lat, west_lon, north_lat, cloud_cover
284
+ FROM `{self.TABLE_NAME}`
285
+ """
286
+ clauses = []
287
+ if time_range is not None:
288
+ clauses.append(f"""(
289
+ sensing_time >= "{time_range[0]}" AND sensing_time <= "{time_range[1]}"
290
+ )""")
291
+ if wgs84_bbox is not None:
292
+ clauses.append(f"""(
293
+ west_lon < {wgs84_bbox[2]} AND
294
+ east_lon > {wgs84_bbox[0]} AND
295
+ south_lat < {wgs84_bbox[3]} AND
296
+ north_lat > {wgs84_bbox[1]}
297
+ )""")
298
+ if clauses:
299
+ query_str += " WHERE " + " AND ".join(clauses)
300
+
301
+ client = bigquery.Client()
302
+ result = client.query(query_str)
303
+ if desc is not None:
304
+ result = tqdm.tqdm(result, desc=desc)
305
+
306
+ for row in result:
307
+ # Validate product ID has correct number of sections and that it is MSIL1C.
308
+ # Example product IDs:
309
+ # - S2B_MSIL1C_20180210T200549_N0206_R128_T08VPK_20180210T215722
310
+ # - S2A_OPER_PRD_MSIL1C_PDMC_20160315T180002_R091_V20160315T060423_20160315T060423
311
+ # We must do this before checking source_url because we want to skip the
312
+ # products that say OPER instead of MSIL1C (occasionally the OPER products
313
+ # are missing other fields in the CSV).
314
+ # For example, the OPER product above has:
315
+ # - source_url = https://storage.googleapis.com/gcp-public-data-sentinel-2/index.csv.gz
316
+ # - base_url = None
317
+ product_id = row["product_id"]
318
+ product_id_parts = product_id.split("_")
319
+ if len(product_id_parts) < 7:
320
+ continue
321
+ product_type = product_id_parts[1]
322
+ if product_type != "MSIL1C":
323
+ continue
324
+ time_str = product_id_parts[2]
325
+ tile_id = product_id_parts[5]
326
+ assert tile_id[0] == "T"
327
+
328
+ # Figure out what the product folder is for this entry.
329
+ # Some entries have source_url correct and others have base_url correct.
330
+ # If base_url is correct, then it seems the source_url always ends in
331
+ # index.csv.gz.
332
+ # Example 1:
333
+ # - source_url = https://storage.googleapis.com/gcp-public-data-sentinel-2/index.csv.gz
334
+ # - base_url = gs://gcp-public-data-sentinel-2/tiles/54/U/VV/S2A_MSIL1C_20160219T015301_N0201_R017_T54UVV_20160222T152042.SAFE
335
+ # Example 2:
336
+ # - source_url = gs://gcp-public-data-sentinel-2/tiles/15/C/WM/S2B_MSIL1C_20250101T121229_N0511_R080_T15CWM_20250101T150509.SAFE
337
+ # - base_url = None
338
+ if row["source_url"] and not row["source_url"].endswith("index.csv.gz"):
339
+ product_folder = row["source_url"].split(f"gs://{self.BUCKET_NAME}/")[1]
340
+ elif row["base_url"] is not None and row["base_url"] != "":
341
+ product_folder = row["base_url"].split(f"gs://{self.BUCKET_NAME}/")[1]
342
+ else:
343
+ raise ValueError(
344
+ f"Unexpected value '{row['source_url']}' in column 'source_url'"
345
+ + f" and '{row['base_url']} in column 'base_url'"
346
+ + f"for product {row['product_id']}"
347
+ )
217
348
 
218
- granule_id = row["GRANULE_ID"]
219
- base_url = row["BASE_URL"].split(
220
- "gs://gcp-public-data-sentinel-2/"
221
- )[1]
349
+ # Build the blob prefix based on the product ID and granule ID.
350
+ # The blob prefix is the prefix to the JP2 image files on GCS.
351
+ granule_id = row["granule_id"]
352
+ blob_prefix = (
353
+ f"{product_folder}/GRANULE/{granule_id}/IMG_DATA/{tile_id}_{time_str}_"
354
+ )
222
355
 
223
- blob_prefix = f"{base_url}/GRANULE/{granule_id}/IMG_DATA/{tile_id}_{time_str}_"
356
+ # Extract the spatial and temporal bounds of the image.
357
+ bounds = (
358
+ float(row["west_lon"]),
359
+ float(row["south_lat"]),
360
+ float(row["east_lon"]),
361
+ float(row["north_lat"]),
362
+ )
363
+ shp = shapely.box(*bounds)
364
+ sensing_time = row["sensing_time"]
365
+ geometry = STGeometry(WGS84_PROJECTION, shp, (sensing_time, sensing_time))
366
+ geometry = split_at_antimeridian(geometry)
224
367
 
225
- # Extract the spatial and temporal bounds of the image.
226
- bounds = (
227
- float(row["EAST_LON"]),
228
- float(row["SOUTH_LAT"]),
229
- float(row["WEST_LON"]),
230
- float(row["NORTH_LAT"]),
231
- )
232
- shp = shapely.box(*bounds)
233
- geometry = STGeometry(
234
- WGS84_PROJECTION, shp, (sensing_time, sensing_time)
235
- )
368
+ cloud_cover = float(row["cloud_cover"])
369
+
370
+ yield Sentinel2Item(product_id, geometry, blob_prefix, cloud_cover)
371
+
372
+ def _build_cell_folder_name(self, cell_id: str) -> str:
373
+ """Get the prefix on GCS containing the product files in the provided cell.
374
+
375
+ The Sentinel-2 cell ID is based on MGRS and is a way of splitting up the world
376
+ into large tiles.
236
377
 
237
- cloud_cover = float(row["CLOUD_COVER"])
378
+ Args:
379
+ cell_id: the 5-character cell ID. Note that the product name includes the
380
+ cell ID with a "T" prefix, the T should be removed.
381
+
382
+ Returns:
383
+ the path on GCS of the folder corresponding to this Sentinel-2 cell.
384
+ """
385
+ return f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/"
386
+
387
+ def _build_product_folder_name(self, item_name: str) -> str:
388
+ """Get the folder containing the given Sentinel-2 scene ID on GCS.
238
389
 
239
- yield Sentinel2Item(product_id, geometry, blob_prefix, cloud_cover)
390
+ Args:
391
+ item_name: the item name (Sentinel-2 scene ID).
240
392
 
241
- def _get_xml_by_name(self, name: str) -> ET.ElementTree:
393
+ Returns:
394
+ the path on GCS of the .SAFE folder corresponding to this item.
395
+ """
396
+ parts = item_name.split("_")
397
+ cell_id_with_prefix = parts[5]
398
+ if len(cell_id_with_prefix) != 6:
399
+ raise ValueError(
400
+ f"cell ID should be 6 characters but got {cell_id_with_prefix}"
401
+ )
402
+ if cell_id_with_prefix[0] != "T":
403
+ raise ValueError(
404
+ f"cell ID should start with T but got {cell_id_with_prefix}"
405
+ )
406
+ cell_id = cell_id_with_prefix[1:]
407
+ return self._build_cell_folder_name(cell_id) + f"{item_name}.SAFE/"
408
+
409
+ def _get_xml_by_name(self, name: str) -> "ET.ElementTree[ET.Element[str]]":
242
410
  """Gets the metadata XML of an item by its name.
243
411
 
244
412
  Args:
@@ -247,76 +415,224 @@ class Sentinel2(DataSource):
247
415
  Returns:
248
416
  the parsed XML ElementTree
249
417
  """
250
- parts = name.split("_")
251
- assert len(parts[5]) == 6
252
- assert parts[5][0] == "T"
253
- cell_id = parts[5][1:]
254
- base_url = f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/{name}.SAFE/"
255
-
256
418
  cache_xml_fname = self.index_cache_dir / (name + ".xml")
257
419
  if not cache_xml_fname.exists():
258
- metadata_blob_path = base_url + "MTD_MSIL1C.xml"
420
+ product_folder = self._build_product_folder_name(name)
421
+ metadata_blob_path = product_folder + self.METADATA_FILENAME
422
+ logger.debug("reading metadata XML from %s", metadata_blob_path)
259
423
  blob = self.bucket.blob(metadata_blob_path)
424
+ if not blob.exists():
425
+ raise MissingXMLException(name)
260
426
  with open_atomic(cache_xml_fname, "wb") as f:
261
427
  blob.download_to_file(f)
262
428
 
263
429
  with cache_xml_fname.open("rb") as f:
264
430
  return ET.parse(f)
265
431
 
266
- def get_item_by_name(self, name: str) -> Item:
267
- """Gets an item by name.
432
+ def _parse_xml(self, name: str) -> ParsedProductXML:
433
+ """Parse a Sentinel-2 product XML file.
268
434
 
269
- Reads the individual product metadata file (MTD_MSIL1C.xml) to get both the
270
- expected blob path where images are stored as well as the detailed geometry of
271
- the product (not just the bounding box).
435
+ This extracts the blob prefix in the GCS bucket, the polygon extent, sensing
436
+ start time, and cloud cover.
272
437
 
273
438
  Args:
274
- name: the name of the item to get
275
-
276
- Returns:
277
- the item object
439
+ name: the Sentinel-2 scene name.
278
440
  """
279
- parts = name.split("_")
280
- assert len(parts[5]) == 6
281
- assert parts[5][0] == "T"
282
- cell_id = parts[5][1:]
283
- base_url = f"tiles/{cell_id[0:2]}/{cell_id[2:3]}/{cell_id[3:5]}/{name}.SAFE/"
284
-
441
+ # Get the XML. This helper function handles caching the XML file.
285
442
  tree = self._get_xml_by_name(name)
286
443
 
444
+ # Now parse the XML, starting with the detailed geometry of the image.
287
445
  # The EXT_POS_LIST tag has flat list of polygon coordinates.
288
446
  elements = list(tree.iter("EXT_POS_LIST"))
289
447
  assert len(elements) == 1
290
- coords = elements[0].text.strip().split(" ")
448
+ if elements[0].text is None:
449
+ raise ValueError(f"EXT_POS_LIST is empty for {name}")
450
+ coords_text = elements[0].text.strip().split(" ")
291
451
  # Convert flat list of lat1 lon1 lat2 lon2 ...
292
452
  # into (lon1, lat1), (lon2, lat2), ...
293
453
  # Then we can get the shapely geometry.
294
454
  coords = [
295
- [float(coords[i + 1]), float(coords[i])] for i in range(0, len(coords), 2)
455
+ [float(coords_text[i + 1]), float(coords_text[i])]
456
+ for i in range(0, len(coords_text), 2)
296
457
  ]
297
458
  shp = shapely.Polygon(coords)
298
459
 
299
- # Get blob prefix which is a subfolder of the base_url
460
+ # Get blob prefix which is a subfolder of the product folder.
461
+ # The blob prefix is the prefix to the JP2 image files on GCS.
462
+ product_folder = self._build_product_folder_name(name)
300
463
  elements = list(tree.iter("IMAGE_FILE"))
301
- elements = [el for el in elements if el.text.endswith("_B01")]
464
+ elements = [
465
+ el for el in elements if el.text is not None and el.text.endswith("_B01")
466
+ ]
302
467
  assert len(elements) == 1
303
- blob_prefix = base_url + elements[0].text.split("B01")[0]
468
+ if elements[0].text is None:
469
+ raise ValueError(f"IMAGE_FILE is empty for {name}")
470
+ blob_prefix = product_folder + elements[0].text.split("B01")[0]
304
471
 
472
+ # Get the sensing start time.
305
473
  elements = list(tree.iter("PRODUCT_START_TIME"))
306
474
  assert len(elements) == 1
475
+ if elements[0].text is None:
476
+ raise ValueError(f"PRODUCT_START_TIME is empty for {name}")
307
477
  start_time = dateutil.parser.isoparse(elements[0].text)
308
478
 
479
+ # Get the cloud cover.
309
480
  elements = list(tree.iter("Cloud_Coverage_Assessment"))
310
481
  assert len(elements) == 1
482
+ if elements[0].text is None:
483
+ raise ValueError(f"Cloud_Coverage_Assessment is empty for {name}")
311
484
  cloud_cover = float(elements[0].text)
312
485
 
486
+ return ParsedProductXML(
487
+ blob_prefix=blob_prefix,
488
+ shp=shp,
489
+ start_time=start_time,
490
+ cloud_cover=cloud_cover,
491
+ )
492
+
493
+ def _get_item_by_name(self, name: str) -> Sentinel2Item:
494
+ """Gets an item by name.
495
+
496
+ This implements the main logic of processing the product metadata file
497
+ without the caching logic in get_item_by_name, see that function for details.
498
+
499
+ Args:
500
+ name: the Sentinel-2 scene ID.
501
+ """
502
+ product_xml = self._parse_xml(name)
503
+
504
+ # Some Sentinel-2 scenes in the bucket are missing a subset of image files. So
505
+ # here we verify that all the bands we know about are intact.
506
+ expected_suffixes = {t[0] for t in self.BANDS}
507
+ for blob in self.bucket.list_blobs(prefix=product_xml.blob_prefix):
508
+ assert blob.name.startswith(product_xml.blob_prefix)
509
+ suffix = blob.name[len(product_xml.blob_prefix) :]
510
+ if suffix in expected_suffixes:
511
+ expected_suffixes.remove(suffix)
512
+ if len(expected_suffixes) > 0:
513
+ raise CorruptItemException(
514
+ f"item is missing image files: {expected_suffixes}"
515
+ )
516
+
517
+ time_range = (product_xml.start_time, product_xml.start_time)
518
+ geometry = STGeometry(WGS84_PROJECTION, product_xml.shp, time_range)
519
+ geometry = split_at_antimeridian(geometry)
520
+
521
+ # Sometimes the geometry is not valid.
522
+ # We just apply make_valid on it to correct issues.
523
+ if not geometry.shp.is_valid:
524
+ geometry.shp = shapely.make_valid(geometry.shp)
525
+
526
+ # Some rasters have zero-area geometry due to incorrect geometry. For example,
527
+ # S2B_MSIL1C_20190111T193659_N0207_R056_T08MLS_20190111T205033.SAFE.
528
+ # So here we add a check for that and mark it corrupt if so.
529
+ if geometry.shp.area == 0:
530
+ raise CorruptItemException(
531
+ f"XML for item {name} shows geometry with zero area"
532
+ )
533
+
313
534
  return Sentinel2Item(
314
- name,
315
- STGeometry(WGS84_PROJECTION, shp, (start_time, start_time)),
316
- blob_prefix,
317
- cloud_cover,
535
+ name=name,
536
+ geometry=geometry,
537
+ blob_prefix=product_xml.blob_prefix,
538
+ cloud_cover=product_xml.cloud_cover,
318
539
  )
319
540
 
541
+ def get_item_by_name(self, name: str) -> Sentinel2Item:
542
+ """Gets an item by name.
543
+
544
+ Reads the individual product metadata file (MTD_MSIL1C.xml) to get both the
545
+ expected blob path where images are stored as well as the detailed geometry of
546
+ the product (not just the bounding box).
547
+
548
+ Args:
549
+ name: the name of the item to get
550
+
551
+ Returns:
552
+ the item object
553
+ """
554
+ # The main logic for getting the item is implemented in _get_item_by_name.
555
+ # Here, we implement caching logic so that, if we have already seen this item
556
+ # before, then we can just deserialize it from a JSON file.
557
+ # We want to cache the item if it is successful, but also cache the
558
+ # CorruptItemException if it is raised.
559
+ cache_item_fname = self.index_cache_dir / (name + ".json")
560
+
561
+ if cache_item_fname.exists():
562
+ with cache_item_fname.open() as f:
563
+ d = json.load(f)
564
+
565
+ if "error" in d:
566
+ raise CorruptItemException(d["error"])
567
+
568
+ return Sentinel2Item.deserialize(d)
569
+
570
+ try:
571
+ item = self._get_item_by_name(name)
572
+ except CorruptItemException as e:
573
+ with open_atomic(cache_item_fname, "w") as f:
574
+ json.dump({"error": e.message}, f)
575
+ raise
576
+
577
+ with open_atomic(cache_item_fname, "w") as f:
578
+ json.dump(item.serialize(), f)
579
+ return item
580
+
581
+ def _read_products_for_cell_year(
582
+ self, cell_id: str, year: int
583
+ ) -> list[Sentinel2Item]:
584
+ """Read items for the given cell and year directly from the GCS bucket.
585
+
586
+ This helper function is used by self._read_products which then caches the
587
+ items together in one file.
588
+ """
589
+ items = []
590
+
591
+ for product_prefix in self.VALID_PRODUCT_PREFIXES:
592
+ cell_folder = self._build_cell_folder_name(cell_id)
593
+ blob_prefix = f"{cell_folder}{product_prefix}_{year}"
594
+ blobs = self.bucket.list_blobs(prefix=blob_prefix, delimiter="/")
595
+
596
+ # Need to consume the iterator to obtain folder names.
597
+ # See https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix#storage_list_files_with_prefix-python # noqa: E501
598
+ # Previously we checked for .SAFE_$folder$ blobs here, but those do
599
+ # not exist for some years like 2017.
600
+ for _ in blobs:
601
+ pass
602
+
603
+ logger.debug(
604
+ "under %s, found %d folders to scan",
605
+ blob_prefix,
606
+ len(blobs.prefixes),
607
+ )
608
+
609
+ for prefix in blobs.prefixes:
610
+ folder_name = prefix.split("/")[-2]
611
+ expected_suffix = ".SAFE"
612
+ assert folder_name.endswith(expected_suffix)
613
+ item_name = folder_name.split(expected_suffix)[0]
614
+
615
+ try:
616
+ item = self.get_item_by_name(item_name)
617
+ except CorruptItemException as e:
618
+ logger.warning("skipping corrupt item %s: %s", item_name, e.message)
619
+ continue
620
+ except MissingXMLException:
621
+ # Sometimes there is a .SAFE folder but some files like the
622
+ # XML file are just missing for whatever reason. Since we
623
+ # know this happens occasionally, we just ignore the error
624
+ # here.
625
+ logger.warning(
626
+ "no metadata XML for Sentinel-2 folder %s/%s",
627
+ blob_prefix,
628
+ folder_name,
629
+ )
630
+ continue
631
+
632
+ items.append(item)
633
+
634
+ return items
635
+
320
636
  def _read_products(
321
637
  self, needed_cell_years: set[tuple[str, int]]
322
638
  ) -> Generator[Sentinel2Item, None, None]:
@@ -326,39 +642,20 @@ class Sentinel2(DataSource):
326
642
  needed_cell_years: set of (mgrs grid cell, year) where we need to search
327
643
  for images.
328
644
  """
329
- for cell_id, year in tqdm.tqdm(needed_cell_years, desc="Reading product infos"):
645
+ # Read the product infos in random order so in case there are multiple jobs
646
+ # reading similar cells, they are more likely to work on different cells/years
647
+ # in parallel.
648
+ needed_cell_years_list = list(needed_cell_years)
649
+ random.shuffle(needed_cell_years_list)
650
+
651
+ for cell_id, year in tqdm.tqdm(
652
+ needed_cell_years_list, desc="Reading product infos"
653
+ ):
330
654
  assert len(cell_id) == 5
331
655
  cache_fname = self.index_cache_dir / f"{cell_id}_{year}.json"
332
656
 
333
657
  if not cache_fname.exists():
334
- cell_part1 = cell_id[0:2]
335
- cell_part2 = cell_id[2:3]
336
- cell_part3 = cell_id[3:5]
337
-
338
- items = []
339
-
340
- for product_prefix in ["S2A_MSIL1C", "S2B_MSIL1C"]:
341
- blob_prefix = (
342
- f"tiles/{cell_part1}/{cell_part2}/{cell_part3}/"
343
- + f"{product_prefix}_{year}"
344
- )
345
- blobs = self.bucket.list_blobs(prefix=blob_prefix, delimiter="/")
346
-
347
- # Need to consume the iterator to obtain folder names.
348
- # See https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix#storage_list_files_with_prefix-python # noqa: E501
349
- # Previously we checked for .SAFE_$folder$ blobs here, but those do
350
- # not exist for some years like 2017.
351
- for _ in blobs:
352
- pass
353
-
354
- for prefix in blobs.prefixes:
355
- folder_name = prefix.split("/")[-2]
356
- expected_suffix = ".SAFE"
357
- assert folder_name.endswith(expected_suffix)
358
- item_name = folder_name.split(expected_suffix)[0]
359
- item = self.get_item_by_name(item_name)
360
- items.append(item)
361
-
658
+ items = self._read_products_for_cell_year(cell_id, year)
362
659
  with open_atomic(cache_fname, "w") as f:
363
660
  json.dump([item.serialize() for item in items], f)
364
661
 
@@ -366,22 +663,26 @@ class Sentinel2(DataSource):
366
663
  with cache_fname.open() as f:
367
664
  items = [Sentinel2Item.deserialize(d) for d in json.load(f)]
368
665
 
369
- for item in items:
370
- yield item
666
+ yield from items
371
667
 
372
668
  def _get_candidate_items_index(
373
669
  self, wgs84_geometries: list[STGeometry]
374
- ) -> list[list[list[Item]]]:
375
- """List relevant items using rtree index."""
376
- candidates = [[] for _ in wgs84_geometries]
670
+ ) -> list[list[Sentinel2Item]]:
671
+ """List relevant items using rtree index.
672
+
673
+ Args:
674
+ wgs84_geometries: the geometries to query.
675
+ """
676
+ candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
377
677
  for idx, geometry in enumerate(wgs84_geometries):
378
678
  time_range = None
379
679
  if geometry.time_range:
380
680
  time_range = (
381
- geometry.time_range[0] - self.max_time_delta,
382
- geometry.time_range[1] + self.max_time_delta,
681
+ geometry.time_range[0],
682
+ geometry.time_range[1],
383
683
  )
384
-
684
+ if self.rtree_index is None:
685
+ raise ValueError("rtree_index is required")
385
686
  encoded_items = self.rtree_index.query(geometry.shp.bounds)
386
687
  for encoded_item in encoded_items:
387
688
  item = Sentinel2Item.deserialize(json.loads(encoded_item))
@@ -389,7 +690,23 @@ class Sentinel2(DataSource):
389
690
  continue
390
691
  if not item.geometry.shp.intersects(geometry.shp):
391
692
  continue
392
- item = self.get_item_by_name(item.name)
693
+
694
+ # Get the item from XML to get its exact geometry (the index only
695
+ # knows the bounding box of the item).
696
+ try:
697
+ item = self.get_item_by_name(item.name)
698
+ except CorruptItemException as e:
699
+ logger.warning("skipping corrupt item %s: %s", item.name, e.message)
700
+ continue
701
+ except MissingXMLException:
702
+ # Sometimes a scene that appears in the BigQuery index does not
703
+ # actually have an XML file on GCS. Since we know this happens
704
+ # occasionally, we ignore the error here.
705
+ logger.warning(
706
+ "skipping item %s that is missing XML file", item.name
707
+ )
708
+ continue
709
+
393
710
  if not item.geometry.shp.intersects(geometry.shp):
394
711
  continue
395
712
  candidates[idx].append(item)
@@ -397,22 +714,26 @@ class Sentinel2(DataSource):
397
714
 
398
715
  def _get_candidate_items_direct(
399
716
  self, wgs84_geometries: list[STGeometry]
400
- ) -> list[list[list[Item]]]:
401
- """Use _read_products to list relevant items."""
717
+ ) -> list[list[Sentinel2Item]]:
718
+ """Use _read_products to list matching items directly from the bucket.
719
+
720
+ Args:
721
+ wgs84_geometries: the geometries to query.
722
+ """
402
723
  needed_cell_years = set()
403
724
  for wgs84_geometry in wgs84_geometries:
404
725
  if wgs84_geometry.time_range is None:
405
726
  raise ValueError(
406
727
  "Sentinel2 on GCP requires geometry time ranges to be set"
407
728
  )
408
- for cell_id in rslearn.utils.mgrs.for_each_cell(wgs84_geometry.shp.bounds):
729
+ for cell_id in get_sentinel2_tiles(wgs84_geometry, self.index_cache_dir):
409
730
  for year in range(
410
- (wgs84_geometry.time_range[0] - self.max_time_delta).year,
411
- (wgs84_geometry.time_range[1] + self.max_time_delta).year + 1,
731
+ wgs84_geometry.time_range[0].year,
732
+ wgs84_geometry.time_range[1].year + 1,
412
733
  ):
413
734
  needed_cell_years.add((cell_id, year))
414
735
 
415
- items_by_cell = {}
736
+ items_by_cell: dict[str, list[Sentinel2Item]] = {}
416
737
  for item in self._read_products(needed_cell_years):
417
738
  cell_id = "".join(item.blob_prefix.split("/")[1:4])
418
739
  assert len(cell_id) == 5
@@ -420,9 +741,9 @@ class Sentinel2(DataSource):
420
741
  items_by_cell[cell_id] = []
421
742
  items_by_cell[cell_id].append(item)
422
743
 
423
- candidates = [[] for _ in wgs84_geometries]
744
+ candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
424
745
  for idx, geometry in enumerate(wgs84_geometries):
425
- for cell_id in rslearn.utils.mgrs.for_each_cell(geometry.shp.bounds):
746
+ for cell_id in get_sentinel2_tiles(geometry, self.index_cache_dir):
426
747
  for item in items_by_cell.get(cell_id, []):
427
748
  if not geometry.shp.intersects(item.geometry.shp):
428
749
  continue
@@ -430,9 +751,27 @@ class Sentinel2(DataSource):
430
751
 
431
752
  return candidates
432
753
 
754
+ def _get_candidate_items_bigquery(
755
+ self, wgs84_geometries: list[STGeometry]
756
+ ) -> list[list[Sentinel2Item]]:
757
+ """Use _read_bigquery to list matching items by querying the BigQuery table.
758
+
759
+ Args:
760
+ wgs84_geometries: the geometries to query.
761
+ """
762
+ candidates: list[list[Sentinel2Item]] = [[] for _ in wgs84_geometries]
763
+ for idx, geometry in enumerate(wgs84_geometries):
764
+ wgs84_bbox = geometry.shp.bounds
765
+ for item in self._read_bigquery(
766
+ time_range=geometry.time_range, wgs84_bbox=wgs84_bbox
767
+ ):
768
+ candidates[idx].append(item)
769
+
770
+ return candidates
771
+
433
772
  def get_items(
434
773
  self, geometries: list[STGeometry], query_config: QueryConfig
435
- ) -> list[list[list[Item]]]:
774
+ ) -> list[list[list[Sentinel2Item]]]:
436
775
  """Get a list of items in the data source intersecting the given geometries.
437
776
 
438
777
  Args:
@@ -448,6 +787,8 @@ class Sentinel2(DataSource):
448
787
 
449
788
  if self.rtree_index:
450
789
  candidates = self._get_candidate_items_index(wgs84_geometries)
790
+ elif self.use_bigquery:
791
+ candidates = self._get_candidate_items_bigquery(wgs84_geometries)
451
792
  else:
452
793
  candidates = self._get_candidate_items_direct(wgs84_geometries)
453
794
 
@@ -463,14 +804,16 @@ class Sentinel2(DataSource):
463
804
  groups.append(cur_groups)
464
805
  return groups
465
806
 
466
- def deserialize_item(self, serialized_item: Any) -> Item:
807
+ def deserialize_item(self, serialized_item: Any) -> Sentinel2Item:
467
808
  """Deserializes an item from JSON-decoded data."""
468
809
  assert isinstance(serialized_item, dict)
469
810
  return Sentinel2Item.deserialize(serialized_item)
470
811
 
471
- def retrieve_item(self, item: Item) -> Generator[tuple[str, BinaryIO], None, None]:
812
+ def retrieve_item(
813
+ self, item: Sentinel2Item
814
+ ) -> Generator[tuple[str, BinaryIO], None, None]:
472
815
  """Retrieves the rasters corresponding to an item as file streams."""
473
- for suffix, _ in self.bands:
816
+ for suffix, _ in self.BANDS:
474
817
  blob_path = item.blob_prefix + suffix
475
818
  fname = blob_path.split("/")[-1]
476
819
  buf = io.BytesIO()
@@ -483,8 +826,8 @@ class Sentinel2(DataSource):
483
826
 
484
827
  def ingest(
485
828
  self,
486
- tile_store: TileStore,
487
- items: list[Item],
829
+ tile_store: TileStoreWithLayer,
830
+ items: list[Sentinel2Item],
488
831
  geometries: list[list[STGeometry]],
489
832
  ) -> None:
490
833
  """Ingest items into the given tile store.
@@ -494,36 +837,49 @@ class Sentinel2(DataSource):
494
837
  items: the items to ingest
495
838
  geometries: a list of geometries needed for each item
496
839
  """
497
- for item, cur_geometries in zip(items, geometries):
498
- harmonize_callback = None
499
- if self.harmonize:
500
- harmonize_callback = get_harmonize_callback(
501
- self._get_xml_by_name(item.name)
502
- )
503
-
504
- for suffix, band_names in self.bands:
505
- cur_tile_store = PrefixedTileStore(
506
- tile_store, (item.name, "_".join(band_names))
507
- )
508
- needed_projections = get_needed_projections(
509
- cur_tile_store, band_names, self.config.band_sets, cur_geometries
510
- )
511
- if not needed_projections:
840
+ for item in items:
841
+ for suffix, band_names in self.needed_bands:
842
+ if tile_store.is_raster_ready(item.name, band_names):
512
843
  continue
513
844
 
514
- buf = io.BytesIO()
515
- blob = self.bucket.blob(item.blob_prefix + suffix)
516
- if not blob.exists():
517
- continue
518
- blob.download_to_file(buf)
519
- buf.seek(0)
520
- with rasterio.open(buf) as raster:
521
- for projection in needed_projections:
522
- ingest_raster(
523
- tile_store=cur_tile_store,
524
- raster=raster,
525
- projection=projection,
526
- time_range=item.geometry.time_range,
527
- layer_config=self.config,
528
- array_callback=harmonize_callback,
845
+ with tempfile.TemporaryDirectory() as tmp_dir:
846
+ fname = os.path.join(tmp_dir, suffix)
847
+ blob = self.bucket.blob(item.blob_prefix + suffix)
848
+ logger.debug(
849
+ "gcp_public_data downloading raster file %s",
850
+ item.blob_prefix + suffix,
851
+ )
852
+ blob.download_to_filename(fname)
853
+ logger.debug(
854
+ "gcp_public_data ingesting raster file %s into tile store",
855
+ item.blob_prefix + suffix,
856
+ )
857
+
858
+ # Harmonize values if needed.
859
+ # TCI does not need harmonization.
860
+ harmonize_callback = None
861
+ if self.harmonize and suffix != "TCI.jp2":
862
+ harmonize_callback = get_harmonize_callback(
863
+ self._get_xml_by_name(item.name)
864
+ )
865
+
866
+ if harmonize_callback is not None:
867
+ # In this case we need to read the array, convert the pixel
868
+ # values, and pass modified array directly to the TileStore.
869
+ with rasterio.open(fname) as src:
870
+ array = src.read()
871
+ projection, bounds = get_raster_projection_and_bounds(src)
872
+ array = harmonize_callback(array)
873
+ tile_store.write_raster(
874
+ item.name, band_names, projection, bounds, array
529
875
  )
876
+
877
+ else:
878
+ tile_store.write_raster_file(
879
+ item.name, band_names, UPath(fname)
880
+ )
881
+
882
+ logger.debug(
883
+ "gcp_public_data done ingesting raster file %s",
884
+ item.blob_prefix + suffix,
885
+ )