tilebox-datasets 0.34.0__py3-none-any.whl → 0.36.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tilebox/datasets/__init__.py +4 -2
- tilebox/datasets/aio/__init__.py +4 -1
- tilebox/datasets/aio/client.py +6 -6
- tilebox/datasets/aio/dataset.py +613 -0
- tilebox/datasets/aio/pagination.py +1 -31
- tilebox/datasets/aio/timeseries.py +4 -570
- tilebox/datasets/data/datapoint.py +44 -1
- tilebox/datasets/datasetsv1/core_pb2.py +8 -8
- tilebox/datasets/datasetsv1/core_pb2.pyi +4 -2
- tilebox/datasets/protobuf_conversion/field_types.py +56 -0
- tilebox/datasets/protobuf_conversion/protobuf_xarray.py +10 -33
- tilebox/datasets/protobuf_conversion/to_protobuf.py +0 -3
- tilebox/datasets/sync/client.py +6 -6
- tilebox/datasets/sync/dataset.py +614 -0
- tilebox/datasets/sync/pagination.py +1 -31
- tilebox/datasets/sync/timeseries.py +4 -570
- {tilebox_datasets-0.34.0.dist-info → tilebox_datasets-0.36.0.dist-info}/METADATA +1 -1
- {tilebox_datasets-0.34.0.dist-info → tilebox_datasets-0.36.0.dist-info}/RECORD +19 -17
- {tilebox_datasets-0.34.0.dist-info → tilebox_datasets-0.36.0.dist-info}/WHEEL +0 -0
tilebox/datasets/__init__.py
CHANGED
|
@@ -3,10 +3,12 @@ import sys
|
|
|
3
3
|
|
|
4
4
|
from loguru import logger
|
|
5
5
|
|
|
6
|
+
# only here for backwards compatibility, to preserve backwards compatibility with older imports
|
|
7
|
+
from tilebox.datasets.aio.timeseries import TimeseriesCollection, TimeseriesDataset
|
|
6
8
|
from tilebox.datasets.sync.client import Client
|
|
7
|
-
from tilebox.datasets.sync.
|
|
9
|
+
from tilebox.datasets.sync.dataset import CollectionClient, DatasetClient
|
|
8
10
|
|
|
9
|
-
__all__ = ["Client", "TimeseriesCollection", "TimeseriesDataset"]
|
|
11
|
+
__all__ = ["Client", "CollectionClient", "DatasetClient", "TimeseriesCollection", "TimeseriesDataset"]
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
def _init_logging(level: str = "INFO") -> None:
|
tilebox/datasets/aio/__init__.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
from tilebox.datasets.aio.client import Client
|
|
2
|
+
from tilebox.datasets.aio.dataset import CollectionClient, DatasetClient
|
|
3
|
+
|
|
4
|
+
# only here for backwards compatibility, to preserve backwards compatibility with older imports
|
|
2
5
|
from tilebox.datasets.aio.timeseries import TimeseriesCollection, TimeseriesDataset
|
|
3
6
|
|
|
4
|
-
__all__ = ["Client", "TimeseriesCollection", "TimeseriesDataset"]
|
|
7
|
+
__all__ = ["Client", "CollectionClient", "DatasetClient", "TimeseriesCollection", "TimeseriesDataset"]
|
tilebox/datasets/aio/client.py
CHANGED
|
@@ -2,7 +2,7 @@ from uuid import UUID
|
|
|
2
2
|
|
|
3
3
|
from _tilebox.grpc.aio.channel import open_channel
|
|
4
4
|
from _tilebox.grpc.aio.error import with_pythonic_errors
|
|
5
|
-
from tilebox.datasets.aio.
|
|
5
|
+
from tilebox.datasets.aio.dataset import DatasetClient
|
|
6
6
|
from tilebox.datasets.client import Client as BaseClient
|
|
7
7
|
from tilebox.datasets.client import token_from_env
|
|
8
8
|
from tilebox.datasets.datasetsv1.collections_pb2_grpc import CollectionServiceStub
|
|
@@ -33,10 +33,10 @@ class Client:
|
|
|
33
33
|
self._client = BaseClient(service)
|
|
34
34
|
|
|
35
35
|
async def datasets(self) -> Group:
|
|
36
|
-
return await self._client.datasets(
|
|
36
|
+
return await self._client.datasets(DatasetClient)
|
|
37
37
|
|
|
38
|
-
async def dataset(self, slug: str) ->
|
|
39
|
-
return await self._client.dataset(slug,
|
|
38
|
+
async def dataset(self, slug: str) -> DatasetClient:
|
|
39
|
+
return await self._client.dataset(slug, DatasetClient)
|
|
40
40
|
|
|
41
|
-
async def _dataset_by_id(self, dataset_id: str | UUID) ->
|
|
42
|
-
return await self._client._dataset_by_id(dataset_id,
|
|
41
|
+
async def _dataset_by_id(self, dataset_id: str | UUID) -> DatasetClient:
|
|
42
|
+
return await self._client._dataset_by_id(dataset_id, DatasetClient) # noqa: SLF001
|
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
from collections.abc import AsyncIterator
|
|
2
|
+
from functools import partial
|
|
3
|
+
from typing import cast
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
from warnings import warn
|
|
6
|
+
|
|
7
|
+
import xarray as xr
|
|
8
|
+
from tqdm.auto import tqdm
|
|
9
|
+
|
|
10
|
+
from _tilebox.grpc.aio.pagination import Pagination as PaginationProtocol
|
|
11
|
+
from _tilebox.grpc.aio.pagination import paginated_request
|
|
12
|
+
from _tilebox.grpc.aio.producer_consumer import async_producer_consumer
|
|
13
|
+
from _tilebox.grpc.error import ArgumentError, NotFoundError
|
|
14
|
+
from tilebox.datasets.aio.pagination import (
|
|
15
|
+
with_progressbar,
|
|
16
|
+
with_time_progress_callback,
|
|
17
|
+
with_time_progressbar,
|
|
18
|
+
)
|
|
19
|
+
from tilebox.datasets.data.collection import CollectionInfo
|
|
20
|
+
from tilebox.datasets.data.data_access import QueryFilters, SpatialFilter, SpatialFilterLike
|
|
21
|
+
from tilebox.datasets.data.datapoint import DatapointInterval, DatapointIntervalLike, DatapointPage, QueryResultPage
|
|
22
|
+
from tilebox.datasets.data.datasets import Dataset
|
|
23
|
+
from tilebox.datasets.data.pagination import Pagination
|
|
24
|
+
from tilebox.datasets.data.time_interval import TimeInterval, TimeIntervalLike
|
|
25
|
+
from tilebox.datasets.data.uuid import as_uuid
|
|
26
|
+
from tilebox.datasets.message_pool import get_message_type
|
|
27
|
+
from tilebox.datasets.progress import ProgressCallback
|
|
28
|
+
from tilebox.datasets.protobuf_conversion.protobuf_xarray import MessageToXarrayConverter, TimeseriesToXarrayConverter
|
|
29
|
+
from tilebox.datasets.protobuf_conversion.to_protobuf import (
|
|
30
|
+
DatapointIDs,
|
|
31
|
+
IngestionData,
|
|
32
|
+
extract_datapoint_ids,
|
|
33
|
+
marshal_messages,
|
|
34
|
+
to_messages,
|
|
35
|
+
)
|
|
36
|
+
from tilebox.datasets.service import TileboxDatasetService
|
|
37
|
+
|
|
38
|
+
# allow private member access: we allow it here because we want to make as much private as possible so that we can
|
|
39
|
+
# minimize the publicly facing API (which allows us to change internals later, and also limits to auto-completion)
|
|
40
|
+
# ruff: noqa: SLF001
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DatasetClient:
|
|
44
|
+
"""A client for a timeseries dataset."""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
service: TileboxDatasetService,
|
|
49
|
+
dataset: Dataset,
|
|
50
|
+
) -> None:
|
|
51
|
+
self._service = service
|
|
52
|
+
self.name = dataset.name
|
|
53
|
+
self._dataset = dataset
|
|
54
|
+
|
|
55
|
+
async def collections(
|
|
56
|
+
self, availability: bool | None = None, count: bool | None = None
|
|
57
|
+
) -> dict[str, "CollectionClient"]:
|
|
58
|
+
"""
|
|
59
|
+
List the available collections in a dataset.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
availability: Unused.
|
|
63
|
+
count: Unused.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
A mapping from collection names to collections.
|
|
67
|
+
"""
|
|
68
|
+
if availability is not None:
|
|
69
|
+
warn(
|
|
70
|
+
"The availability arg has been deprecated, and will be removed in a future version. "
|
|
71
|
+
"Collection availability information is now always returned instead",
|
|
72
|
+
DeprecationWarning,
|
|
73
|
+
stacklevel=2,
|
|
74
|
+
)
|
|
75
|
+
if count is not None:
|
|
76
|
+
warn(
|
|
77
|
+
"The count arg has been deprecated, and will be removed in a future version. "
|
|
78
|
+
"Collection counts are now always returned instead",
|
|
79
|
+
DeprecationWarning,
|
|
80
|
+
stacklevel=2,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
collections = await self._service.get_collections(self._dataset.id, True, True)
|
|
84
|
+
|
|
85
|
+
return {collection.collection.name: CollectionClient(self, collection) for collection in collections}
|
|
86
|
+
|
|
87
|
+
async def get_or_create_collection(self, name: str) -> "CollectionClient":
|
|
88
|
+
"""Get a collection by its name, or create it if it doesn't exist.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
name: The name of the collection to get or create.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The collection with the given name.
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
collection = await self.collection(name)
|
|
98
|
+
except NotFoundError:
|
|
99
|
+
return await self.create_collection(name)
|
|
100
|
+
return collection
|
|
101
|
+
|
|
102
|
+
async def create_collection(self, name: str) -> "CollectionClient":
|
|
103
|
+
"""Create a new collection in this dataset.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
name: The name of the collection to create.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
The created collection.
|
|
110
|
+
"""
|
|
111
|
+
info = await self._service.create_collection(self._dataset.id, name)
|
|
112
|
+
return CollectionClient(self, info)
|
|
113
|
+
|
|
114
|
+
async def collection(self, name: str) -> "CollectionClient":
|
|
115
|
+
"""Get a collection by its name.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
collection: The name of the collection to get.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
The collection with the given name.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
info = await self._service.get_collection_by_name(self._dataset.id, name, True, True)
|
|
125
|
+
except NotFoundError:
|
|
126
|
+
raise NotFoundError(f"No such collection {name}") from None
|
|
127
|
+
|
|
128
|
+
return CollectionClient(self, info)
|
|
129
|
+
|
|
130
|
+
def __repr__(self) -> str:
|
|
131
|
+
return f"{self.name} [Timeseries Dataset]: {self._dataset.summary}"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# always ingest / delete in batches, to avoid timeout issues for very large datasets
|
|
135
|
+
_INGEST_CHUNK_SIZE = 8192
|
|
136
|
+
_DELETE_CHUNK_SIZE = 8192
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class CollectionClient:
|
|
140
|
+
"""A client for a datapoint collection in a specific timeseries dataset."""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
dataset: DatasetClient,
|
|
145
|
+
info: CollectionInfo,
|
|
146
|
+
) -> None:
|
|
147
|
+
self._dataset = dataset
|
|
148
|
+
self._use_legacy_api = dataset._dataset.is_legacy_type
|
|
149
|
+
self._collection = info.collection
|
|
150
|
+
self._info: CollectionInfo | None = info
|
|
151
|
+
|
|
152
|
+
def __repr__(self) -> str:
|
|
153
|
+
"""Human readable representation of the collection."""
|
|
154
|
+
return repr(self._info)
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def name(self) -> str:
|
|
158
|
+
"""The name of the collection."""
|
|
159
|
+
return self._collection.name
|
|
160
|
+
|
|
161
|
+
async def info(self, availability: bool | None = None, count: bool | None = None) -> CollectionInfo:
|
|
162
|
+
"""
|
|
163
|
+
Return metadata about the datapoints in this collection.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
availability: Unused.
|
|
167
|
+
count: Unused.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
collection info for the current collection
|
|
171
|
+
"""
|
|
172
|
+
if availability is not None:
|
|
173
|
+
warn(
|
|
174
|
+
"The availability arg has been deprecated, and will be removed in a future version. "
|
|
175
|
+
"Collection availability information is now always returned instead",
|
|
176
|
+
DeprecationWarning,
|
|
177
|
+
stacklevel=2,
|
|
178
|
+
)
|
|
179
|
+
if count is not None:
|
|
180
|
+
warn(
|
|
181
|
+
"The count arg has been deprecated, and will be removed in a future version. "
|
|
182
|
+
"Collection counts are now always returned instead",
|
|
183
|
+
DeprecationWarning,
|
|
184
|
+
stacklevel=2,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if self._info is None: # only load collection info if it hasn't been loaded yet (or it has been invalidated)
|
|
188
|
+
try:
|
|
189
|
+
self._info = cast(
|
|
190
|
+
CollectionInfo,
|
|
191
|
+
await self._dataset._service.get_collection_by_name(
|
|
192
|
+
self._dataset._dataset.id, self.name, True, True
|
|
193
|
+
),
|
|
194
|
+
)
|
|
195
|
+
except NotFoundError:
|
|
196
|
+
raise NotFoundError(f"No such collection {self.name}") from None
|
|
197
|
+
|
|
198
|
+
return self._info
|
|
199
|
+
|
|
200
|
+
async def find(self, datapoint_id: str | UUID, skip_data: bool = False) -> xr.Dataset:
|
|
201
|
+
"""
|
|
202
|
+
Find a specific datapoint in this collection by its id.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
datapoint_id: The id of the datapoint to find
|
|
206
|
+
skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
The datapoint as an xarray dataset
|
|
210
|
+
"""
|
|
211
|
+
if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
|
|
212
|
+
return await self._find_legacy(str(datapoint_id), skip_data)
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
datapoint = await self._dataset._service.query_by_id(
|
|
216
|
+
[self._collection.id], as_uuid(datapoint_id), skip_data
|
|
217
|
+
)
|
|
218
|
+
except ArgumentError:
|
|
219
|
+
raise ValueError(f"Invalid datapoint id: {datapoint_id} is not a valid UUID") from None
|
|
220
|
+
except NotFoundError:
|
|
221
|
+
raise NotFoundError(f"No such datapoint {datapoint_id}") from None
|
|
222
|
+
|
|
223
|
+
message_type = get_message_type(datapoint.type_url)
|
|
224
|
+
data = message_type.FromString(datapoint.value)
|
|
225
|
+
|
|
226
|
+
converter = MessageToXarrayConverter(initial_capacity=1)
|
|
227
|
+
converter.convert(data)
|
|
228
|
+
return converter.finalize("time").isel(time=0)
|
|
229
|
+
|
|
230
|
+
async def _find_legacy(self, datapoint_id: str, skip_data: bool = False) -> xr.Dataset:
|
|
231
|
+
try:
|
|
232
|
+
datapoint = await self._dataset._service.get_datapoint_by_id(
|
|
233
|
+
str(self._collection.id), datapoint_id, skip_data
|
|
234
|
+
)
|
|
235
|
+
except ArgumentError:
|
|
236
|
+
raise ValueError(f"Invalid datapoint id: {datapoint_id} is not a valid UUID") from None
|
|
237
|
+
except NotFoundError:
|
|
238
|
+
raise NotFoundError(f"No such datapoint {datapoint_id}") from None
|
|
239
|
+
|
|
240
|
+
converter = TimeseriesToXarrayConverter(initial_capacity=1)
|
|
241
|
+
converter.convert(datapoint)
|
|
242
|
+
return converter.finalize().isel(time=0)
|
|
243
|
+
|
|
244
|
+
async def _find_interval(
|
|
245
|
+
self,
|
|
246
|
+
datapoint_id_interval: DatapointIntervalLike,
|
|
247
|
+
end_inclusive: bool = True,
|
|
248
|
+
*,
|
|
249
|
+
skip_data: bool = False,
|
|
250
|
+
show_progress: bool = False,
|
|
251
|
+
) -> xr.Dataset:
|
|
252
|
+
"""
|
|
253
|
+
Find a range of datapoints in this collection in an interval specified as datapoint ids.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
datapoint_id_interval: tuple of two datapoint ids specifying the interval: [start_id, end_id]
|
|
257
|
+
end_inclusive: Flag indicating whether the datapoint with the given end_id should be included in the
|
|
258
|
+
result or not.
|
|
259
|
+
skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
|
|
260
|
+
show_progress: Whether to show a progress bar while loading the data.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
The datapoints in the given interval as an xarray dataset
|
|
264
|
+
"""
|
|
265
|
+
if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
|
|
266
|
+
return await self._find_interval_legacy(
|
|
267
|
+
datapoint_id_interval, end_inclusive, skip_data=skip_data, show_progress=show_progress
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
filters = QueryFilters(
|
|
271
|
+
temporal_extent=DatapointInterval.parse(datapoint_id_interval, end_inclusive=end_inclusive)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
async def request(page: PaginationProtocol) -> QueryResultPage:
|
|
275
|
+
query_page = Pagination(page.limit, page.starting_after)
|
|
276
|
+
return await self._dataset._service.query([self._collection.id], filters, skip_data, query_page)
|
|
277
|
+
|
|
278
|
+
initial_page = Pagination()
|
|
279
|
+
pages = paginated_request(request, initial_page)
|
|
280
|
+
if show_progress:
|
|
281
|
+
pages = with_progressbar(pages, f"Fetching {self._dataset.name}")
|
|
282
|
+
|
|
283
|
+
return await _convert_to_dataset(pages)
|
|
284
|
+
|
|
285
|
+
async def _find_interval_legacy(
|
|
286
|
+
self,
|
|
287
|
+
datapoint_id_interval: DatapointIntervalLike,
|
|
288
|
+
end_inclusive: bool = True,
|
|
289
|
+
*,
|
|
290
|
+
skip_data: bool = False,
|
|
291
|
+
show_progress: bool = False,
|
|
292
|
+
) -> xr.Dataset:
|
|
293
|
+
datapoint_interval = DatapointInterval.parse(datapoint_id_interval, end_inclusive=end_inclusive)
|
|
294
|
+
|
|
295
|
+
async def request(page: PaginationProtocol) -> DatapointPage:
|
|
296
|
+
query_page = Pagination(page.limit, page.starting_after)
|
|
297
|
+
return await self._dataset._service.get_dataset_for_datapoint_interval(
|
|
298
|
+
str(self._collection.id), datapoint_interval, skip_data, False, query_page
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
initial_page = Pagination()
|
|
302
|
+
pages = paginated_request(request, initial_page)
|
|
303
|
+
if show_progress:
|
|
304
|
+
pages = with_progressbar(pages, f"Fetching {self._dataset.name}")
|
|
305
|
+
|
|
306
|
+
return await _convert_to_dataset_legacy(pages)
|
|
307
|
+
|
|
308
|
+
async def load(
|
|
309
|
+
self,
|
|
310
|
+
temporal_extent: TimeIntervalLike,
|
|
311
|
+
*,
|
|
312
|
+
skip_data: bool = False,
|
|
313
|
+
show_progress: bool | ProgressCallback = False,
|
|
314
|
+
) -> xr.Dataset:
|
|
315
|
+
"""
|
|
316
|
+
Load a range of datapoints in this collection for a specified temporal_extent.
|
|
317
|
+
|
|
318
|
+
An alias for query() without a spatial extent.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
temporal_extent: The temporal extent to load data for.
|
|
322
|
+
Can be specified in a number of ways:
|
|
323
|
+
- TimeInterval: interval -> Use the time interval as its given
|
|
324
|
+
- DatetimeScalar: [time, time] -> Construct a TimeInterval with start and end time set to the given
|
|
325
|
+
value and the end time inclusive
|
|
326
|
+
- tuple of two DatetimeScalar: [start, end) -> Construct a TimeInterval with the given start and
|
|
327
|
+
end time
|
|
328
|
+
- xr.DataArray: [arr[0], arr[-1]] -> Construct a TimeInterval with start and end time set to the
|
|
329
|
+
first and last value in the array and the end time inclusive
|
|
330
|
+
- xr.Dataset: [ds.time[0], ds.time[-1]] -> Construct a TimeInterval with start and end time set to
|
|
331
|
+
the first and last value in the time coordinate of the dataset and the end time inclusive
|
|
332
|
+
skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
|
|
333
|
+
show_progress: Whether to show a progress bar while loading the data.
|
|
334
|
+
If a callable is specified it is used as callback to report progress percentages.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
Matching datapoints in the given temporal extent as an xarray dataset
|
|
338
|
+
"""
|
|
339
|
+
if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
|
|
340
|
+
return await self._load_legacy(temporal_extent, skip_data=skip_data, show_progress=show_progress)
|
|
341
|
+
|
|
342
|
+
return await self.query(temporal_extent=temporal_extent, skip_data=skip_data, show_progress=show_progress)
|
|
343
|
+
|
|
344
|
+
async def query(
|
|
345
|
+
self,
|
|
346
|
+
*,
|
|
347
|
+
temporal_extent: TimeIntervalLike,
|
|
348
|
+
spatial_extent: SpatialFilterLike | None = None,
|
|
349
|
+
skip_data: bool = False,
|
|
350
|
+
show_progress: bool | ProgressCallback = False,
|
|
351
|
+
) -> xr.Dataset:
|
|
352
|
+
"""
|
|
353
|
+
Query datapoints in this collection in a specified temporal extent and an optional spatial extent.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
temporal_extent: The temporal extent to query data for. (Required)
|
|
357
|
+
Can be specified in a number of ways:
|
|
358
|
+
- TimeInterval: interval -> Use the time interval as its given
|
|
359
|
+
- DatetimeScalar: [time, time] -> Construct a TimeInterval with start and end time set to the given
|
|
360
|
+
value and the end time inclusive
|
|
361
|
+
- tuple of two DatetimeScalar: [start, end) -> Construct a TimeInterval with the given start and
|
|
362
|
+
end time
|
|
363
|
+
- xr.DataArray: [arr[0], arr[-1]] -> Construct a TimeInterval with start and end time set to the
|
|
364
|
+
first and last value in the array and the end time inclusive
|
|
365
|
+
- xr.Dataset: [ds.time[0], ds.time[-1]] -> Construct a TimeInterval with start and end time set to
|
|
366
|
+
the first and last value in the time coordinate of the dataset and the end time inclusive
|
|
367
|
+
spatial_extent: The spatial extent to query data in. (Optional)
|
|
368
|
+
Expected to be either a shapely geometry, or a dict with the following keys:
|
|
369
|
+
- geometry: The geometry to query by. Must be a shapely.Polygon, shapely.MultiPolygon or shapely.Point.
|
|
370
|
+
- mode: The spatial filter mode to use. Can be one of "intersects" or "contains".
|
|
371
|
+
Defaults to "intersects".
|
|
372
|
+
- coordinate_system: The coordinate system to use for performing geometry calculations. Can be one
|
|
373
|
+
of "cartesian" or "spherical".
|
|
374
|
+
Only supported for spatiotemporal datasets. Will raise an error if used for other dataset types.
|
|
375
|
+
All datapoints whose geometry intersects the given spatial extent will be returned.
|
|
376
|
+
skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
|
|
377
|
+
show_progress: Whether to show a progress bar while loading the data.
|
|
378
|
+
If a callable is specified it is used as callback to report progress percentages.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Matching datapoints in the given temporal and spatial extent as an xarray dataset
|
|
382
|
+
"""
|
|
383
|
+
if self._use_legacy_api:
|
|
384
|
+
raise ValueError("Querying is not supported for this dataset. Please use load() instead.")
|
|
385
|
+
|
|
386
|
+
if temporal_extent is None:
|
|
387
|
+
raise ValueError("A temporal_extent for your query must be specified")
|
|
388
|
+
|
|
389
|
+
pages = self._iter_pages(temporal_extent, spatial_extent, skip_data, show_progress=show_progress)
|
|
390
|
+
return await _convert_to_dataset(pages)
|
|
391
|
+
|
|
392
|
+
async def _iter_pages(
|
|
393
|
+
self,
|
|
394
|
+
temporal_extent: TimeIntervalLike,
|
|
395
|
+
spatial_extent: SpatialFilterLike | None = None,
|
|
396
|
+
skip_data: bool = False,
|
|
397
|
+
show_progress: bool | ProgressCallback = False,
|
|
398
|
+
page_size: int | None = None,
|
|
399
|
+
) -> AsyncIterator[QueryResultPage]:
|
|
400
|
+
time_interval = TimeInterval.parse(temporal_extent)
|
|
401
|
+
filters = QueryFilters(time_interval, SpatialFilter.parse(spatial_extent) if spatial_extent else None)
|
|
402
|
+
|
|
403
|
+
request = partial(self._load_page, filters, skip_data)
|
|
404
|
+
|
|
405
|
+
initial_page = Pagination(limit=page_size)
|
|
406
|
+
pages = paginated_request(request, initial_page)
|
|
407
|
+
|
|
408
|
+
if callable(show_progress):
|
|
409
|
+
pages = with_time_progress_callback(pages, time_interval, show_progress)
|
|
410
|
+
elif show_progress:
|
|
411
|
+
message = f"Fetching {self._dataset.name}"
|
|
412
|
+
pages = with_time_progressbar(pages, time_interval, message)
|
|
413
|
+
|
|
414
|
+
async for page in pages:
|
|
415
|
+
yield page
|
|
416
|
+
|
|
417
|
+
async def _load_page(
|
|
418
|
+
self, filters: QueryFilters, skip_data: bool, page: PaginationProtocol | None = None
|
|
419
|
+
) -> QueryResultPage:
|
|
420
|
+
query_page = Pagination(page.limit, page.starting_after) if page else Pagination()
|
|
421
|
+
return await self._dataset._service.query([self._collection.id], filters, skip_data, query_page)
|
|
422
|
+
|
|
423
|
+
async def _load_legacy(
|
|
424
|
+
self,
|
|
425
|
+
time_or_interval: TimeIntervalLike,
|
|
426
|
+
*,
|
|
427
|
+
skip_data: bool = False,
|
|
428
|
+
show_progress: bool | ProgressCallback = False,
|
|
429
|
+
) -> xr.Dataset:
|
|
430
|
+
pages = self._iter_pages_legacy(time_or_interval, skip_data, show_progress=show_progress)
|
|
431
|
+
return await _convert_to_dataset_legacy(pages)
|
|
432
|
+
|
|
433
|
+
async def _iter_pages_legacy(
|
|
434
|
+
self,
|
|
435
|
+
time_or_interval: TimeIntervalLike,
|
|
436
|
+
skip_data: bool = False,
|
|
437
|
+
skip_meta: bool = False,
|
|
438
|
+
show_progress: bool | ProgressCallback = False,
|
|
439
|
+
page_size: int | None = None,
|
|
440
|
+
) -> AsyncIterator[DatapointPage]:
|
|
441
|
+
time_interval = TimeInterval.parse(time_or_interval)
|
|
442
|
+
|
|
443
|
+
request = partial(self._load_page_legacy, time_interval, skip_data, skip_meta)
|
|
444
|
+
|
|
445
|
+
initial_page = Pagination(limit=page_size)
|
|
446
|
+
pages = paginated_request(request, initial_page)
|
|
447
|
+
|
|
448
|
+
if callable(show_progress):
|
|
449
|
+
if skip_meta:
|
|
450
|
+
raise ValueError("Progress callback requires datapoint metadata, but skip_meta is True")
|
|
451
|
+
else:
|
|
452
|
+
pages = with_time_progress_callback(pages, time_interval, show_progress)
|
|
453
|
+
elif show_progress:
|
|
454
|
+
message = f"Fetching {self._dataset.name}"
|
|
455
|
+
if skip_meta: # without metadata we can't estimate progress based on event time (since it is not returned)
|
|
456
|
+
pages = with_progressbar(pages, message)
|
|
457
|
+
else:
|
|
458
|
+
pages = with_time_progressbar(pages, time_interval, message)
|
|
459
|
+
|
|
460
|
+
async for page in pages:
|
|
461
|
+
yield page
|
|
462
|
+
|
|
463
|
+
async def _load_page_legacy(
|
|
464
|
+
self, time_interval: TimeInterval, skip_data: bool, skip_meta: bool, page: PaginationProtocol | None = None
|
|
465
|
+
) -> DatapointPage:
|
|
466
|
+
query_page = Pagination(page.limit, page.starting_after) if page else Pagination()
|
|
467
|
+
return await self._dataset._service.get_dataset_for_time_interval(
|
|
468
|
+
str(self._collection.id), time_interval, skip_data, skip_meta, query_page
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
async def ingest(
|
|
472
|
+
self,
|
|
473
|
+
data: IngestionData,
|
|
474
|
+
allow_existing: bool = True,
|
|
475
|
+
*,
|
|
476
|
+
show_progress: bool | ProgressCallback = False,
|
|
477
|
+
) -> list[UUID]:
|
|
478
|
+
"""Ingest data into the collection.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
data: The data to ingest. Supported data types are:
|
|
482
|
+
- xr.Dataset: Ingest a dataset such as it is returned by the output of `collection.load()`
|
|
483
|
+
- pd.DataFrame: Ingest a pandas DataFrame, mapping the column names to the dataset fields
|
|
484
|
+
- Iterable, dict or nd-array: Ingest any object that can be converted to a pandas DataFrame,
|
|
485
|
+
equivalent to `ingest(pd.DataFrame(data))`
|
|
486
|
+
allow_existing: Whether to allow existing datapoints. Datapoints will only be overwritten if
|
|
487
|
+
all of their fields are exactly equal to already existing datapoints. Tilebox will never create
|
|
488
|
+
duplicate datapoints, but will raise an error if the datapoint already exists. Setting this to
|
|
489
|
+
`True` will not raise an error and skip the duplicate datapoints instead.
|
|
490
|
+
show_progress: Whether to show a progress bar while ingestion a large number of datapoints.
|
|
491
|
+
If a callable is specified it is used as callback to report progress percentages.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
List of datapoint ids that were ingested.
|
|
495
|
+
"""
|
|
496
|
+
if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
|
|
497
|
+
raise ValueError("Ingestion is not supported for this dataset. Please create a new dataset.")
|
|
498
|
+
|
|
499
|
+
message_type = get_message_type(self._dataset._dataset.type.type_url)
|
|
500
|
+
messages = marshal_messages(
|
|
501
|
+
to_messages(data, message_type, required_fields=["time"], ignore_fields=["id", "ingestion_time"])
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
disable_progress_bar = callable(show_progress) or (not show_progress)
|
|
505
|
+
|
|
506
|
+
ingested_ids = []
|
|
507
|
+
with tqdm(
|
|
508
|
+
total=len(messages),
|
|
509
|
+
desc=f"Ingesting into {self._dataset.name}",
|
|
510
|
+
unit="datapoints",
|
|
511
|
+
disable=disable_progress_bar,
|
|
512
|
+
) as progress_bar:
|
|
513
|
+
for chunk_start in range(0, len(messages), _INGEST_CHUNK_SIZE):
|
|
514
|
+
chunk = messages[chunk_start : chunk_start + _INGEST_CHUNK_SIZE]
|
|
515
|
+
response = await self._dataset._service.ingest(self._collection.id, chunk, allow_existing)
|
|
516
|
+
ingested_ids.extend(response.datapoint_ids)
|
|
517
|
+
|
|
518
|
+
progress_bar.update(len(chunk))
|
|
519
|
+
if callable(show_progress):
|
|
520
|
+
show_progress(len(ingested_ids) / len(messages))
|
|
521
|
+
self._info = None # invalidate collection info, since we just ingested some data into it
|
|
522
|
+
return ingested_ids
|
|
523
|
+
|
|
524
|
+
async def delete(self, datapoints: DatapointIDs, *, show_progress: bool | ProgressCallback = False) -> int:
|
|
525
|
+
"""Delete datapoints from the collection.
|
|
526
|
+
|
|
527
|
+
Datapoints are identified and deleted by their ids.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
datapoints: The datapoints to delete. Supported types are:
|
|
531
|
+
- xr.Dataset: An xarray.Dataset containing an "id" variable/coord consisting of datapoint IDs to delete.
|
|
532
|
+
- pd.DataFrame: A pandas DataFrame containing a "id" column consisting of datapoint IDs to delete.
|
|
533
|
+
- xr.DataArray, np.ndarray, pd.Series, list[UUID]: Array of UUIDs to delete
|
|
534
|
+
- list[str], list[UUID]: List of datapoint IDs to delete
|
|
535
|
+
show_progress: Whether to show a progress bar when deleting a large number of datapoints.
|
|
536
|
+
If a callable is specified it is used as callback to report progress percentages.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
The number of datapoints that were deleted.
|
|
540
|
+
|
|
541
|
+
Raises:
|
|
542
|
+
NotFoundError: If one or more of the datapoints to delete doesn't exist - no datapoints
|
|
543
|
+
will be deleted if any of the requested deletions doesn't exist.
|
|
544
|
+
"""
|
|
545
|
+
datapoint_ids = extract_datapoint_ids(datapoints)
|
|
546
|
+
num_deleted = 0
|
|
547
|
+
|
|
548
|
+
disable_progress_bar = callable(show_progress) or (not show_progress)
|
|
549
|
+
|
|
550
|
+
with tqdm(
|
|
551
|
+
total=len(datapoint_ids),
|
|
552
|
+
desc=f"Deleting from {self._dataset.name}",
|
|
553
|
+
unit="datapoints",
|
|
554
|
+
disable=disable_progress_bar,
|
|
555
|
+
) as progress_bar:
|
|
556
|
+
for chunk_start in range(0, len(datapoint_ids), _DELETE_CHUNK_SIZE):
|
|
557
|
+
chunk = datapoint_ids[chunk_start : chunk_start + _DELETE_CHUNK_SIZE]
|
|
558
|
+
num_deleted += await self._dataset._service.delete(self._collection.id, chunk)
|
|
559
|
+
|
|
560
|
+
progress_bar.update(len(chunk))
|
|
561
|
+
if callable(show_progress):
|
|
562
|
+
show_progress(num_deleted / len(datapoint_ids))
|
|
563
|
+
self._info = None # invalidate collection info, since we just deleted some data from it
|
|
564
|
+
return num_deleted
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
async def _convert_to_dataset(pages: AsyncIterator[QueryResultPage]) -> xr.Dataset:
|
|
568
|
+
"""
|
|
569
|
+
Convert an async iterator of QueryResultPages into a single xarray Dataset
|
|
570
|
+
|
|
571
|
+
Parses each incoming page while in parallel already requesting and waiting for the next page from the server.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
pages: Async iterator of QueryResultPages to convert
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
The datapoints from the individual pages converted and combined into a single xarray dataset
|
|
578
|
+
"""
|
|
579
|
+
converter = MessageToXarrayConverter()
|
|
580
|
+
|
|
581
|
+
def convert_page(page: QueryResultPage) -> None:
|
|
582
|
+
message_type = get_message_type(page.data.type_url)
|
|
583
|
+
messages = [message_type.FromString(v) for v in page.data.value]
|
|
584
|
+
converter.convert_all(messages)
|
|
585
|
+
|
|
586
|
+
# lets parse the incoming pages already while we wait for the next page from the server
|
|
587
|
+
# we solve this using a classic producer/consumer with a queue of pages for communication
|
|
588
|
+
# this would also account for the case where the server sends pages faster than we are converting
|
|
589
|
+
# them to xarray
|
|
590
|
+
await async_producer_consumer(pages, convert_page)
|
|
591
|
+
return converter.finalize("time")
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
async def _convert_to_dataset_legacy(pages: AsyncIterator[DatapointPage]) -> xr.Dataset:
|
|
595
|
+
"""
|
|
596
|
+
Convert an async iterator of DatasetIntervals (pages) into a single xarray Dataset
|
|
597
|
+
|
|
598
|
+
Parses each incoming page while in parallel already requesting and waiting for the next page from the server.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
pages: Async iterator of DatasetIntervals (pages) to convert
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
The datapoints from the individual pages converted and combined into a single xarray dataset
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
converter = TimeseriesToXarrayConverter()
|
|
608
|
+
# lets parse the incoming pages already while we wait for the next page from the server
|
|
609
|
+
# we solve this using a classic producer/consumer with a queue of pages for communication
|
|
610
|
+
# this would also account for the case where the server sends pages faster than we are converting
|
|
611
|
+
# them to xarray
|
|
612
|
+
await async_producer_consumer(pages, lambda page: converter.convert_all(page))
|
|
613
|
+
return converter.finalize()
|