tilebox-datasets 0.34.0__py3-none-any.whl → 0.36.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,12 @@ import sys
3
3
 
4
4
  from loguru import logger
5
5
 
6
+ # only here for backwards compatibility, to preserve backwards compatibility with older imports
7
+ from tilebox.datasets.aio.timeseries import TimeseriesCollection, TimeseriesDataset
6
8
  from tilebox.datasets.sync.client import Client
7
- from tilebox.datasets.sync.timeseries import TimeseriesCollection, TimeseriesDataset
9
+ from tilebox.datasets.sync.dataset import CollectionClient, DatasetClient
8
10
 
9
- __all__ = ["Client", "TimeseriesCollection", "TimeseriesDataset"]
11
+ __all__ = ["Client", "CollectionClient", "DatasetClient", "TimeseriesCollection", "TimeseriesDataset"]
10
12
 
11
13
 
12
14
  def _init_logging(level: str = "INFO") -> None:
@@ -1,4 +1,7 @@
1
1
  from tilebox.datasets.aio.client import Client
2
+ from tilebox.datasets.aio.dataset import CollectionClient, DatasetClient
3
+
4
+ # only here for backwards compatibility, to preserve backwards compatibility with older imports
2
5
  from tilebox.datasets.aio.timeseries import TimeseriesCollection, TimeseriesDataset
3
6
 
4
- __all__ = ["Client", "TimeseriesCollection", "TimeseriesDataset"]
7
+ __all__ = ["Client", "CollectionClient", "DatasetClient", "TimeseriesCollection", "TimeseriesDataset"]
@@ -2,7 +2,7 @@ from uuid import UUID
2
2
 
3
3
  from _tilebox.grpc.aio.channel import open_channel
4
4
  from _tilebox.grpc.aio.error import with_pythonic_errors
5
- from tilebox.datasets.aio.timeseries import TimeseriesDataset
5
+ from tilebox.datasets.aio.dataset import DatasetClient
6
6
  from tilebox.datasets.client import Client as BaseClient
7
7
  from tilebox.datasets.client import token_from_env
8
8
  from tilebox.datasets.datasetsv1.collections_pb2_grpc import CollectionServiceStub
@@ -33,10 +33,10 @@ class Client:
33
33
  self._client = BaseClient(service)
34
34
 
35
35
  async def datasets(self) -> Group:
36
- return await self._client.datasets(TimeseriesDataset)
36
+ return await self._client.datasets(DatasetClient)
37
37
 
38
- async def dataset(self, slug: str) -> TimeseriesDataset:
39
- return await self._client.dataset(slug, TimeseriesDataset)
38
+ async def dataset(self, slug: str) -> DatasetClient:
39
+ return await self._client.dataset(slug, DatasetClient)
40
40
 
41
- async def _dataset_by_id(self, dataset_id: str | UUID) -> TimeseriesDataset:
42
- return await self._client._dataset_by_id(dataset_id, TimeseriesDataset) # noqa: SLF001
41
+ async def _dataset_by_id(self, dataset_id: str | UUID) -> DatasetClient:
42
+ return await self._client._dataset_by_id(dataset_id, DatasetClient) # noqa: SLF001
@@ -0,0 +1,613 @@
1
+ from collections.abc import AsyncIterator
2
+ from functools import partial
3
+ from typing import cast
4
+ from uuid import UUID
5
+ from warnings import warn
6
+
7
+ import xarray as xr
8
+ from tqdm.auto import tqdm
9
+
10
+ from _tilebox.grpc.aio.pagination import Pagination as PaginationProtocol
11
+ from _tilebox.grpc.aio.pagination import paginated_request
12
+ from _tilebox.grpc.aio.producer_consumer import async_producer_consumer
13
+ from _tilebox.grpc.error import ArgumentError, NotFoundError
14
+ from tilebox.datasets.aio.pagination import (
15
+ with_progressbar,
16
+ with_time_progress_callback,
17
+ with_time_progressbar,
18
+ )
19
+ from tilebox.datasets.data.collection import CollectionInfo
20
+ from tilebox.datasets.data.data_access import QueryFilters, SpatialFilter, SpatialFilterLike
21
+ from tilebox.datasets.data.datapoint import DatapointInterval, DatapointIntervalLike, DatapointPage, QueryResultPage
22
+ from tilebox.datasets.data.datasets import Dataset
23
+ from tilebox.datasets.data.pagination import Pagination
24
+ from tilebox.datasets.data.time_interval import TimeInterval, TimeIntervalLike
25
+ from tilebox.datasets.data.uuid import as_uuid
26
+ from tilebox.datasets.message_pool import get_message_type
27
+ from tilebox.datasets.progress import ProgressCallback
28
+ from tilebox.datasets.protobuf_conversion.protobuf_xarray import MessageToXarrayConverter, TimeseriesToXarrayConverter
29
+ from tilebox.datasets.protobuf_conversion.to_protobuf import (
30
+ DatapointIDs,
31
+ IngestionData,
32
+ extract_datapoint_ids,
33
+ marshal_messages,
34
+ to_messages,
35
+ )
36
+ from tilebox.datasets.service import TileboxDatasetService
37
+
38
+ # allow private member access: we allow it here because we want to make as much private as possible so that we can
39
+ # minimize the publicly facing API (which allows us to change internals later, and also limits to auto-completion)
40
+ # ruff: noqa: SLF001
41
+
42
+
43
+ class DatasetClient:
44
+ """A client for a timeseries dataset."""
45
+
46
+ def __init__(
47
+ self,
48
+ service: TileboxDatasetService,
49
+ dataset: Dataset,
50
+ ) -> None:
51
+ self._service = service
52
+ self.name = dataset.name
53
+ self._dataset = dataset
54
+
55
+ async def collections(
56
+ self, availability: bool | None = None, count: bool | None = None
57
+ ) -> dict[str, "CollectionClient"]:
58
+ """
59
+ List the available collections in a dataset.
60
+
61
+ Args:
62
+ availability: Unused.
63
+ count: Unused.
64
+
65
+ Returns:
66
+ A mapping from collection names to collections.
67
+ """
68
+ if availability is not None:
69
+ warn(
70
+ "The availability arg has been deprecated, and will be removed in a future version. "
71
+ "Collection availability information is now always returned instead",
72
+ DeprecationWarning,
73
+ stacklevel=2,
74
+ )
75
+ if count is not None:
76
+ warn(
77
+ "The count arg has been deprecated, and will be removed in a future version. "
78
+ "Collection counts are now always returned instead",
79
+ DeprecationWarning,
80
+ stacklevel=2,
81
+ )
82
+
83
+ collections = await self._service.get_collections(self._dataset.id, True, True)
84
+
85
+ return {collection.collection.name: CollectionClient(self, collection) for collection in collections}
86
+
87
+ async def get_or_create_collection(self, name: str) -> "CollectionClient":
88
+ """Get a collection by its name, or create it if it doesn't exist.
89
+
90
+ Args:
91
+ name: The name of the collection to get or create.
92
+
93
+ Returns:
94
+ The collection with the given name.
95
+ """
96
+ try:
97
+ collection = await self.collection(name)
98
+ except NotFoundError:
99
+ return await self.create_collection(name)
100
+ return collection
101
+
102
+ async def create_collection(self, name: str) -> "CollectionClient":
103
+ """Create a new collection in this dataset.
104
+
105
+ Args:
106
+ name: The name of the collection to create.
107
+
108
+ Returns:
109
+ The created collection.
110
+ """
111
+ info = await self._service.create_collection(self._dataset.id, name)
112
+ return CollectionClient(self, info)
113
+
114
+ async def collection(self, name: str) -> "CollectionClient":
115
+ """Get a collection by its name.
116
+
117
+ Args:
118
+ collection: The name of the collection to get.
119
+
120
+ Returns:
121
+ The collection with the given name.
122
+ """
123
+ try:
124
+ info = await self._service.get_collection_by_name(self._dataset.id, name, True, True)
125
+ except NotFoundError:
126
+ raise NotFoundError(f"No such collection {name}") from None
127
+
128
+ return CollectionClient(self, info)
129
+
130
+ def __repr__(self) -> str:
131
+ return f"{self.name} [Timeseries Dataset]: {self._dataset.summary}"
132
+
133
+
134
+ # always ingest / delete in batches, to avoid timeout issues for very large datasets
135
+ _INGEST_CHUNK_SIZE = 8192
136
+ _DELETE_CHUNK_SIZE = 8192
137
+
138
+
139
+ class CollectionClient:
140
+ """A client for a datapoint collection in a specific timeseries dataset."""
141
+
142
+ def __init__(
143
+ self,
144
+ dataset: DatasetClient,
145
+ info: CollectionInfo,
146
+ ) -> None:
147
+ self._dataset = dataset
148
+ self._use_legacy_api = dataset._dataset.is_legacy_type
149
+ self._collection = info.collection
150
+ self._info: CollectionInfo | None = info
151
+
152
+ def __repr__(self) -> str:
153
+ """Human readable representation of the collection."""
154
+ return repr(self._info)
155
+
156
+ @property
157
+ def name(self) -> str:
158
+ """The name of the collection."""
159
+ return self._collection.name
160
+
161
+ async def info(self, availability: bool | None = None, count: bool | None = None) -> CollectionInfo:
162
+ """
163
+ Return metadata about the datapoints in this collection.
164
+
165
+ Args:
166
+ availability: Unused.
167
+ count: Unused.
168
+
169
+ Returns:
170
+ collection info for the current collection
171
+ """
172
+ if availability is not None:
173
+ warn(
174
+ "The availability arg has been deprecated, and will be removed in a future version. "
175
+ "Collection availability information is now always returned instead",
176
+ DeprecationWarning,
177
+ stacklevel=2,
178
+ )
179
+ if count is not None:
180
+ warn(
181
+ "The count arg has been deprecated, and will be removed in a future version. "
182
+ "Collection counts are now always returned instead",
183
+ DeprecationWarning,
184
+ stacklevel=2,
185
+ )
186
+
187
+ if self._info is None: # only load collection info if it hasn't been loaded yet (or it has been invalidated)
188
+ try:
189
+ self._info = cast(
190
+ CollectionInfo,
191
+ await self._dataset._service.get_collection_by_name(
192
+ self._dataset._dataset.id, self.name, True, True
193
+ ),
194
+ )
195
+ except NotFoundError:
196
+ raise NotFoundError(f"No such collection {self.name}") from None
197
+
198
+ return self._info
199
+
200
+ async def find(self, datapoint_id: str | UUID, skip_data: bool = False) -> xr.Dataset:
201
+ """
202
+ Find a specific datapoint in this collection by its id.
203
+
204
+ Args:
205
+ datapoint_id: The id of the datapoint to find
206
+ skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
207
+
208
+ Returns:
209
+ The datapoint as an xarray dataset
210
+ """
211
+ if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
212
+ return await self._find_legacy(str(datapoint_id), skip_data)
213
+
214
+ try:
215
+ datapoint = await self._dataset._service.query_by_id(
216
+ [self._collection.id], as_uuid(datapoint_id), skip_data
217
+ )
218
+ except ArgumentError:
219
+ raise ValueError(f"Invalid datapoint id: {datapoint_id} is not a valid UUID") from None
220
+ except NotFoundError:
221
+ raise NotFoundError(f"No such datapoint {datapoint_id}") from None
222
+
223
+ message_type = get_message_type(datapoint.type_url)
224
+ data = message_type.FromString(datapoint.value)
225
+
226
+ converter = MessageToXarrayConverter(initial_capacity=1)
227
+ converter.convert(data)
228
+ return converter.finalize("time").isel(time=0)
229
+
230
+ async def _find_legacy(self, datapoint_id: str, skip_data: bool = False) -> xr.Dataset:
231
+ try:
232
+ datapoint = await self._dataset._service.get_datapoint_by_id(
233
+ str(self._collection.id), datapoint_id, skip_data
234
+ )
235
+ except ArgumentError:
236
+ raise ValueError(f"Invalid datapoint id: {datapoint_id} is not a valid UUID") from None
237
+ except NotFoundError:
238
+ raise NotFoundError(f"No such datapoint {datapoint_id}") from None
239
+
240
+ converter = TimeseriesToXarrayConverter(initial_capacity=1)
241
+ converter.convert(datapoint)
242
+ return converter.finalize().isel(time=0)
243
+
244
+ async def _find_interval(
245
+ self,
246
+ datapoint_id_interval: DatapointIntervalLike,
247
+ end_inclusive: bool = True,
248
+ *,
249
+ skip_data: bool = False,
250
+ show_progress: bool = False,
251
+ ) -> xr.Dataset:
252
+ """
253
+ Find a range of datapoints in this collection in an interval specified as datapoint ids.
254
+
255
+ Args:
256
+ datapoint_id_interval: tuple of two datapoint ids specifying the interval: [start_id, end_id]
257
+ end_inclusive: Flag indicating whether the datapoint with the given end_id should be included in the
258
+ result or not.
259
+ skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
260
+ show_progress: Whether to show a progress bar while loading the data.
261
+
262
+ Returns:
263
+ The datapoints in the given interval as an xarray dataset
264
+ """
265
+ if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
266
+ return await self._find_interval_legacy(
267
+ datapoint_id_interval, end_inclusive, skip_data=skip_data, show_progress=show_progress
268
+ )
269
+
270
+ filters = QueryFilters(
271
+ temporal_extent=DatapointInterval.parse(datapoint_id_interval, end_inclusive=end_inclusive)
272
+ )
273
+
274
+ async def request(page: PaginationProtocol) -> QueryResultPage:
275
+ query_page = Pagination(page.limit, page.starting_after)
276
+ return await self._dataset._service.query([self._collection.id], filters, skip_data, query_page)
277
+
278
+ initial_page = Pagination()
279
+ pages = paginated_request(request, initial_page)
280
+ if show_progress:
281
+ pages = with_progressbar(pages, f"Fetching {self._dataset.name}")
282
+
283
+ return await _convert_to_dataset(pages)
284
+
285
+ async def _find_interval_legacy(
286
+ self,
287
+ datapoint_id_interval: DatapointIntervalLike,
288
+ end_inclusive: bool = True,
289
+ *,
290
+ skip_data: bool = False,
291
+ show_progress: bool = False,
292
+ ) -> xr.Dataset:
293
+ datapoint_interval = DatapointInterval.parse(datapoint_id_interval, end_inclusive=end_inclusive)
294
+
295
+ async def request(page: PaginationProtocol) -> DatapointPage:
296
+ query_page = Pagination(page.limit, page.starting_after)
297
+ return await self._dataset._service.get_dataset_for_datapoint_interval(
298
+ str(self._collection.id), datapoint_interval, skip_data, False, query_page
299
+ )
300
+
301
+ initial_page = Pagination()
302
+ pages = paginated_request(request, initial_page)
303
+ if show_progress:
304
+ pages = with_progressbar(pages, f"Fetching {self._dataset.name}")
305
+
306
+ return await _convert_to_dataset_legacy(pages)
307
+
308
+ async def load(
309
+ self,
310
+ temporal_extent: TimeIntervalLike,
311
+ *,
312
+ skip_data: bool = False,
313
+ show_progress: bool | ProgressCallback = False,
314
+ ) -> xr.Dataset:
315
+ """
316
+ Load a range of datapoints in this collection for a specified temporal_extent.
317
+
318
+ An alias for query() without a spatial extent.
319
+
320
+ Args:
321
+ temporal_extent: The temporal extent to load data for.
322
+ Can be specified in a number of ways:
323
+ - TimeInterval: interval -> Use the time interval as its given
324
+ - DatetimeScalar: [time, time] -> Construct a TimeInterval with start and end time set to the given
325
+ value and the end time inclusive
326
+ - tuple of two DatetimeScalar: [start, end) -> Construct a TimeInterval with the given start and
327
+ end time
328
+ - xr.DataArray: [arr[0], arr[-1]] -> Construct a TimeInterval with start and end time set to the
329
+ first and last value in the array and the end time inclusive
330
+ - xr.Dataset: [ds.time[0], ds.time[-1]] -> Construct a TimeInterval with start and end time set to
331
+ the first and last value in the time coordinate of the dataset and the end time inclusive
332
+ skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
333
+ show_progress: Whether to show a progress bar while loading the data.
334
+ If a callable is specified it is used as callback to report progress percentages.
335
+
336
+ Returns:
337
+ Matching datapoints in the given temporal extent as an xarray dataset
338
+ """
339
+ if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
340
+ return await self._load_legacy(temporal_extent, skip_data=skip_data, show_progress=show_progress)
341
+
342
+ return await self.query(temporal_extent=temporal_extent, skip_data=skip_data, show_progress=show_progress)
343
+
344
+ async def query(
345
+ self,
346
+ *,
347
+ temporal_extent: TimeIntervalLike,
348
+ spatial_extent: SpatialFilterLike | None = None,
349
+ skip_data: bool = False,
350
+ show_progress: bool | ProgressCallback = False,
351
+ ) -> xr.Dataset:
352
+ """
353
+ Query datapoints in this collection in a specified temporal extent and an optional spatial extent.
354
+
355
+ Args:
356
+ temporal_extent: The temporal extent to query data for. (Required)
357
+ Can be specified in a number of ways:
358
+ - TimeInterval: interval -> Use the time interval as its given
359
+ - DatetimeScalar: [time, time] -> Construct a TimeInterval with start and end time set to the given
360
+ value and the end time inclusive
361
+ - tuple of two DatetimeScalar: [start, end) -> Construct a TimeInterval with the given start and
362
+ end time
363
+ - xr.DataArray: [arr[0], arr[-1]] -> Construct a TimeInterval with start and end time set to the
364
+ first and last value in the array and the end time inclusive
365
+ - xr.Dataset: [ds.time[0], ds.time[-1]] -> Construct a TimeInterval with start and end time set to
366
+ the first and last value in the time coordinate of the dataset and the end time inclusive
367
+ spatial_extent: The spatial extent to query data in. (Optional)
368
+ Expected to be either a shapely geometry, or a dict with the following keys:
369
+ - geometry: The geometry to query by. Must be a shapely.Polygon, shapely.MultiPolygon or shapely.Point.
370
+ - mode: The spatial filter mode to use. Can be one of "intersects" or "contains".
371
+ Defaults to "intersects".
372
+ - coordinate_system: The coordinate system to use for performing geometry calculations. Can be one
373
+ of "cartesian" or "spherical".
374
+ Only supported for spatiotemporal datasets. Will raise an error if used for other dataset types.
375
+ All datapoints whose geometry intersects the given spatial extent will be returned.
376
+ skip_data: Whether to skip the actual data of the datapoint. If True, only datapoint metadata is returned.
377
+ show_progress: Whether to show a progress bar while loading the data.
378
+ If a callable is specified it is used as callback to report progress percentages.
379
+
380
+ Returns:
381
+ Matching datapoints in the given temporal and spatial extent as an xarray dataset
382
+ """
383
+ if self._use_legacy_api:
384
+ raise ValueError("Querying is not supported for this dataset. Please use load() instead.")
385
+
386
+ if temporal_extent is None:
387
+ raise ValueError("A temporal_extent for your query must be specified")
388
+
389
+ pages = self._iter_pages(temporal_extent, spatial_extent, skip_data, show_progress=show_progress)
390
+ return await _convert_to_dataset(pages)
391
+
392
+ async def _iter_pages(
393
+ self,
394
+ temporal_extent: TimeIntervalLike,
395
+ spatial_extent: SpatialFilterLike | None = None,
396
+ skip_data: bool = False,
397
+ show_progress: bool | ProgressCallback = False,
398
+ page_size: int | None = None,
399
+ ) -> AsyncIterator[QueryResultPage]:
400
+ time_interval = TimeInterval.parse(temporal_extent)
401
+ filters = QueryFilters(time_interval, SpatialFilter.parse(spatial_extent) if spatial_extent else None)
402
+
403
+ request = partial(self._load_page, filters, skip_data)
404
+
405
+ initial_page = Pagination(limit=page_size)
406
+ pages = paginated_request(request, initial_page)
407
+
408
+ if callable(show_progress):
409
+ pages = with_time_progress_callback(pages, time_interval, show_progress)
410
+ elif show_progress:
411
+ message = f"Fetching {self._dataset.name}"
412
+ pages = with_time_progressbar(pages, time_interval, message)
413
+
414
+ async for page in pages:
415
+ yield page
416
+
417
+ async def _load_page(
418
+ self, filters: QueryFilters, skip_data: bool, page: PaginationProtocol | None = None
419
+ ) -> QueryResultPage:
420
+ query_page = Pagination(page.limit, page.starting_after) if page else Pagination()
421
+ return await self._dataset._service.query([self._collection.id], filters, skip_data, query_page)
422
+
423
+ async def _load_legacy(
424
+ self,
425
+ time_or_interval: TimeIntervalLike,
426
+ *,
427
+ skip_data: bool = False,
428
+ show_progress: bool | ProgressCallback = False,
429
+ ) -> xr.Dataset:
430
+ pages = self._iter_pages_legacy(time_or_interval, skip_data, show_progress=show_progress)
431
+ return await _convert_to_dataset_legacy(pages)
432
+
433
+ async def _iter_pages_legacy(
434
+ self,
435
+ time_or_interval: TimeIntervalLike,
436
+ skip_data: bool = False,
437
+ skip_meta: bool = False,
438
+ show_progress: bool | ProgressCallback = False,
439
+ page_size: int | None = None,
440
+ ) -> AsyncIterator[DatapointPage]:
441
+ time_interval = TimeInterval.parse(time_or_interval)
442
+
443
+ request = partial(self._load_page_legacy, time_interval, skip_data, skip_meta)
444
+
445
+ initial_page = Pagination(limit=page_size)
446
+ pages = paginated_request(request, initial_page)
447
+
448
+ if callable(show_progress):
449
+ if skip_meta:
450
+ raise ValueError("Progress callback requires datapoint metadata, but skip_meta is True")
451
+ else:
452
+ pages = with_time_progress_callback(pages, time_interval, show_progress)
453
+ elif show_progress:
454
+ message = f"Fetching {self._dataset.name}"
455
+ if skip_meta: # without metadata we can't estimate progress based on event time (since it is not returned)
456
+ pages = with_progressbar(pages, message)
457
+ else:
458
+ pages = with_time_progressbar(pages, time_interval, message)
459
+
460
+ async for page in pages:
461
+ yield page
462
+
463
+ async def _load_page_legacy(
464
+ self, time_interval: TimeInterval, skip_data: bool, skip_meta: bool, page: PaginationProtocol | None = None
465
+ ) -> DatapointPage:
466
+ query_page = Pagination(page.limit, page.starting_after) if page else Pagination()
467
+ return await self._dataset._service.get_dataset_for_time_interval(
468
+ str(self._collection.id), time_interval, skip_data, skip_meta, query_page
469
+ )
470
+
471
+ async def ingest(
472
+ self,
473
+ data: IngestionData,
474
+ allow_existing: bool = True,
475
+ *,
476
+ show_progress: bool | ProgressCallback = False,
477
+ ) -> list[UUID]:
478
+ """Ingest data into the collection.
479
+
480
+ Args:
481
+ data: The data to ingest. Supported data types are:
482
+ - xr.Dataset: Ingest a dataset such as it is returned by the output of `collection.load()`
483
+ - pd.DataFrame: Ingest a pandas DataFrame, mapping the column names to the dataset fields
484
+ - Iterable, dict or nd-array: Ingest any object that can be converted to a pandas DataFrame,
485
+ equivalent to `ingest(pd.DataFrame(data))`
486
+ allow_existing: Whether to allow existing datapoints. Datapoints will only be overwritten if
487
+ all of their fields are exactly equal to already existing datapoints. Tilebox will never create
488
+ duplicate datapoints, but will raise an error if the datapoint already exists. Setting this to
489
+ `True` will not raise an error and skip the duplicate datapoints instead.
490
+ show_progress: Whether to show a progress bar while ingestion a large number of datapoints.
491
+ If a callable is specified it is used as callback to report progress percentages.
492
+
493
+ Returns:
494
+ List of datapoint ids that were ingested.
495
+ """
496
+ if self._use_legacy_api: # remove this once all datasets are fully migrated to the new endpoints
497
+ raise ValueError("Ingestion is not supported for this dataset. Please create a new dataset.")
498
+
499
+ message_type = get_message_type(self._dataset._dataset.type.type_url)
500
+ messages = marshal_messages(
501
+ to_messages(data, message_type, required_fields=["time"], ignore_fields=["id", "ingestion_time"])
502
+ )
503
+
504
+ disable_progress_bar = callable(show_progress) or (not show_progress)
505
+
506
+ ingested_ids = []
507
+ with tqdm(
508
+ total=len(messages),
509
+ desc=f"Ingesting into {self._dataset.name}",
510
+ unit="datapoints",
511
+ disable=disable_progress_bar,
512
+ ) as progress_bar:
513
+ for chunk_start in range(0, len(messages), _INGEST_CHUNK_SIZE):
514
+ chunk = messages[chunk_start : chunk_start + _INGEST_CHUNK_SIZE]
515
+ response = await self._dataset._service.ingest(self._collection.id, chunk, allow_existing)
516
+ ingested_ids.extend(response.datapoint_ids)
517
+
518
+ progress_bar.update(len(chunk))
519
+ if callable(show_progress):
520
+ show_progress(len(ingested_ids) / len(messages))
521
+ self._info = None # invalidate collection info, since we just ingested some data into it
522
+ return ingested_ids
523
+
524
+ async def delete(self, datapoints: DatapointIDs, *, show_progress: bool | ProgressCallback = False) -> int:
525
+ """Delete datapoints from the collection.
526
+
527
+ Datapoints are identified and deleted by their ids.
528
+
529
+ Args:
530
+ datapoints: The datapoints to delete. Supported types are:
531
+ - xr.Dataset: An xarray.Dataset containing an "id" variable/coord consisting of datapoint IDs to delete.
532
+ - pd.DataFrame: A pandas DataFrame containing a "id" column consisting of datapoint IDs to delete.
533
+ - xr.DataArray, np.ndarray, pd.Series, list[UUID]: Array of UUIDs to delete
534
+ - list[str], list[UUID]: List of datapoint IDs to delete
535
+ show_progress: Whether to show a progress bar when deleting a large number of datapoints.
536
+ If a callable is specified it is used as callback to report progress percentages.
537
+
538
+ Returns:
539
+ The number of datapoints that were deleted.
540
+
541
+ Raises:
542
+ NotFoundError: If one or more of the datapoints to delete doesn't exist - no datapoints
543
+ will be deleted if any of the requested deletions doesn't exist.
544
+ """
545
+ datapoint_ids = extract_datapoint_ids(datapoints)
546
+ num_deleted = 0
547
+
548
+ disable_progress_bar = callable(show_progress) or (not show_progress)
549
+
550
+ with tqdm(
551
+ total=len(datapoint_ids),
552
+ desc=f"Deleting from {self._dataset.name}",
553
+ unit="datapoints",
554
+ disable=disable_progress_bar,
555
+ ) as progress_bar:
556
+ for chunk_start in range(0, len(datapoint_ids), _DELETE_CHUNK_SIZE):
557
+ chunk = datapoint_ids[chunk_start : chunk_start + _DELETE_CHUNK_SIZE]
558
+ num_deleted += await self._dataset._service.delete(self._collection.id, chunk)
559
+
560
+ progress_bar.update(len(chunk))
561
+ if callable(show_progress):
562
+ show_progress(num_deleted / len(datapoint_ids))
563
+ self._info = None # invalidate collection info, since we just deleted some data from it
564
+ return num_deleted
565
+
566
+
567
+ async def _convert_to_dataset(pages: AsyncIterator[QueryResultPage]) -> xr.Dataset:
568
+ """
569
+ Convert an async iterator of QueryResultPages into a single xarray Dataset
570
+
571
+ Parses each incoming page while in parallel already requesting and waiting for the next page from the server.
572
+
573
+ Args:
574
+ pages: Async iterator of QueryResultPages to convert
575
+
576
+ Returns:
577
+ The datapoints from the individual pages converted and combined into a single xarray dataset
578
+ """
579
+ converter = MessageToXarrayConverter()
580
+
581
+ def convert_page(page: QueryResultPage) -> None:
582
+ message_type = get_message_type(page.data.type_url)
583
+ messages = [message_type.FromString(v) for v in page.data.value]
584
+ converter.convert_all(messages)
585
+
586
+ # lets parse the incoming pages already while we wait for the next page from the server
587
+ # we solve this using a classic producer/consumer with a queue of pages for communication
588
+ # this would also account for the case where the server sends pages faster than we are converting
589
+ # them to xarray
590
+ await async_producer_consumer(pages, convert_page)
591
+ return converter.finalize("time")
592
+
593
+
594
+ async def _convert_to_dataset_legacy(pages: AsyncIterator[DatapointPage]) -> xr.Dataset:
595
+ """
596
+ Convert an async iterator of DatasetIntervals (pages) into a single xarray Dataset
597
+
598
+ Parses each incoming page while in parallel already requesting and waiting for the next page from the server.
599
+
600
+ Args:
601
+ pages: Async iterator of DatasetIntervals (pages) to convert
602
+
603
+ Returns:
604
+ The datapoints from the individual pages converted and combined into a single xarray dataset
605
+ """
606
+
607
+ converter = TimeseriesToXarrayConverter()
608
+ # lets parse the incoming pages already while we wait for the next page from the server
609
+ # we solve this using a classic producer/consumer with a queue of pages for communication
610
+ # this would also account for the case where the server sends pages faster than we are converting
611
+ # them to xarray
612
+ await async_producer_consumer(pages, lambda page: converter.convert_all(page))
613
+ return converter.finalize()