freva-client 2404.0.1__py3-none-any.whl → 2408.0.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of freva-client might be problematic. Click here for more details.

@@ -5,14 +5,26 @@ Search quickly and intuitively for many different climate datasets.
5
5
 
6
6
  import json
7
7
  from enum import Enum
8
+ from pathlib import Path
9
+ from tempfile import NamedTemporaryFile
8
10
  from typing import Dict, List, Literal, Optional, Union, cast
9
11
 
10
12
  import typer
11
13
  from freva_client import databrowser
14
+ from freva_client.auth import Auth
12
15
  from freva_client.utils import exception_handler, logger
13
16
 
14
- from .cli_app import app, version_callback
15
- from .cli_utils import parse_cli_args
17
+ from .cli_utils import parse_cli_args, version_callback
18
+
19
+
20
+ def _auth(url: str, token: Optional[str]) -> None:
21
+ if token:
22
+ auth = Auth()
23
+ auth.set_token(
24
+ access_token=token, expires=auth.token_expiration_time.timestamp()
25
+ )
26
+ else:
27
+ raise ValueError("`--access-token` is required for authentication.")
16
28
 
17
29
 
18
30
  class UniqKeys(str, Enum):
@@ -55,7 +67,12 @@ class TimeSelect(str, Enum):
55
67
  )
56
68
 
57
69
 
58
- @app.command(
70
+ databrowser_app = typer.Typer(
71
+ help="Data search related commands", callback=logger.set_cli
72
+ )
73
+
74
+
75
+ @databrowser_app.command(
59
76
  name="data-overview",
60
77
  help="Get an overview over what is available in the databrowser.",
61
78
  )
@@ -74,7 +91,7 @@ def overview(
74
91
  print(databrowser.overview(host=host))
75
92
 
76
93
 
77
- @app.command(
94
+ @databrowser_app.command(
78
95
  name="metadata-search", help="Search databrowser for metadata (facets)."
79
96
  )
80
97
  @exception_handler
@@ -190,7 +207,9 @@ def metadata_search(
190
207
  print(f"{key}: {', '.join(values)}")
191
208
 
192
209
 
193
- @app.command(name="data-search", help="Search the databrowser for datasets.")
210
+ @databrowser_app.command(
211
+ name="data-search", help="Search the databrowser for datasets."
212
+ )
194
213
  @exception_handler
195
214
  def data_search(
196
215
  search_keys: Optional[List[str]] = typer.Argument(
@@ -234,6 +253,17 @@ def data_search(
234
253
  "--time-select",
235
254
  help=TimeSelect.get_help(),
236
255
  ),
256
+ zarr: bool = typer.Option(
257
+ False, "--zarr", help="Create zarr stream files."
258
+ ),
259
+ access_token: Optional[str] = typer.Option(
260
+ None,
261
+ "--access-token",
262
+ help=(
263
+ "Use this access token for authentication"
264
+ " when creating a zarr stream files."
265
+ ),
266
+ ),
237
267
  time: Optional[str] = typer.Option(
238
268
  None,
239
269
  "-t",
@@ -264,14 +294,14 @@ def data_search(
264
294
  ),
265
295
  multiversion: bool = typer.Option(
266
296
  False,
267
- "--mulit-version",
297
+ "--multi-version",
268
298
  help="Select all versions and not just the latest version (default).",
269
299
  ),
270
300
  version: Optional[bool] = typer.Option(
271
301
  False,
272
302
  "-V",
273
303
  "--version",
274
- help="Show verion an exit",
304
+ help="Show version an exit",
275
305
  callback=version_callback,
276
306
  ),
277
307
  ) -> None:
@@ -295,8 +325,11 @@ def data_search(
295
325
  host=host,
296
326
  fail_on_error=False,
297
327
  multiversion=multiversion,
328
+ stream_zarr=zarr,
298
329
  **(parse_cli_args(search_keys or [])),
299
330
  )
331
+ if zarr:
332
+ _auth(result._cfg.auth_url, access_token)
300
333
  if parse_json:
301
334
  print(json.dumps(sorted(result)))
302
335
  else:
@@ -304,7 +337,141 @@ def data_search(
304
337
  print(res)
305
338
 
306
339
 
307
- @app.command(name="data-count", help="Count the databrowser search results")
340
+ @databrowser_app.command(
341
+ name="intake-catalogue", help="Create an intake catalogue from the search."
342
+ )
343
+ @exception_handler
344
+ def intake_catalogue(
345
+ search_keys: Optional[List[str]] = typer.Argument(
346
+ default=None,
347
+ help="Refine your data search with this `key=value` pair search "
348
+ "parameters. The parameters could be, depending on the DRS standard, "
349
+ "flavour product, project model etc.",
350
+ ),
351
+ facets: Optional[List[str]] = typer.Option(
352
+ None,
353
+ "--facet",
354
+ help=(
355
+ "If you are not sure about the correct search key's you can use"
356
+ " the ``--facet`` flag to search of any matching entries. For "
357
+ "example --facet 'era5' would allow you to search for any entries"
358
+ " containing era5, regardless of project, product etc."
359
+ ),
360
+ ),
361
+ uniq_key: UniqKeys = typer.Option(
362
+ "file",
363
+ "--uniq-key",
364
+ "-u",
365
+ help=(
366
+ "The type of search result, which can be either “file” "
367
+ "or “uri”. This parameter determines whether the search will be "
368
+ "based on file paths or Uniform Resource Identifiers"
369
+ ),
370
+ ),
371
+ flavour: Flavours = typer.Option(
372
+ "freva",
373
+ "--flavour",
374
+ "-f",
375
+ help=(
376
+ "The Data Reference Syntax (DRS) standard specifying the type "
377
+ "of climate datasets to query."
378
+ ),
379
+ ),
380
+ time_select: TimeSelect = typer.Option(
381
+ "flexible",
382
+ "-ts",
383
+ "--time-select",
384
+ help=TimeSelect.get_help(),
385
+ ),
386
+ time: Optional[str] = typer.Option(
387
+ None,
388
+ "-t",
389
+ "--time",
390
+ help=(
391
+ "Special search facet to refine/subset search results by time. "
392
+ "This can be a string representation of a time range or a single "
393
+ "time step. The time steps have to follow ISO-8601. Valid strings "
394
+ "are ``%Y-%m-%dT%H:%M`` to ``%Y-%m-%dT%H:%M`` for time ranges and "
395
+ "``%Y-%m-%dT%H:%M``. **Note**: You don't have to give the full "
396
+ "string format to subset time steps ``%Y``, ``%Y-%m`` etc are also"
397
+ " valid."
398
+ ),
399
+ ),
400
+ zarr: bool = typer.Option(
401
+ False, "--zarr", help="Create zarr stream files, as catalogue targets."
402
+ ),
403
+ access_token: Optional[str] = typer.Option(
404
+ None,
405
+ "--access-token",
406
+ help=(
407
+ "Use this access token for authentication"
408
+ " when creating a zarr based intake catalogue."
409
+ ),
410
+ ),
411
+ filename: Optional[Path] = typer.Option(
412
+ None,
413
+ "-f",
414
+ "--filename",
415
+ help=(
416
+ "Path to the file where the catalogue, should be written to. "
417
+ "if None given (default) the catalogue is parsed to stdout."
418
+ ),
419
+ ),
420
+ host: Optional[str] = typer.Option(
421
+ None,
422
+ "--host",
423
+ help=(
424
+ "Set the hostname of the databrowser, if not set (default) "
425
+ "the hostname is read from a config file"
426
+ ),
427
+ ),
428
+ verbose: int = typer.Option(
429
+ 0, "-v", help="Increase verbosity", count=True
430
+ ),
431
+ multiversion: bool = typer.Option(
432
+ False,
433
+ "--multi-version",
434
+ help="Select all versions and not just the latest version (default).",
435
+ ),
436
+ version: Optional[bool] = typer.Option(
437
+ False,
438
+ "-V",
439
+ "--version",
440
+ help="Show version an exit",
441
+ callback=version_callback,
442
+ ),
443
+ ) -> None:
444
+ """Create an intake catalogue for climate datasets based on the specified "
445
+ "Data Reference Syntax (DRS) standard (flavour) and the type of search "
446
+ result (uniq_key), which can be either “file” or “uri”."""
447
+ logger.set_verbosity(verbose)
448
+ logger.debug("Search the databrowser")
449
+ result = databrowser(
450
+ *(facets or []),
451
+ time=time or "",
452
+ time_select=cast(Literal["file", "flexible", "strict"], time_select),
453
+ flavour=cast(
454
+ Literal["freva", "cmip6", "cmip5", "cordex", "nextgems"],
455
+ flavour.value,
456
+ ),
457
+ uniq_key=cast(Literal["uri", "file"], uniq_key.value),
458
+ host=host,
459
+ fail_on_error=False,
460
+ multiversion=multiversion,
461
+ stream_zarr=zarr,
462
+ **(parse_cli_args(search_keys or [])),
463
+ )
464
+ if zarr:
465
+ _auth(result._cfg.auth_url, access_token)
466
+ with NamedTemporaryFile(suffix=".json") as temp_f:
467
+ result._create_intake_catalogue_file(str(filename or temp_f.name))
468
+ if not filename:
469
+ print(Path(temp_f.name).read_text())
470
+
471
+
472
+ @databrowser_app.command(
473
+ name="data-count", help="Count the databrowser search results"
474
+ )
308
475
  @exception_handler
309
476
  def count_values(
310
477
  search_keys: Optional[List[str]] = typer.Argument(
@@ -387,7 +554,7 @@ def count_values(
387
554
  False,
388
555
  "-V",
389
556
  "--version",
390
- help="Show verion an exit",
557
+ help="Show version an exit",
391
558
  callback=version_callback,
392
559
  ),
393
560
  ) -> None:
@@ -438,6 +605,7 @@ def count_values(
438
605
  multiversion=multiversion,
439
606
  fail_on_error=False,
440
607
  uniq_key="file",
608
+ stream_zarr=False,
441
609
  **search_kws,
442
610
  )
443
611
  )
freva_client/query.py CHANGED
@@ -4,12 +4,27 @@ import sys
4
4
  from collections import defaultdict
5
5
  from fnmatch import fnmatch
6
6
  from functools import cached_property
7
- from typing import Dict, Iterator, List, Literal, Optional, Tuple, Union, cast
8
-
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
9
+ from typing import (
10
+ Any,
11
+ Dict,
12
+ Iterator,
13
+ List,
14
+ Literal,
15
+ Optional,
16
+ Tuple,
17
+ Union,
18
+ cast,
19
+ )
20
+
21
+ import intake
22
+ import intake_esm
9
23
  import requests
10
24
  import yaml
11
25
  from rich import print as pprint
12
26
 
27
+ from .auth import Auth
13
28
  from .utils import logger
14
29
  from .utils.databrowser_utils import Config
15
30
 
@@ -67,6 +82,9 @@ class databrowser:
67
82
  url where the freva web site can be found. Such as www.freva.dkrz.de.
68
83
  By default no host name is given and the host name will be taken from
69
84
  the freva config file.
85
+ stream_zarr: bool, default: False
86
+ Create a zarr stream for all search results. When set to true the
87
+ files are served in zarr format and can be opened from anywhere.
70
88
  multiversion: bool, default: False
71
89
  Select all versions and not just the latest version (default).
72
90
  fail_on_error: bool, default: False
@@ -98,7 +116,7 @@ class databrowser:
98
116
  db = databrowser(experiment="cmorph", uniq_key="uri")
99
117
  print(db)
100
118
 
101
- After having created the search object you can aquire differnt kinds of
119
+ After having created the search object you can acquire different kinds of
102
120
  information like the number of found objects:
103
121
 
104
122
  .. execute_code::
@@ -149,24 +167,61 @@ class databrowser:
149
167
  db = databrowser("reana*", realm="ocean", flavour="cmip6")
150
168
  for file in db:
151
169
  print(file)
170
+
171
+ If you don't have direct access to the data, for example because you are
172
+ not directly logged in to the computer where the data is stored you can
173
+ set ``stream_zarr=True``. The data will then be
174
+ provisioned in zarr format and can be opened from anywhere. But bear in
175
+ mind that zarr streams if not accessed in time will expire. Since the
176
+ data can be accessed from anywhere you will also have to authenticate
177
+ before you are able to access the data. Refer also to the
178
+ :py:meth:`freva_client.authenticate` method.
179
+
180
+ .. execute_code::
181
+
182
+ from freva_client import authenticate, databrowser
183
+ token_info = authenticate(username="janedoe")
184
+ db = databrowser(dataset="cmip6-fs", stream_zarr=True)
185
+ zarr_files = list(db)
186
+ print(zarr_files)
187
+
188
+ After you have created the paths to the zarr files you can open them
189
+
190
+ ::
191
+
192
+ import xarray as xr
193
+ dset = xr.open_dataset(
194
+ zarr_files[0],
195
+ chunks="auto",
196
+ engine="zarr",
197
+ storage_options={"header":
198
+ {"Authorization": f"Bearer {token_info['access_token']}"}
199
+ }
200
+ )
201
+
202
+
152
203
  """
153
204
 
154
205
  def __init__(
155
206
  self,
156
207
  *facets: str,
157
208
  uniq_key: Literal["file", "uri"] = "file",
158
- flavour: Literal["freva", "cmip6", "cmip5", "cordex", "nextgems"] = "freva",
209
+ flavour: Literal[
210
+ "freva", "cmip6", "cmip5", "cordex", "nextgems"
211
+ ] = "freva",
159
212
  time: Optional[str] = None,
160
213
  host: Optional[str] = None,
161
214
  time_select: Literal["flexible", "strict", "file"] = "flexible",
215
+ stream_zarr: bool = False,
162
216
  multiversion: bool = False,
163
217
  fail_on_error: bool = False,
164
218
  **search_keys: Union[str, List[str]],
165
219
  ) -> None:
166
-
220
+ self._auth = Auth()
167
221
  self._fail_on_error = fail_on_error
168
222
  self._cfg = Config(host, uniq_key=uniq_key, flavour=flavour)
169
223
  self._flavour = flavour
224
+ self._stream_zarr = stream_zarr
170
225
  facet_search: Dict[str, List[str]] = defaultdict(list)
171
226
  for key, value in search_keys.items():
172
227
  if isinstance(value, str):
@@ -188,7 +243,8 @@ class databrowser:
188
243
  self, facets: Tuple[str, ...], search_kw: Dict[str, List[str]]
189
244
  ) -> None:
190
245
  metadata = {
191
- k: v[::2] for (k, v) in self._facet_search(extended_search=True).items()
246
+ k: v[::2]
247
+ for (k, v) in self._facet_search(extended_search=True).items()
192
248
  }
193
249
  primary_key = list(metadata.keys() or ["project"])[0]
194
250
  num_facets = 0
@@ -201,19 +257,29 @@ class databrowser:
201
257
 
202
258
  if facets and num_facets == 0:
203
259
  # TODO: This isn't pretty, but if a user requested a search
204
- # string doesn't exist than we have to somehow make the search
260
+ # string that doesn't exist than we have to somehow make the search
205
261
  # return nothing.
206
262
  search_kw = {primary_key: ["NotAvailable"]}
207
263
  self._params.update(search_kw)
208
264
 
209
265
  def __iter__(self) -> Iterator[str]:
210
- result = self._get(self._cfg.search_url)
266
+ query_url = self._cfg.search_url
267
+ headers = {}
268
+ if self._stream_zarr:
269
+ query_url = self._cfg.zarr_loader_url
270
+ token = self._auth.check_authentication(
271
+ auth_url=self._cfg.auth_url
272
+ )
273
+ headers = {"Authorization": f"Bearer {token['access_token']}"}
274
+ result = self._get(query_url, headers=headers, stream=True)
211
275
  if result is not None:
212
276
  try:
213
277
  for res in result.iter_lines():
214
278
  yield res.decode("utf-8")
215
279
  except KeyboardInterrupt:
216
- pprint("[red][b]User interrupt: Exit[/red][/b]", file=sys.stderr)
280
+ pprint(
281
+ "[red][b]User interrupt: Exit[/red][/b]", file=sys.stderr
282
+ )
217
283
 
218
284
  def __repr__(self) -> str:
219
285
  params = ", ".join(
@@ -240,7 +306,9 @@ class databrowser:
240
306
 
241
307
  # Create a table-like structure for available flavors and search facets
242
308
  style = 'style="text-align: left"'
243
- facet_heading = f"Available search facets for <em>{self._flavour}</em> flavour"
309
+ facet_heading = (
310
+ f"Available search facets for <em>{self._flavour}</em> flavour"
311
+ )
244
312
  html_repr = (
245
313
  "<table>"
246
314
  f"<tr><th colspan='2' {style}>{self.__class__.__name__}"
@@ -274,11 +342,71 @@ class databrowser:
274
342
  return cast(int, result.json().get("total_count", 0))
275
343
  return 0
276
344
 
345
+ def _create_intake_catalogue_file(self, filename: str) -> None:
346
+ """Create an intake catalogue file."""
347
+ kwargs: Dict[str, Any] = {"stream": True}
348
+ url = self._cfg.intake_url
349
+ if self._stream_zarr:
350
+ token = self._auth.check_authentication(
351
+ auth_url=self._cfg.auth_url
352
+ )
353
+ url = self._cfg.zarr_loader_url
354
+ kwargs["headers"] = {
355
+ "Authorization": f"Bearer {token['access_token']}"
356
+ }
357
+ kwargs["params"] = {"catalogue-type": "intake"}
358
+ result = self._get(url, **kwargs)
359
+ if result is None:
360
+ raise ValueError("No results found")
361
+
362
+ try:
363
+ Path(filename).parent.mkdir(exist_ok=True, parents=True)
364
+ with open(filename, "bw") as stream:
365
+ for content in result.iter_content(decode_unicode=False):
366
+ stream.write(content)
367
+ except Exception as error:
368
+ raise ValueError(
369
+ f"Couldn't write catalogue content: {error}"
370
+ ) from None
371
+
372
+ def intake_catalogue(self) -> intake_esm.core.esm_datastore:
373
+ """Create an intake esm catalogue object from the search.
374
+
375
+ This method creates a intake-esm catalogue from the current object
376
+ search. Instead of having the original files as target objects you can
377
+ also choose to stream the files via zarr.
378
+
379
+ Returns
380
+ ~~~~~~~
381
+ intake_esm.core.esm_datastore: intake-esm catalogue.
382
+
383
+ Raises
384
+ ~~~~~~
385
+ ValueError: If user is not authenticated or catalogue creation failed.
386
+
387
+ Example
388
+ ~~~~~~~
389
+ Let's create an intake-esm catalogue that points points allows for
390
+ streaming the target data as zarr:
391
+
392
+ .. execute_code::
393
+
394
+ from freva_client import databrowser
395
+ db = databrowser(dataset="cmip6-fs", stream_zarr=True)
396
+ cat = db.intake_catalogue()
397
+ print(cat.df)
398
+ """
399
+ with NamedTemporaryFile(suffix=".json") as temp_f:
400
+ self._create_intake_catalogue_file(temp_f.name)
401
+ return intake.open_esm_datastore(temp_f.name)
402
+
277
403
  @classmethod
278
404
  def count_values(
279
405
  cls,
280
406
  *facets: str,
281
- flavour: Literal["freva", "cmip6", "cmip5", "cordex", "nextgems"] = "freva",
407
+ flavour: Literal[
408
+ "freva", "cmip6", "cmip5", "cordex", "nextgems"
409
+ ] = "freva",
282
410
  time: Optional[str] = None,
283
411
  host: Optional[str] = None,
284
412
  time_select: Literal["flexible", "strict", "file"] = "flexible",
@@ -328,7 +456,7 @@ class databrowser:
328
456
  fail_on_error: bool, default: False
329
457
  Make the call fail if the connection to the databrowser could not
330
458
  **search_keys: str
331
- The search contraints to be applied in the data search. If not given
459
+ The search constraints to be applied in the data search. If not given
332
460
  the whole dataset will be queried.
333
461
 
334
462
  Returns
@@ -370,12 +498,15 @@ class databrowser:
370
498
  multiversion=multiversion,
371
499
  fail_on_error=fail_on_error,
372
500
  uniq_key="file",
501
+ stream_zarr=False,
373
502
  **search_keys,
374
503
  )
375
504
  result = this._facet_search(extended_search=extended_search)
376
505
  counts = {}
377
506
  for facet, value_counts in result.items():
378
- counts[facet] = dict(zip(value_counts[::2], map(int, value_counts[1::2])))
507
+ counts[facet] = dict(
508
+ zip(value_counts[::2], map(int, value_counts[1::2]))
509
+ )
379
510
  return counts
380
511
 
381
512
  @cached_property
@@ -384,7 +515,7 @@ class databrowser:
384
515
 
385
516
  You can retrieve all information that is associated with your current
386
517
  databrowser search. This can be useful for reverse searches for example
387
- for retrieving metadata of object sotres or file/directory names.
518
+ for retrieving metadata of object stores or file/directory names.
388
519
 
389
520
  Example
390
521
  ~~~~~~~
@@ -400,14 +531,17 @@ class databrowser:
400
531
 
401
532
  """
402
533
  return {
403
- k: v[::2] for (k, v) in self._facet_search(extended_search=True).items()
534
+ k: v[::2]
535
+ for (k, v) in self._facet_search(extended_search=True).items()
404
536
  }
405
537
 
406
538
  @classmethod
407
539
  def metadata_search(
408
540
  cls,
409
541
  *facets: str,
410
- flavour: Literal["freva", "cmip6", "cmip5", "cordex", "nextgems"] = "freva",
542
+ flavour: Literal[
543
+ "freva", "cmip6", "cmip5", "cordex", "nextgems"
544
+ ] = "freva",
411
545
  time: Optional[str] = None,
412
546
  host: Optional[str] = None,
413
547
  time_select: Literal["flexible", "strict", "file"] = "flexible",
@@ -432,7 +566,7 @@ class databrowser:
432
566
  flavour: str, default: freva
433
567
  The Data Reference Syntax (DRS) standard specifying the type of climate
434
568
  datasets to query.
435
- time: str, defautl: ""
569
+ time: str, default: ""
436
570
  Special search facet to refine/subset search results by time.
437
571
  This can be a string representation of a time range or a single
438
572
  timestamp. The timestamp has to follow ISO-8601. Valid strings are
@@ -525,11 +659,14 @@ class databrowser:
525
659
  multiversion=multiversion,
526
660
  fail_on_error=fail_on_error,
527
661
  uniq_key="file",
662
+ stream_zarr=False,
528
663
  **search_keys,
529
664
  )
530
665
  return {
531
666
  k: v[::2]
532
- for (k, v) in this._facet_search(extended_search=extended_search).items()
667
+ for (k, v) in this._facet_search(
668
+ extended_search=extended_search
669
+ ).items()
533
670
  }
534
671
 
535
672
  @classmethod
@@ -591,16 +728,22 @@ class databrowser:
591
728
  return {}
592
729
  data = result.json()
593
730
  if extended_search:
594
- contraints = data["facets"].keys()
731
+ constraints = data["facets"].keys()
595
732
  else:
596
- contraints = data["primary_facets"]
597
- return {f: v for f, v in data["facets"].items() if f in contraints}
733
+ constraints = data["primary_facets"]
734
+ return {f: v for f, v in data["facets"].items() if f in constraints}
598
735
 
599
- def _get(self, url: str) -> Optional[requests.models.Response]:
736
+ def _get(
737
+ self, url: str, **kwargs: Any
738
+ ) -> Optional[requests.models.Response]:
600
739
  """Apply the get method to the databrowser."""
601
740
  logger.debug("Searching %s with parameters: %s", url, self._params)
741
+ params = kwargs.pop("params", {})
742
+ kwargs.setdefault("timeout", 30)
602
743
  try:
603
- res = requests.get(url, params=self._params, timeout=30)
744
+ res = requests.get(
745
+ url, params={**self._params, **params}, **kwargs
746
+ )
604
747
  res.raise_for_status()
605
748
  return res
606
749
  except KeyboardInterrupt:
@@ -17,7 +17,7 @@ def exception_handler(func: Callable[..., Any]) -> Callable[..., Any]:
17
17
 
18
18
  @wraps(func)
19
19
  def wrapper(*args: Any, **kwargs: Any) -> Any:
20
- """Wrapper function that handles the exeption."""
20
+ """Wrapper function that handles the exception."""
21
21
  try:
22
22
  return func(*args, **kwargs)
23
23
  except KeyboardInterrupt: