metadata-crawler 2510.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (35) hide show
  1. metadata_crawler/__init__.py +263 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +831 -0
  7. metadata_crawler/api/drs_config.toml +440 -0
  8. metadata_crawler/api/index.py +151 -0
  9. metadata_crawler/api/metadata_stores.py +755 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +140 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +547 -0
  22. metadata_crawler/data_collector.py +278 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +206 -0
  25. metadata_crawler/ingester/solr.py +282 -0
  26. metadata_crawler/logger.py +153 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +419 -0
  29. metadata_crawler/utils/__init__.py +482 -0
  30. metadata_crawler/utils/cftime_utils.py +207 -0
  31. metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
  32. metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
  33. metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
  34. metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
  35. metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,440 @@
1
+ # common_drs.toml
2
+ # ----------------
3
+ # Default DRS settings
4
+
5
+ [drs_settings]
6
+
7
+ # 1) Allowed file extensions
8
+ suffixes = [".zarr", ".zar", ".nc4", ".nc", ".tar", ".hdf5", ".h5", ".grib", ".grb"]
9
+
10
+ # 2) Canonical index facets → raw keys in `data`
11
+ [drs_settings.index_schema]
12
+ # facet -> raw-key map
13
+
14
+ [drs_settings.index_schema.file]
15
+ key = "file"
16
+ type = "path"
17
+ multi_valued = false
18
+ required = true
19
+ unique = true
20
+
21
+ [drs_settings.index_schema.uri]
22
+ key = "uri"
23
+ type = "uri"
24
+ multi_valued = false
25
+ required = true
26
+
27
+ [drs_settings.index_schema.project]
28
+ key = "project"
29
+ type = "string"
30
+ multi_valued = true
31
+
32
+ [drs_settings.index_schema.product]
33
+ key = "product"
34
+ type = "string"
35
+ multi_valued = true
36
+
37
+ [drs_settings.index_schema.institute]
38
+ key = "institute"
39
+ type = "string"
40
+ multi_valued = true
41
+
42
+ [drs_settings.index_schema.model]
43
+ key = "model"
44
+ type = "string"
45
+ multi_valued = true
46
+
47
+ [drs_settings.index_schema.experiment]
48
+ key = "experiment"
49
+ type = "string"
50
+ multi_valued = true
51
+
52
+ [drs_settings.index_schema.time_frequency]
53
+ key = "time_frequency"
54
+ type = "string"
55
+ multi_valued = true
56
+
57
+ [drs_settings.index_schema.realm]
58
+ key = "realm"
59
+ type = "string"
60
+ multi_valued = true
61
+
62
+ [drs_settings.index_schema.cmor_table]
63
+ key = "cmor_table"
64
+ type = "string"
65
+ multi_valued = true
66
+
67
+ [drs_settings.index_schema.ensemble]
68
+ key = "ensemble"
69
+ type = "string"
70
+ multi_valued = true
71
+
72
+ [drs_settings.index_schema.variable]
73
+ key = "variable"
74
+ type = "string"
75
+ multi_valued = true
76
+
77
+ [drs_settings.index_schema.time]
78
+ key = "time"
79
+ type = "daterange"
80
+ multi_valued = false
81
+ default = "fx"
82
+
83
+ [drs_settings.index_schema.grid_label]
84
+ key = "grid_label"
85
+ type = "string"
86
+ multi_valued = true
87
+ default = "gn"
88
+
89
+ [drs_settings.index_schema.version]
90
+ key = "version"
91
+ type = "string"
92
+ multi_valued = false
93
+ default = "-1"
94
+
95
+ [drs_settings.index_schema.driving_model]
96
+ key = "driving_model"
97
+ type = "string"
98
+ multi_valued = true
99
+
100
+ [drs_settings.index_schema.rcm_name]
101
+ key = "rcm_name"
102
+ type = "string"
103
+ multi_valued = true
104
+
105
+ [drs_settings.index_schema.rcm_version]
106
+ key = "rcm_version"
107
+ type = "string"
108
+ multi_valued = true
109
+
110
+ [drs_settings.index_schema.dataset]
111
+ key = "dataset"
112
+ type = "dataset"
113
+ multi_valued = false
114
+ default = ""
115
+
116
+ [drs_settings.index_schema.format]
117
+ key = "format"
118
+ type = "fmt"
119
+ multi_valued = false
120
+
121
+ [drs_settings.index_schema.grid_id]
122
+ key = "grid_id"
123
+ type = "string"
124
+ multi_valued = true
125
+
126
+ [drs_settings.index_schema.level_type]
127
+ key = "level_type"
128
+ type = "string"
129
+ multi_valued = true
130
+ default = "2d"
131
+
132
+ [drs_settings.index_schema.user]
133
+ key = "user"
134
+ type = "string"
135
+ multi_valued = false
136
+
137
+ [drs_settings.index_schema.fs_type]
138
+ key = "fs_type"
139
+ type = "storage"
140
+ multi_valued = false
141
+ default = "posix"
142
+
143
+ [drs_settings.index_schema.bbox]
144
+ key = "bbox"
145
+ type = "bbox"
146
+ multi_valued = true
147
+ default = [0,360,-90,90]
148
+
149
+ [drs_settings.index_schema.time_aggregation]
150
+ key = "time_aggregation"
151
+ type = "string"
152
+ multi_valued = true
153
+ default = "mean"
154
+
155
+
156
+ # 3) Global special rules
157
+ [drs_settings.special.time_aggregation]
158
+ type = "conditional"
159
+ condition = "'pt' in '{{ time_frequency | lower}}'"
160
+ true = "inst"
161
+ false = "mean"
162
+
163
+ # 4) Global storage options
164
+ [drs_settings.storage_options]
165
+ # 5) Common dialect specs + per-dialect overrides
166
+
167
+ # -- FREVA -------------------------------------------------------
168
+ [drs_settings.dialect.freva]
169
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
170
+ sources = ["path"]
171
+
172
+ [drs_settings.dialect.freva.facets]
173
+ project = "project"
174
+ product = "product"
175
+ institute = "institute"
176
+ model = "model"
177
+ experiment = "experiment"
178
+ time_frequency = "time_frequency"
179
+ realm = "realm"
180
+ cmor_table = "cmor_table"
181
+ ensemble = "member"
182
+ variable = "variable"
183
+ time = "time"
184
+ grid_label = "grid_label"
185
+ version = "version"
186
+ dataset = "dataset"
187
+ format = "format"
188
+ grid_id = "grid_id"
189
+ level_type = "level_type"
190
+ user = "user"
191
+ fs_type = "__fstype__"
192
+ bbox = "bbox"
193
+ time_aggregation = "time_aggregation"
194
+
195
+ [drs_settings.dialect.freva.path_specs]
196
+ dir_parts = [
197
+ "project",
198
+ "product",
199
+ "institution",
200
+ "model",
201
+ "experiment",
202
+ "time_frequency",
203
+ "realm",
204
+ "cmor_table",
205
+ "ensemble",
206
+ "version",
207
+ "variable"
208
+ ]
209
+ file_parts = [
210
+ "variable",
211
+ "cmor_table",
212
+ "model",
213
+ "experiment",
214
+ "ensemble",
215
+ "time"
216
+ ]
217
+ file_sep = "_"
218
+
219
+ # -- CMIP6 --------------------------------------------------------
220
+ [drs_settings.dialect.cmip6]
221
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
222
+ sources = ["path"]
223
+ [drs_settings.dialect.cmip6.facets]
224
+ project = "mip_era"
225
+ product = "activity_id"
226
+ model = "source_id"
227
+ experiment = "experiment_id"
228
+ cmor_table = "table_id"
229
+ ensemble = "member_id"
230
+ variable = "variable_id"
231
+ grid_label = "grid_label"
232
+ version = "version"
233
+ member = "member"
234
+ time_frequency = "time_frequency"
235
+ realm = "realm"
236
+ time_aggregation = "time_aggregation"
237
+ time = "time"
238
+ bbox = "bbox"
239
+ institute = "institution"
240
+ [drs_settings.dialect.cmip6.path_specs]
241
+ dir_parts = [
242
+ "mip_era",
243
+ "activity_id",
244
+ "institution",
245
+ "source_id",
246
+ "experiment_id",
247
+ "member_id",
248
+ "table_id",
249
+ "variable_id",
250
+ "grid_label",
251
+ "version"
252
+ ]
253
+ file_parts = [
254
+ "variable_id",
255
+ "table_id",
256
+ "source_id",
257
+ "experiment_id",
258
+ "member_id",
259
+ "grid_label",
260
+ "time"
261
+ ]
262
+ file_sep = "_"
263
+
264
+ # override async methods for cmip6
265
+ [drs_settings.dialect.cmip6.special.realm]
266
+ type = "lookup"
267
+ tree = ["{{ table_id }}", "{{ variable_id }}", "realm"]
268
+ attribute = "realm"
269
+ standard = "cmip6"
270
+ [drs_settings.dialect.cmip6.special.time_frequency]
271
+ type = "lookup"
272
+ tree = ["{{ table_id }}","{{ variable_id }}", "time-frequency"]
273
+ attribute = "frequency"
274
+ standard = "cmip6"
275
+ # -- CMIP5 --------------------------------------------------------
276
+ [drs_settings.dialect.cmip5]
277
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
278
+ sources = ["path"]
279
+ [drs_settings.dialect.cmip5.facets]
280
+ project = "project"
281
+ product = "product"
282
+ institute = "institution_id"
283
+ model = "model_id"
284
+ experiment = "experiment_id"
285
+ time_frequency = "time_frequency"
286
+ realm = "realm"
287
+ cmor_table = "cmor_table"
288
+ ensemble = "member_id"
289
+ variable = "variable_id"
290
+ time_aggregation = "time_aggregation"
291
+ bbox = "bbox"
292
+
293
+ [drs_settings.dialect.cmip5.path_specs]
294
+ dir_parts = [
295
+ "project",
296
+ "product",
297
+ "institution_id",
298
+ "model_id",
299
+ "experiment_id",
300
+ "time_frequency",
301
+ "realm",
302
+ "cmor_table",
303
+ "member_id",
304
+ "version",
305
+ "variable_id"
306
+ ]
307
+ file_parts = [
308
+ "variable_id",
309
+ "cmor_table",
310
+ "model_id",
311
+ "experiment_id",
312
+ "member_id",
313
+ "time"
314
+ ]
315
+ file_sep = "_"
316
+
317
+ # -- CORDEX --------------------------------------------------------
318
+ [drs_settings.dialect.cordex]
319
+ sources = ["path"]
320
+ defaults = { realm = "atmos" , level_type = "2d" }
321
+
322
+ [drs_settings.dialect.cordex.facets]
323
+ project = "project"
324
+ product = "domain"
325
+ institute = "institution"
326
+ model = "model"
327
+ driving_model = "driving_model"
328
+ rcm_name = "rcm_name"
329
+ rcm_version = "rcm_version"
330
+ time_aggregation = "time_aggregation"
331
+ bbox = "bbox"
332
+ time = "time"
333
+ variable = "variable"
334
+ ensemble = "ensemble"
335
+ realm = "realm"
336
+ time_frequency = "time_frequency"
337
+ experiment = "experiment"
338
+
339
+ [drs_settings.dialect.cordex.path_specs]
340
+ dir_parts = [
341
+ "project",
342
+ "product",
343
+ "domain",
344
+ "institution",
345
+ "driving_model",
346
+ "experiment",
347
+ "ensemble",
348
+ "rcm_name",
349
+ "rcm_version",
350
+ "time_frequency",
351
+ "variable",
352
+ "version"
353
+ ]
354
+ file_parts = [
355
+ "variable",
356
+ "domain",
357
+ "driving_model",
358
+ "experiment",
359
+ "ensemble",
360
+ "rcm_name",
361
+ "rcm_version",
362
+ "time_frequency",
363
+ "time"
364
+ ]
365
+ file_sep = "_"
366
+
367
+ # CORDEX‐specific domain‐to‐bbox map
368
+ [drs_settings.dialect.cordex.domains]
369
+ EAS-44 = [63.3574, 175.132, -18.22689, 58.59]
370
+ WAS-22 = [19.83779, 115.2829, -14.89043, 44.70015]
371
+ WAS-44 = [19.86429, 115.5316, -15.23168, 45.25018]
372
+ MED-11 = [-46.80627, 76.06158, 19.63504, 73.41]
373
+ EUR-11I = [-44.8125, 65.1875, 21.8125, 72.6875]
374
+ EUR-11 = [-44.594, 64.9646, 21.98791, 72.58528]
375
+ AFR-44I = [-25.25, 60.75, -46.25, 42.75]
376
+ CAS-22 = [10.7899, 140.1774, 18.00188, 69.51006]
377
+ NAM-22 = [-169.51, -24.72, 12.96031, 75.31002]
378
+ ARC-44I = [-179.75, 179.75, 48.75, 89.75]
379
+ SAM-44 = [-105.7188, -18.71515, -57.68022, 18.49518]
380
+ EUR-44I = [-44.75, 65.25, 21.75, 72.75]
381
+ AFR-22 = [-24.53, 59.9502, -47.63018, 43.89017]
382
+ EUR-44 = [-44.14069, 64.40398, 22.19937, 72.41994]
383
+ AFR-44 = [-24.64001, 60.27998, -45.76, 42.24]
384
+ SEA-22 = [88.16034, 148.0006, -16.06006, 27.94011]
385
+ ARC-44 = [-180.0, 179.17, 48.56, 90.0]
386
+ WAS-44I = [19.25, 116.25, -15.75, 45.75]
387
+ AUS-22 = [86.9056, -151.57, -52.46012, 12.51982]
388
+ AUS-44 = [89.24, -157.593, -52.57, 12.2103]
389
+ EAS-22 = [48.54336, -175.70, -1.345726, 62.34015]
390
+ EUR-22 = [-45.38, 66.1712, 21.88545, 71.8702]
391
+ SAM-22 = [-105.55, -17.32, -59.11018, 19.58714]
392
+ CAM-22 = [-124.12, -22.60, -19.54996, 34.8301]
393
+ CEU-3 = [1.62, 18.28, 44.85, 56.24]
394
+
395
+ # override bbox logic for CORDEX
396
+ [drs_settings.dialect.cordex.special.bbox]
397
+ type = "call"
398
+ call = 'dialect["cordex"]["domains"].get("{{ domain | upper }}", [0, 360, -90, 90])'
399
+
400
+ [drs_settings.dialect.cordex.special.model]
401
+ type = "call"
402
+ call = "'{{ driving_model }}-{{ rcm_name }}-{{ rcm_version }}'"
403
+
404
+ # -- NextGems -------------------------------------------------------
405
+ [drs_settings.dialect.nextgems]
406
+ sources = ["data"]
407
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
408
+
409
+ [drs_settings.dialect.nextgems.facets]
410
+ project = "project"
411
+ porduct = "experiment_id"
412
+ institute = "institution_id"
413
+ model = "source_id"
414
+ experiment = "simulation_id"
415
+ time_frequency = "frequency"
416
+ time_aggregation = "time_reduction"
417
+ variable = "variable_id"
418
+ realm = "realm"
419
+ cmor_table = "realm"
420
+ level_type = "level_type"
421
+ format = "format"
422
+ gird_id = "grid_id"
423
+ bbox = "bbox"
424
+
425
+ [drs_settings.dialect.nextgems.data_specs.globals]
426
+ experiment_id = "experiment_id"
427
+ source_id = "source_id"
428
+ institution_id = "institution_id"
429
+ simulation_id = "simulation_id"
430
+ frequency = "frequency"
431
+ time_reduction = "time_reduction"
432
+ variable_id = "__variables__"
433
+
434
+ [drs_settings.dialect.nextgems.data_specs.stats.time]
435
+ stat = "range"
436
+ coord = "time"
437
+
438
+ [drs_settings.dialect.nextgems.special.grid_label]
439
+ type = "call"
440
+ call = "'{{ file }}'.rpartition('.')[0].split('_')[-1]"
@@ -0,0 +1,151 @@
1
+ """API for adding new cataloging systems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import abc
6
+ from pathlib import Path
7
+ from types import TracebackType
8
+ from typing import (
9
+ Any,
10
+ AsyncIterator,
11
+ Dict,
12
+ List,
13
+ Optional,
14
+ Self,
15
+ Tuple,
16
+ Type,
17
+ Union,
18
+ cast,
19
+ )
20
+
21
+ from ..logger import logger
22
+ from ..utils import Console, IndexProgress
23
+ from .config import SchemaField
24
+ from .metadata_stores import CatalogueReader, IndexStore
25
+
26
+
27
+ class BaseIndex:
28
+ """Base class to index metadata in the indexing system.
29
+
30
+ Any data ingestion class that implements metadata ingestion into
31
+ cataloguing systems should inherit from this class.
32
+
33
+ This abstract class will setup consumer threads and a fifo queue that wait
34
+ for new data to harvest metadata and add it to the cataloguing system.
35
+ Only :py:func:`add` and :py:func:`delete` are abstract methods that need
36
+ to be implemented for each cataloguing ingestion class. The rest is done
37
+ by this base class.
38
+
39
+ Parameters
40
+ ^^^^^^^^^^
41
+ catalogue_file:
42
+ Path to the intake catalogue
43
+ batch_size:
44
+ The amount for metadata that should be gathered `before` ingesting
45
+ it into the catalogue.
46
+ progress:
47
+ Optional rich progress object that should display the progress of the
48
+ tasks.
49
+
50
+ Attributes
51
+ ^^^^^^^^^^
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ catalogue_file: Optional[Union[str, Path]] = None,
57
+ batch_size: int = 2500,
58
+ storage_options: Optional[Dict[str, Any]] = None,
59
+ progress: Optional[IndexProgress] = None,
60
+ **kwargs: Any,
61
+ ) -> None:
62
+ self._store: Optional[IndexStore] = None
63
+ self.progress = progress or IndexProgress(total=-1)
64
+ if catalogue_file is not None:
65
+ _reader = CatalogueReader(
66
+ catalogue_file=catalogue_file or "",
67
+ batch_size=batch_size,
68
+ storage_options=storage_options,
69
+ )
70
+ self._store = _reader.store
71
+ self.__post_init__()
72
+
73
+ def __post_init__(self) -> None: ...
74
+
75
+ async def __aenter__(self) -> Self:
76
+ return self
77
+
78
+ async def __aexit__(
79
+ self,
80
+ exc_type: Optional[Type[BaseException]],
81
+ exc_val: Optional[BaseException],
82
+ exc_tb: Optional[TracebackType],
83
+ ) -> None: ...
84
+
85
+ @property
86
+ def index_schema(self) -> Dict[str, SchemaField]:
87
+ """Get the index schema."""
88
+ return cast(Dict[str, SchemaField], getattr(self._store, "schema", {}))
89
+
90
+ @property
91
+ def index_names(self) -> Tuple[str, str]:
92
+ """Get the names of the indexes for latests and all data."""
93
+ return cast(
94
+ Tuple[str, str], getattr(self._store, "index_names", ("", ""))
95
+ )
96
+
97
+ async def get_metadata(
98
+ self, index_name: str
99
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
100
+ """Get the metadata of an index in batches.
101
+
102
+ Parameters
103
+ ^^^^^^^^^^
104
+ index_name:
105
+ Name of the index that should be read.
106
+ """
107
+ if self._store:
108
+ batch = []
109
+ num_items = 0
110
+ logger.info("Indexing %s", index_name)
111
+ async for batch in self._store.read(index_name):
112
+ yield batch
113
+ self.progress.update(len(batch))
114
+ num_items += len(batch)
115
+ msg = f"Indexed {num_items:10,.0f} items for index {index_name}"
116
+ Console.print(msg) if Console.is_terminal else print(msg)
117
+
118
+ @abc.abstractmethod
119
+ async def delete(self, **kwargs: Any) -> None:
120
+ """Delete data from the cataloguing system.
121
+
122
+ Parameters
123
+ ^^^^^^^^^^
124
+ flush:
125
+ Boolean indicating whether or not the data should be flushed after
126
+ amending the catalogue (if implemented).
127
+ search_keys:
128
+ key-value based query for data that should be deleted.
129
+ """
130
+
131
+ @abc.abstractmethod
132
+ async def index(
133
+ self,
134
+ metadata: Optional[dict[str, Any]] = None,
135
+ core: Optional[str] = None,
136
+ **kwags: Any,
137
+ ) -> None:
138
+ """Add metadata into the cataloguing system.
139
+
140
+ Parameters
141
+ ^^^^^^^^^^
142
+ metadata_batch:
143
+ batch of metadata stored in a two valued tuple. The first entry
144
+ of the tuple represents a name of the catalog. This entry
145
+ might have different meanings for different cataloguing systems.
146
+ For example apache solr will receive the name of the ``core``.
147
+ The second entry is the meta data itself, saved in a dictionary.
148
+ flush:
149
+ Boolean indicating whether or not the data should be flushed after
150
+ adding to the catalogue (if implemented)
151
+ """