metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,439 @@
1
+ # common_drs.toml
2
+ # ----------------
3
+ # Default DRS settings
4
+
5
+ [drs_settings]
6
+
7
+ # 1) Allowed file extensions
8
+ suffixes = [".zarr", ".zar", ".nc4", ".nc", ".tar", ".hdf5", ".h5", ".grib", ".grb"]
9
+
10
+ # 2) Canonical index facets → raw keys in `data`
11
+ [drs_settings.index_schema]
12
+ # facet -> raw-key map
13
+
14
+ [drs_settings.index_schema.file]
15
+ key = "file"
16
+ type = "path"
17
+ multi_valued = false
18
+ required = true
19
+ unique = true
20
+
21
+ [drs_settings.index_schema.uri]
22
+ key = "uri"
23
+ type = "uri"
24
+ multi_valued = false
25
+ required = true
26
+
27
+ [drs_settings.index_schema.project]
28
+ key = "project"
29
+ type = "string"
30
+ multi_valued = true
31
+
32
+ [drs_settings.index_schema.product]
33
+ key = "product"
34
+ type = "string"
35
+ multi_valued = true
36
+
37
+ [drs_settings.index_schema.institute]
38
+ key = "institute"
39
+ type = "string"
40
+ multi_valued = true
41
+
42
+ [drs_settings.index_schema.model]
43
+ key = "model"
44
+ type = "string"
45
+ multi_valued = true
46
+
47
+ [drs_settings.index_schema.experiment]
48
+ key = "experiment"
49
+ type = "string"
50
+ multi_valued = true
51
+
52
+ [drs_settings.index_schema.time_frequency]
53
+ key = "time_frequency"
54
+ type = "string"
55
+ multi_valued = true
56
+
57
+ [drs_settings.index_schema.realm]
58
+ key = "realm"
59
+ type = "string"
60
+ multi_valued = true
61
+
62
+ [drs_settings.index_schema.cmor_table]
63
+ key = "cmor_table"
64
+ type = "string"
65
+ multi_valued = true
66
+
67
+ [drs_settings.index_schema.ensemble]
68
+ key = "ensemble"
69
+ type = "string"
70
+ multi_valued = true
71
+
72
+ [drs_settings.index_schema.variable]
73
+ key = "variable"
74
+ type = "string"
75
+ multi_valued = true
76
+
77
+ [drs_settings.index_schema.time]
78
+ key = "time"
79
+ type = "daterange"
80
+ multi_valued = false
81
+
82
+ [drs_settings.index_schema.grid_label]
83
+ key = "grid_label"
84
+ type = "string"
85
+ multi_valued = true
86
+ default = "gn"
87
+
88
+ [drs_settings.index_schema.version]
89
+ key = "version"
90
+ type = "string"
91
+ multi_valued = false
92
+ default = "-1"
93
+
94
+ [drs_settings.index_schema.driving_model]
95
+ key = "driving_model"
96
+ type = "string"
97
+ multi_valued = true
98
+
99
+ [drs_settings.index_schema.rcm_name]
100
+ key = "rcm_name"
101
+ type = "string"
102
+ multi_valued = true
103
+
104
+ [drs_settings.index_schema.rcm_version]
105
+ key = "rcm_version"
106
+ type = "string"
107
+ multi_valued = true
108
+
109
+ [drs_settings.index_schema.dataset]
110
+ key = "dataset"
111
+ type = "dataset"
112
+ multi_valued = false
113
+ default = ""
114
+
115
+ [drs_settings.index_schema.format]
116
+ key = "format"
117
+ type = "fmt"
118
+ multi_valued = false
119
+
120
+ [drs_settings.index_schema.grid_id]
121
+ key = "grid_id"
122
+ type = "string"
123
+ multi_valued = true
124
+
125
+ [drs_settings.index_schema.level_type]
126
+ key = "level_type"
127
+ type = "string"
128
+ multi_valued = true
129
+ default = "2d"
130
+
131
+ [drs_settings.index_schema.user]
132
+ key = "user"
133
+ type = "string"
134
+ multi_valued = false
135
+
136
+ [drs_settings.index_schema.fs_type]
137
+ key = "fs_type"
138
+ type = "storage"
139
+ multi_valued = false
140
+ default = "posix"
141
+
142
+ [drs_settings.index_schema.bbox]
143
+ key = "bbox"
144
+ type = "bbox"
145
+ multi_valued = true
146
+ default = [0,360,-90,90]
147
+
148
+ [drs_settings.index_schema.time_aggregation]
149
+ key = "time_aggregation"
150
+ type = "string"
151
+ multi_valued = true
152
+ default = "mean"
153
+
154
+
155
+ # 3) Global special rules
156
+ [drs_settings.special.time_aggregation]
157
+ type = "conditional"
158
+ condition = "'pt' in '{{ time_frequency | lower}}'"
159
+ true = "inst"
160
+ false = "mean"
161
+
162
+ # 4) Global storage options
163
+ [drs_settings.storage_options]
164
+ # 5) Common dialect specs + per-dialect overrides
165
+
166
+ # -- FREVA -------------------------------------------------------
167
+ [drs_settings.dialect.freva]
168
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
169
+ sources = ["path"]
170
+
171
+ [drs_settings.dialect.freva.facets]
172
+ project = "project"
173
+ product = "product"
174
+ institute = "institute"
175
+ model = "model"
176
+ experiment = "experiment"
177
+ time_frequency = "time_frequency"
178
+ realm = "realm"
179
+ cmor_table = "cmor_table"
180
+ ensemble = "member"
181
+ variable = "variable"
182
+ time = "time"
183
+ grid_label = "grid_label"
184
+ version = "version"
185
+ dataset = "dataset"
186
+ format = "format"
187
+ grid_id = "grid_id"
188
+ level_type = "level_type"
189
+ user = "user"
190
+ fs_type = "__fstype__"
191
+ bbox = "bbox"
192
+ time_aggregation = "time_aggregation"
193
+
194
+ [drs_settings.dialect.freva.path_specs]
195
+ dir_parts = [
196
+ "project",
197
+ "product",
198
+ "institution",
199
+ "model",
200
+ "experiment",
201
+ "time_frequency",
202
+ "realm",
203
+ "cmor_table",
204
+ "ensemble",
205
+ "version",
206
+ "variable"
207
+ ]
208
+ file_parts = [
209
+ "variable",
210
+ "cmor_table",
211
+ "model",
212
+ "experiment",
213
+ "ensemble",
214
+ "time"
215
+ ]
216
+ file_sep = "_"
217
+
218
+ # -- CMIP6 --------------------------------------------------------
219
+ [drs_settings.dialect.cmip6]
220
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
221
+ sources = ["path"]
222
+ [drs_settings.dialect.cmip6.facets]
223
+ project = "mip_era"
224
+ product = "activity_id"
225
+ model = "source_id"
226
+ experiment = "experiment_id"
227
+ cmor_table = "table_id"
228
+ ensemble = "member_id"
229
+ variable = "variable_id"
230
+ grid_label = "grid_label"
231
+ version = "version"
232
+ member = "member"
233
+ time_frequency = "time_frequency"
234
+ realm = "realm"
235
+ time_aggregation = "time_aggregation"
236
+ time = "time"
237
+ bbox = "bbox"
238
+ institute = "institution"
239
+ [drs_settings.dialect.cmip6.path_specs]
240
+ dir_parts = [
241
+ "mip_era",
242
+ "activity_id",
243
+ "institution",
244
+ "source_id",
245
+ "experiment_id",
246
+ "member_id",
247
+ "table_id",
248
+ "variable_id",
249
+ "grid_label",
250
+ "version"
251
+ ]
252
+ file_parts = [
253
+ "variable_id",
254
+ "table_id",
255
+ "source_id",
256
+ "experiment_id",
257
+ "member_id",
258
+ "grid_label",
259
+ "time"
260
+ ]
261
+ file_sep = "_"
262
+
263
+ # override async methods for cmip6
264
+ [drs_settings.dialect.cmip6.special.realm]
265
+ type = "lookup"
266
+ tree = ["{{ table_id }}", "{{ variable_id }}", "realm"]
267
+ attribute = "realm"
268
+ standard = "cmip6"
269
+ [drs_settings.dialect.cmip6.special.time_frequency]
270
+ type = "lookup"
271
+ tree = ["{{ table_id }}","{{ variable_id }}", "time-frequency"]
272
+ attribute = "frequency"
273
+ standard = "cmip6"
274
+ # -- CMIP5 --------------------------------------------------------
275
+ [drs_settings.dialect.cmip5]
276
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
277
+ sources = ["path"]
278
+ [drs_settings.dialect.cmip5.facets]
279
+ project = "project"
280
+ product = "product"
281
+ institute = "institution_id"
282
+ model = "model_id"
283
+ experiment = "experiment_id"
284
+ time_frequency = "time_frequency"
285
+ realm = "realm"
286
+ cmor_table = "cmor_table"
287
+ ensemble = "member_id"
288
+ variable = "variable_id"
289
+ time_aggregation = "time_aggregation"
290
+ bbox = "bbox"
291
+
292
+ [drs_settings.dialect.cmip5.path_specs]
293
+ dir_parts = [
294
+ "project",
295
+ "product",
296
+ "institution_id",
297
+ "model_id",
298
+ "experiment_id",
299
+ "time_frequency",
300
+ "realm",
301
+ "cmor_table",
302
+ "member_id",
303
+ "version",
304
+ "variable_id"
305
+ ]
306
+ file_parts = [
307
+ "variable_id",
308
+ "cmor_table",
309
+ "model_id",
310
+ "experiment_id",
311
+ "member_id",
312
+ "time"
313
+ ]
314
+ file_sep = "_"
315
+
316
+ # -- CORDEX --------------------------------------------------------
317
+ [drs_settings.dialect.cordex]
318
+ sources = ["path"]
319
+ defaults = { realm = "atmos" , level_type = "2d" }
320
+
321
+ [drs_settings.dialect.cordex.facets]
322
+ project = "project"
323
+ product = "domain"
324
+ institute = "institution"
325
+ model = "model"
326
+ driving_model = "driving_model"
327
+ rcm_name = "rcm_name"
328
+ rcm_version = "rcm_version"
329
+ time_aggregation = "time_aggregation"
330
+ bbox = "bbox"
331
+ time = "time"
332
+ variable = "variable"
333
+ ensemble = "ensemble"
334
+ realm = "realm"
335
+ time_frequency = "time_frequency"
336
+ experiment = "experiment"
337
+
338
+ [drs_settings.dialect.cordex.path_specs]
339
+ dir_parts = [
340
+ "project",
341
+ "product",
342
+ "domain",
343
+ "institution",
344
+ "driving_model",
345
+ "experiment",
346
+ "ensemble",
347
+ "rcm_name",
348
+ "rcm_version",
349
+ "time_frequency",
350
+ "variable",
351
+ "version"
352
+ ]
353
+ file_parts = [
354
+ "variable",
355
+ "domain",
356
+ "driving_model",
357
+ "experiment",
358
+ "ensemble",
359
+ "rcm_name",
360
+ "rcm_version",
361
+ "time_frequency",
362
+ "time"
363
+ ]
364
+ file_sep = "_"
365
+
366
+ # CORDEX‐specific domain‐to‐bbox map
367
+ [drs_settings.dialect.cordex.domains]
368
+ EAS-44 = [63.3574, 175.132, -18.22689, 58.59]
369
+ WAS-22 = [19.83779, 115.2829, -14.89043, 44.70015]
370
+ WAS-44 = [19.86429, 115.5316, -15.23168, 45.25018]
371
+ MED-11 = [-46.80627, 76.06158, 19.63504, 73.41]
372
+ EUR-11I = [-44.8125, 65.1875, 21.8125, 72.6875]
373
+ EUR-11 = [-44.594, 64.9646, 21.98791, 72.58528]
374
+ AFR-44I = [-25.25, 60.75, -46.25, 42.75]
375
+ CAS-22 = [10.7899, 140.1774, 18.00188, 69.51006]
376
+ NAM-22 = [-169.51, -24.72, 12.96031, 75.31002]
377
+ ARC-44I = [-179.75, 179.75, 48.75, 89.75]
378
+ SAM-44 = [-105.7188, -18.71515, -57.68022, 18.49518]
379
+ EUR-44I = [-44.75, 65.25, 21.75, 72.75]
380
+ AFR-22 = [-24.53, 59.9502, -47.63018, 43.89017]
381
+ EUR-44 = [-44.14069, 64.40398, 22.19937, 72.41994]
382
+ AFR-44 = [-24.64001, 60.27998, -45.76, 42.24]
383
+ SEA-22 = [88.16034, 148.0006, -16.06006, 27.94011]
384
+ ARC-44 = [-180.0, 179.17, 48.56, 90.0]
385
+ WAS-44I = [19.25, 116.25, -15.75, 45.75]
386
+ AUS-22 = [86.9056, -151.57, -52.46012, 12.51982]
387
+ AUS-44 = [89.24, -157.593, -52.57, 12.2103]
388
+ EAS-22 = [48.54336, -175.70, -1.345726, 62.34015]
389
+ EUR-22 = [-45.38, 66.1712, 21.88545, 71.8702]
390
+ SAM-22 = [-105.55, -17.32, -59.11018, 19.58714]
391
+ CAM-22 = [-124.12, -22.60, -19.54996, 34.8301]
392
+ CEU-3 = [1.62, 18.28, 44.85, 56.24]
393
+
394
+ # override bbox logic for CORDEX
395
+ [drs_settings.dialect.cordex.special.bbox]
396
+ type = "call"
397
+ call = 'dialect["cordex"]["domains"].get("{{ domain | upper }}", [0, 360, -90, 90])'
398
+
399
+ [drs_settings.dialect.cordex.special.model]
400
+ type = "call"
401
+ call = "'{{ driving_model }}-{{ rcm_name }}-{{ rcm_version }}'"
402
+
403
+ # -- NextGems -------------------------------------------------------
404
+ [drs_settings.dialect.nextgems]
405
+ sources = ["data"]
406
+ defaults = { level_type = "2d", bbox = [0, 360, -90, 90]}
407
+
408
+ [drs_settings.dialect.nextgems.facets]
409
+ project = "project"
410
+ porduct = "experiment_id"
411
+ institute = "institution_id"
412
+ model = "source_id"
413
+ experiment = "simulation_id"
414
+ time_frequency = "frequency"
415
+ time_aggregation = "time_reduction"
416
+ variable = "variable_id"
417
+ realm = "realm"
418
+ cmor_table = "realm"
419
+ level_type = "level_type"
420
+ format = "format"
421
+ gird_id = "grid_id"
422
+ bbox = "bbox"
423
+
424
+ [drs_settings.dialect.nextgems.data_specs.globals]
425
+ experiment_id = "experiment_id"
426
+ source_id = "source_id"
427
+ institution_id = "institution_id"
428
+ simulation_id = "simulation_id"
429
+ frequency = "frequency"
430
+ time_reduction = "time_reduction"
431
+ variable_id = "__variables__"
432
+
433
+ [drs_settings.dialect.nextgems.data_specs.stats.time]
434
+ stat = "range"
435
+ coord = "time"
436
+
437
+ [drs_settings.dialect.nextgems.special.grid_label]
438
+ type = "call"
439
+ call = "'{{ file }}'.rpartition('.')[0].split('_')[-1]"
@@ -0,0 +1,132 @@
1
+ """API for adding new cataloging systems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import abc
6
+ from pathlib import Path
7
+ from typing import (
8
+ Any,
9
+ AsyncIterator,
10
+ Dict,
11
+ List,
12
+ Optional,
13
+ Tuple,
14
+ Union,
15
+ cast,
16
+ )
17
+
18
+ from ..logger import logger
19
+ from ..utils import Console
20
+ from .config import SchemaField
21
+ from .metadata_stores import CatalogueReader, IndexStore
22
+
23
+
24
+ class BaseIndex:
25
+ """Base class to index metadata in the indexing system.
26
+
27
+ Any data ingestion class that implements metadata ingestion into
28
+ cataloguing systems should inherit from this class.
29
+
30
+ This abstract class will setup consumer threads and a fifo queue that wait
31
+ for new data to harvest metadata and add it to the cataloguing system.
32
+ Only :py:func:`add` and :py:func:`delete` are abstract methods that need
33
+ to be implemented for each cataloguing ingestion class. The rest is done
34
+ by this base class.
35
+
36
+ Parameters
37
+ ^^^^^^^^^^
38
+ catalogue_file:
39
+ Path to the intake catalogue
40
+ batch_size:
41
+ The amount for metadata that should be gathered `before` ingesting
42
+ it into the catalogue.
43
+
44
+ Attributes
45
+ ^^^^^^^^^^
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ catalogue_file: Optional[Union[str, Path]] = None,
51
+ batch_size: int = 2500,
52
+ storage_options: Optional[Dict[str, Any]] = None,
53
+ **kwargs: Any,
54
+ ) -> None:
55
+ self._store: Optional[IndexStore] = None
56
+ if catalogue_file is not None:
57
+ _reader = CatalogueReader(
58
+ catalogue_file=catalogue_file or "",
59
+ batch_size=batch_size,
60
+ storage_options=storage_options,
61
+ )
62
+ self._store = _reader.store
63
+ self.__post_init__()
64
+
65
+ def __post_init__(self) -> None: ...
66
+
67
+ @property
68
+ def index_schema(self) -> Dict[str, SchemaField]:
69
+ """Get the index schema."""
70
+ return cast(Dict[str, SchemaField], getattr(self._store, "schema", {}))
71
+
72
+ @property
73
+ def index_names(self) -> Tuple[str, str]:
74
+ """Get the names of the indexes for latests and all data."""
75
+ return cast(
76
+ Tuple[str, str], getattr(self._store, "index_names", ("", ""))
77
+ )
78
+
79
+ async def get_metadata(
80
+ self, index_name: str
81
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
82
+ """Get the metadata of an index in batches.
83
+
84
+ Parameters
85
+ ^^^^^^^^^^
86
+ index_name:
87
+ Name of the index that should be read.
88
+ """
89
+ if self._store:
90
+ batch = []
91
+ num_items = 0
92
+ logger.info("Indexing %s", index_name)
93
+ async for batch in self._store.read(index_name):
94
+ yield batch
95
+ num_items += len(batch)
96
+ msg = f"Indexed {num_items:10,.0f} items for index {index_name}"
97
+ Console.print(msg) if Console.is_terminal else print(msg)
98
+
99
+ @abc.abstractmethod
100
+ async def delete(self, **kwargs: Any) -> None:
101
+ """Delete data from the cataloguing system.
102
+
103
+ Parameters
104
+ ^^^^^^^^^^
105
+ flush:
106
+ Boolean indicating whether or not the data should be flushed after
107
+ amending the catalogue (if implemented).
108
+ search_keys:
109
+ key-value based query for data that should be deleted.
110
+ """
111
+
112
+ @abc.abstractmethod
113
+ async def index(
114
+ self,
115
+ metadata: Optional[dict[str, Any]] = None,
116
+ core: Optional[str] = None,
117
+ **kwags: Any,
118
+ ) -> None:
119
+ """Add metadata into the cataloguing system.
120
+
121
+ Parameters
122
+ ^^^^^^^^^^
123
+ metadata_batch:
124
+ batch of metadata stored in a two valued tuple. The first entry
125
+ of the tuple represents a name of the catalog. This entry
126
+ might have different meanings for different cataloguing systems.
127
+ For example apache solr will receive the name of the ``core``.
128
+ The second entry is the meta data itself, saved in a dictionary.
129
+ flush:
130
+ Boolean indicating whether or not the data should be flushed after
131
+ adding to the catalogue (if implemented)
132
+ """