metadata-crawler 2509.0.1__py3-none-any.whl → 2510.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

@@ -53,6 +53,7 @@ def index(
53
53
  *catalogue_files: Union[Path, str, List[str], List[Path]],
54
54
  batch_size: int = 2500,
55
55
  verbosity: int = 0,
56
+ log_suffix: Optional[str] = None,
56
57
  **kwargs: Any,
57
58
  ) -> None:
58
59
  """Index metadata in the indexing system.
@@ -68,6 +69,8 @@ def index(
68
69
  If the index system supports batch-sizes, the size of the batches.
69
70
  verbosity:
70
71
  Set the verbosity level.
72
+ log_suffix:
73
+ Add a suffix to the log file output.
71
74
 
72
75
  Other Parameters
73
76
  ^^^^^^^^^^^^^^^^
@@ -94,6 +97,7 @@ def index(
94
97
  *catalogue_files,
95
98
  batch_size=batch_size,
96
99
  verbosity=verbosity,
100
+ log_suffix=log_suffix,
97
101
  **kwargs,
98
102
  )
99
103
  )
@@ -103,6 +107,7 @@ def delete(
103
107
  index_system: str,
104
108
  batch_size: int = 2500,
105
109
  verbosity: int = 0,
110
+ log_suffix: Optional[str] = None,
106
111
  **kwargs: Any,
107
112
  ) -> None:
108
113
  """Delete metadata from the indexing system.
@@ -116,6 +121,8 @@ def delete(
116
121
  If the index system supports batch-sizes, the size of the batches.
117
122
  verbosity:
118
123
  Set the verbosity of the system.
124
+ log_suffix:
125
+ Add a suffix to the log file output.
119
126
 
120
127
  Other Parameters
121
128
  ^^^^^^^^^^^^^^^^
@@ -135,7 +142,11 @@ def delete(
135
142
  facets=[("project", "CMIP6"), ("institute", "MPI-M")],
136
143
  )
137
144
  """
138
- uvloop.run(async_delete(index_system, batch_size=batch_size, **kwargs))
145
+ uvloop.run(
146
+ async_delete(
147
+ index_system, batch_size=batch_size, log_suffix=log_suffix, **kwargs
148
+ )
149
+ )
139
150
 
140
151
 
141
152
  def add(
@@ -155,6 +166,7 @@ def add(
155
166
  all_versions: str = IndexName().all,
156
167
  n_procs: Optional[int] = None,
157
168
  verbosity: int = 0,
169
+ log_suffix: Optional[str] = None,
158
170
  password: bool = False,
159
171
  fail_under: int = -1,
160
172
  **kwargs: Any,
@@ -204,6 +216,8 @@ def add(
204
216
  Set the number of parallel processes for collecting.
205
217
  verbosity:
206
218
  Set the verbosity of the system.
219
+ log_suffix:
220
+ Add a suffix to the log file output.
207
221
  fail_under:
208
222
  Fail if less than X of the discovered files could be indexed.
209
223
 
@@ -242,6 +256,7 @@ def add(
242
256
  n_procs=n_procs,
243
257
  storage_options=storage_options,
244
258
  verbosity=verbosity,
259
+ log_suffix=log_suffix,
245
260
  fail_under=fail_under,
246
261
  **kwargs,
247
262
  )
@@ -1 +1 @@
1
- __version__ = "2509.0.1"
1
+ __version__ = "2510.0.0"
@@ -17,6 +17,7 @@ from typing import (
17
17
  List,
18
18
  Literal,
19
19
  Optional,
20
+ Tuple,
20
21
  Union,
21
22
  cast,
22
23
  )
@@ -255,23 +256,31 @@ class CrawlerSettings(BaseModel):
255
256
  class PathSpecs(BaseModel):
256
257
  """Implementation of the Directory reference syntax."""
257
258
 
258
- dir_parts: List[str] = Field(default_factory=list)
259
- file_parts: List[str] = Field(default_factory=list)
259
+ dir_parts: Optional[List[str]] = None
260
+ file_parts: Optional[List[str]] = None
260
261
  file_sep: str = "_"
261
262
 
262
- def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
263
- """Read path encoded metadata from path specs."""
263
+ def _get_metadata_from_dir(
264
+ self, data: Dict[str, Any], rel_path: Path
265
+ ) -> None:
264
266
  dir_parts = rel_path.parent.parts
265
- file_parts = rel_path.name.split(self.file_sep)
266
- if len(dir_parts) == len(self.dir_parts):
267
- data: Dict[str, Any] = dict(zip(self.dir_parts, dir_parts))
268
- else:
267
+ if self.dir_parts and len(dir_parts) == len(self.dir_parts):
268
+ _parts = dict(zip(self.dir_parts, dir_parts))
269
+ elif self.dir_parts:
269
270
  raise MetadataCrawlerException(
270
271
  (
271
272
  f"Number of dir parts for {rel_path.parent} do not match "
272
273
  f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
273
274
  )
274
275
  ) from None
276
+ data.update({k: v for (k, v) in _parts.items() if k not in data})
277
+
278
+ def _get_metadata_from_filename(
279
+ self, data: Dict[str, Any], rel_path: Path
280
+ ) -> None:
281
+ if self.file_parts is None:
282
+ return
283
+ file_parts = rel_path.name.split(self.file_sep)
275
284
  if len(file_parts) == len(self.file_parts):
276
285
  _parts = dict(zip(self.file_parts, file_parts))
277
286
  elif (
@@ -285,8 +294,13 @@ class PathSpecs(BaseModel):
285
294
  f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
286
295
  )
287
296
  )
288
- _parts.setdefault("time", "fx")
289
297
  data.update({k: v for (k, v) in _parts.items() if k not in data})
298
+
299
+ def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
300
+ """Read path encoded metadata from path specs."""
301
+ data: Dict[str, Any] = {}
302
+ self._get_metadata_from_dir(data, rel_path)
303
+ self._get_metadata_from_filename(data, rel_path)
290
304
  data.pop("_", None)
291
305
  return data
292
306
 
@@ -609,7 +623,9 @@ class DRSConfig(BaseModel, TemplateMixin):
609
623
  case "conditional":
610
624
  _rule = textwrap.dedent(rule.condition or "").strip()
611
625
  s_cond = self.render_templates(_rule, data)
612
- cond = eval(s_cond, {}, getattr(self, "_model_dict", {}))
626
+ cond = eval(
627
+ s_cond, {}, getattr(self, "_model_dict", {})
628
+ ) # nosec
613
629
  result = rule.true if cond else rule.false
614
630
  case "lookup":
615
631
  args = cast(List[str], self.render_templates(rule.tree, data))
@@ -627,7 +643,7 @@ class DRSConfig(BaseModel, TemplateMixin):
627
643
  self.render_templates(_call, data),
628
644
  {},
629
645
  getattr(self, "_model_dict", {}),
630
- )
646
+ ) # nosec
631
647
  if result:
632
648
  inp.metadata[facet] = result
633
649
 
@@ -666,7 +682,7 @@ class DRSConfig(BaseModel, TemplateMixin):
666
682
 
667
683
  def max_directory_tree_level(
668
684
  self, search_dir: str | Path, drs_type: str
669
- ) -> int:
685
+ ) -> Tuple[int, bool]:
670
686
  """Get the maximum level for descending into directories.
671
687
 
672
688
  When searching for files in a directory we can only traverse the directory
@@ -686,18 +702,19 @@ class DRSConfig(BaseModel, TemplateMixin):
686
702
  version = cast(
687
703
  str, self.dialect[standard].facets.get("version", "version")
688
704
  )
705
+ is_versioned = True
706
+ dir_parts = self.dialect[standard].path_specs.dir_parts or []
689
707
  try:
690
- version_idx = self.dialect[standard].path_specs.dir_parts.index(
691
- version
692
- )
708
+ version_idx = dir_parts.index(version)
693
709
  except ValueError:
694
710
  # No version given
695
- version_idx = len(self.dialect[standard].path_specs.dir_parts)
711
+ version_idx = len(dir_parts)
712
+ is_versioned = False
696
713
  if root_path == search_dir:
697
714
  current_pos = 0
698
715
  else:
699
716
  current_pos = len(search_dir.relative_to(root_path).parts)
700
- return version_idx - current_pos
717
+ return version_idx - current_pos, is_versioned
701
718
 
702
719
  def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
703
720
  """Check if all metadata that can be collected was collected."""
@@ -78,6 +78,7 @@ multi_valued = true
78
78
  key = "time"
79
79
  type = "daterange"
80
80
  multi_valued = false
81
+ default = "fx"
81
82
 
82
83
  [drs_settings.index_schema.grid_label]
83
84
  key = "grid_label"
@@ -4,19 +4,22 @@ from __future__ import annotations
4
4
 
5
5
  import abc
6
6
  from pathlib import Path
7
+ from types import TracebackType
7
8
  from typing import (
8
9
  Any,
9
10
  AsyncIterator,
10
11
  Dict,
11
12
  List,
12
13
  Optional,
14
+ Self,
13
15
  Tuple,
16
+ Type,
14
17
  Union,
15
18
  cast,
16
19
  )
17
20
 
18
21
  from ..logger import logger
19
- from ..utils import Console
22
+ from ..utils import Console, IndexProgress
20
23
  from .config import SchemaField
21
24
  from .metadata_stores import CatalogueReader, IndexStore
22
25
 
@@ -40,6 +43,9 @@ class BaseIndex:
40
43
  batch_size:
41
44
  The amount for metadata that should be gathered `before` ingesting
42
45
  it into the catalogue.
46
+ progress:
47
+ Optional rich progress object that should display the progress of the
48
+ tasks.
43
49
 
44
50
  Attributes
45
51
  ^^^^^^^^^^
@@ -50,9 +56,11 @@ class BaseIndex:
50
56
  catalogue_file: Optional[Union[str, Path]] = None,
51
57
  batch_size: int = 2500,
52
58
  storage_options: Optional[Dict[str, Any]] = None,
59
+ progress: Optional[IndexProgress] = None,
53
60
  **kwargs: Any,
54
61
  ) -> None:
55
62
  self._store: Optional[IndexStore] = None
63
+ self.progress = progress or IndexProgress(total=-1)
56
64
  if catalogue_file is not None:
57
65
  _reader = CatalogueReader(
58
66
  catalogue_file=catalogue_file or "",
@@ -64,6 +72,16 @@ class BaseIndex:
64
72
 
65
73
  def __post_init__(self) -> None: ...
66
74
 
75
+ async def __aenter__(self) -> Self:
76
+ return self
77
+
78
+ async def __aexit__(
79
+ self,
80
+ exc_type: Optional[Type[BaseException]],
81
+ exc_val: Optional[BaseException],
82
+ exc_tb: Optional[TracebackType],
83
+ ) -> None: ...
84
+
67
85
  @property
68
86
  def index_schema(self) -> Dict[str, SchemaField]:
69
87
  """Get the index schema."""
@@ -92,6 +110,7 @@ class BaseIndex:
92
110
  logger.info("Indexing %s", index_name)
93
111
  async for batch in self._store.read(index_name):
94
112
  yield batch
113
+ self.progress.update(len(batch))
95
114
  num_items += len(batch)
96
115
  msg = f"Indexed {num_items:10,.0f} items for index {index_name}"
97
116
  Console.print(msg) if Console.is_terminal else print(msg)
@@ -473,10 +473,7 @@ class CatalogueReader:
473
473
  ) -> None:
474
474
  catalogue_file = str(catalogue_file)
475
475
  storage_options = storage_options or {}
476
- fs, _ = IndexStore.get_fs(catalogue_file, **storage_options)
477
- path = fs.unstrip_protocol(catalogue_file)
478
- with fs.open(path) as stream:
479
- cat = yaml.safe_load(stream.read())
476
+ cat = self.load_catalogue(catalogue_file, **storage_options)
480
477
  _schema_json = cat["metadata"]["schema"]
481
478
  schema = {s["key"]: SchemaField(**s) for k, s in _schema_json.items()}
482
479
  index_name = IndexName(**cat["metadata"]["index_names"])
@@ -493,6 +490,14 @@ class CatalogueReader:
493
490
  storage_options=storage_options,
494
491
  )
495
492
 
493
+ @staticmethod
494
+ def load_catalogue(path: Union[str, Path], **storage_options: Any) -> Any:
495
+ """Load a intake yaml catalogue (remote or local)."""
496
+ fs, _ = IndexStore.get_fs(str(path), **storage_options)
497
+ cat_path = fs.unstrip_protocol(path)
498
+ with fs.open(cat_path) as stream:
499
+ return yaml.safe_load(stream.read())
500
+
496
501
 
497
502
  class QueueConsumer:
498
503
  """Class that consumes the file discovery queue."""
@@ -722,6 +727,7 @@ class CatalogueWriter:
722
727
  "latest": self.index_name.latest,
723
728
  "all": self.index_name.all,
724
729
  },
730
+ "indexed_objects": self.ingested_objects,
725
731
  "schema": {
726
732
  k: json.loads(s.model_dump_json())
727
733
  for k, s in self.store.schema.items()
@@ -6,7 +6,7 @@ from typing import Any, Dict, Mapping, Optional
6
6
 
7
7
  from jinja2 import Environment, Template, Undefined
8
8
 
9
- ENV = Environment(undefined=Undefined, autoescape=False)
9
+ ENV = Environment(undefined=Undefined, autoescape=True)
10
10
 
11
11
 
12
12
  @lru_cache(maxsize=1024)
@@ -71,11 +71,15 @@ class S3Path(PathTemplate):
71
71
  self, path: Union[str, Path, pathlib.Path]
72
72
  ) -> AsyncIterator[str]:
73
73
  """Retrieve sub directories of directory."""
74
- path = str(path)
75
74
  client = await self._get_client()
76
- for _content in await client._lsdir(path):
77
- if _content.get("type", "") == "directory":
78
- yield f'{_content.get("name", "")}'
75
+ path = str(path)
76
+ if await self.is_file(path):
77
+ yield path
78
+ else:
79
+ for _content in await client._lsdir(path):
80
+ size: int = _content.get("size") or 0
81
+ if _content.get("type", "") == "directory" or size > 0:
82
+ yield _content.get("name", "")
79
83
 
80
84
  async def rglob(
81
85
  self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
metadata_crawler/cli.py CHANGED
@@ -34,7 +34,6 @@ from .api.metadata_stores import CatalogueBackends, IndexName
34
34
  from .backends.intake import IntakePath
35
35
  from .logger import (
36
36
  THIS_NAME,
37
- add_file_handle,
38
37
  apply_verbosity,
39
38
  logger,
40
39
  )
@@ -48,7 +47,9 @@ KwargValue = Union[
48
47
 
49
48
 
50
49
  def walk_catalogue(
51
- path: str, storage_options: Optional[Dict[str, Any]] = None, **kwargs: Any
50
+ path: str,
51
+ storage_options: Optional[Dict[str, Any]] = None,
52
+ **kwargs: Any,
52
53
  ) -> int:
53
54
  """Recursively traverse an intake catalogue.
54
55
 
@@ -359,6 +360,13 @@ class ArgParse:
359
360
  action="append",
360
361
  nargs=2,
361
362
  )
363
+ parser.add_argument(
364
+ "-v",
365
+ "--verbose",
366
+ action="count",
367
+ default=self.verbose,
368
+ help="Increase the verbosity level.",
369
+ )
362
370
  parser.set_defaults(apply_func=walk_catalogue)
363
371
 
364
372
  def _index_submcommands(self) -> None:
@@ -391,8 +399,8 @@ class ArgParse:
391
399
  "-b",
392
400
  "--batch-size",
393
401
  type=int,
394
- default=25_000,
395
- help="Set the batch size for ingestion.",
402
+ default=5_000,
403
+ help="Set the batch size for indexing.",
396
404
  )
397
405
  parser.add_argument(
398
406
  "--storage_option",
@@ -494,7 +502,6 @@ class ArgParse:
494
502
  "apply_func",
495
503
  "verbose",
496
504
  "version",
497
- "log_suffix",
498
505
  "storage_option",
499
506
  "shadow",
500
507
  )
@@ -509,7 +516,6 @@ class ArgParse:
509
516
  self.kwargs["shadow"] = _flatten(args.shadow)
510
517
  self.kwargs["storage_options"] = so
511
518
  self.verbose = args.verbose
512
- add_file_handle(args.log_suffix)
513
519
  self.kwargs["verbosity"] = self.verbose
514
520
  return args
515
521
 
@@ -519,7 +525,9 @@ def _run(
519
525
  **kwargs: KwargValue,
520
526
  ) -> None:
521
527
  """Apply the parsed method."""
522
- old_level = apply_verbosity(getattr(parser, "verbose", 0))
528
+ old_level = apply_verbosity(
529
+ getattr(parser, "verbose", 0), suffix=getattr(parser, "log_suffix", None)
530
+ )
523
531
  try:
524
532
  parser.apply_func(**kwargs)
525
533
  except Exception as error:
@@ -15,6 +15,7 @@ from typing import (
15
15
  Dict,
16
16
  Iterator,
17
17
  Optional,
18
+ Tuple,
18
19
  Type,
19
20
  Union,
20
21
  cast,
@@ -33,7 +34,7 @@ from .utils import (
33
34
  print_performance,
34
35
  )
35
36
 
36
- ScanItem = tuple[str, str, bool]
37
+ ScanItem = Tuple[str, str, bool, bool]
37
38
 
38
39
 
39
40
  class DataCollector:
@@ -138,6 +139,7 @@ class DataCollector:
138
139
  drs_type: str,
139
140
  search_dir: str,
140
141
  iterable: bool = True,
142
+ is_versioned: bool = True,
141
143
  ) -> None:
142
144
  if iterable:
143
145
  try:
@@ -161,7 +163,7 @@ class DataCollector:
161
163
  await self.ingest_queue.put(
162
164
  _inp, drs_type, name=self.index_name.all
163
165
  )
164
- if rank == 0:
166
+ if rank == 0 or is_versioned is False:
165
167
  await self.ingest_queue.put(
166
168
  _inp, drs_type, name=self.index_name.latest
167
169
  )
@@ -176,16 +178,22 @@ class DataCollector:
176
178
  if item is None: # sentinel -> exit
177
179
  # do not task_done() for sentinel
178
180
  break
179
- drs_type, path, iterable = item
181
+ drs_type, path, iterable, is_versioned = item
180
182
  try:
181
- await self._ingest_dir(drs_type, path, iterable=iterable)
183
+ await self._ingest_dir(
184
+ drs_type, path, iterable=iterable, is_versioned=is_versioned
185
+ )
182
186
  except Exception as error:
183
187
  logger.error(error)
184
188
  finally:
185
189
  self._scan_queue.task_done()
186
190
 
187
191
  async def _iter_content(
188
- self, drs_type: str, inp_dir: str, pos: int = 0
192
+ self,
193
+ drs_type: str,
194
+ inp_dir: str,
195
+ pos: int = 0,
196
+ is_versioned: bool = True,
189
197
  ) -> None:
190
198
  """Walk recursively until files or the version level is reached."""
191
199
  store = self.config.datasets[drs_type].backend
@@ -203,7 +211,6 @@ class DataCollector:
203
211
 
204
212
  iterable = False if suffix == ".zarr" else iterable
205
213
  op: Optional[Callable[..., Coroutine[Any, Any, None]]] = None
206
-
207
214
  if is_file and suffix in self.config.suffixes:
208
215
  op = self._ingest_dir
209
216
  elif pos <= 0 or suffix == ".zarr":
@@ -211,13 +218,17 @@ class DataCollector:
211
218
 
212
219
  if op is not None:
213
220
  # enqueue the heavy scan; workers will run _ingest_dir concurrently
214
- await self._scan_queue.put((drs_type, inp_dir, iterable))
221
+ await self._scan_queue.put(
222
+ (drs_type, inp_dir, iterable, is_versioned)
223
+ )
215
224
  return
216
225
 
217
226
  # otherwise, recurse sequentially (cheap) — no task per directory
218
227
  try:
219
228
  async for sub in store.iterdir(inp_dir):
220
- await self._iter_content(drs_type, sub, pos - 1)
229
+ await self._iter_content(
230
+ drs_type, sub, pos - 1, is_versioned=is_versioned
231
+ )
221
232
  except Exception as error:
222
233
  logger.error(error)
223
234
 
@@ -239,10 +250,19 @@ class DataCollector:
239
250
 
240
251
  # produce scan items by walking roots sequentially
241
252
  for drs_type, path in self.search_objects: # <- property is sync
242
- pos = self.config.max_directory_tree_level(
253
+ pos, is_versioned = self.config.max_directory_tree_level(
243
254
  path, drs_type=drs_type
244
255
  )
245
- await self._iter_content(drs_type, path, pos)
256
+ if pos < 0:
257
+ logger.warning(
258
+ "Can't define latest version of versioned dataset."
259
+ " This might lead to unexpected results. Try adjusting"
260
+ " your search path."
261
+ )
262
+
263
+ await self._iter_content(
264
+ drs_type, path, pos, is_versioned=is_versioned
265
+ )
246
266
 
247
267
  # wait until all queued scan items are processed
248
268
  await self._scan_queue.join()
@@ -80,12 +80,13 @@ class MongoIndex(BaseIndex):
80
80
  await collection.bulk_write(ops, ordered=False)
81
81
 
82
82
  async def _index_collection(
83
- self, db: AsyncIOMotorDatabase[Any], collection: str
83
+ self, db: AsyncIOMotorDatabase[Any], collection: str, suffix: str = ""
84
84
  ) -> None:
85
85
  """Index a collection."""
86
- await db[collection].create_index(self.unique_index, unique=True)
86
+ col = collection + suffix
87
+ await db[col].create_index(self.unique_index, unique=True)
87
88
  async for chunk in self.get_metadata(collection):
88
- await self._bulk_upsert(chunk, db[collection])
89
+ await self._bulk_upsert(chunk, db[col])
89
90
 
90
91
  async def _prep_db_connection(
91
92
  self, database: str, url: str
@@ -119,12 +120,24 @@ class MongoIndex(BaseIndex):
119
120
  default="metadata",
120
121
  ),
121
122
  ] = "metadata",
123
+ index_suffix: Annotated[
124
+ Optional[str],
125
+ cli_parameter(
126
+ "--index-suffix",
127
+ help="Suffix for the latest and all version collections.",
128
+ type=str,
129
+ ),
130
+ ] = None,
122
131
  ) -> None:
123
132
  """Add metadata to the mongoDB metadata server."""
124
133
  db = await self._prep_db_connection(database, url or "")
125
134
  async with asyncio.TaskGroup() as tg:
126
135
  for collection in self.index_names:
127
- tg.create_task(self._index_collection(db, collection))
136
+ tg.create_task(
137
+ self._index_collection(
138
+ db, collection, suffix=index_suffix or ""
139
+ )
140
+ )
128
141
 
129
142
  async def close(self) -> None:
130
143
  """Close the mongoDB connection."""
@@ -5,9 +5,13 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import logging
7
7
  import os
8
- from typing import Annotated, Any, Dict, List, Optional
8
+ import time
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from types import TracebackType
11
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
9
12
 
10
13
  import aiohttp
14
+ import orjson
11
15
 
12
16
  from ..api.cli import cli_function, cli_parameter
13
17
  from ..api.index import BaseIndex
@@ -18,9 +22,26 @@ from ..logger import logger
18
22
  class SolrIndex(BaseIndex):
19
23
  """Ingest metadata into an apache solr server."""
20
24
 
25
+ senteniel: Optional[bytes] = None
26
+
21
27
  def __post_init__(self) -> None:
22
- self.timeout = aiohttp.ClientTimeout(total=50)
28
+ self.timeout = aiohttp.ClientTimeout(
29
+ connect=10, sock_connect=10, sock_read=180, total=None
30
+ )
31
+ self.semaphore = asyncio.Event()
32
+ self.max_http_workers: int = 0
33
+ queue_max: int = 128
34
+ encode_workers: int = 4
23
35
  self._uri: str = ""
36
+ self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
37
+ self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
38
+ asyncio.Queue(maxsize=queue_max)
39
+ )
40
+ self.connector = aiohttp.TCPConnector(
41
+ ttl_dns_cache=300,
42
+ use_dns_cache=True,
43
+ enable_cleanup_closed=True,
44
+ )
24
45
 
25
46
  async def solr_url(self, server: str, core: str) -> str:
26
47
  """Construct the solr url from a given solr core."""
@@ -112,23 +133,98 @@ class SolrIndex(BaseIndex):
112
133
 
113
134
  return metadata
114
135
 
115
- async def _index_core(self, server: str, core: str) -> None:
116
- """Index data to a solr core."""
117
- url = await self.solr_url(server, core)
118
- async for chunk in self.get_metadata(core):
119
- async with aiohttp.ClientSession(
120
- timeout=self.timeout, raise_for_status=True
121
- ) as session:
122
- try:
123
- payload = list(map(self._convert, chunk))
124
- async with session.post(url, json=payload) as resp:
125
- logger.debug(await resp.text())
126
- except Exception as error:
127
- logger.log(
128
- logging.WARNING,
129
- error,
130
- exc_info=logger.level < logging.INFO,
131
- )
136
+ def _encode_payload(self, chunk: List[Dict[str, Any]]) -> bytes:
137
+ """CPU-bound: convert docs and JSON-encode off the event loop."""
138
+ return orjson.dumps([self._convert(x) for x in chunk])
139
+
140
+ async def _post_chunk(
141
+ self,
142
+ session: aiohttp.ClientSession,
143
+ url: str,
144
+ body: bytes,
145
+ ) -> None:
146
+ """POST one batch with minimal overhead and simple retries."""
147
+ status = 500
148
+ t0 = time.perf_counter()
149
+ try:
150
+ async with session.post(
151
+ url, data=body, headers={"Content-Type": "application/json"}
152
+ ) as resp:
153
+ status = resp.status
154
+ await resp.read()
155
+
156
+ except Exception as error:
157
+ logger.log(
158
+ logging.WARNING,
159
+ error,
160
+ exc_info=logger.level < logging.INFO,
161
+ )
162
+ return
163
+ logger.debug(
164
+ "POST %s -> %i (index time: %.3f)",
165
+ url,
166
+ status,
167
+ time.perf_counter() - t0,
168
+ )
169
+
170
+ async def consumer(self, session: aiohttp.ClientSession) -> None:
171
+ """Consume the metadata read by the porducers."""
172
+ while True:
173
+ update_url, body = await self.producer_queue.get()
174
+ if body is self.senteniel:
175
+ self.producer_queue.task_done()
176
+ break
177
+ try:
178
+ await self._post_chunk(session, update_url, cast(bytes, body))
179
+ finally:
180
+ self.producer_queue.task_done()
181
+
182
+ async def _index_core(
183
+ self,
184
+ session: aiohttp.ClientSession,
185
+ server: str,
186
+ core: str,
187
+ suffix: str,
188
+ http_workers: int = 8,
189
+ ) -> None:
190
+ """Zero-copy-ish, backpressured, bounded-concurrency indexer.
191
+
192
+ - No per-batch commit.
193
+ - Bounded queue so tasks don't pile up.
194
+ - Constant number of worker tasks (not O(batches)).
195
+ """
196
+ base_url = await self.solr_url(server, core + suffix)
197
+ update_url = base_url.split("?", 1)[0] # guard
198
+ loop = asyncio.get_running_loop()
199
+ async for batch in self.get_metadata(core):
200
+ body = await loop.run_in_executor(
201
+ self.cpu_pool, self._encode_payload, batch
202
+ )
203
+ await self.producer_queue.put((update_url, body))
204
+ commit_url = f"{update_url}?commit=true"
205
+ async with session.post(
206
+ commit_url,
207
+ data=b"[]",
208
+ headers={"Content-Type": "application/json"},
209
+ ) as resp:
210
+ if resp.status >= 400:
211
+ text = await resp.text()
212
+ logger.warning(
213
+ "COMMIT %s -> %i: %s", commit_url, resp.status, text
214
+ )
215
+
216
+ async def __aexit__(
217
+ self,
218
+ exc_type: Optional[Type[BaseException]],
219
+ exc_val: Optional[BaseException],
220
+ exc_tb: Optional[TracebackType],
221
+ ) -> None:
222
+
223
+ try:
224
+ self.producer_queue.shutdown()
225
+ except AttributeError: # pragma: no cover
226
+ pass # prgama: no cover
227
+ self.cpu_pool.shutdown()
132
228
 
133
229
  @cli_function(
134
230
  help="Add metadata to the apache solr metadata server.",
@@ -145,8 +241,41 @@ class SolrIndex(BaseIndex):
145
241
  type=str,
146
242
  ),
147
243
  ] = None,
244
+ index_suffix: Annotated[
245
+ Optional[str],
246
+ cli_parameter(
247
+ "--index-suffix",
248
+ help="Suffix for the latest and all version collections.",
249
+ type=str,
250
+ ),
251
+ ] = None,
252
+ http_workers: Annotated[
253
+ int,
254
+ cli_parameter(
255
+ "--http-workers", help="Number of ingestion threads.", type=int
256
+ ),
257
+ ] = 8,
148
258
  ) -> None:
149
259
  """Add metadata to the apache solr metadata server."""
150
- async with asyncio.TaskGroup() as tg:
151
- for core in self.index_names:
152
- tg.create_task(self._index_core(server or "", core))
260
+ async with aiohttp.ClientSession(
261
+ timeout=self.timeout, connector=self.connector, raise_for_status=True
262
+ ) as session:
263
+ consumers = [
264
+ asyncio.create_task(self.consumer(session))
265
+ for _ in range(http_workers)
266
+ ]
267
+ async with asyncio.TaskGroup() as tg:
268
+ for core in self.index_names:
269
+ tg.create_task(
270
+ self._index_core(
271
+ session,
272
+ server or "",
273
+ core,
274
+ suffix=index_suffix or "",
275
+ http_workers=http_workers,
276
+ )
277
+ )
278
+ for _ in range(http_workers):
279
+ await self.producer_queue.put(("", self.senteniel))
280
+ await self.producer_queue.join()
281
+ await asyncio.gather(*consumers)
@@ -11,7 +11,7 @@ import appdirs
11
11
  from rich.console import Console
12
12
  from rich.logging import RichHandler
13
13
 
14
- THIS_NAME = "data-crawler"
14
+ THIS_NAME = "metadata-crawler"
15
15
 
16
16
  logging.basicConfig(
17
17
  level=logging.WARNING,
@@ -24,7 +24,7 @@ logging.config.dictConfig(
24
24
  # keep existing handlers
25
25
  "disable_existing_loggers": False,
26
26
  "root": {
27
- "level": "WARNING",
27
+ "level": "CRITICAL",
28
28
  "handlers": ["default"],
29
29
  },
30
30
  "formatters": {
@@ -36,16 +36,12 @@ logging.config.dictConfig(
36
36
  "default": {
37
37
  "class": "logging.StreamHandler",
38
38
  "formatter": "standard",
39
- "level": "WARNING",
39
+ "level": "CRITICAL",
40
40
  },
41
41
  },
42
42
  }
43
43
  )
44
44
 
45
- logging.getLogger("sqlalchemy").setLevel(logging.WARNING)
46
- logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
47
- logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
48
-
49
45
 
50
46
  class Logger(logging.Logger):
51
47
  """Custom Logger defining the logging behaviour."""
@@ -56,11 +52,14 @@ class Logger(logging.Logger):
56
52
  no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
57
53
 
58
54
  def __init__(
59
- self, name: Optional[str] = None, level: Optional[int] = None
55
+ self,
56
+ name: Optional[str] = None,
57
+ level: Optional[int] = None,
58
+ suffix: Optional[str] = None,
60
59
  ) -> None:
61
60
  """Instantiate this logger only once and for all."""
62
- level = level or int(
63
- cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.WARNING)))
61
+ self.level = level or int(
62
+ cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.CRITICAL)))
64
63
  )
65
64
  name = name or THIS_NAME
66
65
  logger_format = logging.Formatter(self.logfmt, self.datefmt)
@@ -78,11 +77,16 @@ class Logger(logging.Logger):
78
77
  ),
79
78
  )
80
79
  self._logger_stream_handle.setFormatter(logger_format)
81
- self._logger_stream_handle.setLevel(level)
82
- super().__init__(name, level)
80
+ self._logger_stream_handle.setLevel(self.level)
81
+ super().__init__(name, self.level)
83
82
 
84
83
  self.propagate = False
85
84
  self.handlers = [self._logger_stream_handle]
85
+ (
86
+ self.add_file_handle(suffix=suffix)
87
+ if os.getenv("MDC_LOG_INIT", "0") == "1"
88
+ else None
89
+ )
86
90
 
87
91
  def set_level(self, level: int) -> None:
88
92
  """Set the logger level to level."""
@@ -92,7 +96,7 @@ class Logger(logging.Logger):
92
96
  log_level = min(level, logging.CRITICAL)
93
97
  handler.setLevel(log_level)
94
98
  self.setLevel(level)
95
- logger.level = level
99
+ self.level = level
96
100
 
97
101
  def error(
98
102
  self,
@@ -105,28 +109,30 @@ class Logger(logging.Logger):
105
109
  kwargs.setdefault("exc_info", True)
106
110
  self._log(logging.ERROR, msg, args, **kwargs)
107
111
 
108
-
109
- logger = Logger()
112
+ def add_file_handle(
113
+ self,
114
+ suffix: Optional[str] = None,
115
+ level: int = logging.CRITICAL,
116
+ ) -> None:
117
+ """Add a file log handle to the logger."""
118
+ suffix = suffix or os.getenv("MDC_LOG_SUFFIX", "")
119
+ base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
120
+ log_dir = Path(os.getenv("MDC_LOG_DIR", appdirs.user_log_dir(THIS_NAME)))
121
+ log_dir.mkdir(exist_ok=True, parents=True)
122
+ logger_file_handle = RotatingFileHandler(
123
+ log_dir / f"{base_name}.log",
124
+ mode="a",
125
+ maxBytes=5 * 1024**2,
126
+ backupCount=5,
127
+ encoding="utf-8",
128
+ delay=False,
129
+ )
130
+ logger_file_handle.setFormatter(self.file_format)
131
+ logger_file_handle.setLevel(self.level)
132
+ self.addHandler(logger_file_handle)
110
133
 
111
134
 
112
- def add_file_handle(
113
- suffix: Optional[str], log_level: int = logging.CRITICAL
114
- ) -> None:
115
- """Add a file log handle to the logger."""
116
- base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
117
- log_dir = Path(appdirs.user_log_dir(THIS_NAME))
118
- log_dir.mkdir(exist_ok=True, parents=True)
119
- logger_file_handle = RotatingFileHandler(
120
- log_dir / f"{base_name}.log",
121
- mode="a",
122
- maxBytes=5 * 1024**2,
123
- backupCount=5,
124
- encoding="utf-8",
125
- delay=False,
126
- )
127
- logger_file_handle.setFormatter(logger.file_format)
128
- logger_file_handle.setLevel(min(log_level, logging.CRITICAL))
129
- logger.addHandler(logger_file_handle)
135
+ logger = Logger()
130
136
 
131
137
 
132
138
  def get_level_from_verbosity(verbosity: int) -> int:
@@ -134,9 +140,14 @@ def get_level_from_verbosity(verbosity: int) -> int:
134
140
  return max(logging.CRITICAL - 10 * verbosity, -1)
135
141
 
136
142
 
137
- def apply_verbosity(level: int) -> int:
143
+ def apply_verbosity(
144
+ level: Optional[int] = None, suffix: Optional[str] = None
145
+ ) -> int:
138
146
  """Set the logging level of the handlers to a certain level."""
147
+ level = logger.level if level is None else level
139
148
  old_level = logger.level
140
149
  level = get_level_from_verbosity(level)
141
150
  logger.set_level(level)
151
+ logger.add_file_handle(suffix, level)
152
+
142
153
  return old_level
metadata_crawler/run.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Apply the metadata collector."""
2
2
 
3
- import asyncio
4
3
  import os
5
4
  import time
6
5
  from fnmatch import fnmatch
@@ -9,15 +8,21 @@ from types import NoneType
9
8
  from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
10
9
 
11
10
  import tomlkit
11
+ import yaml
12
12
  from rich.prompt import Prompt
13
13
 
14
14
  from .api.config import CrawlerSettings, DRSConfig, strip_protocol
15
- from .api.metadata_stores import CatalogueBackendType, IndexName
15
+ from .api.metadata_stores import (
16
+ CatalogueBackendType,
17
+ CatalogueReader,
18
+ IndexName,
19
+ )
16
20
  from .data_collector import DataCollector
17
21
  from .logger import apply_verbosity, get_level_from_verbosity, logger
18
22
  from .utils import (
19
23
  Console,
20
24
  EmptyCrawl,
25
+ IndexProgress,
21
26
  MetadataCrawlerException,
22
27
  find_closest,
23
28
  load_plugins,
@@ -49,6 +54,20 @@ def _match(match: str, items: Collection[str]) -> List[str]:
49
54
  return out
50
55
 
51
56
 
57
+ def _get_num_of_indexed_objects(
58
+ catalogue_files: FilesArg, storage_options: Optional[Dict[str, Any]] = None
59
+ ) -> int:
60
+ num_objects = 0
61
+ storage_options = storage_options or {}
62
+ for cat_file in _norm_files(catalogue_files):
63
+ try:
64
+ cat = CatalogueReader.load_catalogue(cat_file, **storage_options)
65
+ num_objects += cat.get("metadata", {}).get("indexed_objects", 0)
66
+ except (FileNotFoundError, IsADirectoryError, yaml.parser.ParserError):
67
+ pass
68
+ return num_objects
69
+
70
+
52
71
  def _get_search(
53
72
  config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
54
73
  search_dirs: Optional[List[str]] = None,
@@ -86,14 +105,22 @@ async def async_call(
86
105
  batch_size: int = 2500,
87
106
  catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
88
107
  verbosity: int = 0,
108
+ log_suffix: Optional[str] = None,
109
+ num_objects: int = 0,
89
110
  *args: Any,
90
111
  **kwargs: Any,
91
112
  ) -> None:
92
- """Index metadata."""
113
+ """Add / Delete metadata from index."""
93
114
  env = cast(os._Environ[str], os.environ.copy())
94
- old_level = apply_verbosity(verbosity)
115
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
116
+
95
117
  try:
118
+ progress = IndexProgress(total=num_objects)
119
+ os.environ["MDC_LOG_INIT"] = "1"
96
120
  os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
121
+ os.environ["MDC_LOG_SUFFIX"] = (
122
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
123
+ )
97
124
  backends = load_plugins("metadata_crawler.ingester")
98
125
  try:
99
126
  cls = backends[index_system]
@@ -103,22 +130,22 @@ async def async_call(
103
130
  )
104
131
  raise ValueError(msg) from None
105
132
  flat_files = _norm_files(catalogue_files)
106
- _event_loop = asyncio.get_event_loop()
107
133
  flat_files = flat_files or [""]
108
- futures = []
109
134
  storage_options = kwargs.pop("storage_options", {})
135
+ progress.start()
110
136
  for cf in flat_files:
111
- obj = cls(
137
+ async with cls(
112
138
  batch_size=batch_size,
113
139
  catalogue_file=cf or None,
114
140
  storage_options=storage_options,
115
- )
116
- func = getattr(obj, method)
117
- future = _event_loop.create_task(func(**kwargs))
118
- futures.append(future)
119
- await asyncio.gather(*futures)
141
+ progress=progress,
142
+ ) as obj:
143
+ func = getattr(obj, method)
144
+ await func(**kwargs)
145
+
120
146
  finally:
121
147
  os.environ = env
148
+ progress.stop()
122
149
  logger.set_level(old_level)
123
150
 
124
151
 
@@ -127,6 +154,7 @@ async def async_index(
127
154
  *catalogue_files: Union[Path, str, List[str], List[Path]],
128
155
  batch_size: int = 2500,
129
156
  verbosity: int = 0,
157
+ log_suffix: Optional[str] = None,
130
158
  **kwargs: Any,
131
159
  ) -> None:
132
160
  """Index metadata in the indexing system.
@@ -142,6 +170,8 @@ async def async_index(
142
170
  If the index system supports batch-sizes, the size of the batches.
143
171
  verbosity:
144
172
  Set the verbosity of the system.
173
+ log_suffix:
174
+ Add a suffix to the log file output.
145
175
 
146
176
  Other Parameters
147
177
  ^^^^^^^^^^^^^^^^
@@ -168,6 +198,11 @@ async def async_index(
168
198
  "index",
169
199
  batch_size=batch_size,
170
200
  verbosity=verbosity,
201
+ log_suffix=log_suffix,
202
+ num_objects=_get_num_of_indexed_objects(
203
+ kwargs["catalogue_files"],
204
+ storage_options=kwargs.get("storage_options"),
205
+ ),
171
206
  **kwargs,
172
207
  )
173
208
 
@@ -176,6 +211,7 @@ async def async_delete(
176
211
  index_system: str,
177
212
  batch_size: int = 2500,
178
213
  verbosity: int = 0,
214
+ log_suffix: Optional[str] = None,
179
215
  **kwargs: Any,
180
216
  ) -> None:
181
217
  """Delete metadata from the indexing system.
@@ -188,6 +224,8 @@ async def async_delete(
188
224
  If the index system supports batch-sizes, the size of the batches.
189
225
  verbosity:
190
226
  Set the verbosity of the system.
227
+ log_suffix:
228
+ Add a suffix to the log file output.
191
229
 
192
230
  Other Parameters
193
231
  ^^^^^^^^^^^^^^^^^
@@ -212,6 +250,7 @@ async def async_delete(
212
250
  "delete",
213
251
  batch_size=batch_size,
214
252
  verbosity=verbosity,
253
+ log_suffix=log_suffix,
215
254
  **kwargs,
216
255
  )
217
256
 
@@ -236,6 +275,7 @@ async def async_add(
236
275
  password: bool = False,
237
276
  n_procs: Optional[int] = None,
238
277
  verbosity: int = 0,
278
+ log_suffix: Optional[str] = None,
239
279
  fail_under: int = -1,
240
280
  **kwargs: Any,
241
281
  ) -> None:
@@ -282,6 +322,8 @@ async def async_add(
282
322
  Set the number of parallel processes for collecting.
283
323
  verbosity:
284
324
  Set the verbosity of the system.
325
+ log_suffix:
326
+ Add a suffix to the log file output.
285
327
  fail_under:
286
328
  Fail if less than X of the discovered files could be indexed.
287
329
 
@@ -305,9 +347,13 @@ async def async_add(
305
347
 
306
348
  """
307
349
  env = cast(os._Environ[str], os.environ.copy())
308
- old_level = apply_verbosity(verbosity)
350
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
309
351
  try:
352
+ os.environ["MDC_LOG_INIT"] = "1"
310
353
  os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
354
+ os.environ["MDC_LOG_SUFFIX"] = (
355
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
356
+ )
311
357
  config_file = config_file or os.environ.get(
312
358
  "EVALUATION_SYSTEM_CONFIG_DIR"
313
359
  )
@@ -316,7 +362,7 @@ async def async_add(
316
362
  "You must give a config file/directory"
317
363
  )
318
364
  st = time.time()
319
- passwd = ""
365
+ passwd: Optional[str] = None
320
366
  if password: # pragma: no cover
321
367
  passwd = Prompt.ask(
322
368
  "[b]Enter the password", password=True
metadata_crawler/utils.py CHANGED
@@ -32,6 +32,7 @@ import rich.console
32
32
  import rich.spinner
33
33
  from dateutil.parser import isoparse
34
34
  from rich.live import Live
35
+ from rich.progress import Progress, TaskID
35
36
 
36
37
  from .logger import logger
37
38
 
@@ -330,6 +331,76 @@ def timedelta_to_str(seconds: Union[int, float]) -> str:
330
331
  return " ".join(out[::-1])
331
332
 
332
333
 
334
+ class IndexProgress:
335
+ """A helper that displays the progress of index Tasks."""
336
+
337
+ def __init__(
338
+ self,
339
+ total: int = 0,
340
+ interactive: Optional[bool] = None,
341
+ text: str = "Indexing: ",
342
+ ) -> None:
343
+ if interactive is None:
344
+ self._interactive = bool(
345
+ int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
346
+ )
347
+ else:
348
+ self._interactive = interactive
349
+ self._log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
350
+ self.text = text
351
+ self._done = 0
352
+ self._task: TaskID = TaskID(0)
353
+ self._total = total
354
+ self._start = self._last_log = time.time()
355
+ self._progress = Progress()
356
+ self._last_printed_percent: float = -1.0
357
+
358
+ def start(self) -> None:
359
+ """Start the progress bar."""
360
+ self._start = self._last_log = time.time()
361
+
362
+ if self._interactive:
363
+ self._task = self._progress.add_task(
364
+ f"[green] {self.text}", total=self._total or None
365
+ )
366
+ self._progress.start()
367
+
368
+ def stop(self) -> None:
369
+ """Stop the progress bar."""
370
+ if self._interactive:
371
+ self._progress.stop()
372
+ else:
373
+ self._text_update()
374
+
375
+ def _text_update(self, bar_width: int = 40) -> None:
376
+ elapsed = timedelta(seconds=int(time.time() - self._start))
377
+ log_interval = timedelta(seconds=int(time.time() - self._last_log))
378
+ if self._total > 0:
379
+ filled = int((self._last_printed_percent / 100) * bar_width)
380
+ bar = "#" * filled + "-" * (bar_width - filled)
381
+ text = f"{self.text} [{bar}] {self._last_printed_percent:>6,.02f}%"
382
+ else:
383
+ text = f"{self.text} [{self._done:>12,}]"
384
+ if log_interval.total_seconds() >= self._log_interval:
385
+ print(f"{text} ({elapsed})", flush=True)
386
+ self._last_log = time.time()
387
+
388
+ def update(self, inc: int) -> None:
389
+ """Update the status progress bar by an increment."""
390
+ self._done += inc
391
+
392
+ if self._interactive is True:
393
+ desc = f"{self.text} [{self._done:>10d}]" if self._done == 0 else None
394
+ self._progress.update(self._task, advance=inc, description=desc)
395
+ return
396
+
397
+ frac = self._done / max(self._total, 1)
398
+ pct = frac * 100
399
+ if pct > self._last_printed_percent or self._total == 0:
400
+ self._last_printed_percent = pct
401
+ self._text_update()
402
+
403
+
333
404
  @daemon
334
405
  def print_performance(
335
406
  print_status: EventLike,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metadata-crawler
3
- Version: 2509.0.1
3
+ Version: 2510.0.0
4
4
  Summary: Crawl, extract and push climate metadata for indexing.
5
5
  Author-email: "DKRZ, Clint" <freva@dkrz.de>
6
6
  Requires-Python: >=3.11
@@ -34,7 +34,7 @@ Requires-Dist: numpy
34
34
  Requires-Dist: orjson
35
35
  Requires-Dist: pyarrow
36
36
  Requires-Dist: h5netcdf
37
- Requires-Dist: pydantic
37
+ Requires-Dist: pydantic<2.12
38
38
  Requires-Dist: pyarrow
39
39
  Requires-Dist: rich
40
40
  Requires-Dist: rich-argparse
@@ -96,10 +96,12 @@ Provides-Extra: tests
96
96
 
97
97
  [![License](https://img.shields.io/badge/License-BSD-purple.svg)](LICENSE)
98
98
  [![PyPI](https://img.shields.io/pypi/pyversions/metadata-crawler.svg)](https://pypi.org/project/metadata-crawler/)
99
+ [![Conda Version](https://img.shields.io/conda/vn/conda-forge/metadata-crawler.svg)](https://anaconda.org/conda-forge/metadata-crawler)
99
100
  [![Docs](https://readthedocs.org/projects/metadata-crawler/badge/?version=latest)](https://metadata-crawler.readthedocs.io/en/latest/?badge=latest)
100
101
  [![Tests](https://github.com/freva-org/metadata-crawler/actions/workflows/ci_job.yml/badge.svg)](https://github.com/freva-org/metadata-crawler/actions)
101
102
  [![Test-Coverage](https://codecov.io/gh/freva-org/metadata-crawler/graph/badge.svg?token=W2YziDnh2N)](https://codecov.io/gh/freva-org/metadata-crawler)
102
103
 
104
+
103
105
  Harvest, normalise, and index climate / earth-system metadata from **POSIX**,
104
106
  **S3/MinIO**, and **OpenStack Swift** using configurable **DRS dialects**
105
107
  (CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (JSONLines)
@@ -0,0 +1,34 @@
1
+ metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
2
+ metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
+ metadata_crawler/_version.py,sha256=oJIpBtzsOuKTbnMbTB3ZHAqVHS0O9r3O0d2lf9lUGfE,25
4
+ metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
5
+ metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
6
+ metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
7
+ metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ metadata_crawler/run.py,sha256=_6mx29Co1HwfPNFWtzTR65CNlopqubj-McmavRM7i80,12869
9
+ metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
10
+ metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
+ metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
+ metadata_crawler/api/config.py,sha256=4c9O0xmVwduEEGlNjQcIh1nV5HzXNjXNqPi3tEQkpGw,28814
13
+ metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
14
+ metadata_crawler/api/index.py,sha256=0yqtXYOyWJJKKkCkIJbUUVG1w2Wt_icYJjXJPZZjSvU,4715
15
+ metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
16
+ metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
+ metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
18
+ metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
19
+ metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
20
+ metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
21
+ metadata_crawler/api/mixin/template_mixin.py,sha256=hxQXiP_JND3fuxBNcs1pZ7cvP-k-lTm5MQg40t0kF54,5105
22
+ metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
23
+ metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
24
+ metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
25
+ metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqeeE,4602
26
+ metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
+ metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
+ metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
29
+ metadata_crawler/ingester/solr.py,sha256=kpUAnI5iSsvNGagM_gqbTJZr8HNpYSFZFvNOcbHXB9o,9528
30
+ metadata_crawler-2510.0.0.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
+ metadata_crawler-2510.0.0.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
+ metadata_crawler-2510.0.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
+ metadata_crawler-2510.0.0.dist-info/METADATA,sha256=EdZwF0Y_U8NFQFTUcy6WbI8l2WYq59Ynp_L6S3ys1v4,13006
34
+ metadata_crawler-2510.0.0.dist-info/RECORD,,
@@ -1,34 +0,0 @@
1
- metadata_crawler/__init__.py,sha256=7gEpJjS9FpR6MHRY_Ztk8ORJ8JQ7WZUTV2TfLkaYgqs,6741
2
- metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
- metadata_crawler/_version.py,sha256=lJ4gM5yptFnF64LPHdDVhj6Mppmsw36i5KAr1dVXO5Y,25
4
- metadata_crawler/cli.py,sha256=meY5ZfR5VEW5ZorOPWO_b4MyIIQy0wTTPs9OkJ1WnfA,17180
5
- metadata_crawler/data_collector.py,sha256=9CVr4arKJspyLNLuF2MfkmY_r8x74Mw8hAaDSMouQUA,8372
6
- metadata_crawler/logger.py,sha256=5Lc0KdzH2HdWkidW-MASW8Pfy7vTMnzPv1-e2V3Any0,4407
7
- metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- metadata_crawler/run.py,sha256=w1kV4D63dS3mdgDTQj2ngzeSCjZPphWg1HwIJeJ6ATE,11345
9
- metadata_crawler/utils.py,sha256=QNr_9jZkuuQOrkuO46PrFhUfwLmfCJCq9gWUwwARfyM,11580
10
- metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
- metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
- metadata_crawler/api/config.py,sha256=j__JDKYTOR8kYC--HaHlYXfz38rzEhtUvHdO5Bh_j2E,28250
13
- metadata_crawler/api/drs_config.toml,sha256=90lQaSC2VdJ8OUoc6j27kg6d2OnfxR5a_KZH3W-FZV4,10603
14
- metadata_crawler/api/index.py,sha256=8g5HdSxluKtCwU45P0w_7LDIaSf200JbB-ekGJiI18c,4130
15
- metadata_crawler/api/metadata_stores.py,sha256=oWewL6XRmNZ6i5WxYI8Lm2jfpwLqBCGP2p4j3wLLNpQ,23735
16
- metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
- metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
18
- metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
19
- metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
20
- metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
21
- metadata_crawler/api/mixin/template_mixin.py,sha256=_qDp5n_CPnSYPMBsTia44b1ybBqrJEi-M1NaRkQ0z3U,5106
22
- metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
23
- metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
24
- metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
25
- metadata_crawler/backends/s3.py,sha256=DPz_bOyOlUveCwkSLVatwU_mcxUbFvygU_Id1AZVIMA,4455
26
- metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
- metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
- metadata_crawler/ingester/mongo.py,sha256=lpWIZ8mo6S8oY887uz2l6Y9pir0sUVEkfgOdDxrjIMM,6142
29
- metadata_crawler/ingester/solr.py,sha256=EoKS3kFeDTLf9zP22s2DhQGP81T6rTXVWDNT2wWKFkk,5242
30
- metadata_crawler-2509.0.1.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
- metadata_crawler-2509.0.1.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
- metadata_crawler-2509.0.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
- metadata_crawler-2509.0.1.dist-info/METADATA,sha256=dT5Kd5_sBAccA_Qj9O64zksuF7u2iaj-DXqqEDtUDqc,12864
34
- metadata_crawler-2509.0.1.dist-info/RECORD,,