metadata-crawler 2509.0.0__py3-none-any.whl → 2509.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

@@ -53,6 +53,7 @@ def index(
53
53
  *catalogue_files: Union[Path, str, List[str], List[Path]],
54
54
  batch_size: int = 2500,
55
55
  verbosity: int = 0,
56
+ log_suffix: Optional[str] = None,
56
57
  **kwargs: Any,
57
58
  ) -> None:
58
59
  """Index metadata in the indexing system.
@@ -68,6 +69,8 @@ def index(
68
69
  If the index system supports batch-sizes, the size of the batches.
69
70
  verbosity:
70
71
  Set the verbosity level.
72
+ log_suffix:
73
+ Add a suffix to the log file output.
71
74
 
72
75
  Other Parameters
73
76
  ^^^^^^^^^^^^^^^^
@@ -94,6 +97,7 @@ def index(
94
97
  *catalogue_files,
95
98
  batch_size=batch_size,
96
99
  verbosity=verbosity,
100
+ log_suffix=log_suffix,
97
101
  **kwargs,
98
102
  )
99
103
  )
@@ -103,6 +107,7 @@ def delete(
103
107
  index_system: str,
104
108
  batch_size: int = 2500,
105
109
  verbosity: int = 0,
110
+ log_suffix: Optional[str] = None,
106
111
  **kwargs: Any,
107
112
  ) -> None:
108
113
  """Delete metadata from the indexing system.
@@ -116,6 +121,8 @@ def delete(
116
121
  If the index system supports batch-sizes, the size of the batches.
117
122
  verbosity:
118
123
  Set the verbosity of the system.
124
+ log_suffix:
125
+ Add a suffix to the log file output.
119
126
 
120
127
  Other Parameters
121
128
  ^^^^^^^^^^^^^^^^
@@ -135,7 +142,11 @@ def delete(
135
142
  facets=[("project", "CMIP6"), ("institute", "MPI-M")],
136
143
  )
137
144
  """
138
- uvloop.run(async_delete(index_system, batch_size=batch_size, **kwargs))
145
+ uvloop.run(
146
+ async_delete(
147
+ index_system, batch_size=batch_size, log_suffix=log_suffix, **kwargs
148
+ )
149
+ )
139
150
 
140
151
 
141
152
  def add(
@@ -155,6 +166,7 @@ def add(
155
166
  all_versions: str = IndexName().all,
156
167
  n_procs: Optional[int] = None,
157
168
  verbosity: int = 0,
169
+ log_suffix: Optional[str] = None,
158
170
  password: bool = False,
159
171
  fail_under: int = -1,
160
172
  **kwargs: Any,
@@ -204,6 +216,8 @@ def add(
204
216
  Set the number of parallel processes for collecting.
205
217
  verbosity:
206
218
  Set the verbosity of the system.
219
+ log_suffix:
220
+ Add a suffix to the log file output.
207
221
  fail_under:
208
222
  Fail if less than X of the discovered files could be indexed.
209
223
 
@@ -242,6 +256,7 @@ def add(
242
256
  n_procs=n_procs,
243
257
  storage_options=storage_options,
244
258
  verbosity=verbosity,
259
+ log_suffix=log_suffix,
245
260
  fail_under=fail_under,
246
261
  **kwargs,
247
262
  )
@@ -1 +1 @@
1
- __version__ = "2509.0.0"
1
+ __version__ = "2509.0.2"
@@ -17,6 +17,7 @@ from typing import (
17
17
  List,
18
18
  Literal,
19
19
  Optional,
20
+ Tuple,
20
21
  Union,
21
22
  cast,
22
23
  )
@@ -285,7 +286,6 @@ class PathSpecs(BaseModel):
285
286
  f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
286
287
  )
287
288
  )
288
- _parts.setdefault("time", "fx")
289
289
  data.update({k: v for (k, v) in _parts.items() if k not in data})
290
290
  data.pop("_", None)
291
291
  return data
@@ -609,7 +609,9 @@ class DRSConfig(BaseModel, TemplateMixin):
609
609
  case "conditional":
610
610
  _rule = textwrap.dedent(rule.condition or "").strip()
611
611
  s_cond = self.render_templates(_rule, data)
612
- cond = eval(s_cond, {}, getattr(self, "_model_dict", {}))
612
+ cond = eval(
613
+ s_cond, {}, getattr(self, "_model_dict", {})
614
+ ) # nosec
613
615
  result = rule.true if cond else rule.false
614
616
  case "lookup":
615
617
  args = cast(List[str], self.render_templates(rule.tree, data))
@@ -627,7 +629,7 @@ class DRSConfig(BaseModel, TemplateMixin):
627
629
  self.render_templates(_call, data),
628
630
  {},
629
631
  getattr(self, "_model_dict", {}),
630
- )
632
+ ) # nosec
631
633
  if result:
632
634
  inp.metadata[facet] = result
633
635
 
@@ -666,7 +668,7 @@ class DRSConfig(BaseModel, TemplateMixin):
666
668
 
667
669
  def max_directory_tree_level(
668
670
  self, search_dir: str | Path, drs_type: str
669
- ) -> int:
671
+ ) -> Tuple[int, bool]:
670
672
  """Get the maximum level for descending into directories.
671
673
 
672
674
  When searching for files in a directory we can only traverse the directory
@@ -686,6 +688,7 @@ class DRSConfig(BaseModel, TemplateMixin):
686
688
  version = cast(
687
689
  str, self.dialect[standard].facets.get("version", "version")
688
690
  )
691
+ is_versioned = True
689
692
  try:
690
693
  version_idx = self.dialect[standard].path_specs.dir_parts.index(
691
694
  version
@@ -693,11 +696,12 @@ class DRSConfig(BaseModel, TemplateMixin):
693
696
  except ValueError:
694
697
  # No version given
695
698
  version_idx = len(self.dialect[standard].path_specs.dir_parts)
699
+ is_versioned = False
696
700
  if root_path == search_dir:
697
701
  current_pos = 0
698
702
  else:
699
703
  current_pos = len(search_dir.relative_to(root_path).parts)
700
- return version_idx - current_pos
704
+ return version_idx - current_pos, is_versioned
701
705
 
702
706
  def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
703
707
  """Check if all metadata that can be collected was collected."""
@@ -78,6 +78,7 @@ multi_valued = true
78
78
  key = "time"
79
79
  type = "daterange"
80
80
  multi_valued = false
81
+ default = "fx"
81
82
 
82
83
  [drs_settings.index_schema.grid_label]
83
84
  key = "grid_label"
@@ -16,7 +16,7 @@ from typing import (
16
16
  )
17
17
 
18
18
  from ..logger import logger
19
- from ..utils import Console
19
+ from ..utils import Console, IndexProgress
20
20
  from .config import SchemaField
21
21
  from .metadata_stores import CatalogueReader, IndexStore
22
22
 
@@ -40,6 +40,9 @@ class BaseIndex:
40
40
  batch_size:
41
41
  The amount for metadata that should be gathered `before` ingesting
42
42
  it into the catalogue.
43
+ progress:
44
+ Optional rich progress object that should display the progress of the
45
+ tasks.
43
46
 
44
47
  Attributes
45
48
  ^^^^^^^^^^
@@ -50,9 +53,11 @@ class BaseIndex:
50
53
  catalogue_file: Optional[Union[str, Path]] = None,
51
54
  batch_size: int = 2500,
52
55
  storage_options: Optional[Dict[str, Any]] = None,
56
+ progress: Optional[IndexProgress] = None,
53
57
  **kwargs: Any,
54
58
  ) -> None:
55
59
  self._store: Optional[IndexStore] = None
60
+ self.progress = progress or IndexProgress(total=-1)
56
61
  if catalogue_file is not None:
57
62
  _reader = CatalogueReader(
58
63
  catalogue_file=catalogue_file or "",
@@ -92,6 +97,7 @@ class BaseIndex:
92
97
  logger.info("Indexing %s", index_name)
93
98
  async for batch in self._store.read(index_name):
94
99
  yield batch
100
+ self.progress.update(len(batch))
95
101
  num_items += len(batch)
96
102
  msg = f"Indexed {num_items:10,.0f} items for index {index_name}"
97
103
  Console.print(msg) if Console.is_terminal else print(msg)
@@ -473,10 +473,7 @@ class CatalogueReader:
473
473
  ) -> None:
474
474
  catalogue_file = str(catalogue_file)
475
475
  storage_options = storage_options or {}
476
- fs, _ = IndexStore.get_fs(catalogue_file, **storage_options)
477
- path = fs.unstrip_protocol(catalogue_file)
478
- with fs.open(path) as stream:
479
- cat = yaml.safe_load(stream.read())
476
+ cat = self.load_catalogue(catalogue_file, **storage_options)
480
477
  _schema_json = cat["metadata"]["schema"]
481
478
  schema = {s["key"]: SchemaField(**s) for k, s in _schema_json.items()}
482
479
  index_name = IndexName(**cat["metadata"]["index_names"])
@@ -493,6 +490,14 @@ class CatalogueReader:
493
490
  storage_options=storage_options,
494
491
  )
495
492
 
493
+ @staticmethod
494
+ def load_catalogue(path: Union[str, Path], **storage_options: Any) -> Any:
495
+ """Load a intake yaml catalogue (remote or local)."""
496
+ fs, _ = IndexStore.get_fs(str(path), **storage_options)
497
+ cat_path = fs.unstrip_protocol(path)
498
+ with fs.open(cat_path) as stream:
499
+ return yaml.safe_load(stream.read())
500
+
496
501
 
497
502
  class QueueConsumer:
498
503
  """Class that consumes the file discovery queue."""
@@ -722,6 +727,7 @@ class CatalogueWriter:
722
727
  "latest": self.index_name.latest,
723
728
  "all": self.index_name.all,
724
729
  },
730
+ "indexed_objects": self.ingested_objects,
725
731
  "schema": {
726
732
  k: json.loads(s.model_dump_json())
727
733
  for k, s in self.store.schema.items()
@@ -6,7 +6,7 @@ from typing import Any, Dict, Mapping, Optional
6
6
 
7
7
  from jinja2 import Environment, Template, Undefined
8
8
 
9
- ENV = Environment(undefined=Undefined, autoescape=False)
9
+ ENV = Environment(undefined=Undefined, autoescape=True)
10
10
 
11
11
 
12
12
  @lru_cache(maxsize=1024)
@@ -71,11 +71,15 @@ class S3Path(PathTemplate):
71
71
  self, path: Union[str, Path, pathlib.Path]
72
72
  ) -> AsyncIterator[str]:
73
73
  """Retrieve sub directories of directory."""
74
- path = str(path)
75
74
  client = await self._get_client()
76
- for _content in await client._lsdir(path):
77
- if _content.get("type", "") == "directory":
78
- yield f'{_content.get("name", "")}'
75
+ path = str(path)
76
+ if await self.is_file(path):
77
+ yield path
78
+ else:
79
+ for _content in await client._lsdir(path):
80
+ size: int = _content.get("size") or 0
81
+ if _content.get("type", "") == "directory" or size > 0:
82
+ yield _content.get("name", "")
79
83
 
80
84
  async def rglob(
81
85
  self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
metadata_crawler/cli.py CHANGED
@@ -34,7 +34,6 @@ from .api.metadata_stores import CatalogueBackends, IndexName
34
34
  from .backends.intake import IntakePath
35
35
  from .logger import (
36
36
  THIS_NAME,
37
- add_file_handle,
38
37
  apply_verbosity,
39
38
  logger,
40
39
  )
@@ -48,7 +47,9 @@ KwargValue = Union[
48
47
 
49
48
 
50
49
  def walk_catalogue(
51
- path: str, storage_options: Optional[Dict[str, Any]] = None, **kwargs: Any
50
+ path: str,
51
+ storage_options: Optional[Dict[str, Any]] = None,
52
+ **kwargs: Any,
52
53
  ) -> int:
53
54
  """Recursively traverse an intake catalogue.
54
55
 
@@ -359,6 +360,13 @@ class ArgParse:
359
360
  action="append",
360
361
  nargs=2,
361
362
  )
363
+ parser.add_argument(
364
+ "-v",
365
+ "--verbose",
366
+ action="count",
367
+ default=self.verbose,
368
+ help="Increase the verbosity level.",
369
+ )
362
370
  parser.set_defaults(apply_func=walk_catalogue)
363
371
 
364
372
  def _index_submcommands(self) -> None:
@@ -391,8 +399,8 @@ class ArgParse:
391
399
  "-b",
392
400
  "--batch-size",
393
401
  type=int,
394
- default=25_000,
395
- help="Set the batch size for ingestion.",
402
+ default=5_000,
403
+ help="Set the batch size for indexing.",
396
404
  )
397
405
  parser.add_argument(
398
406
  "--storage_option",
@@ -494,7 +502,6 @@ class ArgParse:
494
502
  "apply_func",
495
503
  "verbose",
496
504
  "version",
497
- "log_suffix",
498
505
  "storage_option",
499
506
  "shadow",
500
507
  )
@@ -509,7 +516,6 @@ class ArgParse:
509
516
  self.kwargs["shadow"] = _flatten(args.shadow)
510
517
  self.kwargs["storage_options"] = so
511
518
  self.verbose = args.verbose
512
- add_file_handle(args.log_suffix)
513
519
  self.kwargs["verbosity"] = self.verbose
514
520
  return args
515
521
 
@@ -519,7 +525,9 @@ def _run(
519
525
  **kwargs: KwargValue,
520
526
  ) -> None:
521
527
  """Apply the parsed method."""
522
- old_level = apply_verbosity(getattr(parser, "verbose", 0))
528
+ old_level = apply_verbosity(
529
+ getattr(parser, "verbose", 0), suffix=getattr(parser, "log_suffix", None)
530
+ )
523
531
  try:
524
532
  parser.apply_func(**kwargs)
525
533
  except Exception as error:
@@ -15,6 +15,7 @@ from typing import (
15
15
  Dict,
16
16
  Iterator,
17
17
  Optional,
18
+ Tuple,
18
19
  Type,
19
20
  Union,
20
21
  cast,
@@ -33,7 +34,7 @@ from .utils import (
33
34
  print_performance,
34
35
  )
35
36
 
36
- ScanItem = tuple[str, str, bool]
37
+ ScanItem = Tuple[str, str, bool, bool]
37
38
 
38
39
 
39
40
  class DataCollector:
@@ -138,6 +139,7 @@ class DataCollector:
138
139
  drs_type: str,
139
140
  search_dir: str,
140
141
  iterable: bool = True,
142
+ is_versioned: bool = True,
141
143
  ) -> None:
142
144
  if iterable:
143
145
  try:
@@ -161,7 +163,7 @@ class DataCollector:
161
163
  await self.ingest_queue.put(
162
164
  _inp, drs_type, name=self.index_name.all
163
165
  )
164
- if rank == 0:
166
+ if rank == 0 or is_versioned is False:
165
167
  await self.ingest_queue.put(
166
168
  _inp, drs_type, name=self.index_name.latest
167
169
  )
@@ -176,16 +178,22 @@ class DataCollector:
176
178
  if item is None: # sentinel -> exit
177
179
  # do not task_done() for sentinel
178
180
  break
179
- drs_type, path, iterable = item
181
+ drs_type, path, iterable, is_versioned = item
180
182
  try:
181
- await self._ingest_dir(drs_type, path, iterable=iterable)
183
+ await self._ingest_dir(
184
+ drs_type, path, iterable=iterable, is_versioned=is_versioned
185
+ )
182
186
  except Exception as error:
183
187
  logger.error(error)
184
188
  finally:
185
189
  self._scan_queue.task_done()
186
190
 
187
191
  async def _iter_content(
188
- self, drs_type: str, inp_dir: str, pos: int = 0
192
+ self,
193
+ drs_type: str,
194
+ inp_dir: str,
195
+ pos: int = 0,
196
+ is_versioned: bool = True,
189
197
  ) -> None:
190
198
  """Walk recursively until files or the version level is reached."""
191
199
  store = self.config.datasets[drs_type].backend
@@ -203,7 +211,6 @@ class DataCollector:
203
211
 
204
212
  iterable = False if suffix == ".zarr" else iterable
205
213
  op: Optional[Callable[..., Coroutine[Any, Any, None]]] = None
206
-
207
214
  if is_file and suffix in self.config.suffixes:
208
215
  op = self._ingest_dir
209
216
  elif pos <= 0 or suffix == ".zarr":
@@ -211,13 +218,17 @@ class DataCollector:
211
218
 
212
219
  if op is not None:
213
220
  # enqueue the heavy scan; workers will run _ingest_dir concurrently
214
- await self._scan_queue.put((drs_type, inp_dir, iterable))
221
+ await self._scan_queue.put(
222
+ (drs_type, inp_dir, iterable, is_versioned)
223
+ )
215
224
  return
216
225
 
217
226
  # otherwise, recurse sequentially (cheap) — no task per directory
218
227
  try:
219
228
  async for sub in store.iterdir(inp_dir):
220
- await self._iter_content(drs_type, sub, pos - 1)
229
+ await self._iter_content(
230
+ drs_type, sub, pos - 1, is_versioned=is_versioned
231
+ )
221
232
  except Exception as error:
222
233
  logger.error(error)
223
234
 
@@ -239,10 +250,19 @@ class DataCollector:
239
250
 
240
251
  # produce scan items by walking roots sequentially
241
252
  for drs_type, path in self.search_objects: # <- property is sync
242
- pos = self.config.max_directory_tree_level(
253
+ pos, is_versioned = self.config.max_directory_tree_level(
243
254
  path, drs_type=drs_type
244
255
  )
245
- await self._iter_content(drs_type, path, pos)
256
+ if pos < 0:
257
+ logger.warning(
258
+ "Can't define latest version of versioned dataset."
259
+ " This might lead to unexpected results. Try adjusting"
260
+ " your search path."
261
+ )
262
+
263
+ await self._iter_content(
264
+ drs_type, path, pos, is_versioned=is_versioned
265
+ )
246
266
 
247
267
  # wait until all queued scan items are processed
248
268
  await self._scan_queue.join()
@@ -80,12 +80,13 @@ class MongoIndex(BaseIndex):
80
80
  await collection.bulk_write(ops, ordered=False)
81
81
 
82
82
  async def _index_collection(
83
- self, db: AsyncIOMotorDatabase[Any], collection: str
83
+ self, db: AsyncIOMotorDatabase[Any], collection: str, suffix: str = ""
84
84
  ) -> None:
85
85
  """Index a collection."""
86
- await db[collection].create_index(self.unique_index, unique=True)
86
+ col = collection + suffix
87
+ await db[col].create_index(self.unique_index, unique=True)
87
88
  async for chunk in self.get_metadata(collection):
88
- await self._bulk_upsert(chunk, db[collection])
89
+ await self._bulk_upsert(chunk, db[col])
89
90
 
90
91
  async def _prep_db_connection(
91
92
  self, database: str, url: str
@@ -119,12 +120,24 @@ class MongoIndex(BaseIndex):
119
120
  default="metadata",
120
121
  ),
121
122
  ] = "metadata",
123
+ index_suffix: Annotated[
124
+ Optional[str],
125
+ cli_parameter(
126
+ "--index-suffix",
127
+ help="Suffix for the latest and all version collections.",
128
+ type=str,
129
+ ),
130
+ ] = None,
122
131
  ) -> None:
123
132
  """Add metadata to the mongoDB metadata server."""
124
133
  db = await self._prep_db_connection(database, url or "")
125
134
  async with asyncio.TaskGroup() as tg:
126
135
  for collection in self.index_names:
127
- tg.create_task(self._index_collection(db, collection))
136
+ tg.create_task(
137
+ self._index_collection(
138
+ db, collection, suffix=index_suffix or ""
139
+ )
140
+ )
128
141
 
129
142
  async def close(self) -> None:
130
143
  """Close the mongoDB connection."""
@@ -5,9 +5,12 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import logging
7
7
  import os
8
- from typing import Annotated, Any, Dict, List, Optional
8
+ import time
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from typing import Annotated, Any, Dict, List, Optional, cast
9
11
 
10
12
  import aiohttp
13
+ import orjson
11
14
 
12
15
  from ..api.cli import cli_function, cli_parameter
13
16
  from ..api.index import BaseIndex
@@ -112,22 +115,114 @@ class SolrIndex(BaseIndex):
112
115
 
113
116
  return metadata
114
117
 
115
- async def _index_core(self, server: str, core: str) -> None:
116
- """Index data to a solr core."""
117
- url = await self.solr_url(server, core)
118
- async for chunk in self.get_metadata(core):
119
- async with aiohttp.ClientSession(
120
- timeout=self.timeout, raise_for_status=True
121
- ) as session:
118
+ def _encode_payload(self, chunk: List[Dict[str, Any]]) -> bytes:
119
+ """CPU-bound: convert docs and JSON-encode off the event loop."""
120
+ return orjson.dumps([self._convert(x) for x in chunk])
121
+
122
+ async def _post_chunk(
123
+ self,
124
+ session: aiohttp.ClientSession,
125
+ url: str,
126
+ body: bytes,
127
+ ) -> None:
128
+ """POST one batch with minimal overhead and simple retries."""
129
+ status = 500
130
+ t0 = time.perf_counter()
131
+ try:
132
+ async with session.post(
133
+ url, data=body, headers={"Content-Type": "application/json"}
134
+ ) as resp:
135
+ status = resp.status
136
+ await resp.read()
137
+
138
+ except Exception as error:
139
+ logger.log(
140
+ logging.WARNING,
141
+ error,
142
+ exc_info=logger.level < logging.INFO,
143
+ )
144
+ return
145
+ logger.debug(
146
+ "POST %s -> %i (index time: %.3f)",
147
+ url,
148
+ status,
149
+ time.perf_counter() - t0,
150
+ )
151
+
152
+ async def _index_core(
153
+ self, server: str, core: str, suffix: str, http_workers: int = 8
154
+ ) -> None:
155
+ """Zero-copy-ish, backpressured, bounded-concurrency indexer.
156
+
157
+ - No per-batch commit.
158
+ - Bounded queue so tasks don't pile up.
159
+ - Constant number of worker tasks (not O(batches)).
160
+ """
161
+ base_url = await self.solr_url(server, core + suffix)
162
+ update_url = base_url.split("?", 1)[0] # guard
163
+
164
+ queue_max: int = 128
165
+ encode_workers: int = 4
166
+
167
+ timeout = aiohttp.ClientTimeout(
168
+ connect=10, sock_connect=10, sock_read=180, total=None
169
+ )
170
+ connector = aiohttp.TCPConnector(
171
+ limit_per_host=http_workers,
172
+ ttl_dns_cache=300,
173
+ enable_cleanup_closed=True,
174
+ )
175
+
176
+ loop = asyncio.get_running_loop()
177
+ cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
178
+ q: asyncio.Queue[Optional[bytes]] = asyncio.Queue(maxsize=queue_max)
179
+ SENTINEL: Optional[bytes] = None
180
+
181
+ async def producer() -> None:
182
+ async for batch in self.get_metadata(core):
183
+ body = await loop.run_in_executor(
184
+ cpu_pool, self._encode_payload, batch
185
+ )
186
+ await q.put(body)
187
+ for _ in range(http_workers):
188
+ await q.put(SENTINEL)
189
+
190
+ async def consumer(
191
+ worker_id: int, session: aiohttp.ClientSession
192
+ ) -> None:
193
+ while True:
194
+ body = await q.get()
195
+ if body is SENTINEL:
196
+ q.task_done()
197
+ break
122
198
  try:
123
- payload = list(map(self._convert, chunk))
124
- async with session.post(url, json=payload) as resp:
125
- logger.debug(await resp.text())
126
- except Exception as error:
127
- logger.log(
128
- logging.WARNING,
129
- error,
130
- exc_info=logger.level < logging.INFO,
199
+ await self._post_chunk(session, update_url, cast(bytes, body))
200
+ finally:
201
+ q.task_done()
202
+
203
+ async with aiohttp.ClientSession(
204
+ timeout=timeout, connector=connector, raise_for_status=True
205
+ ) as session:
206
+ consumers = [
207
+ asyncio.create_task(consumer(i, session))
208
+ for i in range(http_workers)
209
+ ]
210
+ prod_task = asyncio.create_task(producer())
211
+ await prod_task
212
+ await q.join()
213
+ await asyncio.gather(*consumers)
214
+
215
+ commit_url = f"{update_url}?commit=true"
216
+ async with aiohttp.ClientSession(timeout=timeout) as session:
217
+ async with session.post(
218
+ commit_url,
219
+ data=b"[]",
220
+ headers={"Content-Type": "application/json"},
221
+ ) as resp:
222
+ if resp.status >= 400:
223
+ text = await resp.text()
224
+ logger.warning(
225
+ "COMMIT %s -> %i: %s", commit_url, resp.status, text
131
226
  )
132
227
 
133
228
  @cli_function(
@@ -145,8 +240,29 @@ class SolrIndex(BaseIndex):
145
240
  type=str,
146
241
  ),
147
242
  ] = None,
243
+ index_suffix: Annotated[
244
+ Optional[str],
245
+ cli_parameter(
246
+ "--index-suffix",
247
+ help="Suffix for the latest and all version collections.",
248
+ type=str,
249
+ ),
250
+ ] = None,
251
+ http_workers: Annotated[
252
+ int,
253
+ cli_parameter(
254
+ "--http-workers", help="Number of ingestion threads.", type=int
255
+ ),
256
+ ] = 8,
148
257
  ) -> None:
149
258
  """Add metadata to the apache solr metadata server."""
150
259
  async with asyncio.TaskGroup() as tg:
151
260
  for core in self.index_names:
152
- tg.create_task(self._index_core(server or "", core))
261
+ tg.create_task(
262
+ self._index_core(
263
+ server or "",
264
+ core,
265
+ suffix=index_suffix or "",
266
+ http_workers=http_workers,
267
+ )
268
+ )
@@ -11,7 +11,7 @@ import appdirs
11
11
  from rich.console import Console
12
12
  from rich.logging import RichHandler
13
13
 
14
- THIS_NAME = "data-crawler"
14
+ THIS_NAME = "metadata-crawler"
15
15
 
16
16
  logging.basicConfig(
17
17
  level=logging.WARNING,
@@ -24,7 +24,7 @@ logging.config.dictConfig(
24
24
  # keep existing handlers
25
25
  "disable_existing_loggers": False,
26
26
  "root": {
27
- "level": "WARNING",
27
+ "level": "CRITICAL",
28
28
  "handlers": ["default"],
29
29
  },
30
30
  "formatters": {
@@ -36,16 +36,12 @@ logging.config.dictConfig(
36
36
  "default": {
37
37
  "class": "logging.StreamHandler",
38
38
  "formatter": "standard",
39
- "level": "WARNING",
39
+ "level": "CRITICAL",
40
40
  },
41
41
  },
42
42
  }
43
43
  )
44
44
 
45
- logging.getLogger("sqlalchemy").setLevel(logging.WARNING)
46
- logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
47
- logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
48
-
49
45
 
50
46
  class Logger(logging.Logger):
51
47
  """Custom Logger defining the logging behaviour."""
@@ -56,11 +52,14 @@ class Logger(logging.Logger):
56
52
  no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
57
53
 
58
54
  def __init__(
59
- self, name: Optional[str] = None, level: Optional[int] = None
55
+ self,
56
+ name: Optional[str] = None,
57
+ level: Optional[int] = None,
58
+ suffix: Optional[str] = None,
60
59
  ) -> None:
61
60
  """Instantiate this logger only once and for all."""
62
- level = level or int(
63
- cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.WARNING)))
61
+ self.level = level or int(
62
+ cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.CRITICAL)))
64
63
  )
65
64
  name = name or THIS_NAME
66
65
  logger_format = logging.Formatter(self.logfmt, self.datefmt)
@@ -78,11 +77,16 @@ class Logger(logging.Logger):
78
77
  ),
79
78
  )
80
79
  self._logger_stream_handle.setFormatter(logger_format)
81
- self._logger_stream_handle.setLevel(level)
82
- super().__init__(name, level)
80
+ self._logger_stream_handle.setLevel(self.level)
81
+ super().__init__(name, self.level)
83
82
 
84
83
  self.propagate = False
85
84
  self.handlers = [self._logger_stream_handle]
85
+ (
86
+ self.add_file_handle(suffix=suffix)
87
+ if os.getenv("MDC_LOG_INIT", "0") == "1"
88
+ else None
89
+ )
86
90
 
87
91
  def set_level(self, level: int) -> None:
88
92
  """Set the logger level to level."""
@@ -92,7 +96,7 @@ class Logger(logging.Logger):
92
96
  log_level = min(level, logging.CRITICAL)
93
97
  handler.setLevel(log_level)
94
98
  self.setLevel(level)
95
- logger.level = level
99
+ self.level = level
96
100
 
97
101
  def error(
98
102
  self,
@@ -105,28 +109,30 @@ class Logger(logging.Logger):
105
109
  kwargs.setdefault("exc_info", True)
106
110
  self._log(logging.ERROR, msg, args, **kwargs)
107
111
 
108
-
109
- logger = Logger()
112
+ def add_file_handle(
113
+ self,
114
+ suffix: Optional[str] = None,
115
+ level: int = logging.CRITICAL,
116
+ ) -> None:
117
+ """Add a file log handle to the logger."""
118
+ suffix = suffix or os.getenv("MDC_LOG_SUFFIX", "")
119
+ base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
120
+ log_dir = Path(os.getenv("MDC_LOG_DIR", appdirs.user_log_dir(THIS_NAME)))
121
+ log_dir.mkdir(exist_ok=True, parents=True)
122
+ logger_file_handle = RotatingFileHandler(
123
+ log_dir / f"{base_name}.log",
124
+ mode="a",
125
+ maxBytes=5 * 1024**2,
126
+ backupCount=5,
127
+ encoding="utf-8",
128
+ delay=False,
129
+ )
130
+ logger_file_handle.setFormatter(self.file_format)
131
+ logger_file_handle.setLevel(self.level)
132
+ self.addHandler(logger_file_handle)
110
133
 
111
134
 
112
- def add_file_handle(
113
- suffix: Optional[str], log_level: int = logging.CRITICAL
114
- ) -> None:
115
- """Add a file log handle to the logger."""
116
- base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
117
- log_dir = Path(appdirs.user_log_dir(THIS_NAME))
118
- log_dir.mkdir(exist_ok=True, parents=True)
119
- logger_file_handle = RotatingFileHandler(
120
- log_dir / f"{base_name}.log",
121
- mode="a",
122
- maxBytes=5 * 1024**2,
123
- backupCount=5,
124
- encoding="utf-8",
125
- delay=False,
126
- )
127
- logger_file_handle.setFormatter(logger.file_format)
128
- logger_file_handle.setLevel(min(log_level, logging.CRITICAL))
129
- logger.addHandler(logger_file_handle)
135
+ logger = Logger()
130
136
 
131
137
 
132
138
  def get_level_from_verbosity(verbosity: int) -> int:
@@ -134,9 +140,14 @@ def get_level_from_verbosity(verbosity: int) -> int:
134
140
  return max(logging.CRITICAL - 10 * verbosity, -1)
135
141
 
136
142
 
137
- def apply_verbosity(level: int) -> int:
143
+ def apply_verbosity(
144
+ level: Optional[int] = None, suffix: Optional[str] = None
145
+ ) -> int:
138
146
  """Set the logging level of the handlers to a certain level."""
147
+ level = logger.level if level is None else level
139
148
  old_level = logger.level
140
149
  level = get_level_from_verbosity(level)
141
150
  logger.set_level(level)
151
+ logger.add_file_handle(suffix, level)
152
+
142
153
  return old_level
metadata_crawler/run.py CHANGED
@@ -9,15 +9,21 @@ from types import NoneType
9
9
  from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
10
10
 
11
11
  import tomlkit
12
+ import yaml
12
13
  from rich.prompt import Prompt
13
14
 
14
15
  from .api.config import CrawlerSettings, DRSConfig, strip_protocol
15
- from .api.metadata_stores import CatalogueBackendType, IndexName
16
+ from .api.metadata_stores import (
17
+ CatalogueBackendType,
18
+ CatalogueReader,
19
+ IndexName,
20
+ )
16
21
  from .data_collector import DataCollector
17
22
  from .logger import apply_verbosity, get_level_from_verbosity, logger
18
23
  from .utils import (
19
24
  Console,
20
25
  EmptyCrawl,
26
+ IndexProgress,
21
27
  MetadataCrawlerException,
22
28
  find_closest,
23
29
  load_plugins,
@@ -49,6 +55,20 @@ def _match(match: str, items: Collection[str]) -> List[str]:
49
55
  return out
50
56
 
51
57
 
58
+ def _get_num_of_indexed_objects(
59
+ catalogue_files: FilesArg, storage_options: Optional[Dict[str, Any]] = None
60
+ ) -> int:
61
+ num_objects = 0
62
+ storage_options = storage_options or {}
63
+ for cat_file in _norm_files(catalogue_files):
64
+ try:
65
+ cat = CatalogueReader.load_catalogue(cat_file, **storage_options)
66
+ num_objects += cat.get("metadata", {}).get("indexed_objects", 0)
67
+ except (FileNotFoundError, IsADirectoryError, yaml.parser.ParserError):
68
+ pass
69
+ return num_objects
70
+
71
+
52
72
  def _get_search(
53
73
  config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
54
74
  search_dirs: Optional[List[str]] = None,
@@ -86,14 +106,22 @@ async def async_call(
86
106
  batch_size: int = 2500,
87
107
  catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
88
108
  verbosity: int = 0,
109
+ log_suffix: Optional[str] = None,
110
+ num_objects: int = 0,
89
111
  *args: Any,
90
112
  **kwargs: Any,
91
113
  ) -> None:
92
- """Index metadata."""
114
+ """Add / Delete metadata from index."""
93
115
  env = cast(os._Environ[str], os.environ.copy())
94
- old_level = apply_verbosity(verbosity)
116
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
117
+
95
118
  try:
119
+ progress = IndexProgress(total=num_objects)
120
+ os.environ["MDC_LOG_INIT"] = "1"
96
121
  os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
122
+ os.environ["MDC_LOG_SUFFIX"] = (
123
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
124
+ )
97
125
  backends = load_plugins("metadata_crawler.ingester")
98
126
  try:
99
127
  cls = backends[index_system]
@@ -107,18 +135,22 @@ async def async_call(
107
135
  flat_files = flat_files or [""]
108
136
  futures = []
109
137
  storage_options = kwargs.pop("storage_options", {})
138
+ progress.start()
110
139
  for cf in flat_files:
111
140
  obj = cls(
112
141
  batch_size=batch_size,
113
142
  catalogue_file=cf or None,
114
143
  storage_options=storage_options,
144
+ progress=progress,
115
145
  )
116
146
  func = getattr(obj, method)
117
147
  future = _event_loop.create_task(func(**kwargs))
118
148
  futures.append(future)
119
149
  await asyncio.gather(*futures)
150
+
120
151
  finally:
121
152
  os.environ = env
153
+ progress.stop()
122
154
  logger.set_level(old_level)
123
155
 
124
156
 
@@ -127,6 +159,7 @@ async def async_index(
127
159
  *catalogue_files: Union[Path, str, List[str], List[Path]],
128
160
  batch_size: int = 2500,
129
161
  verbosity: int = 0,
162
+ log_suffix: Optional[str] = None,
130
163
  **kwargs: Any,
131
164
  ) -> None:
132
165
  """Index metadata in the indexing system.
@@ -142,6 +175,8 @@ async def async_index(
142
175
  If the index system supports batch-sizes, the size of the batches.
143
176
  verbosity:
144
177
  Set the verbosity of the system.
178
+ log_suffix:
179
+ Add a suffix to the log file output.
145
180
 
146
181
  Other Parameters
147
182
  ^^^^^^^^^^^^^^^^
@@ -168,6 +203,11 @@ async def async_index(
168
203
  "index",
169
204
  batch_size=batch_size,
170
205
  verbosity=verbosity,
206
+ log_suffix=log_suffix,
207
+ num_objects=_get_num_of_indexed_objects(
208
+ kwargs["catalogue_files"],
209
+ storage_options=kwargs.get("storage_options"),
210
+ ),
171
211
  **kwargs,
172
212
  )
173
213
 
@@ -176,6 +216,7 @@ async def async_delete(
176
216
  index_system: str,
177
217
  batch_size: int = 2500,
178
218
  verbosity: int = 0,
219
+ log_suffix: Optional[str] = None,
179
220
  **kwargs: Any,
180
221
  ) -> None:
181
222
  """Delete metadata from the indexing system.
@@ -188,6 +229,8 @@ async def async_delete(
188
229
  If the index system supports batch-sizes, the size of the batches.
189
230
  verbosity:
190
231
  Set the verbosity of the system.
232
+ log_suffix:
233
+ Add a suffix to the log file output.
191
234
 
192
235
  Other Parameters
193
236
  ^^^^^^^^^^^^^^^^^
@@ -212,6 +255,7 @@ async def async_delete(
212
255
  "delete",
213
256
  batch_size=batch_size,
214
257
  verbosity=verbosity,
258
+ log_suffix=log_suffix,
215
259
  **kwargs,
216
260
  )
217
261
 
@@ -236,6 +280,7 @@ async def async_add(
236
280
  password: bool = False,
237
281
  n_procs: Optional[int] = None,
238
282
  verbosity: int = 0,
283
+ log_suffix: Optional[str] = None,
239
284
  fail_under: int = -1,
240
285
  **kwargs: Any,
241
286
  ) -> None:
@@ -282,6 +327,8 @@ async def async_add(
282
327
  Set the number of parallel processes for collecting.
283
328
  verbosity:
284
329
  Set the verbosity of the system.
330
+ log_suffix:
331
+ Add a suffix to the log file output.
285
332
  fail_under:
286
333
  Fail if less than X of the discovered files could be indexed.
287
334
 
@@ -305,9 +352,13 @@ async def async_add(
305
352
 
306
353
  """
307
354
  env = cast(os._Environ[str], os.environ.copy())
308
- old_level = apply_verbosity(verbosity)
355
+ old_level = apply_verbosity(verbosity, suffix=log_suffix)
309
356
  try:
357
+ os.environ["MDC_LOG_INIT"] = "1"
310
358
  os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
359
+ os.environ["MDC_LOG_SUFFIX"] = (
360
+ log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
361
+ )
311
362
  config_file = config_file or os.environ.get(
312
363
  "EVALUATION_SYSTEM_CONFIG_DIR"
313
364
  )
@@ -316,7 +367,7 @@ async def async_add(
316
367
  "You must give a config file/directory"
317
368
  )
318
369
  st = time.time()
319
- passwd = ""
370
+ passwd: Optional[str] = None
320
371
  if password: # pragma: no cover
321
372
  passwd = Prompt.ask(
322
373
  "[b]Enter the password", password=True
metadata_crawler/utils.py CHANGED
@@ -32,6 +32,7 @@ import rich.console
32
32
  import rich.spinner
33
33
  from dateutil.parser import isoparse
34
34
  from rich.live import Live
35
+ from rich.progress import Progress, TaskID
35
36
 
36
37
  from .logger import logger
37
38
 
@@ -330,6 +331,76 @@ def timedelta_to_str(seconds: Union[int, float]) -> str:
330
331
  return " ".join(out[::-1])
331
332
 
332
333
 
334
+ class IndexProgress:
335
+ """A helper that displays the progress of index Tasks."""
336
+
337
+ def __init__(
338
+ self,
339
+ total: int = 0,
340
+ interactive: Optional[bool] = None,
341
+ text: str = "Indexing: ",
342
+ ) -> None:
343
+ if interactive is None:
344
+ self._interactive = bool(
345
+ int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
346
+ )
347
+ else:
348
+ self._interactive = interactive
349
+ self._log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
350
+ self.text = text
351
+ self._done = 0
352
+ self._task: TaskID = TaskID(0)
353
+ self._total = total
354
+ self._start = self._last_log = time.time()
355
+ self._progress = Progress()
356
+ self._last_printed_percent: float = -1.0
357
+
358
+ def start(self) -> None:
359
+ """Start the progress bar."""
360
+ self._start = self._last_log = time.time()
361
+
362
+ if self._interactive:
363
+ self._task = self._progress.add_task(
364
+ f"[green] {self.text}", total=self._total or None
365
+ )
366
+ self._progress.start()
367
+
368
+ def stop(self) -> None:
369
+ """Stop the progress bar."""
370
+ if self._interactive:
371
+ self._progress.stop()
372
+ else:
373
+ self._text_update()
374
+
375
+ def _text_update(self, bar_width: int = 40) -> None:
376
+ elapsed = timedelta(seconds=int(time.time() - self._start))
377
+ log_interval = timedelta(seconds=int(time.time() - self._last_log))
378
+ if self._total > 0:
379
+ filled = int((self._last_printed_percent / 100) * bar_width)
380
+ bar = "#" * filled + "-" * (bar_width - filled)
381
+ text = f"{self.text} [{bar}] {self._last_printed_percent:>6,.02f}%"
382
+ else:
383
+ text = f"{self.text} [{self._done:>12,}]"
384
+ if log_interval.total_seconds() >= self._log_interval:
385
+ print(f"{text} ({elapsed})", flush=True)
386
+ self._last_log = time.time()
387
+
388
+ def update(self, inc: int) -> None:
389
+ """Update the status progress bar by an increment."""
390
+ self._done += inc
391
+
392
+ if self._interactive is True:
393
+ desc = f"{self.text} [{self._done:>10d}]" if self._done == 0 else None
394
+ self._progress.update(self._task, advance=inc, description=desc)
395
+ return
396
+
397
+ frac = self._done / max(self._total, 1)
398
+ pct = frac * 100
399
+ if pct > self._last_printed_percent or self._total == 0:
400
+ self._last_printed_percent = pct
401
+ self._text_update()
402
+
403
+
333
404
  @daemon
334
405
  def print_performance(
335
406
  print_status: EventLike,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metadata-crawler
3
- Version: 2509.0.0
3
+ Version: 2509.0.2
4
4
  Summary: Crawl, extract and push climate metadata for indexing.
5
5
  Author-email: "DKRZ, Clint" <freva@dkrz.de>
6
6
  Requires-Python: >=3.11
@@ -34,7 +34,7 @@ Requires-Dist: numpy
34
34
  Requires-Dist: orjson
35
35
  Requires-Dist: pyarrow
36
36
  Requires-Dist: h5netcdf
37
- Requires-Dist: pydantic
37
+ Requires-Dist: pydantic<2.12
38
38
  Requires-Dist: pyarrow
39
39
  Requires-Dist: rich
40
40
  Requires-Dist: rich-argparse
@@ -83,10 +83,10 @@ Requires-Dist: pytest-env ; extra == "tests"
83
83
  Requires-Dist: requests ; extra == "tests"
84
84
  Requires-Dist: pre-commit ; extra == "tests"
85
85
  Requires-Dist: toml ; extra == "tests"
86
- Project-URL: Documentation, https://github.com/freva-org/freva-admin
87
- Project-URL: Home, https://github.com/freva-org/freva-admin
88
- Project-URL: Issues, https://github.com/freva-org/freva-admin/issues
89
- Project-URL: Source, https://github.com/freva-org/freva-admin
86
+ Project-URL: Documentation, https://metadata-crawler.readthedocs.io
87
+ Project-URL: Home, https://github.com/freva-org/metadata-crawler
88
+ Project-URL: Issues, https://github.com/freva-org/metadata-crawler/issues
89
+ Project-URL: Source, https://github.com/freva-org/metadata-crawler
90
90
  Provides-Extra: dev
91
91
  Provides-Extra: doc
92
92
  Provides-Extra: mkdoc
@@ -95,25 +95,27 @@ Provides-Extra: tests
95
95
  # metadata-crawler
96
96
 
97
97
  [![License](https://img.shields.io/badge/License-BSD-purple.svg)](LICENSE)
98
- [![PyPI](https://img.shields.io/pypi/pyversions/freva-client.svg)](https://pypi.org/project/metadata-crawler/)
98
+ [![PyPI](https://img.shields.io/pypi/pyversions/metadata-crawler.svg)](https://pypi.org/project/metadata-crawler/)
99
+ [![Conda Version](https://img.shields.io/conda/vn/conda-forge/metadata-crawler.svg)](https://anaconda.org/conda-forge/metadata-crawler)
99
100
  [![Docs](https://readthedocs.org/projects/metadata-crawler/badge/?version=latest)](https://metadata-crawler.readthedocs.io/en/latest/?badge=latest)
100
101
  [![Tests](https://github.com/freva-org/metadata-crawler/actions/workflows/ci_job.yml/badge.svg)](https://github.com/freva-org/metadata-crawler/actions)
101
102
  [![Test-Coverage](https://codecov.io/gh/freva-org/metadata-crawler/graph/badge.svg?token=W2YziDnh2N)](https://codecov.io/gh/freva-org/metadata-crawler)
102
103
 
104
+
103
105
  Harvest, normalise, and index climate / earth-system metadata from **POSIX**,
104
106
  **S3/MinIO**, and **OpenStack Swift** using configurable **DRS dialects**
105
- (CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (DuckDB or
106
- JSONLines) and then **index** into systems such as **Solr** or **MongoDB**.
107
+ (CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (JSONLines)
108
+ and then **index** into systems such as **Solr** or **MongoDB**.
107
109
  Configuration is **TOML** with inheritance, templating, and computed rules.
108
110
 
109
- > [!TIP]
110
- > **TL;DR**
111
- > - Define datasets + dialects in ``drs_config.toml``
112
- > - ``mdc crawl`` → write a temporary catalogue (``jsonl.gz`` or **DuckDB**)
113
- > - ``mdc config`` → inspect a the (merged) crawler config.
114
- > - ``mdc walk-intake`` → inspect the content of an intake catalogue.
115
- > - ``mdc <backend> index`` → push records from catalogue into your index backend
116
- > - ``mdc <backend> delete`` → remove records by facet match
111
+ ## TL;DR
112
+
113
+ - Define datasets + dialects in ``drs_config.toml``
114
+ - ``mdc add`` → write a temporary catalogue (``jsonl.gz``)
115
+ - ``mdc config`` → inspect a the (merged) crawler config.
116
+ - ``mdc walk-intake`` → inspect the content of an intake catalogue.
117
+ - ``mdc <backend> index`` → push records from catalogue into your index backend
118
+ - ``mdc <backend> delete`` → remove records by facet match
117
119
 
118
120
  ## Features
119
121
 
@@ -126,7 +128,7 @@ Configuration is **TOML** with inheritance, templating, and computed rules.
126
128
  dataset attributes/vars
127
129
  - **Special rules**: conditionals, cache lookups and function calls (e.g. CMIP6 realm,
128
130
  time aggregation)
129
- - **Index backends**: DuckDB (local/S3), MongoDB (Motor), Solr
131
+ - **Index backends**: MongoDB (Motor), Solr
130
132
  - **Sync + Async APIs** and a clean CLI
131
133
  - **Docs**: Sphinx with ``pydata_sphinx_theme``
132
134
 
@@ -143,14 +145,14 @@ Configuration is **TOML** with inheritance, templating, and computed rules.
143
145
  ```console
144
146
 
145
147
  # 1) Crawl → write catalogue
146
- mdc crawl \
148
+ mdc add \
147
149
  cat.yaml \
148
150
  --config-file drs_config.toml \
149
151
  --dataset cmip6-fs,obs-fs \
150
152
  --threads 4 --batch-size 100
151
153
 
152
- # 2) Index from catalogue → Solr (or Mongo/DuckDB)
153
- mdc soler index \
154
+ # 2) Index from catalogue → Solr (or Mongo)
155
+ mdc solr index \
154
156
  cat.yaml \
155
157
  --server localhot:8983
156
158
 
@@ -0,0 +1,34 @@
1
+ metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
2
+ metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
+ metadata_crawler/_version.py,sha256=9-K5oUNmfiY2VyddRsxyD-fcZp54m4x8eeX3XbXHEV0,25
4
+ metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
5
+ metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
6
+ metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
7
+ metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ metadata_crawler/run.py,sha256=ytkYZQGWQ1jAvm8_ZbVPfTydGoHTEAhKWbajlkt6oU4,13033
9
+ metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
10
+ metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
+ metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
+ metadata_crawler/api/config.py,sha256=MxxAN1y2FtHlUU42nBfQds5_8R_OSDdnHXsZANx6IFY,28373
13
+ metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
14
+ metadata_crawler/api/index.py,sha256=9hafNfNEbmw2tIVYq7jPagz7RaDtxXjs_L-YtFVvNJk,4411
15
+ metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
16
+ metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
+ metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
18
+ metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
19
+ metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
20
+ metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
21
+ metadata_crawler/api/mixin/template_mixin.py,sha256=hxQXiP_JND3fuxBNcs1pZ7cvP-k-lTm5MQg40t0kF54,5105
22
+ metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
23
+ metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
24
+ metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
25
+ metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqeeE,4602
26
+ metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
+ metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
+ metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
29
+ metadata_crawler/ingester/solr.py,sha256=cRHe47l3WFZEFLZkHD1q-aPVjimi8H03xgL994XO1Lg,8988
30
+ metadata_crawler-2509.0.2.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
+ metadata_crawler-2509.0.2.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
+ metadata_crawler-2509.0.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
+ metadata_crawler-2509.0.2.dist-info/METADATA,sha256=b32DEUfPeWaSKbhdZYw_1qi57-yIyS0Z2PhaaH4EDK8,13006
34
+ metadata_crawler-2509.0.2.dist-info/RECORD,,
@@ -1,34 +0,0 @@
1
- metadata_crawler/__init__.py,sha256=7gEpJjS9FpR6MHRY_Ztk8ORJ8JQ7WZUTV2TfLkaYgqs,6741
2
- metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
- metadata_crawler/_version.py,sha256=Z6_4SgU9Dpc127xJlyvGKjeWd_Q1ONlOHQO123XGv30,25
4
- metadata_crawler/cli.py,sha256=meY5ZfR5VEW5ZorOPWO_b4MyIIQy0wTTPs9OkJ1WnfA,17180
5
- metadata_crawler/data_collector.py,sha256=9CVr4arKJspyLNLuF2MfkmY_r8x74Mw8hAaDSMouQUA,8372
6
- metadata_crawler/logger.py,sha256=5Lc0KdzH2HdWkidW-MASW8Pfy7vTMnzPv1-e2V3Any0,4407
7
- metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- metadata_crawler/run.py,sha256=w1kV4D63dS3mdgDTQj2ngzeSCjZPphWg1HwIJeJ6ATE,11345
9
- metadata_crawler/utils.py,sha256=QNr_9jZkuuQOrkuO46PrFhUfwLmfCJCq9gWUwwARfyM,11580
10
- metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
- metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
- metadata_crawler/api/config.py,sha256=j__JDKYTOR8kYC--HaHlYXfz38rzEhtUvHdO5Bh_j2E,28250
13
- metadata_crawler/api/drs_config.toml,sha256=90lQaSC2VdJ8OUoc6j27kg6d2OnfxR5a_KZH3W-FZV4,10603
14
- metadata_crawler/api/index.py,sha256=8g5HdSxluKtCwU45P0w_7LDIaSf200JbB-ekGJiI18c,4130
15
- metadata_crawler/api/metadata_stores.py,sha256=oWewL6XRmNZ6i5WxYI8Lm2jfpwLqBCGP2p4j3wLLNpQ,23735
16
- metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
- metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
18
- metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
19
- metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
20
- metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
21
- metadata_crawler/api/mixin/template_mixin.py,sha256=_qDp5n_CPnSYPMBsTia44b1ybBqrJEi-M1NaRkQ0z3U,5106
22
- metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
23
- metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
24
- metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
25
- metadata_crawler/backends/s3.py,sha256=DPz_bOyOlUveCwkSLVatwU_mcxUbFvygU_Id1AZVIMA,4455
26
- metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
- metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
- metadata_crawler/ingester/mongo.py,sha256=lpWIZ8mo6S8oY887uz2l6Y9pir0sUVEkfgOdDxrjIMM,6142
29
- metadata_crawler/ingester/solr.py,sha256=EoKS3kFeDTLf9zP22s2DhQGP81T6rTXVWDNT2wWKFkk,5242
30
- metadata_crawler-2509.0.0.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
- metadata_crawler-2509.0.0.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
- metadata_crawler-2509.0.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
- metadata_crawler-2509.0.0.dist-info/METADATA,sha256=Dk0trqXYleepz1L8HXwKF-vAdSQww1zBm4Q014G4aOU,12938
34
- metadata_crawler-2509.0.0.dist-info/RECORD,,