metadata-crawler 2509.0.2__py3-none-any.whl → 2510.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __version__ = "2509.0.2"
1
+ __version__ = "2510.0.1"
@@ -256,23 +256,38 @@ class CrawlerSettings(BaseModel):
256
256
  class PathSpecs(BaseModel):
257
257
  """Implementation of the Directory reference syntax."""
258
258
 
259
- dir_parts: List[str] = Field(default_factory=list)
260
- file_parts: List[str] = Field(default_factory=list)
259
+ dir_parts: Optional[List[str]] = None
260
+ file_parts: Optional[List[str]] = None
261
261
  file_sep: str = "_"
262
262
 
263
- def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
264
- """Read path encoded metadata from path specs."""
263
+ def _get_metadata_from_dir(
264
+ self, data: Dict[str, Any], rel_path: Path
265
+ ) -> None:
265
266
  dir_parts = rel_path.parent.parts
266
- file_parts = rel_path.name.split(self.file_sep)
267
- if len(dir_parts) == len(self.dir_parts):
268
- data: Dict[str, Any] = dict(zip(self.dir_parts, dir_parts))
269
- else:
267
+
268
+ if self.dir_parts and len(dir_parts) == len(self.dir_parts):
269
+ data.update(
270
+ {
271
+ k: v
272
+ for (k, v) in zip(self.dir_parts, dir_parts)
273
+ if k not in data
274
+ }
275
+ )
276
+ elif self.dir_parts:
270
277
  raise MetadataCrawlerException(
271
278
  (
272
279
  f"Number of dir parts for {rel_path.parent} do not match "
273
280
  f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
274
281
  )
275
282
  ) from None
283
+
284
+ def _get_metadata_from_filename(
285
+ self, data: Dict[str, Any], rel_path: Path
286
+ ) -> None:
287
+ if self.file_parts is None:
288
+ return
289
+ file_parts = rel_path.name.split(self.file_sep)
290
+ _parts: Dict[str, str] = {}
276
291
  if len(file_parts) == len(self.file_parts):
277
292
  _parts = dict(zip(self.file_parts, file_parts))
278
293
  elif (
@@ -287,6 +302,12 @@ class PathSpecs(BaseModel):
287
302
  )
288
303
  )
289
304
  data.update({k: v for (k, v) in _parts.items() if k not in data})
305
+
306
+ def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
307
+ """Read path encoded metadata from path specs."""
308
+ data: Dict[str, Any] = {}
309
+ self._get_metadata_from_dir(data, rel_path)
310
+ self._get_metadata_from_filename(data, rel_path)
290
311
  data.pop("_", None)
291
312
  return data
292
313
 
@@ -689,13 +710,12 @@ class DRSConfig(BaseModel, TemplateMixin):
689
710
  str, self.dialect[standard].facets.get("version", "version")
690
711
  )
691
712
  is_versioned = True
713
+ dir_parts = self.dialect[standard].path_specs.dir_parts or []
692
714
  try:
693
- version_idx = self.dialect[standard].path_specs.dir_parts.index(
694
- version
695
- )
715
+ version_idx = dir_parts.index(version)
696
716
  except ValueError:
697
717
  # No version given
698
- version_idx = len(self.dialect[standard].path_specs.dir_parts)
718
+ version_idx = len(dir_parts)
699
719
  is_versioned = False
700
720
  if root_path == search_dir:
701
721
  current_pos = 0
@@ -4,13 +4,16 @@ from __future__ import annotations
4
4
 
5
5
  import abc
6
6
  from pathlib import Path
7
+ from types import TracebackType
7
8
  from typing import (
8
9
  Any,
9
10
  AsyncIterator,
10
11
  Dict,
11
12
  List,
12
13
  Optional,
14
+ Self,
13
15
  Tuple,
16
+ Type,
14
17
  Union,
15
18
  cast,
16
19
  )
@@ -69,6 +72,16 @@ class BaseIndex:
69
72
 
70
73
  def __post_init__(self) -> None: ...
71
74
 
75
+ async def __aenter__(self) -> Self:
76
+ return self
77
+
78
+ async def __aexit__(
79
+ self,
80
+ exc_type: Optional[Type[BaseException]],
81
+ exc_val: Optional[BaseException],
82
+ exc_tb: Optional[TracebackType],
83
+ ) -> None: ...
84
+
72
85
  @property
73
86
  def index_schema(self) -> Dict[str, SchemaField]:
74
87
  """Get the index schema."""
@@ -7,7 +7,8 @@ import logging
7
7
  import os
8
8
  import time
9
9
  from concurrent.futures import ThreadPoolExecutor
10
- from typing import Annotated, Any, Dict, List, Optional, cast
10
+ from types import TracebackType
11
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
11
12
 
12
13
  import aiohttp
13
14
  import orjson
@@ -21,9 +22,26 @@ from ..logger import logger
21
22
  class SolrIndex(BaseIndex):
22
23
  """Ingest metadata into an apache solr server."""
23
24
 
25
+ senteniel: Optional[bytes] = None
26
+
24
27
  def __post_init__(self) -> None:
25
- self.timeout = aiohttp.ClientTimeout(total=50)
28
+ self.timeout = aiohttp.ClientTimeout(
29
+ connect=10, sock_connect=10, sock_read=180, total=None
30
+ )
31
+ self.semaphore = asyncio.Event()
32
+ self.max_http_workers: int = 0
33
+ queue_max: int = 128
34
+ encode_workers: int = 4
26
35
  self._uri: str = ""
36
+ self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
37
+ self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
38
+ asyncio.Queue(maxsize=queue_max)
39
+ )
40
+ self.connector = aiohttp.TCPConnector(
41
+ ttl_dns_cache=300,
42
+ use_dns_cache=True,
43
+ enable_cleanup_closed=True,
44
+ )
27
45
 
28
46
  async def solr_url(self, server: str, core: str) -> str:
29
47
  """Construct the solr url from a given solr core."""
@@ -149,8 +167,25 @@ class SolrIndex(BaseIndex):
149
167
  time.perf_counter() - t0,
150
168
  )
151
169
 
170
+ async def consumer(self, session: aiohttp.ClientSession) -> None:
171
+ """Consume the metadata read by the porducers."""
172
+ while True:
173
+ update_url, body = await self.producer_queue.get()
174
+ if body is self.senteniel:
175
+ self.producer_queue.task_done()
176
+ break
177
+ try:
178
+ await self._post_chunk(session, update_url, cast(bytes, body))
179
+ finally:
180
+ self.producer_queue.task_done()
181
+
152
182
  async def _index_core(
153
- self, server: str, core: str, suffix: str, http_workers: int = 8
183
+ self,
184
+ session: aiohttp.ClientSession,
185
+ server: str,
186
+ core: str,
187
+ suffix: str,
188
+ http_workers: int = 8,
154
189
  ) -> None:
155
190
  """Zero-copy-ish, backpressured, bounded-concurrency indexer.
156
191
 
@@ -160,70 +195,36 @@ class SolrIndex(BaseIndex):
160
195
  """
161
196
  base_url = await self.solr_url(server, core + suffix)
162
197
  update_url = base_url.split("?", 1)[0] # guard
163
-
164
- queue_max: int = 128
165
- encode_workers: int = 4
166
-
167
- timeout = aiohttp.ClientTimeout(
168
- connect=10, sock_connect=10, sock_read=180, total=None
169
- )
170
- connector = aiohttp.TCPConnector(
171
- limit_per_host=http_workers,
172
- ttl_dns_cache=300,
173
- enable_cleanup_closed=True,
174
- )
175
-
176
198
  loop = asyncio.get_running_loop()
177
- cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
178
- q: asyncio.Queue[Optional[bytes]] = asyncio.Queue(maxsize=queue_max)
179
- SENTINEL: Optional[bytes] = None
180
-
181
- async def producer() -> None:
182
- async for batch in self.get_metadata(core):
183
- body = await loop.run_in_executor(
184
- cpu_pool, self._encode_payload, batch
199
+ async for batch in self.get_metadata(core):
200
+ body = await loop.run_in_executor(
201
+ self.cpu_pool, self._encode_payload, batch
202
+ )
203
+ await self.producer_queue.put((update_url, body))
204
+ commit_url = f"{update_url}?commit=true"
205
+ async with session.post(
206
+ commit_url,
207
+ data=b"[]",
208
+ headers={"Content-Type": "application/json"},
209
+ ) as resp:
210
+ if resp.status >= 400:
211
+ text = await resp.text()
212
+ logger.warning(
213
+ "COMMIT %s -> %i: %s", commit_url, resp.status, text
185
214
  )
186
- await q.put(body)
187
- for _ in range(http_workers):
188
- await q.put(SENTINEL)
189
215
 
190
- async def consumer(
191
- worker_id: int, session: aiohttp.ClientSession
192
- ) -> None:
193
- while True:
194
- body = await q.get()
195
- if body is SENTINEL:
196
- q.task_done()
197
- break
198
- try:
199
- await self._post_chunk(session, update_url, cast(bytes, body))
200
- finally:
201
- q.task_done()
202
-
203
- async with aiohttp.ClientSession(
204
- timeout=timeout, connector=connector, raise_for_status=True
205
- ) as session:
206
- consumers = [
207
- asyncio.create_task(consumer(i, session))
208
- for i in range(http_workers)
209
- ]
210
- prod_task = asyncio.create_task(producer())
211
- await prod_task
212
- await q.join()
213
- await asyncio.gather(*consumers)
216
+ async def __aexit__(
217
+ self,
218
+ exc_type: Optional[Type[BaseException]],
219
+ exc_val: Optional[BaseException],
220
+ exc_tb: Optional[TracebackType],
221
+ ) -> None:
214
222
 
215
- commit_url = f"{update_url}?commit=true"
216
- async with aiohttp.ClientSession(timeout=timeout) as session:
217
- async with session.post(
218
- commit_url,
219
- data=b"[]",
220
- headers={"Content-Type": "application/json"},
221
- ) as resp:
222
- if resp.status >= 400:
223
- text = await resp.text()
224
- logger.warning(
225
- "COMMIT %s -> %i: %s", commit_url, resp.status, text
226
- )
223
+ try:
224
+ self.producer_queue.shutdown()
225
+ except AttributeError: # pragma: no cover
226
+ pass # prgama: no cover
227
+ self.cpu_pool.shutdown()
227
228
 
228
229
  @cli_function(
229
230
  help="Add metadata to the apache solr metadata server.",
@@ -256,13 +257,25 @@ class SolrIndex(BaseIndex):
256
257
  ] = 8,
257
258
  ) -> None:
258
259
  """Add metadata to the apache solr metadata server."""
259
- async with asyncio.TaskGroup() as tg:
260
- for core in self.index_names:
261
- tg.create_task(
262
- self._index_core(
263
- server or "",
264
- core,
265
- suffix=index_suffix or "",
266
- http_workers=http_workers,
260
+ async with aiohttp.ClientSession(
261
+ timeout=self.timeout, connector=self.connector, raise_for_status=True
262
+ ) as session:
263
+ consumers = [
264
+ asyncio.create_task(self.consumer(session))
265
+ for _ in range(http_workers)
266
+ ]
267
+ async with asyncio.TaskGroup() as tg:
268
+ for core in self.index_names:
269
+ tg.create_task(
270
+ self._index_core(
271
+ session,
272
+ server or "",
273
+ core,
274
+ suffix=index_suffix or "",
275
+ http_workers=http_workers,
276
+ )
267
277
  )
268
- )
278
+ for _ in range(http_workers):
279
+ await self.producer_queue.put(("", self.senteniel))
280
+ await self.producer_queue.join()
281
+ await asyncio.gather(*consumers)
metadata_crawler/run.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Apply the metadata collector."""
2
2
 
3
- import asyncio
4
3
  import os
5
4
  import time
6
5
  from fnmatch import fnmatch
@@ -131,22 +130,18 @@ async def async_call(
131
130
  )
132
131
  raise ValueError(msg) from None
133
132
  flat_files = _norm_files(catalogue_files)
134
- _event_loop = asyncio.get_event_loop()
135
133
  flat_files = flat_files or [""]
136
- futures = []
137
134
  storage_options = kwargs.pop("storage_options", {})
138
135
  progress.start()
139
136
  for cf in flat_files:
140
- obj = cls(
137
+ async with cls(
141
138
  batch_size=batch_size,
142
139
  catalogue_file=cf or None,
143
140
  storage_options=storage_options,
144
141
  progress=progress,
145
- )
146
- func = getattr(obj, method)
147
- future = _event_loop.create_task(func(**kwargs))
148
- futures.append(future)
149
- await asyncio.gather(*futures)
142
+ ) as obj:
143
+ func = getattr(obj, method)
144
+ await func(**kwargs)
150
145
 
151
146
  finally:
152
147
  os.environ = env
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metadata-crawler
3
- Version: 2509.0.2
3
+ Version: 2510.0.1
4
4
  Summary: Crawl, extract and push climate metadata for indexing.
5
5
  Author-email: "DKRZ, Clint" <freva@dkrz.de>
6
6
  Requires-Python: >=3.11
@@ -1,17 +1,17 @@
1
1
  metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
2
2
  metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
- metadata_crawler/_version.py,sha256=9-K5oUNmfiY2VyddRsxyD-fcZp54m4x8eeX3XbXHEV0,25
3
+ metadata_crawler/_version.py,sha256=_1Aa0y1WK02pe4d0jqbbU4gNu6ua_fbVc34XEhA0Nso,25
4
4
  metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
5
5
  metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
6
6
  metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
7
7
  metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- metadata_crawler/run.py,sha256=ytkYZQGWQ1jAvm8_ZbVPfTydGoHTEAhKWbajlkt6oU4,13033
8
+ metadata_crawler/run.py,sha256=_6mx29Co1HwfPNFWtzTR65CNlopqubj-McmavRM7i80,12869
9
9
  metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
10
10
  metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
11
  metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
- metadata_crawler/api/config.py,sha256=MxxAN1y2FtHlUU42nBfQds5_8R_OSDdnHXsZANx6IFY,28373
12
+ metadata_crawler/api/config.py,sha256=wYcVCsppOCWtdP1GIhrGNGfvfWTKOC3rb50cw5hfrG4,28921
13
13
  metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
14
- metadata_crawler/api/index.py,sha256=9hafNfNEbmw2tIVYq7jPagz7RaDtxXjs_L-YtFVvNJk,4411
14
+ metadata_crawler/api/index.py,sha256=0yqtXYOyWJJKKkCkIJbUUVG1w2Wt_icYJjXJPZZjSvU,4715
15
15
  metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
16
16
  metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
17
  metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
@@ -26,9 +26,9 @@ metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqe
26
26
  metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
27
  metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
28
  metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
29
- metadata_crawler/ingester/solr.py,sha256=cRHe47l3WFZEFLZkHD1q-aPVjimi8H03xgL994XO1Lg,8988
30
- metadata_crawler-2509.0.2.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
- metadata_crawler-2509.0.2.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
- metadata_crawler-2509.0.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
- metadata_crawler-2509.0.2.dist-info/METADATA,sha256=b32DEUfPeWaSKbhdZYw_1qi57-yIyS0Z2PhaaH4EDK8,13006
34
- metadata_crawler-2509.0.2.dist-info/RECORD,,
29
+ metadata_crawler/ingester/solr.py,sha256=kpUAnI5iSsvNGagM_gqbTJZr8HNpYSFZFvNOcbHXB9o,9528
30
+ metadata_crawler-2510.0.1.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
+ metadata_crawler-2510.0.1.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
+ metadata_crawler-2510.0.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
+ metadata_crawler-2510.0.1.dist-info/METADATA,sha256=RNL72fSYduIhQJoZuzhMompb9Fvz5qEd3DyYSZIfdq0,13006
34
+ metadata_crawler-2510.0.1.dist-info/RECORD,,