metadata-crawler 2509.0.2__py3-none-any.whl → 2510.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "2509.0.2"
1
+ __version__ = "2510.0.0"
@@ -256,23 +256,31 @@ class CrawlerSettings(BaseModel):
256
256
  class PathSpecs(BaseModel):
257
257
  """Implementation of the Directory reference syntax."""
258
258
 
259
- dir_parts: List[str] = Field(default_factory=list)
260
- file_parts: List[str] = Field(default_factory=list)
259
+ dir_parts: Optional[List[str]] = None
260
+ file_parts: Optional[List[str]] = None
261
261
  file_sep: str = "_"
262
262
 
263
- def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
264
- """Read path encoded metadata from path specs."""
263
+ def _get_metadata_from_dir(
264
+ self, data: Dict[str, Any], rel_path: Path
265
+ ) -> None:
265
266
  dir_parts = rel_path.parent.parts
266
- file_parts = rel_path.name.split(self.file_sep)
267
- if len(dir_parts) == len(self.dir_parts):
268
- data: Dict[str, Any] = dict(zip(self.dir_parts, dir_parts))
269
- else:
267
+ if self.dir_parts and len(dir_parts) == len(self.dir_parts):
268
+ _parts = dict(zip(self.dir_parts, dir_parts))
269
+ elif self.dir_parts:
270
270
  raise MetadataCrawlerException(
271
271
  (
272
272
  f"Number of dir parts for {rel_path.parent} do not match "
273
273
  f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
274
274
  )
275
275
  ) from None
276
+ data.update({k: v for (k, v) in _parts.items() if k not in data})
277
+
278
+ def _get_metadata_from_filename(
279
+ self, data: Dict[str, Any], rel_path: Path
280
+ ) -> None:
281
+ if self.file_parts is None:
282
+ return
283
+ file_parts = rel_path.name.split(self.file_sep)
276
284
  if len(file_parts) == len(self.file_parts):
277
285
  _parts = dict(zip(self.file_parts, file_parts))
278
286
  elif (
@@ -287,6 +295,12 @@ class PathSpecs(BaseModel):
287
295
  )
288
296
  )
289
297
  data.update({k: v for (k, v) in _parts.items() if k not in data})
298
+
299
+ def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
300
+ """Read path encoded metadata from path specs."""
301
+ data: Dict[str, Any] = {}
302
+ self._get_metadata_from_dir(data, rel_path)
303
+ self._get_metadata_from_filename(data, rel_path)
290
304
  data.pop("_", None)
291
305
  return data
292
306
 
@@ -689,13 +703,12 @@ class DRSConfig(BaseModel, TemplateMixin):
689
703
  str, self.dialect[standard].facets.get("version", "version")
690
704
  )
691
705
  is_versioned = True
706
+ dir_parts = self.dialect[standard].path_specs.dir_parts or []
692
707
  try:
693
- version_idx = self.dialect[standard].path_specs.dir_parts.index(
694
- version
695
- )
708
+ version_idx = dir_parts.index(version)
696
709
  except ValueError:
697
710
  # No version given
698
- version_idx = len(self.dialect[standard].path_specs.dir_parts)
711
+ version_idx = len(dir_parts)
699
712
  is_versioned = False
700
713
  if root_path == search_dir:
701
714
  current_pos = 0
@@ -4,13 +4,16 @@ from __future__ import annotations
4
4
 
5
5
  import abc
6
6
  from pathlib import Path
7
+ from types import TracebackType
7
8
  from typing import (
8
9
  Any,
9
10
  AsyncIterator,
10
11
  Dict,
11
12
  List,
12
13
  Optional,
14
+ Self,
13
15
  Tuple,
16
+ Type,
14
17
  Union,
15
18
  cast,
16
19
  )
@@ -69,6 +72,16 @@ class BaseIndex:
69
72
 
70
73
  def __post_init__(self) -> None: ...
71
74
 
75
+ async def __aenter__(self) -> Self:
76
+ return self
77
+
78
+ async def __aexit__(
79
+ self,
80
+ exc_type: Optional[Type[BaseException]],
81
+ exc_val: Optional[BaseException],
82
+ exc_tb: Optional[TracebackType],
83
+ ) -> None: ...
84
+
72
85
  @property
73
86
  def index_schema(self) -> Dict[str, SchemaField]:
74
87
  """Get the index schema."""
@@ -7,7 +7,8 @@ import logging
7
7
  import os
8
8
  import time
9
9
  from concurrent.futures import ThreadPoolExecutor
10
- from typing import Annotated, Any, Dict, List, Optional, cast
10
+ from types import TracebackType
11
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
11
12
 
12
13
  import aiohttp
13
14
  import orjson
@@ -21,9 +22,26 @@ from ..logger import logger
21
22
  class SolrIndex(BaseIndex):
22
23
  """Ingest metadata into an apache solr server."""
23
24
 
25
+ senteniel: Optional[bytes] = None
26
+
24
27
  def __post_init__(self) -> None:
25
- self.timeout = aiohttp.ClientTimeout(total=50)
28
+ self.timeout = aiohttp.ClientTimeout(
29
+ connect=10, sock_connect=10, sock_read=180, total=None
30
+ )
31
+ self.semaphore = asyncio.Event()
32
+ self.max_http_workers: int = 0
33
+ queue_max: int = 128
34
+ encode_workers: int = 4
26
35
  self._uri: str = ""
36
+ self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
37
+ self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
38
+ asyncio.Queue(maxsize=queue_max)
39
+ )
40
+ self.connector = aiohttp.TCPConnector(
41
+ ttl_dns_cache=300,
42
+ use_dns_cache=True,
43
+ enable_cleanup_closed=True,
44
+ )
27
45
 
28
46
  async def solr_url(self, server: str, core: str) -> str:
29
47
  """Construct the solr url from a given solr core."""
@@ -149,8 +167,25 @@ class SolrIndex(BaseIndex):
149
167
  time.perf_counter() - t0,
150
168
  )
151
169
 
170
+ async def consumer(self, session: aiohttp.ClientSession) -> None:
171
+ """Consume the metadata read by the porducers."""
172
+ while True:
173
+ update_url, body = await self.producer_queue.get()
174
+ if body is self.senteniel:
175
+ self.producer_queue.task_done()
176
+ break
177
+ try:
178
+ await self._post_chunk(session, update_url, cast(bytes, body))
179
+ finally:
180
+ self.producer_queue.task_done()
181
+
152
182
  async def _index_core(
153
- self, server: str, core: str, suffix: str, http_workers: int = 8
183
+ self,
184
+ session: aiohttp.ClientSession,
185
+ server: str,
186
+ core: str,
187
+ suffix: str,
188
+ http_workers: int = 8,
154
189
  ) -> None:
155
190
  """Zero-copy-ish, backpressured, bounded-concurrency indexer.
156
191
 
@@ -160,70 +195,36 @@ class SolrIndex(BaseIndex):
160
195
  """
161
196
  base_url = await self.solr_url(server, core + suffix)
162
197
  update_url = base_url.split("?", 1)[0] # guard
163
-
164
- queue_max: int = 128
165
- encode_workers: int = 4
166
-
167
- timeout = aiohttp.ClientTimeout(
168
- connect=10, sock_connect=10, sock_read=180, total=None
169
- )
170
- connector = aiohttp.TCPConnector(
171
- limit_per_host=http_workers,
172
- ttl_dns_cache=300,
173
- enable_cleanup_closed=True,
174
- )
175
-
176
198
  loop = asyncio.get_running_loop()
177
- cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
178
- q: asyncio.Queue[Optional[bytes]] = asyncio.Queue(maxsize=queue_max)
179
- SENTINEL: Optional[bytes] = None
180
-
181
- async def producer() -> None:
182
- async for batch in self.get_metadata(core):
183
- body = await loop.run_in_executor(
184
- cpu_pool, self._encode_payload, batch
199
+ async for batch in self.get_metadata(core):
200
+ body = await loop.run_in_executor(
201
+ self.cpu_pool, self._encode_payload, batch
202
+ )
203
+ await self.producer_queue.put((update_url, body))
204
+ commit_url = f"{update_url}?commit=true"
205
+ async with session.post(
206
+ commit_url,
207
+ data=b"[]",
208
+ headers={"Content-Type": "application/json"},
209
+ ) as resp:
210
+ if resp.status >= 400:
211
+ text = await resp.text()
212
+ logger.warning(
213
+ "COMMIT %s -> %i: %s", commit_url, resp.status, text
185
214
  )
186
- await q.put(body)
187
- for _ in range(http_workers):
188
- await q.put(SENTINEL)
189
215
 
190
- async def consumer(
191
- worker_id: int, session: aiohttp.ClientSession
192
- ) -> None:
193
- while True:
194
- body = await q.get()
195
- if body is SENTINEL:
196
- q.task_done()
197
- break
198
- try:
199
- await self._post_chunk(session, update_url, cast(bytes, body))
200
- finally:
201
- q.task_done()
202
-
203
- async with aiohttp.ClientSession(
204
- timeout=timeout, connector=connector, raise_for_status=True
205
- ) as session:
206
- consumers = [
207
- asyncio.create_task(consumer(i, session))
208
- for i in range(http_workers)
209
- ]
210
- prod_task = asyncio.create_task(producer())
211
- await prod_task
212
- await q.join()
213
- await asyncio.gather(*consumers)
216
+ async def __aexit__(
217
+ self,
218
+ exc_type: Optional[Type[BaseException]],
219
+ exc_val: Optional[BaseException],
220
+ exc_tb: Optional[TracebackType],
221
+ ) -> None:
214
222
 
215
- commit_url = f"{update_url}?commit=true"
216
- async with aiohttp.ClientSession(timeout=timeout) as session:
217
- async with session.post(
218
- commit_url,
219
- data=b"[]",
220
- headers={"Content-Type": "application/json"},
221
- ) as resp:
222
- if resp.status >= 400:
223
- text = await resp.text()
224
- logger.warning(
225
- "COMMIT %s -> %i: %s", commit_url, resp.status, text
226
- )
223
+ try:
224
+ self.producer_queue.shutdown()
225
+ except AttributeError: # pragma: no cover
226
+ pass # prgama: no cover
227
+ self.cpu_pool.shutdown()
227
228
 
228
229
  @cli_function(
229
230
  help="Add metadata to the apache solr metadata server.",
@@ -256,13 +257,25 @@ class SolrIndex(BaseIndex):
256
257
  ] = 8,
257
258
  ) -> None:
258
259
  """Add metadata to the apache solr metadata server."""
259
- async with asyncio.TaskGroup() as tg:
260
- for core in self.index_names:
261
- tg.create_task(
262
- self._index_core(
263
- server or "",
264
- core,
265
- suffix=index_suffix or "",
266
- http_workers=http_workers,
260
+ async with aiohttp.ClientSession(
261
+ timeout=self.timeout, connector=self.connector, raise_for_status=True
262
+ ) as session:
263
+ consumers = [
264
+ asyncio.create_task(self.consumer(session))
265
+ for _ in range(http_workers)
266
+ ]
267
+ async with asyncio.TaskGroup() as tg:
268
+ for core in self.index_names:
269
+ tg.create_task(
270
+ self._index_core(
271
+ session,
272
+ server or "",
273
+ core,
274
+ suffix=index_suffix or "",
275
+ http_workers=http_workers,
276
+ )
267
277
  )
268
- )
278
+ for _ in range(http_workers):
279
+ await self.producer_queue.put(("", self.senteniel))
280
+ await self.producer_queue.join()
281
+ await asyncio.gather(*consumers)
metadata_crawler/run.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Apply the metadata collector."""
2
2
 
3
- import asyncio
4
3
  import os
5
4
  import time
6
5
  from fnmatch import fnmatch
@@ -131,22 +130,18 @@ async def async_call(
131
130
  )
132
131
  raise ValueError(msg) from None
133
132
  flat_files = _norm_files(catalogue_files)
134
- _event_loop = asyncio.get_event_loop()
135
133
  flat_files = flat_files or [""]
136
- futures = []
137
134
  storage_options = kwargs.pop("storage_options", {})
138
135
  progress.start()
139
136
  for cf in flat_files:
140
- obj = cls(
137
+ async with cls(
141
138
  batch_size=batch_size,
142
139
  catalogue_file=cf or None,
143
140
  storage_options=storage_options,
144
141
  progress=progress,
145
- )
146
- func = getattr(obj, method)
147
- future = _event_loop.create_task(func(**kwargs))
148
- futures.append(future)
149
- await asyncio.gather(*futures)
142
+ ) as obj:
143
+ func = getattr(obj, method)
144
+ await func(**kwargs)
150
145
 
151
146
  finally:
152
147
  os.environ = env
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metadata-crawler
3
- Version: 2509.0.2
3
+ Version: 2510.0.0
4
4
  Summary: Crawl, extract and push climate metadata for indexing.
5
5
  Author-email: "DKRZ, Clint" <freva@dkrz.de>
6
6
  Requires-Python: >=3.11
@@ -1,17 +1,17 @@
1
1
  metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
2
2
  metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
3
- metadata_crawler/_version.py,sha256=9-K5oUNmfiY2VyddRsxyD-fcZp54m4x8eeX3XbXHEV0,25
3
+ metadata_crawler/_version.py,sha256=oJIpBtzsOuKTbnMbTB3ZHAqVHS0O9r3O0d2lf9lUGfE,25
4
4
  metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
5
5
  metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
6
6
  metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
7
7
  metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- metadata_crawler/run.py,sha256=ytkYZQGWQ1jAvm8_ZbVPfTydGoHTEAhKWbajlkt6oU4,13033
8
+ metadata_crawler/run.py,sha256=_6mx29Co1HwfPNFWtzTR65CNlopqubj-McmavRM7i80,12869
9
9
  metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
10
10
  metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
11
11
  metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
12
- metadata_crawler/api/config.py,sha256=MxxAN1y2FtHlUU42nBfQds5_8R_OSDdnHXsZANx6IFY,28373
12
+ metadata_crawler/api/config.py,sha256=4c9O0xmVwduEEGlNjQcIh1nV5HzXNjXNqPi3tEQkpGw,28814
13
13
  metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
14
- metadata_crawler/api/index.py,sha256=9hafNfNEbmw2tIVYq7jPagz7RaDtxXjs_L-YtFVvNJk,4411
14
+ metadata_crawler/api/index.py,sha256=0yqtXYOyWJJKKkCkIJbUUVG1w2Wt_icYJjXJPZZjSvU,4715
15
15
  metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
16
16
  metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
17
17
  metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
@@ -26,9 +26,9 @@ metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqe
26
26
  metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
27
27
  metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
28
28
  metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
29
- metadata_crawler/ingester/solr.py,sha256=cRHe47l3WFZEFLZkHD1q-aPVjimi8H03xgL994XO1Lg,8988
30
- metadata_crawler-2509.0.2.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
- metadata_crawler-2509.0.2.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
- metadata_crawler-2509.0.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
- metadata_crawler-2509.0.2.dist-info/METADATA,sha256=b32DEUfPeWaSKbhdZYw_1qi57-yIyS0Z2PhaaH4EDK8,13006
34
- metadata_crawler-2509.0.2.dist-info/RECORD,,
29
+ metadata_crawler/ingester/solr.py,sha256=kpUAnI5iSsvNGagM_gqbTJZr8HNpYSFZFvNOcbHXB9o,9528
30
+ metadata_crawler-2510.0.0.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
31
+ metadata_crawler-2510.0.0.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
32
+ metadata_crawler-2510.0.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
33
+ metadata_crawler-2510.0.0.dist-info/METADATA,sha256=EdZwF0Y_U8NFQFTUcy6WbI8l2WYq59Ynp_L6S3ys1v4,13006
34
+ metadata_crawler-2510.0.0.dist-info/RECORD,,