metadata-crawler 2509.0.2__py3-none-any.whl → 2510.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metadata_crawler/_version.py +1 -1
- metadata_crawler/api/config.py +32 -12
- metadata_crawler/api/index.py +13 -0
- metadata_crawler/ingester/solr.py +85 -72
- metadata_crawler/run.py +4 -9
- {metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/METADATA +1 -1
- {metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/RECORD +10 -10
- {metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/WHEEL +0 -0
- {metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/entry_points.txt +0 -0
- {metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/licenses/LICENSE +0 -0
metadata_crawler/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "
|
|
1
|
+
__version__ = "2510.0.1"
|
metadata_crawler/api/config.py
CHANGED
|
@@ -256,23 +256,38 @@ class CrawlerSettings(BaseModel):
|
|
|
256
256
|
class PathSpecs(BaseModel):
|
|
257
257
|
"""Implementation of the Directory reference syntax."""
|
|
258
258
|
|
|
259
|
-
dir_parts: List[str] =
|
|
260
|
-
file_parts: List[str] =
|
|
259
|
+
dir_parts: Optional[List[str]] = None
|
|
260
|
+
file_parts: Optional[List[str]] = None
|
|
261
261
|
file_sep: str = "_"
|
|
262
262
|
|
|
263
|
-
def
|
|
264
|
-
|
|
263
|
+
def _get_metadata_from_dir(
|
|
264
|
+
self, data: Dict[str, Any], rel_path: Path
|
|
265
|
+
) -> None:
|
|
265
266
|
dir_parts = rel_path.parent.parts
|
|
266
|
-
|
|
267
|
-
if len(dir_parts) == len(self.dir_parts):
|
|
268
|
-
data
|
|
269
|
-
|
|
267
|
+
|
|
268
|
+
if self.dir_parts and len(dir_parts) == len(self.dir_parts):
|
|
269
|
+
data.update(
|
|
270
|
+
{
|
|
271
|
+
k: v
|
|
272
|
+
for (k, v) in zip(self.dir_parts, dir_parts)
|
|
273
|
+
if k not in data
|
|
274
|
+
}
|
|
275
|
+
)
|
|
276
|
+
elif self.dir_parts:
|
|
270
277
|
raise MetadataCrawlerException(
|
|
271
278
|
(
|
|
272
279
|
f"Number of dir parts for {rel_path.parent} do not match "
|
|
273
280
|
f"- needs: {len(self.dir_parts)} has: {len(dir_parts)}"
|
|
274
281
|
)
|
|
275
282
|
) from None
|
|
283
|
+
|
|
284
|
+
def _get_metadata_from_filename(
|
|
285
|
+
self, data: Dict[str, Any], rel_path: Path
|
|
286
|
+
) -> None:
|
|
287
|
+
if self.file_parts is None:
|
|
288
|
+
return
|
|
289
|
+
file_parts = rel_path.name.split(self.file_sep)
|
|
290
|
+
_parts: Dict[str, str] = {}
|
|
276
291
|
if len(file_parts) == len(self.file_parts):
|
|
277
292
|
_parts = dict(zip(self.file_parts, file_parts))
|
|
278
293
|
elif (
|
|
@@ -287,6 +302,12 @@ class PathSpecs(BaseModel):
|
|
|
287
302
|
)
|
|
288
303
|
)
|
|
289
304
|
data.update({k: v for (k, v) in _parts.items() if k not in data})
|
|
305
|
+
|
|
306
|
+
def get_metadata_from_path(self, rel_path: Path) -> Dict[str, Any]:
|
|
307
|
+
"""Read path encoded metadata from path specs."""
|
|
308
|
+
data: Dict[str, Any] = {}
|
|
309
|
+
self._get_metadata_from_dir(data, rel_path)
|
|
310
|
+
self._get_metadata_from_filename(data, rel_path)
|
|
290
311
|
data.pop("_", None)
|
|
291
312
|
return data
|
|
292
313
|
|
|
@@ -689,13 +710,12 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
689
710
|
str, self.dialect[standard].facets.get("version", "version")
|
|
690
711
|
)
|
|
691
712
|
is_versioned = True
|
|
713
|
+
dir_parts = self.dialect[standard].path_specs.dir_parts or []
|
|
692
714
|
try:
|
|
693
|
-
version_idx =
|
|
694
|
-
version
|
|
695
|
-
)
|
|
715
|
+
version_idx = dir_parts.index(version)
|
|
696
716
|
except ValueError:
|
|
697
717
|
# No version given
|
|
698
|
-
version_idx = len(
|
|
718
|
+
version_idx = len(dir_parts)
|
|
699
719
|
is_versioned = False
|
|
700
720
|
if root_path == search_dir:
|
|
701
721
|
current_pos = 0
|
metadata_crawler/api/index.py
CHANGED
|
@@ -4,13 +4,16 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import abc
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from types import TracebackType
|
|
7
8
|
from typing import (
|
|
8
9
|
Any,
|
|
9
10
|
AsyncIterator,
|
|
10
11
|
Dict,
|
|
11
12
|
List,
|
|
12
13
|
Optional,
|
|
14
|
+
Self,
|
|
13
15
|
Tuple,
|
|
16
|
+
Type,
|
|
14
17
|
Union,
|
|
15
18
|
cast,
|
|
16
19
|
)
|
|
@@ -69,6 +72,16 @@ class BaseIndex:
|
|
|
69
72
|
|
|
70
73
|
def __post_init__(self) -> None: ...
|
|
71
74
|
|
|
75
|
+
async def __aenter__(self) -> Self:
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
async def __aexit__(
|
|
79
|
+
self,
|
|
80
|
+
exc_type: Optional[Type[BaseException]],
|
|
81
|
+
exc_val: Optional[BaseException],
|
|
82
|
+
exc_tb: Optional[TracebackType],
|
|
83
|
+
) -> None: ...
|
|
84
|
+
|
|
72
85
|
@property
|
|
73
86
|
def index_schema(self) -> Dict[str, SchemaField]:
|
|
74
87
|
"""Get the index schema."""
|
|
@@ -7,7 +7,8 @@ import logging
|
|
|
7
7
|
import os
|
|
8
8
|
import time
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from
|
|
10
|
+
from types import TracebackType
|
|
11
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, cast
|
|
11
12
|
|
|
12
13
|
import aiohttp
|
|
13
14
|
import orjson
|
|
@@ -21,9 +22,26 @@ from ..logger import logger
|
|
|
21
22
|
class SolrIndex(BaseIndex):
|
|
22
23
|
"""Ingest metadata into an apache solr server."""
|
|
23
24
|
|
|
25
|
+
senteniel: Optional[bytes] = None
|
|
26
|
+
|
|
24
27
|
def __post_init__(self) -> None:
|
|
25
|
-
self.timeout = aiohttp.ClientTimeout(
|
|
28
|
+
self.timeout = aiohttp.ClientTimeout(
|
|
29
|
+
connect=10, sock_connect=10, sock_read=180, total=None
|
|
30
|
+
)
|
|
31
|
+
self.semaphore = asyncio.Event()
|
|
32
|
+
self.max_http_workers: int = 0
|
|
33
|
+
queue_max: int = 128
|
|
34
|
+
encode_workers: int = 4
|
|
26
35
|
self._uri: str = ""
|
|
36
|
+
self.cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
|
|
37
|
+
self.producer_queue: asyncio.Queue[Tuple[str, Optional[bytes]]] = (
|
|
38
|
+
asyncio.Queue(maxsize=queue_max)
|
|
39
|
+
)
|
|
40
|
+
self.connector = aiohttp.TCPConnector(
|
|
41
|
+
ttl_dns_cache=300,
|
|
42
|
+
use_dns_cache=True,
|
|
43
|
+
enable_cleanup_closed=True,
|
|
44
|
+
)
|
|
27
45
|
|
|
28
46
|
async def solr_url(self, server: str, core: str) -> str:
|
|
29
47
|
"""Construct the solr url from a given solr core."""
|
|
@@ -149,8 +167,25 @@ class SolrIndex(BaseIndex):
|
|
|
149
167
|
time.perf_counter() - t0,
|
|
150
168
|
)
|
|
151
169
|
|
|
170
|
+
async def consumer(self, session: aiohttp.ClientSession) -> None:
|
|
171
|
+
"""Consume the metadata read by the porducers."""
|
|
172
|
+
while True:
|
|
173
|
+
update_url, body = await self.producer_queue.get()
|
|
174
|
+
if body is self.senteniel:
|
|
175
|
+
self.producer_queue.task_done()
|
|
176
|
+
break
|
|
177
|
+
try:
|
|
178
|
+
await self._post_chunk(session, update_url, cast(bytes, body))
|
|
179
|
+
finally:
|
|
180
|
+
self.producer_queue.task_done()
|
|
181
|
+
|
|
152
182
|
async def _index_core(
|
|
153
|
-
self,
|
|
183
|
+
self,
|
|
184
|
+
session: aiohttp.ClientSession,
|
|
185
|
+
server: str,
|
|
186
|
+
core: str,
|
|
187
|
+
suffix: str,
|
|
188
|
+
http_workers: int = 8,
|
|
154
189
|
) -> None:
|
|
155
190
|
"""Zero-copy-ish, backpressured, bounded-concurrency indexer.
|
|
156
191
|
|
|
@@ -160,70 +195,36 @@ class SolrIndex(BaseIndex):
|
|
|
160
195
|
"""
|
|
161
196
|
base_url = await self.solr_url(server, core + suffix)
|
|
162
197
|
update_url = base_url.split("?", 1)[0] # guard
|
|
163
|
-
|
|
164
|
-
queue_max: int = 128
|
|
165
|
-
encode_workers: int = 4
|
|
166
|
-
|
|
167
|
-
timeout = aiohttp.ClientTimeout(
|
|
168
|
-
connect=10, sock_connect=10, sock_read=180, total=None
|
|
169
|
-
)
|
|
170
|
-
connector = aiohttp.TCPConnector(
|
|
171
|
-
limit_per_host=http_workers,
|
|
172
|
-
ttl_dns_cache=300,
|
|
173
|
-
enable_cleanup_closed=True,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
198
|
loop = asyncio.get_running_loop()
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
199
|
+
async for batch in self.get_metadata(core):
|
|
200
|
+
body = await loop.run_in_executor(
|
|
201
|
+
self.cpu_pool, self._encode_payload, batch
|
|
202
|
+
)
|
|
203
|
+
await self.producer_queue.put((update_url, body))
|
|
204
|
+
commit_url = f"{update_url}?commit=true"
|
|
205
|
+
async with session.post(
|
|
206
|
+
commit_url,
|
|
207
|
+
data=b"[]",
|
|
208
|
+
headers={"Content-Type": "application/json"},
|
|
209
|
+
) as resp:
|
|
210
|
+
if resp.status >= 400:
|
|
211
|
+
text = await resp.text()
|
|
212
|
+
logger.warning(
|
|
213
|
+
"COMMIT %s -> %i: %s", commit_url, resp.status, text
|
|
185
214
|
)
|
|
186
|
-
await q.put(body)
|
|
187
|
-
for _ in range(http_workers):
|
|
188
|
-
await q.put(SENTINEL)
|
|
189
215
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
q.task_done()
|
|
197
|
-
break
|
|
198
|
-
try:
|
|
199
|
-
await self._post_chunk(session, update_url, cast(bytes, body))
|
|
200
|
-
finally:
|
|
201
|
-
q.task_done()
|
|
202
|
-
|
|
203
|
-
async with aiohttp.ClientSession(
|
|
204
|
-
timeout=timeout, connector=connector, raise_for_status=True
|
|
205
|
-
) as session:
|
|
206
|
-
consumers = [
|
|
207
|
-
asyncio.create_task(consumer(i, session))
|
|
208
|
-
for i in range(http_workers)
|
|
209
|
-
]
|
|
210
|
-
prod_task = asyncio.create_task(producer())
|
|
211
|
-
await prod_task
|
|
212
|
-
await q.join()
|
|
213
|
-
await asyncio.gather(*consumers)
|
|
216
|
+
async def __aexit__(
|
|
217
|
+
self,
|
|
218
|
+
exc_type: Optional[Type[BaseException]],
|
|
219
|
+
exc_val: Optional[BaseException],
|
|
220
|
+
exc_tb: Optional[TracebackType],
|
|
221
|
+
) -> None:
|
|
214
222
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
headers={"Content-Type": "application/json"},
|
|
221
|
-
) as resp:
|
|
222
|
-
if resp.status >= 400:
|
|
223
|
-
text = await resp.text()
|
|
224
|
-
logger.warning(
|
|
225
|
-
"COMMIT %s -> %i: %s", commit_url, resp.status, text
|
|
226
|
-
)
|
|
223
|
+
try:
|
|
224
|
+
self.producer_queue.shutdown()
|
|
225
|
+
except AttributeError: # pragma: no cover
|
|
226
|
+
pass # prgama: no cover
|
|
227
|
+
self.cpu_pool.shutdown()
|
|
227
228
|
|
|
228
229
|
@cli_function(
|
|
229
230
|
help="Add metadata to the apache solr metadata server.",
|
|
@@ -256,13 +257,25 @@ class SolrIndex(BaseIndex):
|
|
|
256
257
|
] = 8,
|
|
257
258
|
) -> None:
|
|
258
259
|
"""Add metadata to the apache solr metadata server."""
|
|
259
|
-
async with
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
260
|
+
async with aiohttp.ClientSession(
|
|
261
|
+
timeout=self.timeout, connector=self.connector, raise_for_status=True
|
|
262
|
+
) as session:
|
|
263
|
+
consumers = [
|
|
264
|
+
asyncio.create_task(self.consumer(session))
|
|
265
|
+
for _ in range(http_workers)
|
|
266
|
+
]
|
|
267
|
+
async with asyncio.TaskGroup() as tg:
|
|
268
|
+
for core in self.index_names:
|
|
269
|
+
tg.create_task(
|
|
270
|
+
self._index_core(
|
|
271
|
+
session,
|
|
272
|
+
server or "",
|
|
273
|
+
core,
|
|
274
|
+
suffix=index_suffix or "",
|
|
275
|
+
http_workers=http_workers,
|
|
276
|
+
)
|
|
267
277
|
)
|
|
268
|
-
|
|
278
|
+
for _ in range(http_workers):
|
|
279
|
+
await self.producer_queue.put(("", self.senteniel))
|
|
280
|
+
await self.producer_queue.join()
|
|
281
|
+
await asyncio.gather(*consumers)
|
metadata_crawler/run.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Apply the metadata collector."""
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
import os
|
|
5
4
|
import time
|
|
6
5
|
from fnmatch import fnmatch
|
|
@@ -131,22 +130,18 @@ async def async_call(
|
|
|
131
130
|
)
|
|
132
131
|
raise ValueError(msg) from None
|
|
133
132
|
flat_files = _norm_files(catalogue_files)
|
|
134
|
-
_event_loop = asyncio.get_event_loop()
|
|
135
133
|
flat_files = flat_files or [""]
|
|
136
|
-
futures = []
|
|
137
134
|
storage_options = kwargs.pop("storage_options", {})
|
|
138
135
|
progress.start()
|
|
139
136
|
for cf in flat_files:
|
|
140
|
-
|
|
137
|
+
async with cls(
|
|
141
138
|
batch_size=batch_size,
|
|
142
139
|
catalogue_file=cf or None,
|
|
143
140
|
storage_options=storage_options,
|
|
144
141
|
progress=progress,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
futures.append(future)
|
|
149
|
-
await asyncio.gather(*futures)
|
|
142
|
+
) as obj:
|
|
143
|
+
func = getattr(obj, method)
|
|
144
|
+
await func(**kwargs)
|
|
150
145
|
|
|
151
146
|
finally:
|
|
152
147
|
os.environ = env
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
|
|
2
2
|
metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
|
|
3
|
-
metadata_crawler/_version.py,sha256=
|
|
3
|
+
metadata_crawler/_version.py,sha256=_1Aa0y1WK02pe4d0jqbbU4gNu6ua_fbVc34XEhA0Nso,25
|
|
4
4
|
metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
|
|
5
5
|
metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
|
|
6
6
|
metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
|
|
7
7
|
metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
metadata_crawler/run.py,sha256=
|
|
8
|
+
metadata_crawler/run.py,sha256=_6mx29Co1HwfPNFWtzTR65CNlopqubj-McmavRM7i80,12869
|
|
9
9
|
metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
|
|
10
10
|
metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
|
|
11
11
|
metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
|
|
12
|
-
metadata_crawler/api/config.py,sha256=
|
|
12
|
+
metadata_crawler/api/config.py,sha256=wYcVCsppOCWtdP1GIhrGNGfvfWTKOC3rb50cw5hfrG4,28921
|
|
13
13
|
metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
|
|
14
|
-
metadata_crawler/api/index.py,sha256=
|
|
14
|
+
metadata_crawler/api/index.py,sha256=0yqtXYOyWJJKKkCkIJbUUVG1w2Wt_icYJjXJPZZjSvU,4715
|
|
15
15
|
metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
|
|
16
16
|
metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
|
|
17
17
|
metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
|
|
@@ -26,9 +26,9 @@ metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqe
|
|
|
26
26
|
metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
|
|
27
27
|
metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
|
|
28
28
|
metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
|
|
29
|
-
metadata_crawler/ingester/solr.py,sha256=
|
|
30
|
-
metadata_crawler-
|
|
31
|
-
metadata_crawler-
|
|
32
|
-
metadata_crawler-
|
|
33
|
-
metadata_crawler-
|
|
34
|
-
metadata_crawler-
|
|
29
|
+
metadata_crawler/ingester/solr.py,sha256=kpUAnI5iSsvNGagM_gqbTJZr8HNpYSFZFvNOcbHXB9o,9528
|
|
30
|
+
metadata_crawler-2510.0.1.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
|
|
31
|
+
metadata_crawler-2510.0.1.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
|
|
32
|
+
metadata_crawler-2510.0.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
33
|
+
metadata_crawler-2510.0.1.dist-info/METADATA,sha256=RNL72fSYduIhQJoZuzhMompb9Fvz5qEd3DyYSZIfdq0,13006
|
|
34
|
+
metadata_crawler-2510.0.1.dist-info/RECORD,,
|
|
File without changes
|
{metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{metadata_crawler-2509.0.2.dist-info → metadata_crawler-2510.0.1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|