metadata-crawler 2509.0.0__py3-none-any.whl → 2509.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +16 -1
- metadata_crawler/_version.py +1 -1
- metadata_crawler/api/config.py +9 -5
- metadata_crawler/api/drs_config.toml +1 -0
- metadata_crawler/api/index.py +7 -1
- metadata_crawler/api/metadata_stores.py +10 -4
- metadata_crawler/api/mixin/template_mixin.py +1 -1
- metadata_crawler/backends/s3.py +8 -4
- metadata_crawler/cli.py +15 -7
- metadata_crawler/data_collector.py +30 -10
- metadata_crawler/ingester/mongo.py +17 -4
- metadata_crawler/ingester/solr.py +133 -17
- metadata_crawler/logger.py +45 -34
- metadata_crawler/run.py +56 -5
- metadata_crawler/utils.py +71 -0
- {metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/METADATA +23 -21
- metadata_crawler-2509.0.2.dist-info/RECORD +34 -0
- metadata_crawler-2509.0.0.dist-info/RECORD +0 -34
- {metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/WHEEL +0 -0
- {metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/entry_points.txt +0 -0
- {metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/licenses/LICENSE +0 -0
metadata_crawler/__init__.py
CHANGED
|
@@ -53,6 +53,7 @@ def index(
|
|
|
53
53
|
*catalogue_files: Union[Path, str, List[str], List[Path]],
|
|
54
54
|
batch_size: int = 2500,
|
|
55
55
|
verbosity: int = 0,
|
|
56
|
+
log_suffix: Optional[str] = None,
|
|
56
57
|
**kwargs: Any,
|
|
57
58
|
) -> None:
|
|
58
59
|
"""Index metadata in the indexing system.
|
|
@@ -68,6 +69,8 @@ def index(
|
|
|
68
69
|
If the index system supports batch-sizes, the size of the batches.
|
|
69
70
|
verbosity:
|
|
70
71
|
Set the verbosity level.
|
|
72
|
+
log_suffix:
|
|
73
|
+
Add a suffix to the log file output.
|
|
71
74
|
|
|
72
75
|
Other Parameters
|
|
73
76
|
^^^^^^^^^^^^^^^^
|
|
@@ -94,6 +97,7 @@ def index(
|
|
|
94
97
|
*catalogue_files,
|
|
95
98
|
batch_size=batch_size,
|
|
96
99
|
verbosity=verbosity,
|
|
100
|
+
log_suffix=log_suffix,
|
|
97
101
|
**kwargs,
|
|
98
102
|
)
|
|
99
103
|
)
|
|
@@ -103,6 +107,7 @@ def delete(
|
|
|
103
107
|
index_system: str,
|
|
104
108
|
batch_size: int = 2500,
|
|
105
109
|
verbosity: int = 0,
|
|
110
|
+
log_suffix: Optional[str] = None,
|
|
106
111
|
**kwargs: Any,
|
|
107
112
|
) -> None:
|
|
108
113
|
"""Delete metadata from the indexing system.
|
|
@@ -116,6 +121,8 @@ def delete(
|
|
|
116
121
|
If the index system supports batch-sizes, the size of the batches.
|
|
117
122
|
verbosity:
|
|
118
123
|
Set the verbosity of the system.
|
|
124
|
+
log_suffix:
|
|
125
|
+
Add a suffix to the log file output.
|
|
119
126
|
|
|
120
127
|
Other Parameters
|
|
121
128
|
^^^^^^^^^^^^^^^^
|
|
@@ -135,7 +142,11 @@ def delete(
|
|
|
135
142
|
facets=[("project", "CMIP6"), ("institute", "MPI-M")],
|
|
136
143
|
)
|
|
137
144
|
"""
|
|
138
|
-
uvloop.run(
|
|
145
|
+
uvloop.run(
|
|
146
|
+
async_delete(
|
|
147
|
+
index_system, batch_size=batch_size, log_suffix=log_suffix, **kwargs
|
|
148
|
+
)
|
|
149
|
+
)
|
|
139
150
|
|
|
140
151
|
|
|
141
152
|
def add(
|
|
@@ -155,6 +166,7 @@ def add(
|
|
|
155
166
|
all_versions: str = IndexName().all,
|
|
156
167
|
n_procs: Optional[int] = None,
|
|
157
168
|
verbosity: int = 0,
|
|
169
|
+
log_suffix: Optional[str] = None,
|
|
158
170
|
password: bool = False,
|
|
159
171
|
fail_under: int = -1,
|
|
160
172
|
**kwargs: Any,
|
|
@@ -204,6 +216,8 @@ def add(
|
|
|
204
216
|
Set the number of parallel processes for collecting.
|
|
205
217
|
verbosity:
|
|
206
218
|
Set the verbosity of the system.
|
|
219
|
+
log_suffix:
|
|
220
|
+
Add a suffix to the log file output.
|
|
207
221
|
fail_under:
|
|
208
222
|
Fail if less than X of the discovered files could be indexed.
|
|
209
223
|
|
|
@@ -242,6 +256,7 @@ def add(
|
|
|
242
256
|
n_procs=n_procs,
|
|
243
257
|
storage_options=storage_options,
|
|
244
258
|
verbosity=verbosity,
|
|
259
|
+
log_suffix=log_suffix,
|
|
245
260
|
fail_under=fail_under,
|
|
246
261
|
**kwargs,
|
|
247
262
|
)
|
metadata_crawler/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2509.0.
|
|
1
|
+
__version__ = "2509.0.2"
|
metadata_crawler/api/config.py
CHANGED
|
@@ -17,6 +17,7 @@ from typing import (
|
|
|
17
17
|
List,
|
|
18
18
|
Literal,
|
|
19
19
|
Optional,
|
|
20
|
+
Tuple,
|
|
20
21
|
Union,
|
|
21
22
|
cast,
|
|
22
23
|
)
|
|
@@ -285,7 +286,6 @@ class PathSpecs(BaseModel):
|
|
|
285
286
|
f"- needs: {len(self.file_parts)} has: {len(file_parts)})"
|
|
286
287
|
)
|
|
287
288
|
)
|
|
288
|
-
_parts.setdefault("time", "fx")
|
|
289
289
|
data.update({k: v for (k, v) in _parts.items() if k not in data})
|
|
290
290
|
data.pop("_", None)
|
|
291
291
|
return data
|
|
@@ -609,7 +609,9 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
609
609
|
case "conditional":
|
|
610
610
|
_rule = textwrap.dedent(rule.condition or "").strip()
|
|
611
611
|
s_cond = self.render_templates(_rule, data)
|
|
612
|
-
cond = eval(
|
|
612
|
+
cond = eval(
|
|
613
|
+
s_cond, {}, getattr(self, "_model_dict", {})
|
|
614
|
+
) # nosec
|
|
613
615
|
result = rule.true if cond else rule.false
|
|
614
616
|
case "lookup":
|
|
615
617
|
args = cast(List[str], self.render_templates(rule.tree, data))
|
|
@@ -627,7 +629,7 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
627
629
|
self.render_templates(_call, data),
|
|
628
630
|
{},
|
|
629
631
|
getattr(self, "_model_dict", {}),
|
|
630
|
-
)
|
|
632
|
+
) # nosec
|
|
631
633
|
if result:
|
|
632
634
|
inp.metadata[facet] = result
|
|
633
635
|
|
|
@@ -666,7 +668,7 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
666
668
|
|
|
667
669
|
def max_directory_tree_level(
|
|
668
670
|
self, search_dir: str | Path, drs_type: str
|
|
669
|
-
) -> int:
|
|
671
|
+
) -> Tuple[int, bool]:
|
|
670
672
|
"""Get the maximum level for descending into directories.
|
|
671
673
|
|
|
672
674
|
When searching for files in a directory we can only traverse the directory
|
|
@@ -686,6 +688,7 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
686
688
|
version = cast(
|
|
687
689
|
str, self.dialect[standard].facets.get("version", "version")
|
|
688
690
|
)
|
|
691
|
+
is_versioned = True
|
|
689
692
|
try:
|
|
690
693
|
version_idx = self.dialect[standard].path_specs.dir_parts.index(
|
|
691
694
|
version
|
|
@@ -693,11 +696,12 @@ class DRSConfig(BaseModel, TemplateMixin):
|
|
|
693
696
|
except ValueError:
|
|
694
697
|
# No version given
|
|
695
698
|
version_idx = len(self.dialect[standard].path_specs.dir_parts)
|
|
699
|
+
is_versioned = False
|
|
696
700
|
if root_path == search_dir:
|
|
697
701
|
current_pos = 0
|
|
698
702
|
else:
|
|
699
703
|
current_pos = len(search_dir.relative_to(root_path).parts)
|
|
700
|
-
return version_idx - current_pos
|
|
704
|
+
return version_idx - current_pos, is_versioned
|
|
701
705
|
|
|
702
706
|
def is_complete(self, data: Dict[str, Any], standard: str) -> bool:
|
|
703
707
|
"""Check if all metadata that can be collected was collected."""
|
metadata_crawler/api/index.py
CHANGED
|
@@ -16,7 +16,7 @@ from typing import (
|
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
from ..logger import logger
|
|
19
|
-
from ..utils import Console
|
|
19
|
+
from ..utils import Console, IndexProgress
|
|
20
20
|
from .config import SchemaField
|
|
21
21
|
from .metadata_stores import CatalogueReader, IndexStore
|
|
22
22
|
|
|
@@ -40,6 +40,9 @@ class BaseIndex:
|
|
|
40
40
|
batch_size:
|
|
41
41
|
The amount for metadata that should be gathered `before` ingesting
|
|
42
42
|
it into the catalogue.
|
|
43
|
+
progress:
|
|
44
|
+
Optional rich progress object that should display the progress of the
|
|
45
|
+
tasks.
|
|
43
46
|
|
|
44
47
|
Attributes
|
|
45
48
|
^^^^^^^^^^
|
|
@@ -50,9 +53,11 @@ class BaseIndex:
|
|
|
50
53
|
catalogue_file: Optional[Union[str, Path]] = None,
|
|
51
54
|
batch_size: int = 2500,
|
|
52
55
|
storage_options: Optional[Dict[str, Any]] = None,
|
|
56
|
+
progress: Optional[IndexProgress] = None,
|
|
53
57
|
**kwargs: Any,
|
|
54
58
|
) -> None:
|
|
55
59
|
self._store: Optional[IndexStore] = None
|
|
60
|
+
self.progress = progress or IndexProgress(total=-1)
|
|
56
61
|
if catalogue_file is not None:
|
|
57
62
|
_reader = CatalogueReader(
|
|
58
63
|
catalogue_file=catalogue_file or "",
|
|
@@ -92,6 +97,7 @@ class BaseIndex:
|
|
|
92
97
|
logger.info("Indexing %s", index_name)
|
|
93
98
|
async for batch in self._store.read(index_name):
|
|
94
99
|
yield batch
|
|
100
|
+
self.progress.update(len(batch))
|
|
95
101
|
num_items += len(batch)
|
|
96
102
|
msg = f"Indexed {num_items:10,.0f} items for index {index_name}"
|
|
97
103
|
Console.print(msg) if Console.is_terminal else print(msg)
|
|
@@ -473,10 +473,7 @@ class CatalogueReader:
|
|
|
473
473
|
) -> None:
|
|
474
474
|
catalogue_file = str(catalogue_file)
|
|
475
475
|
storage_options = storage_options or {}
|
|
476
|
-
|
|
477
|
-
path = fs.unstrip_protocol(catalogue_file)
|
|
478
|
-
with fs.open(path) as stream:
|
|
479
|
-
cat = yaml.safe_load(stream.read())
|
|
476
|
+
cat = self.load_catalogue(catalogue_file, **storage_options)
|
|
480
477
|
_schema_json = cat["metadata"]["schema"]
|
|
481
478
|
schema = {s["key"]: SchemaField(**s) for k, s in _schema_json.items()}
|
|
482
479
|
index_name = IndexName(**cat["metadata"]["index_names"])
|
|
@@ -493,6 +490,14 @@ class CatalogueReader:
|
|
|
493
490
|
storage_options=storage_options,
|
|
494
491
|
)
|
|
495
492
|
|
|
493
|
+
@staticmethod
|
|
494
|
+
def load_catalogue(path: Union[str, Path], **storage_options: Any) -> Any:
|
|
495
|
+
"""Load a intake yaml catalogue (remote or local)."""
|
|
496
|
+
fs, _ = IndexStore.get_fs(str(path), **storage_options)
|
|
497
|
+
cat_path = fs.unstrip_protocol(path)
|
|
498
|
+
with fs.open(cat_path) as stream:
|
|
499
|
+
return yaml.safe_load(stream.read())
|
|
500
|
+
|
|
496
501
|
|
|
497
502
|
class QueueConsumer:
|
|
498
503
|
"""Class that consumes the file discovery queue."""
|
|
@@ -722,6 +727,7 @@ class CatalogueWriter:
|
|
|
722
727
|
"latest": self.index_name.latest,
|
|
723
728
|
"all": self.index_name.all,
|
|
724
729
|
},
|
|
730
|
+
"indexed_objects": self.ingested_objects,
|
|
725
731
|
"schema": {
|
|
726
732
|
k: json.loads(s.model_dump_json())
|
|
727
733
|
for k, s in self.store.schema.items()
|
metadata_crawler/backends/s3.py
CHANGED
|
@@ -71,11 +71,15 @@ class S3Path(PathTemplate):
|
|
|
71
71
|
self, path: Union[str, Path, pathlib.Path]
|
|
72
72
|
) -> AsyncIterator[str]:
|
|
73
73
|
"""Retrieve sub directories of directory."""
|
|
74
|
-
path = str(path)
|
|
75
74
|
client = await self._get_client()
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
75
|
+
path = str(path)
|
|
76
|
+
if await self.is_file(path):
|
|
77
|
+
yield path
|
|
78
|
+
else:
|
|
79
|
+
for _content in await client._lsdir(path):
|
|
80
|
+
size: int = _content.get("size") or 0
|
|
81
|
+
if _content.get("type", "") == "directory" or size > 0:
|
|
82
|
+
yield _content.get("name", "")
|
|
79
83
|
|
|
80
84
|
async def rglob(
|
|
81
85
|
self, path: str | Path | pathlib.Path, glob_pattern: str = "*"
|
metadata_crawler/cli.py
CHANGED
|
@@ -34,7 +34,6 @@ from .api.metadata_stores import CatalogueBackends, IndexName
|
|
|
34
34
|
from .backends.intake import IntakePath
|
|
35
35
|
from .logger import (
|
|
36
36
|
THIS_NAME,
|
|
37
|
-
add_file_handle,
|
|
38
37
|
apply_verbosity,
|
|
39
38
|
logger,
|
|
40
39
|
)
|
|
@@ -48,7 +47,9 @@ KwargValue = Union[
|
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
def walk_catalogue(
|
|
51
|
-
path: str,
|
|
50
|
+
path: str,
|
|
51
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
52
|
+
**kwargs: Any,
|
|
52
53
|
) -> int:
|
|
53
54
|
"""Recursively traverse an intake catalogue.
|
|
54
55
|
|
|
@@ -359,6 +360,13 @@ class ArgParse:
|
|
|
359
360
|
action="append",
|
|
360
361
|
nargs=2,
|
|
361
362
|
)
|
|
363
|
+
parser.add_argument(
|
|
364
|
+
"-v",
|
|
365
|
+
"--verbose",
|
|
366
|
+
action="count",
|
|
367
|
+
default=self.verbose,
|
|
368
|
+
help="Increase the verbosity level.",
|
|
369
|
+
)
|
|
362
370
|
parser.set_defaults(apply_func=walk_catalogue)
|
|
363
371
|
|
|
364
372
|
def _index_submcommands(self) -> None:
|
|
@@ -391,8 +399,8 @@ class ArgParse:
|
|
|
391
399
|
"-b",
|
|
392
400
|
"--batch-size",
|
|
393
401
|
type=int,
|
|
394
|
-
default=
|
|
395
|
-
help="Set the batch size for
|
|
402
|
+
default=5_000,
|
|
403
|
+
help="Set the batch size for indexing.",
|
|
396
404
|
)
|
|
397
405
|
parser.add_argument(
|
|
398
406
|
"--storage_option",
|
|
@@ -494,7 +502,6 @@ class ArgParse:
|
|
|
494
502
|
"apply_func",
|
|
495
503
|
"verbose",
|
|
496
504
|
"version",
|
|
497
|
-
"log_suffix",
|
|
498
505
|
"storage_option",
|
|
499
506
|
"shadow",
|
|
500
507
|
)
|
|
@@ -509,7 +516,6 @@ class ArgParse:
|
|
|
509
516
|
self.kwargs["shadow"] = _flatten(args.shadow)
|
|
510
517
|
self.kwargs["storage_options"] = so
|
|
511
518
|
self.verbose = args.verbose
|
|
512
|
-
add_file_handle(args.log_suffix)
|
|
513
519
|
self.kwargs["verbosity"] = self.verbose
|
|
514
520
|
return args
|
|
515
521
|
|
|
@@ -519,7 +525,9 @@ def _run(
|
|
|
519
525
|
**kwargs: KwargValue,
|
|
520
526
|
) -> None:
|
|
521
527
|
"""Apply the parsed method."""
|
|
522
|
-
old_level = apply_verbosity(
|
|
528
|
+
old_level = apply_verbosity(
|
|
529
|
+
getattr(parser, "verbose", 0), suffix=getattr(parser, "log_suffix", None)
|
|
530
|
+
)
|
|
523
531
|
try:
|
|
524
532
|
parser.apply_func(**kwargs)
|
|
525
533
|
except Exception as error:
|
|
@@ -15,6 +15,7 @@ from typing import (
|
|
|
15
15
|
Dict,
|
|
16
16
|
Iterator,
|
|
17
17
|
Optional,
|
|
18
|
+
Tuple,
|
|
18
19
|
Type,
|
|
19
20
|
Union,
|
|
20
21
|
cast,
|
|
@@ -33,7 +34,7 @@ from .utils import (
|
|
|
33
34
|
print_performance,
|
|
34
35
|
)
|
|
35
36
|
|
|
36
|
-
ScanItem =
|
|
37
|
+
ScanItem = Tuple[str, str, bool, bool]
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class DataCollector:
|
|
@@ -138,6 +139,7 @@ class DataCollector:
|
|
|
138
139
|
drs_type: str,
|
|
139
140
|
search_dir: str,
|
|
140
141
|
iterable: bool = True,
|
|
142
|
+
is_versioned: bool = True,
|
|
141
143
|
) -> None:
|
|
142
144
|
if iterable:
|
|
143
145
|
try:
|
|
@@ -161,7 +163,7 @@ class DataCollector:
|
|
|
161
163
|
await self.ingest_queue.put(
|
|
162
164
|
_inp, drs_type, name=self.index_name.all
|
|
163
165
|
)
|
|
164
|
-
if rank == 0:
|
|
166
|
+
if rank == 0 or is_versioned is False:
|
|
165
167
|
await self.ingest_queue.put(
|
|
166
168
|
_inp, drs_type, name=self.index_name.latest
|
|
167
169
|
)
|
|
@@ -176,16 +178,22 @@ class DataCollector:
|
|
|
176
178
|
if item is None: # sentinel -> exit
|
|
177
179
|
# do not task_done() for sentinel
|
|
178
180
|
break
|
|
179
|
-
drs_type, path, iterable = item
|
|
181
|
+
drs_type, path, iterable, is_versioned = item
|
|
180
182
|
try:
|
|
181
|
-
await self._ingest_dir(
|
|
183
|
+
await self._ingest_dir(
|
|
184
|
+
drs_type, path, iterable=iterable, is_versioned=is_versioned
|
|
185
|
+
)
|
|
182
186
|
except Exception as error:
|
|
183
187
|
logger.error(error)
|
|
184
188
|
finally:
|
|
185
189
|
self._scan_queue.task_done()
|
|
186
190
|
|
|
187
191
|
async def _iter_content(
|
|
188
|
-
self,
|
|
192
|
+
self,
|
|
193
|
+
drs_type: str,
|
|
194
|
+
inp_dir: str,
|
|
195
|
+
pos: int = 0,
|
|
196
|
+
is_versioned: bool = True,
|
|
189
197
|
) -> None:
|
|
190
198
|
"""Walk recursively until files or the version level is reached."""
|
|
191
199
|
store = self.config.datasets[drs_type].backend
|
|
@@ -203,7 +211,6 @@ class DataCollector:
|
|
|
203
211
|
|
|
204
212
|
iterable = False if suffix == ".zarr" else iterable
|
|
205
213
|
op: Optional[Callable[..., Coroutine[Any, Any, None]]] = None
|
|
206
|
-
|
|
207
214
|
if is_file and suffix in self.config.suffixes:
|
|
208
215
|
op = self._ingest_dir
|
|
209
216
|
elif pos <= 0 or suffix == ".zarr":
|
|
@@ -211,13 +218,17 @@ class DataCollector:
|
|
|
211
218
|
|
|
212
219
|
if op is not None:
|
|
213
220
|
# enqueue the heavy scan; workers will run _ingest_dir concurrently
|
|
214
|
-
await self._scan_queue.put(
|
|
221
|
+
await self._scan_queue.put(
|
|
222
|
+
(drs_type, inp_dir, iterable, is_versioned)
|
|
223
|
+
)
|
|
215
224
|
return
|
|
216
225
|
|
|
217
226
|
# otherwise, recurse sequentially (cheap) — no task per directory
|
|
218
227
|
try:
|
|
219
228
|
async for sub in store.iterdir(inp_dir):
|
|
220
|
-
await self._iter_content(
|
|
229
|
+
await self._iter_content(
|
|
230
|
+
drs_type, sub, pos - 1, is_versioned=is_versioned
|
|
231
|
+
)
|
|
221
232
|
except Exception as error:
|
|
222
233
|
logger.error(error)
|
|
223
234
|
|
|
@@ -239,10 +250,19 @@ class DataCollector:
|
|
|
239
250
|
|
|
240
251
|
# produce scan items by walking roots sequentially
|
|
241
252
|
for drs_type, path in self.search_objects: # <- property is sync
|
|
242
|
-
pos = self.config.max_directory_tree_level(
|
|
253
|
+
pos, is_versioned = self.config.max_directory_tree_level(
|
|
243
254
|
path, drs_type=drs_type
|
|
244
255
|
)
|
|
245
|
-
|
|
256
|
+
if pos < 0:
|
|
257
|
+
logger.warning(
|
|
258
|
+
"Can't define latest version of versioned dataset."
|
|
259
|
+
" This might lead to unexpected results. Try adjusting"
|
|
260
|
+
" your search path."
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
await self._iter_content(
|
|
264
|
+
drs_type, path, pos, is_versioned=is_versioned
|
|
265
|
+
)
|
|
246
266
|
|
|
247
267
|
# wait until all queued scan items are processed
|
|
248
268
|
await self._scan_queue.join()
|
|
@@ -80,12 +80,13 @@ class MongoIndex(BaseIndex):
|
|
|
80
80
|
await collection.bulk_write(ops, ordered=False)
|
|
81
81
|
|
|
82
82
|
async def _index_collection(
|
|
83
|
-
self, db: AsyncIOMotorDatabase[Any], collection: str
|
|
83
|
+
self, db: AsyncIOMotorDatabase[Any], collection: str, suffix: str = ""
|
|
84
84
|
) -> None:
|
|
85
85
|
"""Index a collection."""
|
|
86
|
-
|
|
86
|
+
col = collection + suffix
|
|
87
|
+
await db[col].create_index(self.unique_index, unique=True)
|
|
87
88
|
async for chunk in self.get_metadata(collection):
|
|
88
|
-
await self._bulk_upsert(chunk, db[
|
|
89
|
+
await self._bulk_upsert(chunk, db[col])
|
|
89
90
|
|
|
90
91
|
async def _prep_db_connection(
|
|
91
92
|
self, database: str, url: str
|
|
@@ -119,12 +120,24 @@ class MongoIndex(BaseIndex):
|
|
|
119
120
|
default="metadata",
|
|
120
121
|
),
|
|
121
122
|
] = "metadata",
|
|
123
|
+
index_suffix: Annotated[
|
|
124
|
+
Optional[str],
|
|
125
|
+
cli_parameter(
|
|
126
|
+
"--index-suffix",
|
|
127
|
+
help="Suffix for the latest and all version collections.",
|
|
128
|
+
type=str,
|
|
129
|
+
),
|
|
130
|
+
] = None,
|
|
122
131
|
) -> None:
|
|
123
132
|
"""Add metadata to the mongoDB metadata server."""
|
|
124
133
|
db = await self._prep_db_connection(database, url or "")
|
|
125
134
|
async with asyncio.TaskGroup() as tg:
|
|
126
135
|
for collection in self.index_names:
|
|
127
|
-
tg.create_task(
|
|
136
|
+
tg.create_task(
|
|
137
|
+
self._index_collection(
|
|
138
|
+
db, collection, suffix=index_suffix or ""
|
|
139
|
+
)
|
|
140
|
+
)
|
|
128
141
|
|
|
129
142
|
async def close(self) -> None:
|
|
130
143
|
"""Close the mongoDB connection."""
|
|
@@ -5,9 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
-
|
|
8
|
+
import time
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from typing import Annotated, Any, Dict, List, Optional, cast
|
|
9
11
|
|
|
10
12
|
import aiohttp
|
|
13
|
+
import orjson
|
|
11
14
|
|
|
12
15
|
from ..api.cli import cli_function, cli_parameter
|
|
13
16
|
from ..api.index import BaseIndex
|
|
@@ -112,22 +115,114 @@ class SolrIndex(BaseIndex):
|
|
|
112
115
|
|
|
113
116
|
return metadata
|
|
114
117
|
|
|
115
|
-
|
|
116
|
-
"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
118
|
+
def _encode_payload(self, chunk: List[Dict[str, Any]]) -> bytes:
|
|
119
|
+
"""CPU-bound: convert docs and JSON-encode off the event loop."""
|
|
120
|
+
return orjson.dumps([self._convert(x) for x in chunk])
|
|
121
|
+
|
|
122
|
+
async def _post_chunk(
|
|
123
|
+
self,
|
|
124
|
+
session: aiohttp.ClientSession,
|
|
125
|
+
url: str,
|
|
126
|
+
body: bytes,
|
|
127
|
+
) -> None:
|
|
128
|
+
"""POST one batch with minimal overhead and simple retries."""
|
|
129
|
+
status = 500
|
|
130
|
+
t0 = time.perf_counter()
|
|
131
|
+
try:
|
|
132
|
+
async with session.post(
|
|
133
|
+
url, data=body, headers={"Content-Type": "application/json"}
|
|
134
|
+
) as resp:
|
|
135
|
+
status = resp.status
|
|
136
|
+
await resp.read()
|
|
137
|
+
|
|
138
|
+
except Exception as error:
|
|
139
|
+
logger.log(
|
|
140
|
+
logging.WARNING,
|
|
141
|
+
error,
|
|
142
|
+
exc_info=logger.level < logging.INFO,
|
|
143
|
+
)
|
|
144
|
+
return
|
|
145
|
+
logger.debug(
|
|
146
|
+
"POST %s -> %i (index time: %.3f)",
|
|
147
|
+
url,
|
|
148
|
+
status,
|
|
149
|
+
time.perf_counter() - t0,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
async def _index_core(
|
|
153
|
+
self, server: str, core: str, suffix: str, http_workers: int = 8
|
|
154
|
+
) -> None:
|
|
155
|
+
"""Zero-copy-ish, backpressured, bounded-concurrency indexer.
|
|
156
|
+
|
|
157
|
+
- No per-batch commit.
|
|
158
|
+
- Bounded queue so tasks don't pile up.
|
|
159
|
+
- Constant number of worker tasks (not O(batches)).
|
|
160
|
+
"""
|
|
161
|
+
base_url = await self.solr_url(server, core + suffix)
|
|
162
|
+
update_url = base_url.split("?", 1)[0] # guard
|
|
163
|
+
|
|
164
|
+
queue_max: int = 128
|
|
165
|
+
encode_workers: int = 4
|
|
166
|
+
|
|
167
|
+
timeout = aiohttp.ClientTimeout(
|
|
168
|
+
connect=10, sock_connect=10, sock_read=180, total=None
|
|
169
|
+
)
|
|
170
|
+
connector = aiohttp.TCPConnector(
|
|
171
|
+
limit_per_host=http_workers,
|
|
172
|
+
ttl_dns_cache=300,
|
|
173
|
+
enable_cleanup_closed=True,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
loop = asyncio.get_running_loop()
|
|
177
|
+
cpu_pool = ThreadPoolExecutor(max_workers=encode_workers)
|
|
178
|
+
q: asyncio.Queue[Optional[bytes]] = asyncio.Queue(maxsize=queue_max)
|
|
179
|
+
SENTINEL: Optional[bytes] = None
|
|
180
|
+
|
|
181
|
+
async def producer() -> None:
|
|
182
|
+
async for batch in self.get_metadata(core):
|
|
183
|
+
body = await loop.run_in_executor(
|
|
184
|
+
cpu_pool, self._encode_payload, batch
|
|
185
|
+
)
|
|
186
|
+
await q.put(body)
|
|
187
|
+
for _ in range(http_workers):
|
|
188
|
+
await q.put(SENTINEL)
|
|
189
|
+
|
|
190
|
+
async def consumer(
|
|
191
|
+
worker_id: int, session: aiohttp.ClientSession
|
|
192
|
+
) -> None:
|
|
193
|
+
while True:
|
|
194
|
+
body = await q.get()
|
|
195
|
+
if body is SENTINEL:
|
|
196
|
+
q.task_done()
|
|
197
|
+
break
|
|
122
198
|
try:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
199
|
+
await self._post_chunk(session, update_url, cast(bytes, body))
|
|
200
|
+
finally:
|
|
201
|
+
q.task_done()
|
|
202
|
+
|
|
203
|
+
async with aiohttp.ClientSession(
|
|
204
|
+
timeout=timeout, connector=connector, raise_for_status=True
|
|
205
|
+
) as session:
|
|
206
|
+
consumers = [
|
|
207
|
+
asyncio.create_task(consumer(i, session))
|
|
208
|
+
for i in range(http_workers)
|
|
209
|
+
]
|
|
210
|
+
prod_task = asyncio.create_task(producer())
|
|
211
|
+
await prod_task
|
|
212
|
+
await q.join()
|
|
213
|
+
await asyncio.gather(*consumers)
|
|
214
|
+
|
|
215
|
+
commit_url = f"{update_url}?commit=true"
|
|
216
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
217
|
+
async with session.post(
|
|
218
|
+
commit_url,
|
|
219
|
+
data=b"[]",
|
|
220
|
+
headers={"Content-Type": "application/json"},
|
|
221
|
+
) as resp:
|
|
222
|
+
if resp.status >= 400:
|
|
223
|
+
text = await resp.text()
|
|
224
|
+
logger.warning(
|
|
225
|
+
"COMMIT %s -> %i: %s", commit_url, resp.status, text
|
|
131
226
|
)
|
|
132
227
|
|
|
133
228
|
@cli_function(
|
|
@@ -145,8 +240,29 @@ class SolrIndex(BaseIndex):
|
|
|
145
240
|
type=str,
|
|
146
241
|
),
|
|
147
242
|
] = None,
|
|
243
|
+
index_suffix: Annotated[
|
|
244
|
+
Optional[str],
|
|
245
|
+
cli_parameter(
|
|
246
|
+
"--index-suffix",
|
|
247
|
+
help="Suffix for the latest and all version collections.",
|
|
248
|
+
type=str,
|
|
249
|
+
),
|
|
250
|
+
] = None,
|
|
251
|
+
http_workers: Annotated[
|
|
252
|
+
int,
|
|
253
|
+
cli_parameter(
|
|
254
|
+
"--http-workers", help="Number of ingestion threads.", type=int
|
|
255
|
+
),
|
|
256
|
+
] = 8,
|
|
148
257
|
) -> None:
|
|
149
258
|
"""Add metadata to the apache solr metadata server."""
|
|
150
259
|
async with asyncio.TaskGroup() as tg:
|
|
151
260
|
for core in self.index_names:
|
|
152
|
-
tg.create_task(
|
|
261
|
+
tg.create_task(
|
|
262
|
+
self._index_core(
|
|
263
|
+
server or "",
|
|
264
|
+
core,
|
|
265
|
+
suffix=index_suffix or "",
|
|
266
|
+
http_workers=http_workers,
|
|
267
|
+
)
|
|
268
|
+
)
|
metadata_crawler/logger.py
CHANGED
|
@@ -11,7 +11,7 @@ import appdirs
|
|
|
11
11
|
from rich.console import Console
|
|
12
12
|
from rich.logging import RichHandler
|
|
13
13
|
|
|
14
|
-
THIS_NAME = "
|
|
14
|
+
THIS_NAME = "metadata-crawler"
|
|
15
15
|
|
|
16
16
|
logging.basicConfig(
|
|
17
17
|
level=logging.WARNING,
|
|
@@ -24,7 +24,7 @@ logging.config.dictConfig(
|
|
|
24
24
|
# keep existing handlers
|
|
25
25
|
"disable_existing_loggers": False,
|
|
26
26
|
"root": {
|
|
27
|
-
"level": "
|
|
27
|
+
"level": "CRITICAL",
|
|
28
28
|
"handlers": ["default"],
|
|
29
29
|
},
|
|
30
30
|
"formatters": {
|
|
@@ -36,16 +36,12 @@ logging.config.dictConfig(
|
|
|
36
36
|
"default": {
|
|
37
37
|
"class": "logging.StreamHandler",
|
|
38
38
|
"formatter": "standard",
|
|
39
|
-
"level": "
|
|
39
|
+
"level": "CRITICAL",
|
|
40
40
|
},
|
|
41
41
|
},
|
|
42
42
|
}
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
logging.getLogger("sqlalchemy").setLevel(logging.WARNING)
|
|
46
|
-
logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
|
|
47
|
-
logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
|
|
48
|
-
|
|
49
45
|
|
|
50
46
|
class Logger(logging.Logger):
|
|
51
47
|
"""Custom Logger defining the logging behaviour."""
|
|
@@ -56,11 +52,14 @@ class Logger(logging.Logger):
|
|
|
56
52
|
no_debug: list[str] = ["watchfiles", "httpcore", "pymongo", "pika"]
|
|
57
53
|
|
|
58
54
|
def __init__(
|
|
59
|
-
self,
|
|
55
|
+
self,
|
|
56
|
+
name: Optional[str] = None,
|
|
57
|
+
level: Optional[int] = None,
|
|
58
|
+
suffix: Optional[str] = None,
|
|
60
59
|
) -> None:
|
|
61
60
|
"""Instantiate this logger only once and for all."""
|
|
62
|
-
level = level or int(
|
|
63
|
-
cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.
|
|
61
|
+
self.level = level or int(
|
|
62
|
+
cast(str, os.getenv("MDC_LOG_LEVEL", str(logging.CRITICAL)))
|
|
64
63
|
)
|
|
65
64
|
name = name or THIS_NAME
|
|
66
65
|
logger_format = logging.Formatter(self.logfmt, self.datefmt)
|
|
@@ -78,11 +77,16 @@ class Logger(logging.Logger):
|
|
|
78
77
|
),
|
|
79
78
|
)
|
|
80
79
|
self._logger_stream_handle.setFormatter(logger_format)
|
|
81
|
-
self._logger_stream_handle.setLevel(level)
|
|
82
|
-
super().__init__(name, level)
|
|
80
|
+
self._logger_stream_handle.setLevel(self.level)
|
|
81
|
+
super().__init__(name, self.level)
|
|
83
82
|
|
|
84
83
|
self.propagate = False
|
|
85
84
|
self.handlers = [self._logger_stream_handle]
|
|
85
|
+
(
|
|
86
|
+
self.add_file_handle(suffix=suffix)
|
|
87
|
+
if os.getenv("MDC_LOG_INIT", "0") == "1"
|
|
88
|
+
else None
|
|
89
|
+
)
|
|
86
90
|
|
|
87
91
|
def set_level(self, level: int) -> None:
|
|
88
92
|
"""Set the logger level to level."""
|
|
@@ -92,7 +96,7 @@ class Logger(logging.Logger):
|
|
|
92
96
|
log_level = min(level, logging.CRITICAL)
|
|
93
97
|
handler.setLevel(log_level)
|
|
94
98
|
self.setLevel(level)
|
|
95
|
-
|
|
99
|
+
self.level = level
|
|
96
100
|
|
|
97
101
|
def error(
|
|
98
102
|
self,
|
|
@@ -105,28 +109,30 @@ class Logger(logging.Logger):
|
|
|
105
109
|
kwargs.setdefault("exc_info", True)
|
|
106
110
|
self._log(logging.ERROR, msg, args, **kwargs)
|
|
107
111
|
|
|
108
|
-
|
|
109
|
-
|
|
112
|
+
def add_file_handle(
|
|
113
|
+
self,
|
|
114
|
+
suffix: Optional[str] = None,
|
|
115
|
+
level: int = logging.CRITICAL,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Add a file log handle to the logger."""
|
|
118
|
+
suffix = suffix or os.getenv("MDC_LOG_SUFFIX", "")
|
|
119
|
+
base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
|
|
120
|
+
log_dir = Path(os.getenv("MDC_LOG_DIR", appdirs.user_log_dir(THIS_NAME)))
|
|
121
|
+
log_dir.mkdir(exist_ok=True, parents=True)
|
|
122
|
+
logger_file_handle = RotatingFileHandler(
|
|
123
|
+
log_dir / f"{base_name}.log",
|
|
124
|
+
mode="a",
|
|
125
|
+
maxBytes=5 * 1024**2,
|
|
126
|
+
backupCount=5,
|
|
127
|
+
encoding="utf-8",
|
|
128
|
+
delay=False,
|
|
129
|
+
)
|
|
130
|
+
logger_file_handle.setFormatter(self.file_format)
|
|
131
|
+
logger_file_handle.setLevel(self.level)
|
|
132
|
+
self.addHandler(logger_file_handle)
|
|
110
133
|
|
|
111
134
|
|
|
112
|
-
|
|
113
|
-
suffix: Optional[str], log_level: int = logging.CRITICAL
|
|
114
|
-
) -> None:
|
|
115
|
-
"""Add a file log handle to the logger."""
|
|
116
|
-
base_name = f"{THIS_NAME}-{suffix}" if suffix else THIS_NAME
|
|
117
|
-
log_dir = Path(appdirs.user_log_dir(THIS_NAME))
|
|
118
|
-
log_dir.mkdir(exist_ok=True, parents=True)
|
|
119
|
-
logger_file_handle = RotatingFileHandler(
|
|
120
|
-
log_dir / f"{base_name}.log",
|
|
121
|
-
mode="a",
|
|
122
|
-
maxBytes=5 * 1024**2,
|
|
123
|
-
backupCount=5,
|
|
124
|
-
encoding="utf-8",
|
|
125
|
-
delay=False,
|
|
126
|
-
)
|
|
127
|
-
logger_file_handle.setFormatter(logger.file_format)
|
|
128
|
-
logger_file_handle.setLevel(min(log_level, logging.CRITICAL))
|
|
129
|
-
logger.addHandler(logger_file_handle)
|
|
135
|
+
logger = Logger()
|
|
130
136
|
|
|
131
137
|
|
|
132
138
|
def get_level_from_verbosity(verbosity: int) -> int:
|
|
@@ -134,9 +140,14 @@ def get_level_from_verbosity(verbosity: int) -> int:
|
|
|
134
140
|
return max(logging.CRITICAL - 10 * verbosity, -1)
|
|
135
141
|
|
|
136
142
|
|
|
137
|
-
def apply_verbosity(
|
|
143
|
+
def apply_verbosity(
|
|
144
|
+
level: Optional[int] = None, suffix: Optional[str] = None
|
|
145
|
+
) -> int:
|
|
138
146
|
"""Set the logging level of the handlers to a certain level."""
|
|
147
|
+
level = logger.level if level is None else level
|
|
139
148
|
old_level = logger.level
|
|
140
149
|
level = get_level_from_verbosity(level)
|
|
141
150
|
logger.set_level(level)
|
|
151
|
+
logger.add_file_handle(suffix, level)
|
|
152
|
+
|
|
142
153
|
return old_level
|
metadata_crawler/run.py
CHANGED
|
@@ -9,15 +9,21 @@ from types import NoneType
|
|
|
9
9
|
from typing import Any, Collection, Dict, List, Optional, Sequence, Union, cast
|
|
10
10
|
|
|
11
11
|
import tomlkit
|
|
12
|
+
import yaml
|
|
12
13
|
from rich.prompt import Prompt
|
|
13
14
|
|
|
14
15
|
from .api.config import CrawlerSettings, DRSConfig, strip_protocol
|
|
15
|
-
from .api.metadata_stores import
|
|
16
|
+
from .api.metadata_stores import (
|
|
17
|
+
CatalogueBackendType,
|
|
18
|
+
CatalogueReader,
|
|
19
|
+
IndexName,
|
|
20
|
+
)
|
|
16
21
|
from .data_collector import DataCollector
|
|
17
22
|
from .logger import apply_verbosity, get_level_from_verbosity, logger
|
|
18
23
|
from .utils import (
|
|
19
24
|
Console,
|
|
20
25
|
EmptyCrawl,
|
|
26
|
+
IndexProgress,
|
|
21
27
|
MetadataCrawlerException,
|
|
22
28
|
find_closest,
|
|
23
29
|
load_plugins,
|
|
@@ -49,6 +55,20 @@ def _match(match: str, items: Collection[str]) -> List[str]:
|
|
|
49
55
|
return out
|
|
50
56
|
|
|
51
57
|
|
|
58
|
+
def _get_num_of_indexed_objects(
|
|
59
|
+
catalogue_files: FilesArg, storage_options: Optional[Dict[str, Any]] = None
|
|
60
|
+
) -> int:
|
|
61
|
+
num_objects = 0
|
|
62
|
+
storage_options = storage_options or {}
|
|
63
|
+
for cat_file in _norm_files(catalogue_files):
|
|
64
|
+
try:
|
|
65
|
+
cat = CatalogueReader.load_catalogue(cat_file, **storage_options)
|
|
66
|
+
num_objects += cat.get("metadata", {}).get("indexed_objects", 0)
|
|
67
|
+
except (FileNotFoundError, IsADirectoryError, yaml.parser.ParserError):
|
|
68
|
+
pass
|
|
69
|
+
return num_objects
|
|
70
|
+
|
|
71
|
+
|
|
52
72
|
def _get_search(
|
|
53
73
|
config_file: Union[str, Path, Dict[str, Any], tomlkit.TOMLDocument],
|
|
54
74
|
search_dirs: Optional[List[str]] = None,
|
|
@@ -86,14 +106,22 @@ async def async_call(
|
|
|
86
106
|
batch_size: int = 2500,
|
|
87
107
|
catalogue_files: Optional[Sequence[Union[Path, str]]] = None,
|
|
88
108
|
verbosity: int = 0,
|
|
109
|
+
log_suffix: Optional[str] = None,
|
|
110
|
+
num_objects: int = 0,
|
|
89
111
|
*args: Any,
|
|
90
112
|
**kwargs: Any,
|
|
91
113
|
) -> None:
|
|
92
|
-
"""
|
|
114
|
+
"""Add / Delete metadata from index."""
|
|
93
115
|
env = cast(os._Environ[str], os.environ.copy())
|
|
94
|
-
old_level = apply_verbosity(verbosity)
|
|
116
|
+
old_level = apply_verbosity(verbosity, suffix=log_suffix)
|
|
117
|
+
|
|
95
118
|
try:
|
|
119
|
+
progress = IndexProgress(total=num_objects)
|
|
120
|
+
os.environ["MDC_LOG_INIT"] = "1"
|
|
96
121
|
os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
|
|
122
|
+
os.environ["MDC_LOG_SUFFIX"] = (
|
|
123
|
+
log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
|
|
124
|
+
)
|
|
97
125
|
backends = load_plugins("metadata_crawler.ingester")
|
|
98
126
|
try:
|
|
99
127
|
cls = backends[index_system]
|
|
@@ -107,18 +135,22 @@ async def async_call(
|
|
|
107
135
|
flat_files = flat_files or [""]
|
|
108
136
|
futures = []
|
|
109
137
|
storage_options = kwargs.pop("storage_options", {})
|
|
138
|
+
progress.start()
|
|
110
139
|
for cf in flat_files:
|
|
111
140
|
obj = cls(
|
|
112
141
|
batch_size=batch_size,
|
|
113
142
|
catalogue_file=cf or None,
|
|
114
143
|
storage_options=storage_options,
|
|
144
|
+
progress=progress,
|
|
115
145
|
)
|
|
116
146
|
func = getattr(obj, method)
|
|
117
147
|
future = _event_loop.create_task(func(**kwargs))
|
|
118
148
|
futures.append(future)
|
|
119
149
|
await asyncio.gather(*futures)
|
|
150
|
+
|
|
120
151
|
finally:
|
|
121
152
|
os.environ = env
|
|
153
|
+
progress.stop()
|
|
122
154
|
logger.set_level(old_level)
|
|
123
155
|
|
|
124
156
|
|
|
@@ -127,6 +159,7 @@ async def async_index(
|
|
|
127
159
|
*catalogue_files: Union[Path, str, List[str], List[Path]],
|
|
128
160
|
batch_size: int = 2500,
|
|
129
161
|
verbosity: int = 0,
|
|
162
|
+
log_suffix: Optional[str] = None,
|
|
130
163
|
**kwargs: Any,
|
|
131
164
|
) -> None:
|
|
132
165
|
"""Index metadata in the indexing system.
|
|
@@ -142,6 +175,8 @@ async def async_index(
|
|
|
142
175
|
If the index system supports batch-sizes, the size of the batches.
|
|
143
176
|
verbosity:
|
|
144
177
|
Set the verbosity of the system.
|
|
178
|
+
log_suffix:
|
|
179
|
+
Add a suffix to the log file output.
|
|
145
180
|
|
|
146
181
|
Other Parameters
|
|
147
182
|
^^^^^^^^^^^^^^^^
|
|
@@ -168,6 +203,11 @@ async def async_index(
|
|
|
168
203
|
"index",
|
|
169
204
|
batch_size=batch_size,
|
|
170
205
|
verbosity=verbosity,
|
|
206
|
+
log_suffix=log_suffix,
|
|
207
|
+
num_objects=_get_num_of_indexed_objects(
|
|
208
|
+
kwargs["catalogue_files"],
|
|
209
|
+
storage_options=kwargs.get("storage_options"),
|
|
210
|
+
),
|
|
171
211
|
**kwargs,
|
|
172
212
|
)
|
|
173
213
|
|
|
@@ -176,6 +216,7 @@ async def async_delete(
|
|
|
176
216
|
index_system: str,
|
|
177
217
|
batch_size: int = 2500,
|
|
178
218
|
verbosity: int = 0,
|
|
219
|
+
log_suffix: Optional[str] = None,
|
|
179
220
|
**kwargs: Any,
|
|
180
221
|
) -> None:
|
|
181
222
|
"""Delete metadata from the indexing system.
|
|
@@ -188,6 +229,8 @@ async def async_delete(
|
|
|
188
229
|
If the index system supports batch-sizes, the size of the batches.
|
|
189
230
|
verbosity:
|
|
190
231
|
Set the verbosity of the system.
|
|
232
|
+
log_suffix:
|
|
233
|
+
Add a suffix to the log file output.
|
|
191
234
|
|
|
192
235
|
Other Parameters
|
|
193
236
|
^^^^^^^^^^^^^^^^^
|
|
@@ -212,6 +255,7 @@ async def async_delete(
|
|
|
212
255
|
"delete",
|
|
213
256
|
batch_size=batch_size,
|
|
214
257
|
verbosity=verbosity,
|
|
258
|
+
log_suffix=log_suffix,
|
|
215
259
|
**kwargs,
|
|
216
260
|
)
|
|
217
261
|
|
|
@@ -236,6 +280,7 @@ async def async_add(
|
|
|
236
280
|
password: bool = False,
|
|
237
281
|
n_procs: Optional[int] = None,
|
|
238
282
|
verbosity: int = 0,
|
|
283
|
+
log_suffix: Optional[str] = None,
|
|
239
284
|
fail_under: int = -1,
|
|
240
285
|
**kwargs: Any,
|
|
241
286
|
) -> None:
|
|
@@ -282,6 +327,8 @@ async def async_add(
|
|
|
282
327
|
Set the number of parallel processes for collecting.
|
|
283
328
|
verbosity:
|
|
284
329
|
Set the verbosity of the system.
|
|
330
|
+
log_suffix:
|
|
331
|
+
Add a suffix to the log file output.
|
|
285
332
|
fail_under:
|
|
286
333
|
Fail if less than X of the discovered files could be indexed.
|
|
287
334
|
|
|
@@ -305,9 +352,13 @@ async def async_add(
|
|
|
305
352
|
|
|
306
353
|
"""
|
|
307
354
|
env = cast(os._Environ[str], os.environ.copy())
|
|
308
|
-
old_level = apply_verbosity(verbosity)
|
|
355
|
+
old_level = apply_verbosity(verbosity, suffix=log_suffix)
|
|
309
356
|
try:
|
|
357
|
+
os.environ["MDC_LOG_INIT"] = "1"
|
|
310
358
|
os.environ["MDC_LOG_LEVEL"] = str(get_level_from_verbosity(verbosity))
|
|
359
|
+
os.environ["MDC_LOG_SUFFIX"] = (
|
|
360
|
+
log_suffix or os.getenv("MDC_LOG_SUFFIX") or ""
|
|
361
|
+
)
|
|
311
362
|
config_file = config_file or os.environ.get(
|
|
312
363
|
"EVALUATION_SYSTEM_CONFIG_DIR"
|
|
313
364
|
)
|
|
@@ -316,7 +367,7 @@ async def async_add(
|
|
|
316
367
|
"You must give a config file/directory"
|
|
317
368
|
)
|
|
318
369
|
st = time.time()
|
|
319
|
-
passwd =
|
|
370
|
+
passwd: Optional[str] = None
|
|
320
371
|
if password: # pragma: no cover
|
|
321
372
|
passwd = Prompt.ask(
|
|
322
373
|
"[b]Enter the password", password=True
|
metadata_crawler/utils.py
CHANGED
|
@@ -32,6 +32,7 @@ import rich.console
|
|
|
32
32
|
import rich.spinner
|
|
33
33
|
from dateutil.parser import isoparse
|
|
34
34
|
from rich.live import Live
|
|
35
|
+
from rich.progress import Progress, TaskID
|
|
35
36
|
|
|
36
37
|
from .logger import logger
|
|
37
38
|
|
|
@@ -330,6 +331,76 @@ def timedelta_to_str(seconds: Union[int, float]) -> str:
|
|
|
330
331
|
return " ".join(out[::-1])
|
|
331
332
|
|
|
332
333
|
|
|
334
|
+
class IndexProgress:
|
|
335
|
+
"""A helper that displays the progress of index Tasks."""
|
|
336
|
+
|
|
337
|
+
def __init__(
|
|
338
|
+
self,
|
|
339
|
+
total: int = 0,
|
|
340
|
+
interactive: Optional[bool] = None,
|
|
341
|
+
text: str = "Indexing: ",
|
|
342
|
+
) -> None:
|
|
343
|
+
if interactive is None:
|
|
344
|
+
self._interactive = bool(
|
|
345
|
+
int(os.getenv("MDC_INTERACTIVE", str(int(Console.is_terminal))))
|
|
346
|
+
)
|
|
347
|
+
else:
|
|
348
|
+
self._interactive = interactive
|
|
349
|
+
self._log_interval = int(os.getenv("MDC_LOG_INTERVAL", "30"))
|
|
350
|
+
self.text = text
|
|
351
|
+
self._done = 0
|
|
352
|
+
self._task: TaskID = TaskID(0)
|
|
353
|
+
self._total = total
|
|
354
|
+
self._start = self._last_log = time.time()
|
|
355
|
+
self._progress = Progress()
|
|
356
|
+
self._last_printed_percent: float = -1.0
|
|
357
|
+
|
|
358
|
+
def start(self) -> None:
|
|
359
|
+
"""Start the progress bar."""
|
|
360
|
+
self._start = self._last_log = time.time()
|
|
361
|
+
|
|
362
|
+
if self._interactive:
|
|
363
|
+
self._task = self._progress.add_task(
|
|
364
|
+
f"[green] {self.text}", total=self._total or None
|
|
365
|
+
)
|
|
366
|
+
self._progress.start()
|
|
367
|
+
|
|
368
|
+
def stop(self) -> None:
|
|
369
|
+
"""Stop the progress bar."""
|
|
370
|
+
if self._interactive:
|
|
371
|
+
self._progress.stop()
|
|
372
|
+
else:
|
|
373
|
+
self._text_update()
|
|
374
|
+
|
|
375
|
+
def _text_update(self, bar_width: int = 40) -> None:
|
|
376
|
+
elapsed = timedelta(seconds=int(time.time() - self._start))
|
|
377
|
+
log_interval = timedelta(seconds=int(time.time() - self._last_log))
|
|
378
|
+
if self._total > 0:
|
|
379
|
+
filled = int((self._last_printed_percent / 100) * bar_width)
|
|
380
|
+
bar = "#" * filled + "-" * (bar_width - filled)
|
|
381
|
+
text = f"{self.text} [{bar}] {self._last_printed_percent:>6,.02f}%"
|
|
382
|
+
else:
|
|
383
|
+
text = f"{self.text} [{self._done:>12,}]"
|
|
384
|
+
if log_interval.total_seconds() >= self._log_interval:
|
|
385
|
+
print(f"{text} ({elapsed})", flush=True)
|
|
386
|
+
self._last_log = time.time()
|
|
387
|
+
|
|
388
|
+
def update(self, inc: int) -> None:
|
|
389
|
+
"""Update the status progress bar by an increment."""
|
|
390
|
+
self._done += inc
|
|
391
|
+
|
|
392
|
+
if self._interactive is True:
|
|
393
|
+
desc = f"{self.text} [{self._done:>10d}]" if self._done == 0 else None
|
|
394
|
+
self._progress.update(self._task, advance=inc, description=desc)
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
frac = self._done / max(self._total, 1)
|
|
398
|
+
pct = frac * 100
|
|
399
|
+
if pct > self._last_printed_percent or self._total == 0:
|
|
400
|
+
self._last_printed_percent = pct
|
|
401
|
+
self._text_update()
|
|
402
|
+
|
|
403
|
+
|
|
333
404
|
@daemon
|
|
334
405
|
def print_performance(
|
|
335
406
|
print_status: EventLike,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metadata-crawler
|
|
3
|
-
Version: 2509.0.
|
|
3
|
+
Version: 2509.0.2
|
|
4
4
|
Summary: Crawl, extract and push climate metadata for indexing.
|
|
5
5
|
Author-email: "DKRZ, Clint" <freva@dkrz.de>
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -34,7 +34,7 @@ Requires-Dist: numpy
|
|
|
34
34
|
Requires-Dist: orjson
|
|
35
35
|
Requires-Dist: pyarrow
|
|
36
36
|
Requires-Dist: h5netcdf
|
|
37
|
-
Requires-Dist: pydantic
|
|
37
|
+
Requires-Dist: pydantic<2.12
|
|
38
38
|
Requires-Dist: pyarrow
|
|
39
39
|
Requires-Dist: rich
|
|
40
40
|
Requires-Dist: rich-argparse
|
|
@@ -83,10 +83,10 @@ Requires-Dist: pytest-env ; extra == "tests"
|
|
|
83
83
|
Requires-Dist: requests ; extra == "tests"
|
|
84
84
|
Requires-Dist: pre-commit ; extra == "tests"
|
|
85
85
|
Requires-Dist: toml ; extra == "tests"
|
|
86
|
-
Project-URL: Documentation, https://
|
|
87
|
-
Project-URL: Home, https://github.com/freva-org/
|
|
88
|
-
Project-URL: Issues, https://github.com/freva-org/
|
|
89
|
-
Project-URL: Source, https://github.com/freva-org/
|
|
86
|
+
Project-URL: Documentation, https://metadata-crawler.readthedocs.io
|
|
87
|
+
Project-URL: Home, https://github.com/freva-org/metadata-crawler
|
|
88
|
+
Project-URL: Issues, https://github.com/freva-org/metadata-crawler/issues
|
|
89
|
+
Project-URL: Source, https://github.com/freva-org/metadata-crawler
|
|
90
90
|
Provides-Extra: dev
|
|
91
91
|
Provides-Extra: doc
|
|
92
92
|
Provides-Extra: mkdoc
|
|
@@ -95,25 +95,27 @@ Provides-Extra: tests
|
|
|
95
95
|
# metadata-crawler
|
|
96
96
|
|
|
97
97
|
[](LICENSE)
|
|
98
|
-
[](https://pypi.org/project/metadata-crawler/)
|
|
99
|
+
[](https://anaconda.org/conda-forge/metadata-crawler)
|
|
99
100
|
[](https://metadata-crawler.readthedocs.io/en/latest/?badge=latest)
|
|
100
101
|
[](https://github.com/freva-org/metadata-crawler/actions)
|
|
101
102
|
[](https://codecov.io/gh/freva-org/metadata-crawler)
|
|
102
103
|
|
|
104
|
+
|
|
103
105
|
Harvest, normalise, and index climate / earth-system metadata from **POSIX**,
|
|
104
106
|
**S3/MinIO**, and **OpenStack Swift** using configurable **DRS dialects**
|
|
105
|
-
(CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (
|
|
106
|
-
|
|
107
|
+
(CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (JSONLines)
|
|
108
|
+
and then **index** into systems such as **Solr** or **MongoDB**.
|
|
107
109
|
Configuration is **TOML** with inheritance, templating, and computed rules.
|
|
108
110
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
111
|
+
## TL;DR
|
|
112
|
+
|
|
113
|
+
- Define datasets + dialects in ``drs_config.toml``
|
|
114
|
+
- ``mdc add`` → write a temporary catalogue (``jsonl.gz``)
|
|
115
|
+
- ``mdc config`` → inspect a the (merged) crawler config.
|
|
116
|
+
- ``mdc walk-intake`` → inspect the content of an intake catalogue.
|
|
117
|
+
- ``mdc <backend> index`` → push records from catalogue into your index backend
|
|
118
|
+
- ``mdc <backend> delete`` → remove records by facet match
|
|
117
119
|
|
|
118
120
|
## Features
|
|
119
121
|
|
|
@@ -126,7 +128,7 @@ Configuration is **TOML** with inheritance, templating, and computed rules.
|
|
|
126
128
|
dataset attributes/vars
|
|
127
129
|
- **Special rules**: conditionals, cache lookups and function calls (e.g. CMIP6 realm,
|
|
128
130
|
time aggregation)
|
|
129
|
-
- **Index backends**:
|
|
131
|
+
- **Index backends**: MongoDB (Motor), Solr
|
|
130
132
|
- **Sync + Async APIs** and a clean CLI
|
|
131
133
|
- **Docs**: Sphinx with ``pydata_sphinx_theme``
|
|
132
134
|
|
|
@@ -143,14 +145,14 @@ Configuration is **TOML** with inheritance, templating, and computed rules.
|
|
|
143
145
|
```console
|
|
144
146
|
|
|
145
147
|
# 1) Crawl → write catalogue
|
|
146
|
-
mdc
|
|
148
|
+
mdc add \
|
|
147
149
|
cat.yaml \
|
|
148
150
|
--config-file drs_config.toml \
|
|
149
151
|
--dataset cmip6-fs,obs-fs \
|
|
150
152
|
--threads 4 --batch-size 100
|
|
151
153
|
|
|
152
|
-
# 2) Index from catalogue → Solr (or Mongo
|
|
153
|
-
mdc
|
|
154
|
+
# 2) Index from catalogue → Solr (or Mongo)
|
|
155
|
+
mdc solr index \
|
|
154
156
|
cat.yaml \
|
|
155
157
|
--server localhot:8983
|
|
156
158
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
|
|
2
|
+
metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
|
|
3
|
+
metadata_crawler/_version.py,sha256=9-K5oUNmfiY2VyddRsxyD-fcZp54m4x8eeX3XbXHEV0,25
|
|
4
|
+
metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
|
|
5
|
+
metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
|
|
6
|
+
metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
|
|
7
|
+
metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
metadata_crawler/run.py,sha256=ytkYZQGWQ1jAvm8_ZbVPfTydGoHTEAhKWbajlkt6oU4,13033
|
|
9
|
+
metadata_crawler/utils.py,sha256=Nm1DkyBD8PyBOP-EUf-Vqs-mLQUPu-6gWPgvNkGDmq8,14124
|
|
10
|
+
metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
|
|
11
|
+
metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
|
|
12
|
+
metadata_crawler/api/config.py,sha256=MxxAN1y2FtHlUU42nBfQds5_8R_OSDdnHXsZANx6IFY,28373
|
|
13
|
+
metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
|
|
14
|
+
metadata_crawler/api/index.py,sha256=9hafNfNEbmw2tIVYq7jPagz7RaDtxXjs_L-YtFVvNJk,4411
|
|
15
|
+
metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
|
|
16
|
+
metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
|
|
17
|
+
metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
|
|
18
|
+
metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
|
|
19
|
+
metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
|
|
20
|
+
metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
|
|
21
|
+
metadata_crawler/api/mixin/template_mixin.py,sha256=hxQXiP_JND3fuxBNcs1pZ7cvP-k-lTm5MQg40t0kF54,5105
|
|
22
|
+
metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
|
|
23
|
+
metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
|
|
24
|
+
metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
|
|
25
|
+
metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqeeE,4602
|
|
26
|
+
metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
|
|
27
|
+
metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
|
|
28
|
+
metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
|
|
29
|
+
metadata_crawler/ingester/solr.py,sha256=cRHe47l3WFZEFLZkHD1q-aPVjimi8H03xgL994XO1Lg,8988
|
|
30
|
+
metadata_crawler-2509.0.2.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
|
|
31
|
+
metadata_crawler-2509.0.2.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
|
|
32
|
+
metadata_crawler-2509.0.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
33
|
+
metadata_crawler-2509.0.2.dist-info/METADATA,sha256=b32DEUfPeWaSKbhdZYw_1qi57-yIyS0Z2PhaaH4EDK8,13006
|
|
34
|
+
metadata_crawler-2509.0.2.dist-info/RECORD,,
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
metadata_crawler/__init__.py,sha256=7gEpJjS9FpR6MHRY_Ztk8ORJ8JQ7WZUTV2TfLkaYgqs,6741
|
|
2
|
-
metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
|
|
3
|
-
metadata_crawler/_version.py,sha256=Z6_4SgU9Dpc127xJlyvGKjeWd_Q1ONlOHQO123XGv30,25
|
|
4
|
-
metadata_crawler/cli.py,sha256=meY5ZfR5VEW5ZorOPWO_b4MyIIQy0wTTPs9OkJ1WnfA,17180
|
|
5
|
-
metadata_crawler/data_collector.py,sha256=9CVr4arKJspyLNLuF2MfkmY_r8x74Mw8hAaDSMouQUA,8372
|
|
6
|
-
metadata_crawler/logger.py,sha256=5Lc0KdzH2HdWkidW-MASW8Pfy7vTMnzPv1-e2V3Any0,4407
|
|
7
|
-
metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
metadata_crawler/run.py,sha256=w1kV4D63dS3mdgDTQj2ngzeSCjZPphWg1HwIJeJ6ATE,11345
|
|
9
|
-
metadata_crawler/utils.py,sha256=QNr_9jZkuuQOrkuO46PrFhUfwLmfCJCq9gWUwwARfyM,11580
|
|
10
|
-
metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
|
|
11
|
-
metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
|
|
12
|
-
metadata_crawler/api/config.py,sha256=j__JDKYTOR8kYC--HaHlYXfz38rzEhtUvHdO5Bh_j2E,28250
|
|
13
|
-
metadata_crawler/api/drs_config.toml,sha256=90lQaSC2VdJ8OUoc6j27kg6d2OnfxR5a_KZH3W-FZV4,10603
|
|
14
|
-
metadata_crawler/api/index.py,sha256=8g5HdSxluKtCwU45P0w_7LDIaSf200JbB-ekGJiI18c,4130
|
|
15
|
-
metadata_crawler/api/metadata_stores.py,sha256=oWewL6XRmNZ6i5WxYI8Lm2jfpwLqBCGP2p4j3wLLNpQ,23735
|
|
16
|
-
metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
|
|
17
|
-
metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
|
|
18
|
-
metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
|
|
19
|
-
metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
|
|
20
|
-
metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
|
|
21
|
-
metadata_crawler/api/mixin/template_mixin.py,sha256=_qDp5n_CPnSYPMBsTia44b1ybBqrJEi-M1NaRkQ0z3U,5106
|
|
22
|
-
metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
|
|
23
|
-
metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
|
|
24
|
-
metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
|
|
25
|
-
metadata_crawler/backends/s3.py,sha256=DPz_bOyOlUveCwkSLVatwU_mcxUbFvygU_Id1AZVIMA,4455
|
|
26
|
-
metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
|
|
27
|
-
metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
|
|
28
|
-
metadata_crawler/ingester/mongo.py,sha256=lpWIZ8mo6S8oY887uz2l6Y9pir0sUVEkfgOdDxrjIMM,6142
|
|
29
|
-
metadata_crawler/ingester/solr.py,sha256=EoKS3kFeDTLf9zP22s2DhQGP81T6rTXVWDNT2wWKFkk,5242
|
|
30
|
-
metadata_crawler-2509.0.0.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
|
|
31
|
-
metadata_crawler-2509.0.0.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
|
|
32
|
-
metadata_crawler-2509.0.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
33
|
-
metadata_crawler-2509.0.0.dist-info/METADATA,sha256=Dk0trqXYleepz1L8HXwKF-vAdSQww1zBm4Q014G4aOU,12938
|
|
34
|
-
metadata_crawler-2509.0.0.dist-info/RECORD,,
|
|
File without changes
|
{metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{metadata_crawler-2509.0.0.dist-info → metadata_crawler-2509.0.2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|