metadata-crawler 2510.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metadata-crawler might be problematic. Click here for more details.
- metadata_crawler/__init__.py +263 -0
- metadata_crawler/__main__.py +8 -0
- metadata_crawler/_version.py +1 -0
- metadata_crawler/api/__init__.py +1 -0
- metadata_crawler/api/cli.py +57 -0
- metadata_crawler/api/config.py +831 -0
- metadata_crawler/api/drs_config.toml +440 -0
- metadata_crawler/api/index.py +151 -0
- metadata_crawler/api/metadata_stores.py +755 -0
- metadata_crawler/api/mixin/__init__.py +7 -0
- metadata_crawler/api/mixin/lookup_mixin.py +112 -0
- metadata_crawler/api/mixin/lookup_tables.py +10010 -0
- metadata_crawler/api/mixin/path_mixin.py +46 -0
- metadata_crawler/api/mixin/template_mixin.py +145 -0
- metadata_crawler/api/storage_backend.py +277 -0
- metadata_crawler/backends/__init__.py +1 -0
- metadata_crawler/backends/intake.py +211 -0
- metadata_crawler/backends/posix.py +121 -0
- metadata_crawler/backends/s3.py +140 -0
- metadata_crawler/backends/swift.py +305 -0
- metadata_crawler/cli.py +547 -0
- metadata_crawler/data_collector.py +278 -0
- metadata_crawler/ingester/__init__.py +1 -0
- metadata_crawler/ingester/mongo.py +206 -0
- metadata_crawler/ingester/solr.py +282 -0
- metadata_crawler/logger.py +153 -0
- metadata_crawler/py.typed +0 -0
- metadata_crawler/run.py +419 -0
- metadata_crawler/utils/__init__.py +482 -0
- metadata_crawler/utils/cftime_utils.py +207 -0
- metadata_crawler-2510.1.0.dist-info/METADATA +401 -0
- metadata_crawler-2510.1.0.dist-info/RECORD +35 -0
- metadata_crawler-2510.1.0.dist-info/WHEEL +4 -0
- metadata_crawler-2510.1.0.dist-info/entry_points.txt +14 -0
- metadata_crawler-2510.1.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metadata-crawler
|
|
3
|
+
Version: 2510.1.0
|
|
4
|
+
Summary: Crawl, extract and push climate metadata for indexing.
|
|
5
|
+
Author-email: "DKRZ, Clint" <freva@dkrz.de>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: aiohttp
|
|
21
|
+
Requires-Dist: appdirs
|
|
22
|
+
Requires-Dist: anyio
|
|
23
|
+
Requires-Dist: ciso8601
|
|
24
|
+
Requires-Dist: fsspec
|
|
25
|
+
Requires-Dist: diskcache
|
|
26
|
+
Requires-Dist: s3fs
|
|
27
|
+
Requires-Dist: jinja2
|
|
28
|
+
Requires-Dist: intake
|
|
29
|
+
Requires-Dist: intake-xarray
|
|
30
|
+
Requires-Dist: intake-esm
|
|
31
|
+
Requires-Dist: pandas
|
|
32
|
+
Requires-Dist: python-dateutil
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: orjson
|
|
35
|
+
Requires-Dist: pyarrow
|
|
36
|
+
Requires-Dist: h5netcdf
|
|
37
|
+
Requires-Dist: pydantic<2.12
|
|
38
|
+
Requires-Dist: pyarrow
|
|
39
|
+
Requires-Dist: rich
|
|
40
|
+
Requires-Dist: rich-argparse
|
|
41
|
+
Requires-Dist: tomli
|
|
42
|
+
Requires-Dist: tomlkit
|
|
43
|
+
Requires-Dist: typing_extensions
|
|
44
|
+
Requires-Dist: zarr
|
|
45
|
+
Requires-Dist: xarray
|
|
46
|
+
Requires-Dist: httpx
|
|
47
|
+
Requires-Dist: uvloop
|
|
48
|
+
Requires-Dist: motor
|
|
49
|
+
Requires-Dist: flit ; extra == "dev"
|
|
50
|
+
Requires-Dist: tox ; extra == "dev"
|
|
51
|
+
Requires-Dist: codespell ; extra == "doc"
|
|
52
|
+
Requires-Dist: blacken-docs ; extra == "doc"
|
|
53
|
+
Requires-Dist: numpydoc ; extra == "doc"
|
|
54
|
+
Requires-Dist: sphinx ; extra == "doc"
|
|
55
|
+
Requires-Dist: sphinxcontrib_github_alt ; extra == "doc"
|
|
56
|
+
Requires-Dist: sphinx-execute-code-python3 ; extra == "doc"
|
|
57
|
+
Requires-Dist: sphinx-copybutton ; extra == "doc"
|
|
58
|
+
Requires-Dist: sphinx-sitemap ; extra == "doc"
|
|
59
|
+
Requires-Dist: sphinx-togglebutton ; extra == "doc"
|
|
60
|
+
Requires-Dist: sphinxext-opengraph[social-cards] ; extra == "doc"
|
|
61
|
+
Requires-Dist: pydata-sphinx-theme ; extra == "doc"
|
|
62
|
+
Requires-Dist: myst-parser ; extra == "doc"
|
|
63
|
+
Requires-Dist: mkdocs ; extra == "mkdoc"
|
|
64
|
+
Requires-Dist: mkdocs-material ; extra == "mkdoc"
|
|
65
|
+
Requires-Dist: mkdocstrings[python] ; extra == "mkdoc"
|
|
66
|
+
Requires-Dist: mkdocs-macros-plugin ; extra == "mkdoc"
|
|
67
|
+
Requires-Dist: mkdocs-minify-plugin ; extra == "mkdoc"
|
|
68
|
+
Requires-Dist: mkdocs-redirects ; extra == "mkdoc"
|
|
69
|
+
Requires-Dist: mike ; extra == "mkdoc"
|
|
70
|
+
Requires-Dist: codespell ; extra == "tests"
|
|
71
|
+
Requires-Dist: pydocstyle ; extra == "tests"
|
|
72
|
+
Requires-Dist: types-appdirs ; extra == "tests"
|
|
73
|
+
Requires-Dist: black ; extra == "tests"
|
|
74
|
+
Requires-Dist: isort ; extra == "tests"
|
|
75
|
+
Requires-Dist: mock ; extra == "tests"
|
|
76
|
+
Requires-Dist: mypy ; extra == "tests"
|
|
77
|
+
Requires-Dist: netcdf4 ; extra == "tests"
|
|
78
|
+
Requires-Dist: pandas ; extra == "tests"
|
|
79
|
+
Requires-Dist: intake-parquet ; extra == "tests"
|
|
80
|
+
Requires-Dist: pytest-asyncio ; extra == "tests"
|
|
81
|
+
Requires-Dist: pytest-cov ; extra == "tests"
|
|
82
|
+
Requires-Dist: pytest-env ; extra == "tests"
|
|
83
|
+
Requires-Dist: requests ; extra == "tests"
|
|
84
|
+
Requires-Dist: pre-commit ; extra == "tests"
|
|
85
|
+
Requires-Dist: toml ; extra == "tests"
|
|
86
|
+
Project-URL: Documentation, https://metadata-crawler.readthedocs.io
|
|
87
|
+
Project-URL: Home, https://github.com/freva-org/metadata-crawler
|
|
88
|
+
Project-URL: Issues, https://github.com/freva-org/metadata-crawler/issues
|
|
89
|
+
Project-URL: Source, https://github.com/freva-org/metadata-crawler
|
|
90
|
+
Provides-Extra: dev
|
|
91
|
+
Provides-Extra: doc
|
|
92
|
+
Provides-Extra: mkdoc
|
|
93
|
+
Provides-Extra: tests
|
|
94
|
+
|
|
95
|
+
# metadata-crawler
|
|
96
|
+
|
|
97
|
+
[](LICENSE)
|
|
98
|
+
[](https://pypi.org/project/metadata-crawler/)
|
|
99
|
+
[](https://anaconda.org/conda-forge/metadata-crawler)
|
|
100
|
+
[](https://metadata-crawler.readthedocs.io/en/latest/?badge=latest)
|
|
101
|
+
[](https://github.com/freva-org/metadata-crawler/actions)
|
|
102
|
+
[](https://codecov.io/gh/freva-org/metadata-crawler)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
Harvest, normalise, and index climate / earth-system metadata from **POSIX**,
|
|
106
|
+
**S3/MinIO**, and **OpenStack Swift** using configurable **DRS dialects**
|
|
107
|
+
(CMIP6, CMIP5, CORDEX, …). Output to a temporary **catalogue** (JSONLines)
|
|
108
|
+
and then **index** into systems such as **Solr** or **MongoDB**.
|
|
109
|
+
Configuration is **TOML** with inheritance, templating, and computed rules.
|
|
110
|
+
|
|
111
|
+
## TL;DR
|
|
112
|
+
|
|
113
|
+
- Define datasets + dialects in ``drs_config.toml``
|
|
114
|
+
- ``mdc add`` → write a temporary catalogue (``jsonl.gz``)
|
|
115
|
+
- ``mdc config`` → inspect a the (merged) crawler config.
|
|
116
|
+
- ``mdc walk-intake`` → inspect the content of an intake catalogue.
|
|
117
|
+
- ``mdc <backend> index`` → push records from catalogue into your index backend
|
|
118
|
+
- ``mdc <backend> delete`` → remove records by facet match
|
|
119
|
+
|
|
120
|
+
## Features
|
|
121
|
+
|
|
122
|
+
- **Multi-backend discovery**: POSIX, S3/MinIO, Swift (async REST), Intake
|
|
123
|
+
- **Two-stage pipeline**: *crawl → catalogue* then *catalogue → index*
|
|
124
|
+
- **Schema driven**: strong types (e.g. ``string``, ``datetime[2]``,
|
|
125
|
+
``float[4]``, ``string[]``)
|
|
126
|
+
- **DRS dialects**: packaged CMIP6/CMIP5/CORDEX; build your own via inheritance
|
|
127
|
+
- **Path specs & data specs**: parse directory/filename parts and/or read
|
|
128
|
+
dataset attributes/vars
|
|
129
|
+
- **Special rules**: conditionals, cache lookups and function calls (e.g. CMIP6 realm,
|
|
130
|
+
time aggregation)
|
|
131
|
+
- **Index backends**: MongoDB (Motor), Solr
|
|
132
|
+
- **Sync + Async APIs** and a clean CLI
|
|
133
|
+
- **Docs**: Sphinx with ``pydata_sphinx_theme``
|
|
134
|
+
|
|
135
|
+
## Install
|
|
136
|
+
|
|
137
|
+
```console
|
|
138
|
+
|
|
139
|
+
pip install metadata-crawler
|
|
140
|
+
conda install -c conda-forge metadata-crawler
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Quickstart (CLI)
|
|
144
|
+
|
|
145
|
+
```console
|
|
146
|
+
|
|
147
|
+
# 1) Crawl → write catalogue
|
|
148
|
+
mdc add \
|
|
149
|
+
cat.yaml \
|
|
150
|
+
--config-file drs_config.toml \
|
|
151
|
+
--dataset cmip6-fs,obs-fs \
|
|
152
|
+
--threads 4 --batch-size 100
|
|
153
|
+
|
|
154
|
+
# 2) Index from catalogue → Solr (or Mongo)
|
|
155
|
+
mdc solr index \
|
|
156
|
+
cat.yaml \
|
|
157
|
+
--server localhot:8983
|
|
158
|
+
|
|
159
|
+
# 3) Delete by facets (supports globs on values)
|
|
160
|
+
mdc delete \
|
|
161
|
+
--server localhost:8983 \
|
|
162
|
+
--facets "file *.nc" --facets "project CMIP6"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
> [!NOTE]
|
|
166
|
+
> The CLI is a **custom framework** inspired by Typer (not Typer itself).
|
|
167
|
+
> Use ``--help`` on any subcommand to see all options.
|
|
168
|
+
|
|
169
|
+
## Minimal config (``drs_config.toml``)
|
|
170
|
+
|
|
171
|
+
```toml
|
|
172
|
+
|
|
173
|
+
# === Canonical schema ===
|
|
174
|
+
[drs_settings.schema.file]
|
|
175
|
+
key = "file"
|
|
176
|
+
type = "path"
|
|
177
|
+
required = true
|
|
178
|
+
indexed = true
|
|
179
|
+
unique = true
|
|
180
|
+
|
|
181
|
+
[drs_settings.schema.uri]
|
|
182
|
+
key = "uri"
|
|
183
|
+
type = "uri"
|
|
184
|
+
required = true
|
|
185
|
+
indexed = true
|
|
186
|
+
|
|
187
|
+
[drs_settings.schema.variable]
|
|
188
|
+
key = "variable"
|
|
189
|
+
type = "string[]"
|
|
190
|
+
multi_valued = true
|
|
191
|
+
indexed = true
|
|
192
|
+
|
|
193
|
+
[drs_settings.schema.time]
|
|
194
|
+
key = "time"
|
|
195
|
+
type = "datetime[2]" # [start, end]
|
|
196
|
+
indexed = true
|
|
197
|
+
default = []
|
|
198
|
+
|
|
199
|
+
[drs_settings.schema.bbox]
|
|
200
|
+
key = "bbox"
|
|
201
|
+
type = "float[4]" # [W,E,S,N]
|
|
202
|
+
default = [0, 360, -90, 90]
|
|
203
|
+
|
|
204
|
+
# === Dialect: CMIP6 (example) ===
|
|
205
|
+
[drs_settings.dialect.cmip6]
|
|
206
|
+
sources = ["path","data"] # path | data | storage
|
|
207
|
+
defaults.grid_label = "gn"
|
|
208
|
+
specs_dir = ["mip_era","activity_id","institution_id","source_id","experiment_id","member_id","table_id","variable_id","grid_label","version"]
|
|
209
|
+
specs_file = ["variable_id","table_id","source_id","experiment_id","member_id","grid_label","time"]
|
|
210
|
+
|
|
211
|
+
[drs_settings.dialect.cmip6.special.realm]
|
|
212
|
+
type = "method"
|
|
213
|
+
method = "_get_realm"
|
|
214
|
+
args = ["table_id","variable_id","__file_name__"]
|
|
215
|
+
|
|
216
|
+
[drs_settings.dialect.cmip6.special.time_aggregation]
|
|
217
|
+
type = "method"
|
|
218
|
+
method = "_get_aggregation"
|
|
219
|
+
args = ["table_id","variable_id","__file_name__"]
|
|
220
|
+
|
|
221
|
+
# === Dialect: CORDEX (bbox by domain) ===
|
|
222
|
+
[drs_settings.dialect.cordex]
|
|
223
|
+
sources = ["path","data"]
|
|
224
|
+
specs_dir = ["project","product","domain","institution","driving_model","experiment","ensemble","rcm_name","rcm_version","time_frequency","variable","version"]
|
|
225
|
+
specs_file= ["variable","domain","driving_model","experiment","ensemble","rcm_name","rcm_version","time_frequency","time"]
|
|
226
|
+
|
|
227
|
+
[drs_settings.dialect.cordex.special.bbox]
|
|
228
|
+
type = "call"
|
|
229
|
+
method = "dialect['cordex']['domains'].get('{{domain | upper }}', [0,360,-90,90])"
|
|
230
|
+
|
|
231
|
+
[drs_settings.dialect.cordex.domains]
|
|
232
|
+
EUR-11 = [-44.14, 64.40, 22.20, 72.42]
|
|
233
|
+
AFR-44 = [-24.64, 60.28, -45.76, 42.24]
|
|
234
|
+
|
|
235
|
+
# === Datasets ===
|
|
236
|
+
[cmip6-fs]
|
|
237
|
+
root_path = "/data/model/global/cmip6"
|
|
238
|
+
drs_format = "cmip6" # dialect name
|
|
239
|
+
fs_type = "posix"
|
|
240
|
+
|
|
241
|
+
[cmip6-s3]
|
|
242
|
+
root_path = "s3://test-bucket/data/model/global/cmip6"
|
|
243
|
+
drs_format = "cmip6"
|
|
244
|
+
fs_type = "s3"
|
|
245
|
+
storage_options.endpoint_url = "http://127.0.0.1:9000"
|
|
246
|
+
storage_options.aws_access_key_id = "minioadmin"
|
|
247
|
+
storage_options.aws_secret_access_key = "minioadmin"
|
|
248
|
+
storage_options.region_name = "us-east-1"
|
|
249
|
+
storage_options.url_style = "path"
|
|
250
|
+
storage_options.use_ssl = false
|
|
251
|
+
|
|
252
|
+
[obs-fs]
|
|
253
|
+
root_path = "/arch/observations"
|
|
254
|
+
drs_format = "custom"
|
|
255
|
+
# define your specs_dir/specs_file or inherit from another dialect
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## Concepts
|
|
259
|
+
|
|
260
|
+
### Schema (facet definitions)
|
|
261
|
+
|
|
262
|
+
Each canonical facet describes:
|
|
263
|
+
|
|
264
|
+
- ``key``: where to read value (``"project"``, ``"variable"``,)
|
|
265
|
+
- ``type``: ``string``, ``integer``, ``float``, ``datetime``, with arrays like
|
|
266
|
+
``float[4]``, ``string[]``, ``datetime[2]``, or special types like ``file``,
|
|
267
|
+
``uri``, ``fs_type``, ``dataset``, ``fmt``
|
|
268
|
+
- ``required``, ``default``, ``indexed``, ``unique``, ``multi_valued``
|
|
269
|
+
|
|
270
|
+
### Dialects
|
|
271
|
+
|
|
272
|
+
A dialect tells the crawler how to **interpret paths** and **read data**:
|
|
273
|
+
|
|
274
|
+
- ``sources``: which sources to consult (``path``, ``data``, ``storage``) in priority
|
|
275
|
+
- ``specs_dir`` / ``specs_file``: ordered facet names encoded in directory and file names
|
|
276
|
+
- ``data_specs``: pull values from dataset content (attrs/variables); supports
|
|
277
|
+
``__variable__`` and templated specs
|
|
278
|
+
- ``special``: computed fields (``conditional`` | ``method`` | ``function``)
|
|
279
|
+
- Optional lookups (e.g., CORDEX ``domains`` for bbox)
|
|
280
|
+
|
|
281
|
+
### Path specs vs data specs
|
|
282
|
+
|
|
283
|
+
- **Path specs** parse segments from the path, e.g.:
|
|
284
|
+
``/project/product/institute/model/experiment/.../variable_time.nc``
|
|
285
|
+
- **Data specs** read from the dataset itself (e.g., xarray/global attribute, variable
|
|
286
|
+
attributes, per-var stats). Example: gather all variables ``__variable__``, then
|
|
287
|
+
their units with a templated selector.
|
|
288
|
+
|
|
289
|
+
### Inheritance
|
|
290
|
+
|
|
291
|
+
Create new dialects/datasets by inheriting:
|
|
292
|
+
|
|
293
|
+
```toml
|
|
294
|
+
|
|
295
|
+
[drs_settings.dialect.reana]
|
|
296
|
+
inherits_from = "cmip5"
|
|
297
|
+
sources = ["path","data"]
|
|
298
|
+
[drs_settings.dialect.reana.data_specs.read_kws]
|
|
299
|
+
engine = "h5netcdf"
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## Python API
|
|
303
|
+
|
|
304
|
+
### Async
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
|
|
308
|
+
import asyncio
|
|
309
|
+
from metadata_crawler.run import async_add, async_index, async_delete
|
|
310
|
+
|
|
311
|
+
async def main():
|
|
312
|
+
# crawl → catalogue
|
|
313
|
+
await async_add(
|
|
314
|
+
"cat.yaml",
|
|
315
|
+
config_file="drs_config.toml",
|
|
316
|
+
dataset_names=["cmip6-fs"],
|
|
317
|
+
threads=4,
|
|
318
|
+
batch_size=100,
|
|
319
|
+
)
|
|
320
|
+
# index → backend
|
|
321
|
+
await async_index(
|
|
322
|
+
"solr",
|
|
323
|
+
"cat.yaml",
|
|
324
|
+
config_file="drs_config.toml",
|
|
325
|
+
server="localhost:8983",
|
|
326
|
+
)
|
|
327
|
+
# delete by facets
|
|
328
|
+
await async_delete(
|
|
329
|
+
config_path="drs_config.toml",
|
|
330
|
+
index_store="solr",
|
|
331
|
+
facets=[("file", "*.nc")],
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
asyncio.run(main())
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
### Sync (simple wrapper)
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
|
|
342
|
+
import asyncio
|
|
343
|
+
from metadata_crawler import add
|
|
344
|
+
|
|
345
|
+
add(
|
|
346
|
+
store="cat.yaml",
|
|
347
|
+
config_file="drs_config.toml",
|
|
348
|
+
dataset_names=["cmip6-fs"],
|
|
349
|
+
)
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
## Index backends
|
|
353
|
+
|
|
354
|
+
- **MongoDB** (Motor): upserts by unique facet (e.g., ``file``), bulk deletes (glob → regex)
|
|
355
|
+
- **Solr**: fields align with managed schema; supports multi-valued facets
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
## Contributing
|
|
359
|
+
|
|
360
|
+
Development install:
|
|
361
|
+
|
|
362
|
+
```console
|
|
363
|
+
|
|
364
|
+
git clone https://github.com/freva-org/metadata-crawler.git
|
|
365
|
+
cd metadata-crawler
|
|
366
|
+
pip install -e .
|
|
367
|
+
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
PRs and issues welcome. Please add tests and keep examples minimal & reproducible
|
|
371
|
+
(use the MinIO compose stack). Run:
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
```console
|
|
375
|
+
python -m pip install tox
|
|
376
|
+
tox -e test lint types
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
### Benchmarks
|
|
380
|
+
For benchmarking you can create a directory tree with roughly 1.5 M files by
|
|
381
|
+
calling the ``create-cordex.sh`` script in the ``dev-env`` folder:
|
|
382
|
+
|
|
383
|
+
```console
|
|
384
|
+
./dev-env/create-cordex.sh
|
|
385
|
+
python dev-env/benchmark.py --max-files 20000
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
See ``code-of-conduct.rst`` and ``whatsnew.rst`` for guidelines and changelog.
|
|
390
|
+
|
|
391
|
+
Use MinIO or LocalStack via ``docker-compose`` and seed a bucket (e.g., ``test-bucket``).
|
|
392
|
+
Then point a dataset’s ``fs_type = "s3"`` and set ``storage_options``.
|
|
393
|
+
|
|
394
|
+
### Documentation
|
|
395
|
+
|
|
396
|
+
Built with Sphinx + ``pydata_sphinx_theme``. Build locally:
|
|
397
|
+
|
|
398
|
+
```console
|
|
399
|
+
tox -e docs
|
|
400
|
+
```
|
|
401
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
metadata_crawler/__init__.py,sha256=dT4ZOngmtO-7fiWqdo80JmeRacG09fy1T8C0bZpFR6Q,7167
|
|
2
|
+
metadata_crawler/__main__.py,sha256=4m56VOh7bb5xmZqb09fFbquke8g6KZfMbb3CUdBA60M,163
|
|
3
|
+
metadata_crawler/_version.py,sha256=_KJS3jBkMy--QJjEfRuFFutGcU0bPMFJLUgB2EZNym4,25
|
|
4
|
+
metadata_crawler/cli.py,sha256=qi77QXtuwO1N3MvLbacdaOZwzpT22FJMpnnp1k6yj-Y,17347
|
|
5
|
+
metadata_crawler/data_collector.py,sha256=7N0zQcxjsqITUVr0JnkFu_beMzrTW-paaw69ESC9rkQ,9063
|
|
6
|
+
metadata_crawler/logger.py,sha256=wNImwUVw0ycvIYrxzthWAgOCujJZhVDCSiCH5KKX5EA,4743
|
|
7
|
+
metadata_crawler/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
metadata_crawler/run.py,sha256=_6mx29Co1HwfPNFWtzTR65CNlopqubj-McmavRM7i80,12869
|
|
9
|
+
metadata_crawler/api/__init__.py,sha256=UUF0_FKgfqgcXYmknxB0Wt1jaLNaf-w_q0tWVJhgV0M,28
|
|
10
|
+
metadata_crawler/api/cli.py,sha256=pgj3iB_Irt74VbG3ZKStLRHKYY_I4bZpbOW1famKDnQ,1498
|
|
11
|
+
metadata_crawler/api/config.py,sha256=8C-qJC8lPmms61vHgmv3Dijppjjrtfnu35fD_SOjk68,29225
|
|
12
|
+
metadata_crawler/api/drs_config.toml,sha256=c3Gc8MGH22xlDOLH_y2TXiiEydmhjzvish-fQi5aGRA,10622
|
|
13
|
+
metadata_crawler/api/index.py,sha256=0yqtXYOyWJJKKkCkIJbUUVG1w2Wt_icYJjXJPZZjSvU,4715
|
|
14
|
+
metadata_crawler/api/metadata_stores.py,sha256=UekPl16KlaF7xiD4X7KVo3EMWz9KE-MT7gKxvgZyvXU,24016
|
|
15
|
+
metadata_crawler/api/storage_backend.py,sha256=jdZZ_3SZcP3gJgw_NmPPdpDEx4D7qfLJDABfupTH9p0,7803
|
|
16
|
+
metadata_crawler/api/mixin/__init__.py,sha256=4Y0T1eM4vLlgFazuC1q2briqx67LyfeCpY_pCICTnjk,197
|
|
17
|
+
metadata_crawler/api/mixin/lookup_mixin.py,sha256=WxJ-ZNs8DcIXS9ThSoIZiepD07jfmLlzyTp65-Z1fLc,3558
|
|
18
|
+
metadata_crawler/api/mixin/lookup_tables.py,sha256=za63xfZB0EvAm66uTTYo52zC0z7Y6VL8DUrP6CJ-DnQ,308683
|
|
19
|
+
metadata_crawler/api/mixin/path_mixin.py,sha256=WKpesEjlwVSJ-VdoYYLEY5oBSAQTsvuv1B38ragAVIM,1247
|
|
20
|
+
metadata_crawler/api/mixin/template_mixin.py,sha256=hxQXiP_JND3fuxBNcs1pZ7cvP-k-lTm5MQg40t0kF54,5105
|
|
21
|
+
metadata_crawler/backends/__init__.py,sha256=yrk1L00ubQlMj3yXI73PPbhAahDKp792PJB-xcXUJIM,35
|
|
22
|
+
metadata_crawler/backends/intake.py,sha256=TkvzBU8Rk49L0Y8e7Exz2nE3iLSWrBAwZnpEJtdlNR8,6595
|
|
23
|
+
metadata_crawler/backends/posix.py,sha256=6sjAoCQHiOOjp_Hvwxn247wHBnoAJYUGequqphyZWaA,3409
|
|
24
|
+
metadata_crawler/backends/s3.py,sha256=2ki-O_rRIb5dJVS9KyMmDDPczGOQTBUa-hmImllqeeE,4602
|
|
25
|
+
metadata_crawler/backends/swift.py,sha256=az3ctF_npadjzAybX65CQbDLGoxRnk0ZR7vByo6lQOM,10954
|
|
26
|
+
metadata_crawler/ingester/__init__.py,sha256=Y-c9VkQWMHDLb9WagwITCaEODlYa4p8xW-BkzzSRZXw,55
|
|
27
|
+
metadata_crawler/ingester/mongo.py,sha256=Ntt3zKVtAX6wDB5aQYCoYrkVWrnvJU2oJJyfYGW30lU,6546
|
|
28
|
+
metadata_crawler/ingester/solr.py,sha256=WrdyOdwMiutmOE1lP_3rOx7h99gbvDjkxU1FMG9zmbs,9560
|
|
29
|
+
metadata_crawler/utils/__init__.py,sha256=VSIoAtorPSiGkkwjnEcO6gKZJzXlOewDzSNUMsGAoo0,14125
|
|
30
|
+
metadata_crawler/utils/cftime_utils.py,sha256=gd64D3kEKOtGmQ7wHnnSJc7Emnw2_LflV52bCZlhTwU,5586
|
|
31
|
+
metadata_crawler-2510.1.0.dist-info/entry_points.txt,sha256=4LzS7pbqwUPTD6C-iW42vuhXdtsOJmKXqFZpdpaKwF8,428
|
|
32
|
+
metadata_crawler-2510.1.0.dist-info/licenses/LICENSE,sha256=GAUualebvSlegSVqb86FUqHrHM8WyM145__Nm2r_dfA,1496
|
|
33
|
+
metadata_crawler-2510.1.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
34
|
+
metadata_crawler-2510.1.0.dist-info/METADATA,sha256=hmv6Gvv9c-rmGSy59SHJnK2Nn9fq4sNQwlXF35-GnC4,13006
|
|
35
|
+
metadata_crawler-2510.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[console_scripts]
|
|
2
|
+
mdc=metadata_crawler.cli:cli
|
|
3
|
+
metadata-crawler=metadata_crawler.cli:cli
|
|
4
|
+
|
|
5
|
+
[metadata_crawler.ingester]
|
|
6
|
+
mongo=metadata_crawler.ingester.mongo:MongoIndex
|
|
7
|
+
solr=metadata_crawler.ingester.solr:SolrIndex
|
|
8
|
+
|
|
9
|
+
[metadata_crawler.storage]
|
|
10
|
+
intake=metadata_crawler.backends.intake:IntakePath
|
|
11
|
+
posix=metadata_crawler.backends.posix:PosixPath
|
|
12
|
+
s3=metadata_crawler.backends.s3:S3Path
|
|
13
|
+
swift=metadata_crawler.backends.swift:SwiftPath
|
|
14
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, freva-org
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|