metadata-crawler 2509.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metadata-crawler might be problematic. Click here for more details.

Files changed (34) hide show
  1. metadata_crawler/__init__.py +248 -0
  2. metadata_crawler/__main__.py +8 -0
  3. metadata_crawler/_version.py +1 -0
  4. metadata_crawler/api/__init__.py +1 -0
  5. metadata_crawler/api/cli.py +57 -0
  6. metadata_crawler/api/config.py +801 -0
  7. metadata_crawler/api/drs_config.toml +439 -0
  8. metadata_crawler/api/index.py +132 -0
  9. metadata_crawler/api/metadata_stores.py +749 -0
  10. metadata_crawler/api/mixin/__init__.py +7 -0
  11. metadata_crawler/api/mixin/lookup_mixin.py +112 -0
  12. metadata_crawler/api/mixin/lookup_tables.py +10010 -0
  13. metadata_crawler/api/mixin/path_mixin.py +46 -0
  14. metadata_crawler/api/mixin/template_mixin.py +145 -0
  15. metadata_crawler/api/storage_backend.py +277 -0
  16. metadata_crawler/backends/__init__.py +1 -0
  17. metadata_crawler/backends/intake.py +211 -0
  18. metadata_crawler/backends/posix.py +121 -0
  19. metadata_crawler/backends/s3.py +136 -0
  20. metadata_crawler/backends/swift.py +305 -0
  21. metadata_crawler/cli.py +539 -0
  22. metadata_crawler/data_collector.py +258 -0
  23. metadata_crawler/ingester/__init__.py +1 -0
  24. metadata_crawler/ingester/mongo.py +193 -0
  25. metadata_crawler/ingester/solr.py +152 -0
  26. metadata_crawler/logger.py +142 -0
  27. metadata_crawler/py.typed +0 -0
  28. metadata_crawler/run.py +373 -0
  29. metadata_crawler/utils.py +411 -0
  30. metadata_crawler-2509.0.0.dist-info/METADATA +399 -0
  31. metadata_crawler-2509.0.0.dist-info/RECORD +34 -0
  32. metadata_crawler-2509.0.0.dist-info/WHEEL +4 -0
  33. metadata_crawler-2509.0.0.dist-info/entry_points.txt +14 -0
  34. metadata_crawler-2509.0.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,7 @@
1
+ """Mixin definitions."""
2
+
3
+ from .lookup_mixin import LookupMixin
4
+ from .path_mixin import PathMixin
5
+ from .template_mixin import TemplateMixin
6
+
7
+ __all__ = ["LookupMixin", "PathMixin", "TemplateMixin"]
@@ -0,0 +1,112 @@
1
+ """Definitions for lookup table mixins."""
2
+
3
+ import atexit
4
+ import os
5
+ from types import MappingProxyType
6
+ from typing import Any, Dict, Mapping, Tuple
7
+
8
+ from appdirs import user_cache_dir
9
+ from diskcache import Cache
10
+
11
+ from .lookup_tables import cmor_lookup as _NESTED
12
+
13
+ Key = Tuple[str, ...]
14
+
15
+
16
+ def _flatten_static(
17
+ prefix: Tuple[str, ...], node: Mapping[str, Any], out: Dict[Key, Any]
18
+ ) -> None:
19
+ """Flatten nested CMOR-like dict.
20
+
21
+ cmip6 -> CF3hr -> tas -> {'realm': 'atmos', 'time-frequency': '3hrPt', ...}
22
+ into keys ('cmip6','CF3hr','tas','realm') -> 'atmos'.
23
+ """
24
+ for k, v in node.items():
25
+ if isinstance(v, Mapping):
26
+ if v and not all(isinstance(x, Mapping) for x in v.values()):
27
+ for leaf_k, leaf_v in v.items():
28
+ out[prefix + (k, leaf_k)] = leaf_v
29
+ else:
30
+ _flatten_static(prefix + (k,), v, out)
31
+ else:
32
+ out[prefix + (k,)] = v
33
+
34
+
35
+ _flat: Dict[Key, Any] = {}
36
+
37
+ _dir = os.getenv("MDC_LOOKUP_CACHE_DIR") or os.path.join(
38
+ user_cache_dir("metadata-crawler", "freva"), "lookup"
39
+ )
40
+ os.makedirs(_dir, exist_ok=True)
41
+ _DC = Cache(
42
+ _dir, size_limit=2 * 1024**3, eviction="least-recently-used", cull_limit=10
43
+ )
44
+ atexit.register(_DC.close)
45
+
46
+
47
+ class LookupMixin:
48
+ """Provide a Mixing with a process safe lookup().
49
+
50
+ The mixin does:
51
+ - process-wide static table (CMOR) via CMOR_STATIC
52
+ - per-instance disk cache for file-derived attrs
53
+ - in-flight de-duplication for concurrent misses
54
+
55
+ Subclass must implement:
56
+ def read_attr(self, attribute: str, path: str, **read_kws: Any) -> Any
57
+ """
58
+
59
+ CMOR_STATIC: Mapping[Key, Any] = {}
60
+
61
+ def set_static_from_nested(self) -> None:
62
+ """Flatting the cmor lookup table."""
63
+ if not self.CMOR_STATIC:
64
+ _flatten_static((), _NESTED, _flat)
65
+ self.CMOR_STATIC = MappingProxyType(_flat)
66
+
67
+ def read_attr(self, attribute: str, path: str, **read_kws: Any) -> Any:
68
+ """Get a metadata attribute from a datastore object."""
69
+ raise NotImplementedError # pragma: no cover
70
+
71
+ def lookup(
72
+ self, path: str, attribute: str, *tree: str, **read_kws: Any
73
+ ) -> Any:
74
+ """Get metadata from a lookup table.
75
+
76
+ This function will read metadata from a pre-defined cache table and if
77
+ the metadata is not present in the cache table it'll read the
78
+ the object store and add the metadata to the cache table.
79
+
80
+ Parameters
81
+ ^^^^^^^^^^
82
+
83
+ path:
84
+ Path to the object store / file name
85
+ attribute:
86
+ The attribute that is retrieved from the data.
87
+ variable attributes can be defined by a ``.``.
88
+ For example: ``tas.long_name`` would get attribute ``long_name``
89
+ from variable ``tas``.
90
+ *tree:
91
+ A tuple representing nested attributes. Attributes are nested for
92
+ more efficient lookup. ('atmos', '1hr', 'tas') will translate into
93
+ a tree of ['atmos']['1hr']['tas']
94
+
95
+ Other Parameters
96
+ ^^^^^^^^^^^^^^^^
97
+ **read_kws:
98
+ Keyword arguments passed to open the datasets.
99
+
100
+ """
101
+ # 1) static fast-path
102
+ try:
103
+ return self.CMOR_STATIC[tree]
104
+ except KeyError:
105
+ pass
106
+ # 2) process-safe disk cache (key includes path)
107
+ val = _DC.get(tree)
108
+ if val is None:
109
+ val = self.read_attr(attribute, path, **read_kws)
110
+ if not _DC.add(tree, val):
111
+ val = _DC.get(tree)
112
+ return val