statwrapper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statwrapper/__init__.py +31 -0
- statwrapper/api_clients/__init__.py +11 -0
- statwrapper/api_clients/dst_client.py +249 -0
- statwrapper/api_clients/eurostat_client.py +359 -0
- statwrapper/api_clients/pxweb2_client.py +170 -0
- statwrapper/api_clients/pxweb_client.py +244 -0
- statwrapper/base_api_client.py +79 -0
- statwrapper/exceptions.py +22 -0
- statwrapper/http.py +126 -0
- statwrapper/models.py +103 -0
- statwrapper/parsers.py +260 -0
- statwrapper/provider_registry.py +74 -0
- statwrapper/providers.json +662 -0
- statwrapper/statwrapper.py +103 -0
- statwrapper/utils.py +134 -0
- statwrapper-0.1.0.dist-info/METADATA +123 -0
- statwrapper-0.1.0.dist-info/RECORD +19 -0
- statwrapper-0.1.0.dist-info/WHEEL +5 -0
- statwrapper-0.1.0.dist-info/top_level.txt +1 -0
statwrapper/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .base_api_client import APIWrapper
|
|
2
|
+
from .http import RateLimitedSession
|
|
3
|
+
from .models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
|
|
4
|
+
from .provider_registry import (
|
|
5
|
+
create_wrapper,
|
|
6
|
+
get_provider,
|
|
7
|
+
get_wrapper_class,
|
|
8
|
+
load_providers,
|
|
9
|
+
)
|
|
10
|
+
from .statwrapper import (
|
|
11
|
+
StatWrapper,
|
|
12
|
+
discover_provider_datasets,
|
|
13
|
+
get_datasets,
|
|
14
|
+
resolve_dataset_metadata,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"APIWrapper",
|
|
19
|
+
"DiscoveredDataset",
|
|
20
|
+
"Provider",
|
|
21
|
+
"RateLimitedSession",
|
|
22
|
+
"ResolvedDatasetMetadata",
|
|
23
|
+
"StatWrapper",
|
|
24
|
+
"create_wrapper",
|
|
25
|
+
"discover_provider_datasets",
|
|
26
|
+
"get_datasets",
|
|
27
|
+
"get_provider",
|
|
28
|
+
"get_wrapper_class",
|
|
29
|
+
"load_providers",
|
|
30
|
+
"resolve_dataset_metadata",
|
|
31
|
+
]
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..base_api_client import APIWrapper
|
|
7
|
+
from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
|
|
8
|
+
from ..utils import (
|
|
9
|
+
detect_role,
|
|
10
|
+
determine_time_unit,
|
|
11
|
+
normalize_contact,
|
|
12
|
+
normalize_note,
|
|
13
|
+
parse_dt,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _collect_tables(subjects_tree: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
|
|
18
|
+
tables_by_id: dict[str, dict[str, Any]] = {}
|
|
19
|
+
|
|
20
|
+
def visit(node: dict[str, Any]) -> None:
|
|
21
|
+
for table in node.get("tables") or []:
|
|
22
|
+
if not isinstance(table, dict):
|
|
23
|
+
continue
|
|
24
|
+
table_id = str(table.get("id") or "").strip()
|
|
25
|
+
if not table_id:
|
|
26
|
+
continue
|
|
27
|
+
existing = tables_by_id.setdefault(table_id, {"id": table_id})
|
|
28
|
+
for key, value in table.items():
|
|
29
|
+
if existing.get(key) in (None, "", []) and value not in (None, "", []):
|
|
30
|
+
existing[key] = value
|
|
31
|
+
for child in node.get("subjects") or []:
|
|
32
|
+
if isinstance(child, dict):
|
|
33
|
+
visit(child)
|
|
34
|
+
|
|
35
|
+
for root in subjects_tree:
|
|
36
|
+
if isinstance(root, dict):
|
|
37
|
+
visit(root)
|
|
38
|
+
return tables_by_id
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _parse_paths(subjects_tree: list[dict[str, Any]]) -> dict[str, list[list[dict[str, str]]]]:
|
|
42
|
+
code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
|
|
43
|
+
|
|
44
|
+
def visit(node: dict[str, Any], path: list[dict[str, str]]) -> None:
|
|
45
|
+
node_id = str(node.get("id") or "").strip()
|
|
46
|
+
node_label = str(node.get("description") or node.get("text") or node_id).strip()
|
|
47
|
+
current_path = path + ([{"id": node_id, "label": node_label}] if node_id else [])
|
|
48
|
+
for table in node.get("tables") or []:
|
|
49
|
+
if not isinstance(table, dict):
|
|
50
|
+
continue
|
|
51
|
+
table_id = str(table.get("id") or "").strip()
|
|
52
|
+
if not table_id:
|
|
53
|
+
continue
|
|
54
|
+
code_to_paths.setdefault(table_id, []).append(
|
|
55
|
+
current_path + [{"id": table_id, "label": str(table.get("text") or table_id).strip()}]
|
|
56
|
+
)
|
|
57
|
+
for child in node.get("subjects") or []:
|
|
58
|
+
if isinstance(child, dict):
|
|
59
|
+
visit(child, current_path)
|
|
60
|
+
|
|
61
|
+
for root in subjects_tree:
|
|
62
|
+
if isinstance(root, dict):
|
|
63
|
+
visit(root, [])
|
|
64
|
+
return code_to_paths
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DstClient(APIWrapper):
|
|
68
|
+
def __init__(self, provider: Provider, **kwargs: Any) -> None:
|
|
69
|
+
super().__init__(
|
|
70
|
+
provider_code=provider.provider_code,
|
|
71
|
+
label=provider.label,
|
|
72
|
+
language=kwargs.pop("language"),
|
|
73
|
+
json_request_handler=kwargs.pop("json_request_handler"),
|
|
74
|
+
text_request_handler=kwargs.pop("text_request_handler", None),
|
|
75
|
+
bytes_request_handler=kwargs.pop("bytes_request_handler", None),
|
|
76
|
+
logger=kwargs.pop("logger", None),
|
|
77
|
+
)
|
|
78
|
+
self.provider = provider
|
|
79
|
+
self.api_type = provider.api_type
|
|
80
|
+
self.base_api_url = provider.base_api_url or ""
|
|
81
|
+
self.base_web_url = provider.base_web_url
|
|
82
|
+
|
|
83
|
+
def _subjects_url(self) -> str:
|
|
84
|
+
return f"{self.base_api_url.rstrip('/')}/subjects"
|
|
85
|
+
|
|
86
|
+
def _tableinfo_url(self, dataset_code: str) -> str:
|
|
87
|
+
return f"{self.base_api_url.rstrip('/')}/tableinfo/{dataset_code}"
|
|
88
|
+
|
|
89
|
+
def _data_url(self, dataset_code: str) -> str:
|
|
90
|
+
return f"{self.base_api_url.rstrip('/')}/data/{dataset_code}/JSONSTAT"
|
|
91
|
+
|
|
92
|
+
async def health_check(self, dataset_code: str | None = None) -> bool:
|
|
93
|
+
url = self._subjects_url() if dataset_code is None else self._tableinfo_url(dataset_code)
|
|
94
|
+
payload = await self._get_json(url, params={"lang": self.language})
|
|
95
|
+
return payload is not None
|
|
96
|
+
|
|
97
|
+
async def discover_datasets(
|
|
98
|
+
self,
|
|
99
|
+
task_id: uuid.UUID,
|
|
100
|
+
**_: Any,
|
|
101
|
+
) -> list[DiscoveredDataset]:
|
|
102
|
+
payload = await self._get_json(
|
|
103
|
+
self._subjects_url(),
|
|
104
|
+
params={
|
|
105
|
+
"lang": self.language,
|
|
106
|
+
"recursive": "true",
|
|
107
|
+
"omitSubjectsWithoutTables": "true",
|
|
108
|
+
"includeTables": "true",
|
|
109
|
+
"format": "JSON",
|
|
110
|
+
},
|
|
111
|
+
)
|
|
112
|
+
if not isinstance(payload, list):
|
|
113
|
+
return []
|
|
114
|
+
tables_by_id = _collect_tables(payload)
|
|
115
|
+
paths_by_id = _parse_paths(payload)
|
|
116
|
+
output: list[DiscoveredDataset] = []
|
|
117
|
+
for dataset_code, table in sorted(tables_by_id.items()):
|
|
118
|
+
paths = paths_by_id.get(dataset_code, [])
|
|
119
|
+
primary_path = paths[0] if paths else []
|
|
120
|
+
subject = primary_path[0] if primary_path else None
|
|
121
|
+
first_period = str(table.get("firstPeriod") or "").strip() or None
|
|
122
|
+
last_period = str(table.get("latestPeriod") or table.get("lastPeriod") or "").strip() or None
|
|
123
|
+
updated = parse_dt(table.get("updated")) or parse_dt("1970-01-01T00:00:00+00:00")
|
|
124
|
+
if updated is None:
|
|
125
|
+
continue
|
|
126
|
+
output.append(
|
|
127
|
+
DiscoveredDataset(
|
|
128
|
+
task_id=task_id,
|
|
129
|
+
provider_code=self.provider_code,
|
|
130
|
+
dataset_code=dataset_code,
|
|
131
|
+
language=self.language,
|
|
132
|
+
updated=updated,
|
|
133
|
+
label=str(table.get("text") or dataset_code).strip(),
|
|
134
|
+
description=str(table.get("description")).strip() if table.get("description") is not None else None,
|
|
135
|
+
source=f"Statistics Denmark - statbank.dk/{dataset_code}",
|
|
136
|
+
subject_code=subject["id"] if subject else None,
|
|
137
|
+
subject_label=subject["label"] if subject else None,
|
|
138
|
+
time_unit=determine_time_unit(first_period, last_period) if first_period and last_period else None,
|
|
139
|
+
first_period=first_period,
|
|
140
|
+
last_period=last_period,
|
|
141
|
+
discontinued=(not bool(table.get("active")) if isinstance(table.get("active"), bool) else False),
|
|
142
|
+
metadata_url=f"{self._tableinfo_url(dataset_code)}?lang={self.language}",
|
|
143
|
+
data_url=f"{self._data_url(dataset_code)}?lang={self.language}",
|
|
144
|
+
web_url=f"{self.base_web_url.rstrip('/')}/{dataset_code}" if self.base_web_url else None,
|
|
145
|
+
paths=paths or None,
|
|
146
|
+
extension={"unit": table.get("unit"), "variable_names": table.get("variables")},
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
return output
|
|
150
|
+
|
|
151
|
+
async def resolve_dataset_metadata(
|
|
152
|
+
self,
|
|
153
|
+
discovered: DiscoveredDataset,
|
|
154
|
+
task_id: uuid.UUID | None = None,
|
|
155
|
+
**_: Any,
|
|
156
|
+
) -> ResolvedDatasetMetadata:
|
|
157
|
+
payload = await self._get_json(
|
|
158
|
+
self._tableinfo_url(discovered.dataset_code),
|
|
159
|
+
params={"lang": self.language},
|
|
160
|
+
)
|
|
161
|
+
dimensions: dict[str, dict[str, Any]] = {}
|
|
162
|
+
dimension_ids: list[str] = []
|
|
163
|
+
required_dimensions: dict[str, bool | None] = {}
|
|
164
|
+
role_map: dict[str, list[str]] = {}
|
|
165
|
+
extension = dict(discovered.extension)
|
|
166
|
+
if isinstance(payload, dict):
|
|
167
|
+
for position, variable in enumerate(payload.get("variables") or []):
|
|
168
|
+
if not isinstance(variable, dict):
|
|
169
|
+
continue
|
|
170
|
+
dimension_id = str(variable.get("id") or "").strip()
|
|
171
|
+
if not dimension_id:
|
|
172
|
+
continue
|
|
173
|
+
values = variable.get("values") if isinstance(variable.get("values"), list) else []
|
|
174
|
+
index: dict[str, int] = {}
|
|
175
|
+
labels: dict[str, str] = {}
|
|
176
|
+
for ordinal, item in enumerate(values):
|
|
177
|
+
if not isinstance(item, dict):
|
|
178
|
+
continue
|
|
179
|
+
value_id = str(item.get("id") or "").strip()
|
|
180
|
+
if not value_id:
|
|
181
|
+
continue
|
|
182
|
+
index[value_id] = ordinal
|
|
183
|
+
labels[value_id] = str(item.get("text") or value_id).strip()
|
|
184
|
+
if not index:
|
|
185
|
+
continue
|
|
186
|
+
label = str(variable.get("text") or dimension_id).strip()
|
|
187
|
+
dimensions[dimension_id] = {
|
|
188
|
+
"label": label,
|
|
189
|
+
"category": {"index": index, "label": labels},
|
|
190
|
+
}
|
|
191
|
+
dimension_ids.append(dimension_id)
|
|
192
|
+
elimination = variable.get("elimination")
|
|
193
|
+
required_dimensions[dimension_id] = None if elimination is None else not bool(elimination)
|
|
194
|
+
role = "geo" if variable.get("map") else detect_role(dimension_id, label, bool(variable.get("time")))
|
|
195
|
+
if role:
|
|
196
|
+
role_map.setdefault(role, []).append(dimension_id)
|
|
197
|
+
extension.setdefault("dimension_extensions", {})[dimension_id] = {
|
|
198
|
+
"extension": {"position": position}
|
|
199
|
+
}
|
|
200
|
+
label = (
|
|
201
|
+
str(payload.get("text")).strip()
|
|
202
|
+
if isinstance(payload, dict) and payload.get("text") is not None
|
|
203
|
+
else (discovered.label or discovered.dataset_code)
|
|
204
|
+
)
|
|
205
|
+
description = (
|
|
206
|
+
str(payload.get("description")).strip()
|
|
207
|
+
if isinstance(payload, dict) and payload.get("description") is not None
|
|
208
|
+
else discovered.description
|
|
209
|
+
)
|
|
210
|
+
updated = (
|
|
211
|
+
parse_dt(payload.get("updated"))
|
|
212
|
+
if isinstance(payload, dict)
|
|
213
|
+
else None
|
|
214
|
+
) or discovered.updated
|
|
215
|
+
return ResolvedDatasetMetadata(
|
|
216
|
+
task_id=task_id or discovered.task_id,
|
|
217
|
+
provider_code=discovered.provider_code,
|
|
218
|
+
dataset_code=discovered.dataset_code,
|
|
219
|
+
language=discovered.language,
|
|
220
|
+
updated=updated,
|
|
221
|
+
label=label,
|
|
222
|
+
time_unit=discovered.time_unit or "Other",
|
|
223
|
+
first_period=discovered.first_period or "",
|
|
224
|
+
last_period=discovered.last_period or discovered.first_period or "",
|
|
225
|
+
paths=discovered.paths or [],
|
|
226
|
+
role=role_map,
|
|
227
|
+
metadata_url=discovered.metadata_url or f"{self._tableinfo_url(discovered.dataset_code)}?lang={self.language}",
|
|
228
|
+
data_url=discovered.data_url or f"{self._data_url(discovered.dataset_code)}?lang={self.language}",
|
|
229
|
+
dimension=dimensions,
|
|
230
|
+
required_dimensions=required_dimensions,
|
|
231
|
+
note=(normalize_note(payload.get("footnote")) if isinstance(payload, dict) else None) or discovered.note,
|
|
232
|
+
source=discovered.source,
|
|
233
|
+
description=description,
|
|
234
|
+
discontinued=not bool(payload.get("active")) if isinstance(payload, dict) and isinstance(payload.get("active"), bool) else discovered.discontinued,
|
|
235
|
+
subject_code=discovered.subject_code,
|
|
236
|
+
subject_label=discovered.subject_label,
|
|
237
|
+
web_url=discovered.web_url,
|
|
238
|
+
doc_url=(
|
|
239
|
+
str(payload.get("documentation", {}).get("url")).strip()
|
|
240
|
+
if isinstance(payload, dict)
|
|
241
|
+
and isinstance(payload.get("documentation"), dict)
|
|
242
|
+
and payload["documentation"].get("url") is not None
|
|
243
|
+
else discovered.doc_url
|
|
244
|
+
),
|
|
245
|
+
official_statistics=discovered.official_statistics,
|
|
246
|
+
contact=(normalize_contact(payload.get("contacts")) if isinstance(payload, dict) else None) or discovered.contact,
|
|
247
|
+
dimension_ids=dimension_ids or None,
|
|
248
|
+
extension=extension,
|
|
249
|
+
)
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import csv
|
|
5
|
+
import gzip
|
|
6
|
+
import io
|
|
7
|
+
import re
|
|
8
|
+
import uuid
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ..base_api_client import APIWrapper
|
|
14
|
+
from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
|
|
15
|
+
from ..utils import parse_dt
|
|
16
|
+
|
|
17
|
+
DATA_URL_TEMPLATE = "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/{dataset_code}"
|
|
18
|
+
METADATA_URL_TEMPLATE = (
|
|
19
|
+
"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/"
|
|
20
|
+
"ESTAT/{dataset_code}/1.0?format=JSON&lang={language}&compress=false&attributes=none&measures=none"
|
|
21
|
+
)
|
|
22
|
+
WEB_URL_TEMPLATE = "https://ec.europa.eu/eurostat/databrowser/product/page/{dataset_code}"
|
|
23
|
+
DOC_URL_TEMPLATE = "https://ec.europa.eu/eurostat/cache/metadata/en/{lowercase_dataset_code}.htm"
|
|
24
|
+
TOC_URL_TEMPLATE = "https://ec.europa.eu/eurostat/api/dissemination/catalogue/toc/txt?lang={language}"
|
|
25
|
+
DATAFLOWS_URL = (
|
|
26
|
+
"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/structure/dataflow/"
|
|
27
|
+
"ESTAT/all/1.0?format=JSON&compress=false"
|
|
28
|
+
)
|
|
29
|
+
CODELISTS_URL = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/codelist/ESTAT/all/latest?format=json"
|
|
30
|
+
METABASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/catalogue/metabase.txt.gz"
|
|
31
|
+
_GLOBAL_CACHE: dict[
|
|
32
|
+
str,
|
|
33
|
+
tuple[
|
|
34
|
+
dict[str, _Dataflow],
|
|
35
|
+
dict[str, Any],
|
|
36
|
+
dict[str, dict[str, list[str]]],
|
|
37
|
+
dict[str, list[list[dict[str, str]]]],
|
|
38
|
+
],
|
|
39
|
+
] = {}
|
|
40
|
+
_GLOBAL_CACHE_LOCK = asyncio.Lock()
|
|
41
|
+
_CLEAN_TAGS_PATTERN = re.compile(r"<.*?>")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _remove_html_tags(text: str) -> str:
|
|
45
|
+
return re.sub(_CLEAN_TAGS_PATTERN, "", text)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parse_toc_paths(toc_text: str) -> dict[str, list[list[dict[str, str]]]]:
|
|
49
|
+
reader = csv.reader(io.StringIO(toc_text), delimiter="\t", quotechar='"')
|
|
50
|
+
next(reader, None)
|
|
51
|
+
folder_stack: list[dict[str, str]] = []
|
|
52
|
+
full_stack: list[dict[str, str]] = []
|
|
53
|
+
code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
|
|
54
|
+
for row in reader:
|
|
55
|
+
if len(row) < 3:
|
|
56
|
+
continue
|
|
57
|
+
title, code, item_type = row[0], row[1], row[2]
|
|
58
|
+
if not title.strip() or not code.strip():
|
|
59
|
+
continue
|
|
60
|
+
indent_level = (len(title) - len(title.lstrip())) // 4
|
|
61
|
+
full_stack = full_stack[:indent_level]
|
|
62
|
+
full_stack.append({"id": code, "label": title.strip(), "type": item_type})
|
|
63
|
+
folder_stack = [
|
|
64
|
+
{"id": node["id"], "label": node["label"]}
|
|
65
|
+
for node in full_stack
|
|
66
|
+
if node.get("type") == "folder"
|
|
67
|
+
]
|
|
68
|
+
if item_type != "folder":
|
|
69
|
+
code_to_paths.setdefault(code.upper(), []).append(list(folder_stack))
|
|
70
|
+
return code_to_paths
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _parse_metabase(gzipped_content: bytes) -> dict[str, dict[str, list[str]]]:
|
|
74
|
+
metabase: dict[str, dict[str, list[str]]] = {}
|
|
75
|
+
with gzip.open(io.BytesIO(gzipped_content), "rt", encoding="utf-8") as handle:
|
|
76
|
+
dataset_dim_values: defaultdict[str, defaultdict[str, list[str]]] = defaultdict(
|
|
77
|
+
lambda: defaultdict(list)
|
|
78
|
+
)
|
|
79
|
+
for line in handle:
|
|
80
|
+
stripped = line.strip()
|
|
81
|
+
if not stripped:
|
|
82
|
+
continue
|
|
83
|
+
parts = stripped.split("\t")
|
|
84
|
+
if len(parts) < 3:
|
|
85
|
+
continue
|
|
86
|
+
dataset_code, dimension, value = parts[:3]
|
|
87
|
+
dataset_dim_values[dataset_code.upper()][dimension].append(value)
|
|
88
|
+
for dataset_code, dimensions in dataset_dim_values.items():
|
|
89
|
+
metabase[dataset_code] = dict(dimensions)
|
|
90
|
+
return metabase
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(slots=True)
|
|
94
|
+
class _Dataflow:
|
|
95
|
+
dataset_code: str
|
|
96
|
+
label: str
|
|
97
|
+
updated: str
|
|
98
|
+
source: str | None
|
|
99
|
+
description: str | None
|
|
100
|
+
first_period: str | None
|
|
101
|
+
last_period: str | None
|
|
102
|
+
extension: dict[str, Any]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class EurostatClient(APIWrapper):
|
|
106
|
+
def __init__(self, provider: Provider, **kwargs: Any) -> None:
|
|
107
|
+
super().__init__(
|
|
108
|
+
provider_code=provider.provider_code,
|
|
109
|
+
label=provider.label,
|
|
110
|
+
language=kwargs.pop("language"),
|
|
111
|
+
json_request_handler=kwargs.pop("json_request_handler"),
|
|
112
|
+
text_request_handler=kwargs.pop("text_request_handler", None),
|
|
113
|
+
bytes_request_handler=kwargs.pop("bytes_request_handler", None),
|
|
114
|
+
logger=kwargs.pop("logger", None),
|
|
115
|
+
)
|
|
116
|
+
self.provider = provider
|
|
117
|
+
self.api_type = provider.api_type
|
|
118
|
+
|
|
119
|
+
async def health_check(self, dataset_code: str | None = None) -> bool:
|
|
120
|
+
url = (
|
|
121
|
+
DATAFLOWS_URL
|
|
122
|
+
if dataset_code is None
|
|
123
|
+
else METADATA_URL_TEMPLATE.format(
|
|
124
|
+
dataset_code=dataset_code,
|
|
125
|
+
language=self.language,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
payload = await self._get_json(url)
|
|
129
|
+
return payload is not None
|
|
130
|
+
|
|
131
|
+
async def _fetch_global_metadata(
|
|
132
|
+
self,
|
|
133
|
+
) -> tuple[
|
|
134
|
+
dict[str, _Dataflow],
|
|
135
|
+
dict[str, Any],
|
|
136
|
+
dict[str, dict[str, list[str]]],
|
|
137
|
+
dict[str, list[list[dict[str, str]]]],
|
|
138
|
+
]:
|
|
139
|
+
cache_key = self.language.lower()
|
|
140
|
+
cached = _GLOBAL_CACHE.get(cache_key)
|
|
141
|
+
if cached is not None:
|
|
142
|
+
return cached
|
|
143
|
+
async with _GLOBAL_CACHE_LOCK:
|
|
144
|
+
cached = _GLOBAL_CACHE.get(cache_key)
|
|
145
|
+
if cached is not None:
|
|
146
|
+
return cached
|
|
147
|
+
toc_text, dataflows_payload, codelists_payload, metabase_payload = (
|
|
148
|
+
await asyncio.gather(
|
|
149
|
+
self._get_text(TOC_URL_TEMPLATE.format(language=self.language)),
|
|
150
|
+
self._get_json(DATAFLOWS_URL),
|
|
151
|
+
self._get_json(CODELISTS_URL),
|
|
152
|
+
self._get_bytes(METABASE_URL),
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
dataflows: dict[str, _Dataflow] = {}
|
|
156
|
+
if isinstance(dataflows_payload, dict):
|
|
157
|
+
items = dataflows_payload.get("link", {}).get("item", [])
|
|
158
|
+
for item in items:
|
|
159
|
+
if not isinstance(item, dict):
|
|
160
|
+
continue
|
|
161
|
+
extension = item.get("extension", {})
|
|
162
|
+
if not isinstance(extension, dict):
|
|
163
|
+
extension = {}
|
|
164
|
+
dataset_code = str(extension.get("id") or "").upper()
|
|
165
|
+
label = str(item.get("label") or "").strip()
|
|
166
|
+
if not dataset_code or not label:
|
|
167
|
+
continue
|
|
168
|
+
annotation_map: dict[str, Any] = {}
|
|
169
|
+
for annotation in extension.get("annotation", []):
|
|
170
|
+
if not isinstance(annotation, dict):
|
|
171
|
+
continue
|
|
172
|
+
annotation_type = annotation.get("type")
|
|
173
|
+
if not annotation_type:
|
|
174
|
+
continue
|
|
175
|
+
for key in ("date", "title", "text"):
|
|
176
|
+
if key in annotation:
|
|
177
|
+
annotation_map[str(annotation_type)] = annotation[key]
|
|
178
|
+
break
|
|
179
|
+
updated = max(
|
|
180
|
+
str(annotation_map.get("UPDATE_DATA") or ""),
|
|
181
|
+
str(annotation_map.get("UPDATE_STRUCTURE") or ""),
|
|
182
|
+
)
|
|
183
|
+
if not updated:
|
|
184
|
+
continue
|
|
185
|
+
description = extension.get("description")
|
|
186
|
+
dataflows[dataset_code] = _Dataflow(
|
|
187
|
+
dataset_code=dataset_code,
|
|
188
|
+
label=label,
|
|
189
|
+
updated=updated,
|
|
190
|
+
source=str(annotation_map.get("SOURCE_INSTITUTIONS") or "Eurostat"),
|
|
191
|
+
description=_remove_html_tags(description)
|
|
192
|
+
if isinstance(description, str)
|
|
193
|
+
else None,
|
|
194
|
+
first_period=str(
|
|
195
|
+
annotation_map.get("OBS_PERIOD_OVERALL_OLDEST") or ""
|
|
196
|
+
).strip()
|
|
197
|
+
or None,
|
|
198
|
+
last_period=str(
|
|
199
|
+
annotation_map.get("OBS_PERIOD_OVERALL_LATEST") or ""
|
|
200
|
+
).strip()
|
|
201
|
+
or None,
|
|
202
|
+
extension={"dataflow_extension": extension} if extension else {},
|
|
203
|
+
)
|
|
204
|
+
codelists = codelists_payload if isinstance(codelists_payload, dict) else {}
|
|
205
|
+
metabase = (
|
|
206
|
+
_parse_metabase(metabase_payload)
|
|
207
|
+
if isinstance(metabase_payload, bytes)
|
|
208
|
+
else {}
|
|
209
|
+
)
|
|
210
|
+
toc_paths = _parse_toc_paths(toc_text) if isinstance(toc_text, str) else {}
|
|
211
|
+
_GLOBAL_CACHE[cache_key] = (dataflows, codelists, metabase, toc_paths)
|
|
212
|
+
return _GLOBAL_CACHE[cache_key]
|
|
213
|
+
|
|
214
|
+
def _build_dimensions(
|
|
215
|
+
self,
|
|
216
|
+
dataset_code: str,
|
|
217
|
+
codelists: dict[str, Any],
|
|
218
|
+
metabase: dict[str, dict[str, list[str]]],
|
|
219
|
+
) -> tuple[dict[str, dict[str, Any]], list[str], dict[str, list[str]], str]:
|
|
220
|
+
dimensions: dict[str, dict[str, Any]] = {}
|
|
221
|
+
role: dict[str, list[str]] = {}
|
|
222
|
+
time_unit = "Other"
|
|
223
|
+
items = codelists.get("link", {}).get("item", []) if isinstance(codelists, dict) else []
|
|
224
|
+
for dimension_id, category_ids in metabase.get(dataset_code, {}).items():
|
|
225
|
+
if dimension_id.lower() == "time":
|
|
226
|
+
time_unit = (
|
|
227
|
+
"Annual"
|
|
228
|
+
if category_ids and all(len(value) == 4 for value in category_ids[:2])
|
|
229
|
+
else "Other"
|
|
230
|
+
)
|
|
231
|
+
labels = {value: value for value in category_ids}
|
|
232
|
+
index = {value: ordinal for ordinal, value in enumerate(category_ids)}
|
|
233
|
+
dimensions[dimension_id] = {
|
|
234
|
+
"label": "Time",
|
|
235
|
+
"category": {"index": index, "label": labels},
|
|
236
|
+
}
|
|
237
|
+
role.setdefault("time", []).append(dimension_id)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
dimension_label = dimension_id
|
|
241
|
+
label_map = {value: value for value in category_ids}
|
|
242
|
+
for item in items:
|
|
243
|
+
if not isinstance(item, dict):
|
|
244
|
+
continue
|
|
245
|
+
extension = item.get("extension", {})
|
|
246
|
+
if not isinstance(extension, dict):
|
|
247
|
+
continue
|
|
248
|
+
if str(extension.get("id") or "").upper() != dimension_id.upper():
|
|
249
|
+
continue
|
|
250
|
+
dimension_label = str(item.get("label") or dimension_id)
|
|
251
|
+
labels = item.get("category", {}).get("label", {})
|
|
252
|
+
if isinstance(labels, dict):
|
|
253
|
+
label_map = {
|
|
254
|
+
value: str(labels.get(value, value)) for value in category_ids
|
|
255
|
+
}
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
dimensions[dimension_id] = {
|
|
259
|
+
"label": dimension_label,
|
|
260
|
+
"category": {
|
|
261
|
+
"index": {
|
|
262
|
+
value: ordinal for ordinal, value in enumerate(category_ids)
|
|
263
|
+
},
|
|
264
|
+
"label": label_map,
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
if dimension_id.lower() == "geo":
|
|
268
|
+
role.setdefault("geo", []).append(dimension_id)
|
|
269
|
+
return dimensions, list(dimensions.keys()), role, time_unit
|
|
270
|
+
|
|
271
|
+
async def discover_datasets(
|
|
272
|
+
self,
|
|
273
|
+
task_id: uuid.UUID,
|
|
274
|
+
**_: Any,
|
|
275
|
+
) -> list[DiscoveredDataset]:
|
|
276
|
+
dataflows, codelists, metabase, toc_paths = await self._fetch_global_metadata()
|
|
277
|
+
output: list[DiscoveredDataset] = []
|
|
278
|
+
for dataset_code, dataflow in dataflows.items():
|
|
279
|
+
if "$dv_" in dataset_code.lower():
|
|
280
|
+
continue
|
|
281
|
+
dimensions, dimension_ids, role, time_unit = self._build_dimensions(
|
|
282
|
+
dataset_code,
|
|
283
|
+
codelists,
|
|
284
|
+
metabase,
|
|
285
|
+
)
|
|
286
|
+
updated = parse_dt(dataflow.updated)
|
|
287
|
+
if updated is None:
|
|
288
|
+
continue
|
|
289
|
+
paths = toc_paths.get(dataset_code, [])
|
|
290
|
+
subject = paths[0][0] if paths and paths[0] else None
|
|
291
|
+
output.append(
|
|
292
|
+
DiscoveredDataset(
|
|
293
|
+
task_id=task_id,
|
|
294
|
+
provider_code=self.provider_code,
|
|
295
|
+
dataset_code=dataset_code,
|
|
296
|
+
language=self.language,
|
|
297
|
+
updated=updated,
|
|
298
|
+
label=_remove_html_tags(dataflow.label),
|
|
299
|
+
source=dataflow.source,
|
|
300
|
+
description=dataflow.description,
|
|
301
|
+
time_unit=time_unit,
|
|
302
|
+
first_period=dataflow.first_period,
|
|
303
|
+
last_period=dataflow.last_period,
|
|
304
|
+
paths=paths or None,
|
|
305
|
+
subject_code=subject["id"] if subject else None,
|
|
306
|
+
subject_label=subject["label"] if subject else None,
|
|
307
|
+
metadata_url=METADATA_URL_TEMPLATE.format(
|
|
308
|
+
dataset_code=dataset_code,
|
|
309
|
+
language=self.language,
|
|
310
|
+
),
|
|
311
|
+
data_url=DATA_URL_TEMPLATE.format(dataset_code=dataset_code),
|
|
312
|
+
web_url=WEB_URL_TEMPLATE.format(dataset_code=dataset_code),
|
|
313
|
+
doc_url=DOC_URL_TEMPLATE.format(
|
|
314
|
+
lowercase_dataset_code=dataset_code.lower()
|
|
315
|
+
),
|
|
316
|
+
dimension=dimensions,
|
|
317
|
+
required_dimensions=dict.fromkeys(dimension_ids, None),
|
|
318
|
+
role=role,
|
|
319
|
+
dimension_ids=dimension_ids,
|
|
320
|
+
extension=dataflow.extension,
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
return output
|
|
324
|
+
|
|
325
|
+
async def resolve_dataset_metadata(
|
|
326
|
+
self,
|
|
327
|
+
discovered: DiscoveredDataset,
|
|
328
|
+
task_id: uuid.UUID | None = None,
|
|
329
|
+
**_: Any,
|
|
330
|
+
) -> ResolvedDatasetMetadata:
|
|
331
|
+
return ResolvedDatasetMetadata(
|
|
332
|
+
task_id=task_id or discovered.task_id,
|
|
333
|
+
provider_code=discovered.provider_code,
|
|
334
|
+
dataset_code=discovered.dataset_code,
|
|
335
|
+
language=discovered.language,
|
|
336
|
+
updated=discovered.updated,
|
|
337
|
+
label=discovered.label or discovered.dataset_code,
|
|
338
|
+
time_unit=discovered.time_unit or "Other",
|
|
339
|
+
first_period=discovered.first_period or "",
|
|
340
|
+
last_period=discovered.last_period or discovered.first_period or "",
|
|
341
|
+
paths=discovered.paths or [],
|
|
342
|
+
role=discovered.role or {},
|
|
343
|
+
metadata_url=discovered.metadata_url or "",
|
|
344
|
+
data_url=discovered.data_url or "",
|
|
345
|
+
dimension=discovered.dimension or {},
|
|
346
|
+
required_dimensions=discovered.required_dimensions or {},
|
|
347
|
+
note=discovered.note,
|
|
348
|
+
source=discovered.source or "Eurostat",
|
|
349
|
+
description=discovered.description,
|
|
350
|
+
discontinued=discovered.discontinued,
|
|
351
|
+
subject_code=discovered.subject_code,
|
|
352
|
+
subject_label=discovered.subject_label,
|
|
353
|
+
web_url=discovered.web_url,
|
|
354
|
+
doc_url=discovered.doc_url,
|
|
355
|
+
official_statistics=discovered.official_statistics,
|
|
356
|
+
contact=discovered.contact,
|
|
357
|
+
dimension_ids=discovered.dimension_ids,
|
|
358
|
+
extension=discovered.extension,
|
|
359
|
+
)
|