statwrapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ from .base_api_client import APIWrapper
2
+ from .http import RateLimitedSession
3
+ from .models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
4
+ from .provider_registry import (
5
+ create_wrapper,
6
+ get_provider,
7
+ get_wrapper_class,
8
+ load_providers,
9
+ )
10
+ from .statwrapper import (
11
+ StatWrapper,
12
+ discover_provider_datasets,
13
+ get_datasets,
14
+ resolve_dataset_metadata,
15
+ )
16
+
17
+ __all__ = [
18
+ "APIWrapper",
19
+ "DiscoveredDataset",
20
+ "Provider",
21
+ "RateLimitedSession",
22
+ "ResolvedDatasetMetadata",
23
+ "StatWrapper",
24
+ "create_wrapper",
25
+ "discover_provider_datasets",
26
+ "get_datasets",
27
+ "get_provider",
28
+ "get_wrapper_class",
29
+ "load_providers",
30
+ "resolve_dataset_metadata",
31
+ ]
@@ -0,0 +1,11 @@
1
+ from .dst_client import DstClient
2
+ from .eurostat_client import EurostatClient
3
+ from .pxweb2_client import PxWeb2Client
4
+ from .pxweb_client import PxWebClient
5
+
6
+ __all__ = [
7
+ "DstClient",
8
+ "EurostatClient",
9
+ "PxWebClient",
10
+ "PxWeb2Client",
11
+ ]
@@ -0,0 +1,249 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from typing import Any
5
+
6
+ from ..base_api_client import APIWrapper
7
+ from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
8
+ from ..utils import (
9
+ detect_role,
10
+ determine_time_unit,
11
+ normalize_contact,
12
+ normalize_note,
13
+ parse_dt,
14
+ )
15
+
16
+
17
+ def _collect_tables(subjects_tree: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
18
+ tables_by_id: dict[str, dict[str, Any]] = {}
19
+
20
+ def visit(node: dict[str, Any]) -> None:
21
+ for table in node.get("tables") or []:
22
+ if not isinstance(table, dict):
23
+ continue
24
+ table_id = str(table.get("id") or "").strip()
25
+ if not table_id:
26
+ continue
27
+ existing = tables_by_id.setdefault(table_id, {"id": table_id})
28
+ for key, value in table.items():
29
+ if existing.get(key) in (None, "", []) and value not in (None, "", []):
30
+ existing[key] = value
31
+ for child in node.get("subjects") or []:
32
+ if isinstance(child, dict):
33
+ visit(child)
34
+
35
+ for root in subjects_tree:
36
+ if isinstance(root, dict):
37
+ visit(root)
38
+ return tables_by_id
39
+
40
+
41
+ def _parse_paths(subjects_tree: list[dict[str, Any]]) -> dict[str, list[list[dict[str, str]]]]:
42
+ code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
43
+
44
+ def visit(node: dict[str, Any], path: list[dict[str, str]]) -> None:
45
+ node_id = str(node.get("id") or "").strip()
46
+ node_label = str(node.get("description") or node.get("text") or node_id).strip()
47
+ current_path = path + ([{"id": node_id, "label": node_label}] if node_id else [])
48
+ for table in node.get("tables") or []:
49
+ if not isinstance(table, dict):
50
+ continue
51
+ table_id = str(table.get("id") or "").strip()
52
+ if not table_id:
53
+ continue
54
+ code_to_paths.setdefault(table_id, []).append(
55
+ current_path + [{"id": table_id, "label": str(table.get("text") or table_id).strip()}]
56
+ )
57
+ for child in node.get("subjects") or []:
58
+ if isinstance(child, dict):
59
+ visit(child, current_path)
60
+
61
+ for root in subjects_tree:
62
+ if isinstance(root, dict):
63
+ visit(root, [])
64
+ return code_to_paths
65
+
66
+
67
+ class DstClient(APIWrapper):
68
+ def __init__(self, provider: Provider, **kwargs: Any) -> None:
69
+ super().__init__(
70
+ provider_code=provider.provider_code,
71
+ label=provider.label,
72
+ language=kwargs.pop("language"),
73
+ json_request_handler=kwargs.pop("json_request_handler"),
74
+ text_request_handler=kwargs.pop("text_request_handler", None),
75
+ bytes_request_handler=kwargs.pop("bytes_request_handler", None),
76
+ logger=kwargs.pop("logger", None),
77
+ )
78
+ self.provider = provider
79
+ self.api_type = provider.api_type
80
+ self.base_api_url = provider.base_api_url or ""
81
+ self.base_web_url = provider.base_web_url
82
+
83
+ def _subjects_url(self) -> str:
84
+ return f"{self.base_api_url.rstrip('/')}/subjects"
85
+
86
+ def _tableinfo_url(self, dataset_code: str) -> str:
87
+ return f"{self.base_api_url.rstrip('/')}/tableinfo/{dataset_code}"
88
+
89
+ def _data_url(self, dataset_code: str) -> str:
90
+ return f"{self.base_api_url.rstrip('/')}/data/{dataset_code}/JSONSTAT"
91
+
92
+ async def health_check(self, dataset_code: str | None = None) -> bool:
93
+ url = self._subjects_url() if dataset_code is None else self._tableinfo_url(dataset_code)
94
+ payload = await self._get_json(url, params={"lang": self.language})
95
+ return payload is not None
96
+
97
+ async def discover_datasets(
98
+ self,
99
+ task_id: uuid.UUID,
100
+ **_: Any,
101
+ ) -> list[DiscoveredDataset]:
102
+ payload = await self._get_json(
103
+ self._subjects_url(),
104
+ params={
105
+ "lang": self.language,
106
+ "recursive": "true",
107
+ "omitSubjectsWithoutTables": "true",
108
+ "includeTables": "true",
109
+ "format": "JSON",
110
+ },
111
+ )
112
+ if not isinstance(payload, list):
113
+ return []
114
+ tables_by_id = _collect_tables(payload)
115
+ paths_by_id = _parse_paths(payload)
116
+ output: list[DiscoveredDataset] = []
117
+ for dataset_code, table in sorted(tables_by_id.items()):
118
+ paths = paths_by_id.get(dataset_code, [])
119
+ primary_path = paths[0] if paths else []
120
+ subject = primary_path[0] if primary_path else None
121
+ first_period = str(table.get("firstPeriod") or "").strip() or None
122
+ last_period = str(table.get("latestPeriod") or table.get("lastPeriod") or "").strip() or None
123
+ updated = parse_dt(table.get("updated")) or parse_dt("1970-01-01T00:00:00+00:00")
124
+ if updated is None:
125
+ continue
126
+ output.append(
127
+ DiscoveredDataset(
128
+ task_id=task_id,
129
+ provider_code=self.provider_code,
130
+ dataset_code=dataset_code,
131
+ language=self.language,
132
+ updated=updated,
133
+ label=str(table.get("text") or dataset_code).strip(),
134
+ description=str(table.get("description")).strip() if table.get("description") is not None else None,
135
+ source=f"Statistics Denmark - statbank.dk/{dataset_code}",
136
+ subject_code=subject["id"] if subject else None,
137
+ subject_label=subject["label"] if subject else None,
138
+ time_unit=determine_time_unit(first_period, last_period) if first_period and last_period else None,
139
+ first_period=first_period,
140
+ last_period=last_period,
141
+ discontinued=(not bool(table.get("active")) if isinstance(table.get("active"), bool) else False),
142
+ metadata_url=f"{self._tableinfo_url(dataset_code)}?lang={self.language}",
143
+ data_url=f"{self._data_url(dataset_code)}?lang={self.language}",
144
+ web_url=f"{self.base_web_url.rstrip('/')}/{dataset_code}" if self.base_web_url else None,
145
+ paths=paths or None,
146
+ extension={"unit": table.get("unit"), "variable_names": table.get("variables")},
147
+ )
148
+ )
149
+ return output
150
+
151
+ async def resolve_dataset_metadata(
152
+ self,
153
+ discovered: DiscoveredDataset,
154
+ task_id: uuid.UUID | None = None,
155
+ **_: Any,
156
+ ) -> ResolvedDatasetMetadata:
157
+ payload = await self._get_json(
158
+ self._tableinfo_url(discovered.dataset_code),
159
+ params={"lang": self.language},
160
+ )
161
+ dimensions: dict[str, dict[str, Any]] = {}
162
+ dimension_ids: list[str] = []
163
+ required_dimensions: dict[str, bool | None] = {}
164
+ role_map: dict[str, list[str]] = {}
165
+ extension = dict(discovered.extension)
166
+ if isinstance(payload, dict):
167
+ for position, variable in enumerate(payload.get("variables") or []):
168
+ if not isinstance(variable, dict):
169
+ continue
170
+ dimension_id = str(variable.get("id") or "").strip()
171
+ if not dimension_id:
172
+ continue
173
+ values = variable.get("values") if isinstance(variable.get("values"), list) else []
174
+ index: dict[str, int] = {}
175
+ labels: dict[str, str] = {}
176
+ for ordinal, item in enumerate(values):
177
+ if not isinstance(item, dict):
178
+ continue
179
+ value_id = str(item.get("id") or "").strip()
180
+ if not value_id:
181
+ continue
182
+ index[value_id] = ordinal
183
+ labels[value_id] = str(item.get("text") or value_id).strip()
184
+ if not index:
185
+ continue
186
+ label = str(variable.get("text") or dimension_id).strip()
187
+ dimensions[dimension_id] = {
188
+ "label": label,
189
+ "category": {"index": index, "label": labels},
190
+ }
191
+ dimension_ids.append(dimension_id)
192
+ elimination = variable.get("elimination")
193
+ required_dimensions[dimension_id] = None if elimination is None else not bool(elimination)
194
+ role = "geo" if variable.get("map") else detect_role(dimension_id, label, bool(variable.get("time")))
195
+ if role:
196
+ role_map.setdefault(role, []).append(dimension_id)
197
+ extension.setdefault("dimension_extensions", {})[dimension_id] = {
198
+ "extension": {"position": position}
199
+ }
200
+ label = (
201
+ str(payload.get("text")).strip()
202
+ if isinstance(payload, dict) and payload.get("text") is not None
203
+ else (discovered.label or discovered.dataset_code)
204
+ )
205
+ description = (
206
+ str(payload.get("description")).strip()
207
+ if isinstance(payload, dict) and payload.get("description") is not None
208
+ else discovered.description
209
+ )
210
+ updated = (
211
+ parse_dt(payload.get("updated"))
212
+ if isinstance(payload, dict)
213
+ else None
214
+ ) or discovered.updated
215
+ return ResolvedDatasetMetadata(
216
+ task_id=task_id or discovered.task_id,
217
+ provider_code=discovered.provider_code,
218
+ dataset_code=discovered.dataset_code,
219
+ language=discovered.language,
220
+ updated=updated,
221
+ label=label,
222
+ time_unit=discovered.time_unit or "Other",
223
+ first_period=discovered.first_period or "",
224
+ last_period=discovered.last_period or discovered.first_period or "",
225
+ paths=discovered.paths or [],
226
+ role=role_map,
227
+ metadata_url=discovered.metadata_url or f"{self._tableinfo_url(discovered.dataset_code)}?lang={self.language}",
228
+ data_url=discovered.data_url or f"{self._data_url(discovered.dataset_code)}?lang={self.language}",
229
+ dimension=dimensions,
230
+ required_dimensions=required_dimensions,
231
+ note=(normalize_note(payload.get("footnote")) if isinstance(payload, dict) else None) or discovered.note,
232
+ source=discovered.source,
233
+ description=description,
234
+ discontinued=not bool(payload.get("active")) if isinstance(payload, dict) and isinstance(payload.get("active"), bool) else discovered.discontinued,
235
+ subject_code=discovered.subject_code,
236
+ subject_label=discovered.subject_label,
237
+ web_url=discovered.web_url,
238
+ doc_url=(
239
+ str(payload.get("documentation", {}).get("url")).strip()
240
+ if isinstance(payload, dict)
241
+ and isinstance(payload.get("documentation"), dict)
242
+ and payload["documentation"].get("url") is not None
243
+ else discovered.doc_url
244
+ ),
245
+ official_statistics=discovered.official_statistics,
246
+ contact=(normalize_contact(payload.get("contacts")) if isinstance(payload, dict) else None) or discovered.contact,
247
+ dimension_ids=dimension_ids or None,
248
+ extension=extension,
249
+ )
@@ -0,0 +1,359 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import csv
5
+ import gzip
6
+ import io
7
+ import re
8
+ import uuid
9
+ from collections import defaultdict
10
+ from dataclasses import dataclass
11
+ from typing import Any
12
+
13
+ from ..base_api_client import APIWrapper
14
+ from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
15
+ from ..utils import parse_dt
16
+
17
+ DATA_URL_TEMPLATE = "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/{dataset_code}"
18
+ METADATA_URL_TEMPLATE = (
19
+ "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/"
20
+ "ESTAT/{dataset_code}/1.0?format=JSON&lang={language}&compress=false&attributes=none&measures=none"
21
+ )
22
+ WEB_URL_TEMPLATE = "https://ec.europa.eu/eurostat/databrowser/product/page/{dataset_code}"
23
+ DOC_URL_TEMPLATE = "https://ec.europa.eu/eurostat/cache/metadata/en/{lowercase_dataset_code}.htm"
24
+ TOC_URL_TEMPLATE = "https://ec.europa.eu/eurostat/api/dissemination/catalogue/toc/txt?lang={language}"
25
+ DATAFLOWS_URL = (
26
+ "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/structure/dataflow/"
27
+ "ESTAT/all/1.0?format=JSON&compress=false"
28
+ )
29
+ CODELISTS_URL = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/codelist/ESTAT/all/latest?format=json"
30
+ METABASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/catalogue/metabase.txt.gz"
31
+ _GLOBAL_CACHE: dict[
32
+ str,
33
+ tuple[
34
+ dict[str, _Dataflow],
35
+ dict[str, Any],
36
+ dict[str, dict[str, list[str]]],
37
+ dict[str, list[list[dict[str, str]]]],
38
+ ],
39
+ ] = {}
40
+ _GLOBAL_CACHE_LOCK = asyncio.Lock()
41
+ _CLEAN_TAGS_PATTERN = re.compile(r"<.*?>")
42
+
43
+
44
+ def _remove_html_tags(text: str) -> str:
45
+ return re.sub(_CLEAN_TAGS_PATTERN, "", text)
46
+
47
+
48
+ def _parse_toc_paths(toc_text: str) -> dict[str, list[list[dict[str, str]]]]:
49
+ reader = csv.reader(io.StringIO(toc_text), delimiter="\t", quotechar='"')
50
+ next(reader, None)
51
+ folder_stack: list[dict[str, str]] = []
52
+ full_stack: list[dict[str, str]] = []
53
+ code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
54
+ for row in reader:
55
+ if len(row) < 3:
56
+ continue
57
+ title, code, item_type = row[0], row[1], row[2]
58
+ if not title.strip() or not code.strip():
59
+ continue
60
+ indent_level = (len(title) - len(title.lstrip())) // 4
61
+ full_stack = full_stack[:indent_level]
62
+ full_stack.append({"id": code, "label": title.strip(), "type": item_type})
63
+ folder_stack = [
64
+ {"id": node["id"], "label": node["label"]}
65
+ for node in full_stack
66
+ if node.get("type") == "folder"
67
+ ]
68
+ if item_type != "folder":
69
+ code_to_paths.setdefault(code.upper(), []).append(list(folder_stack))
70
+ return code_to_paths
71
+
72
+
73
+ def _parse_metabase(gzipped_content: bytes) -> dict[str, dict[str, list[str]]]:
74
+ metabase: dict[str, dict[str, list[str]]] = {}
75
+ with gzip.open(io.BytesIO(gzipped_content), "rt", encoding="utf-8") as handle:
76
+ dataset_dim_values: defaultdict[str, defaultdict[str, list[str]]] = defaultdict(
77
+ lambda: defaultdict(list)
78
+ )
79
+ for line in handle:
80
+ stripped = line.strip()
81
+ if not stripped:
82
+ continue
83
+ parts = stripped.split("\t")
84
+ if len(parts) < 3:
85
+ continue
86
+ dataset_code, dimension, value = parts[:3]
87
+ dataset_dim_values[dataset_code.upper()][dimension].append(value)
88
+ for dataset_code, dimensions in dataset_dim_values.items():
89
+ metabase[dataset_code] = dict(dimensions)
90
+ return metabase
91
+
92
+
93
+ @dataclass(slots=True)
94
+ class _Dataflow:
95
+ dataset_code: str
96
+ label: str
97
+ updated: str
98
+ source: str | None
99
+ description: str | None
100
+ first_period: str | None
101
+ last_period: str | None
102
+ extension: dict[str, Any]
103
+
104
+
105
+ class EurostatClient(APIWrapper):
106
+ def __init__(self, provider: Provider, **kwargs: Any) -> None:
107
+ super().__init__(
108
+ provider_code=provider.provider_code,
109
+ label=provider.label,
110
+ language=kwargs.pop("language"),
111
+ json_request_handler=kwargs.pop("json_request_handler"),
112
+ text_request_handler=kwargs.pop("text_request_handler", None),
113
+ bytes_request_handler=kwargs.pop("bytes_request_handler", None),
114
+ logger=kwargs.pop("logger", None),
115
+ )
116
+ self.provider = provider
117
+ self.api_type = provider.api_type
118
+
119
+ async def health_check(self, dataset_code: str | None = None) -> bool:
120
+ url = (
121
+ DATAFLOWS_URL
122
+ if dataset_code is None
123
+ else METADATA_URL_TEMPLATE.format(
124
+ dataset_code=dataset_code,
125
+ language=self.language,
126
+ )
127
+ )
128
+ payload = await self._get_json(url)
129
+ return payload is not None
130
+
131
+ async def _fetch_global_metadata(
132
+ self,
133
+ ) -> tuple[
134
+ dict[str, _Dataflow],
135
+ dict[str, Any],
136
+ dict[str, dict[str, list[str]]],
137
+ dict[str, list[list[dict[str, str]]]],
138
+ ]:
139
+ cache_key = self.language.lower()
140
+ cached = _GLOBAL_CACHE.get(cache_key)
141
+ if cached is not None:
142
+ return cached
143
+ async with _GLOBAL_CACHE_LOCK:
144
+ cached = _GLOBAL_CACHE.get(cache_key)
145
+ if cached is not None:
146
+ return cached
147
+ toc_text, dataflows_payload, codelists_payload, metabase_payload = (
148
+ await asyncio.gather(
149
+ self._get_text(TOC_URL_TEMPLATE.format(language=self.language)),
150
+ self._get_json(DATAFLOWS_URL),
151
+ self._get_json(CODELISTS_URL),
152
+ self._get_bytes(METABASE_URL),
153
+ )
154
+ )
155
+ dataflows: dict[str, _Dataflow] = {}
156
+ if isinstance(dataflows_payload, dict):
157
+ items = dataflows_payload.get("link", {}).get("item", [])
158
+ for item in items:
159
+ if not isinstance(item, dict):
160
+ continue
161
+ extension = item.get("extension", {})
162
+ if not isinstance(extension, dict):
163
+ extension = {}
164
+ dataset_code = str(extension.get("id") or "").upper()
165
+ label = str(item.get("label") or "").strip()
166
+ if not dataset_code or not label:
167
+ continue
168
+ annotation_map: dict[str, Any] = {}
169
+ for annotation in extension.get("annotation", []):
170
+ if not isinstance(annotation, dict):
171
+ continue
172
+ annotation_type = annotation.get("type")
173
+ if not annotation_type:
174
+ continue
175
+ for key in ("date", "title", "text"):
176
+ if key in annotation:
177
+ annotation_map[str(annotation_type)] = annotation[key]
178
+ break
179
+ updated = max(
180
+ str(annotation_map.get("UPDATE_DATA") or ""),
181
+ str(annotation_map.get("UPDATE_STRUCTURE") or ""),
182
+ )
183
+ if not updated:
184
+ continue
185
+ description = extension.get("description")
186
+ dataflows[dataset_code] = _Dataflow(
187
+ dataset_code=dataset_code,
188
+ label=label,
189
+ updated=updated,
190
+ source=str(annotation_map.get("SOURCE_INSTITUTIONS") or "Eurostat"),
191
+ description=_remove_html_tags(description)
192
+ if isinstance(description, str)
193
+ else None,
194
+ first_period=str(
195
+ annotation_map.get("OBS_PERIOD_OVERALL_OLDEST") or ""
196
+ ).strip()
197
+ or None,
198
+ last_period=str(
199
+ annotation_map.get("OBS_PERIOD_OVERALL_LATEST") or ""
200
+ ).strip()
201
+ or None,
202
+ extension={"dataflow_extension": extension} if extension else {},
203
+ )
204
+ codelists = codelists_payload if isinstance(codelists_payload, dict) else {}
205
+ metabase = (
206
+ _parse_metabase(metabase_payload)
207
+ if isinstance(metabase_payload, bytes)
208
+ else {}
209
+ )
210
+ toc_paths = _parse_toc_paths(toc_text) if isinstance(toc_text, str) else {}
211
+ _GLOBAL_CACHE[cache_key] = (dataflows, codelists, metabase, toc_paths)
212
+ return _GLOBAL_CACHE[cache_key]
213
+
214
+ def _build_dimensions(
215
+ self,
216
+ dataset_code: str,
217
+ codelists: dict[str, Any],
218
+ metabase: dict[str, dict[str, list[str]]],
219
+ ) -> tuple[dict[str, dict[str, Any]], list[str], dict[str, list[str]], str]:
220
+ dimensions: dict[str, dict[str, Any]] = {}
221
+ role: dict[str, list[str]] = {}
222
+ time_unit = "Other"
223
+ items = codelists.get("link", {}).get("item", []) if isinstance(codelists, dict) else []
224
+ for dimension_id, category_ids in metabase.get(dataset_code, {}).items():
225
+ if dimension_id.lower() == "time":
226
+ time_unit = (
227
+ "Annual"
228
+ if category_ids and all(len(value) == 4 for value in category_ids[:2])
229
+ else "Other"
230
+ )
231
+ labels = {value: value for value in category_ids}
232
+ index = {value: ordinal for ordinal, value in enumerate(category_ids)}
233
+ dimensions[dimension_id] = {
234
+ "label": "Time",
235
+ "category": {"index": index, "label": labels},
236
+ }
237
+ role.setdefault("time", []).append(dimension_id)
238
+ continue
239
+
240
+ dimension_label = dimension_id
241
+ label_map = {value: value for value in category_ids}
242
+ for item in items:
243
+ if not isinstance(item, dict):
244
+ continue
245
+ extension = item.get("extension", {})
246
+ if not isinstance(extension, dict):
247
+ continue
248
+ if str(extension.get("id") or "").upper() != dimension_id.upper():
249
+ continue
250
+ dimension_label = str(item.get("label") or dimension_id)
251
+ labels = item.get("category", {}).get("label", {})
252
+ if isinstance(labels, dict):
253
+ label_map = {
254
+ value: str(labels.get(value, value)) for value in category_ids
255
+ }
256
+ break
257
+
258
+ dimensions[dimension_id] = {
259
+ "label": dimension_label,
260
+ "category": {
261
+ "index": {
262
+ value: ordinal for ordinal, value in enumerate(category_ids)
263
+ },
264
+ "label": label_map,
265
+ },
266
+ }
267
+ if dimension_id.lower() == "geo":
268
+ role.setdefault("geo", []).append(dimension_id)
269
+ return dimensions, list(dimensions.keys()), role, time_unit
270
+
271
+ async def discover_datasets(
272
+ self,
273
+ task_id: uuid.UUID,
274
+ **_: Any,
275
+ ) -> list[DiscoveredDataset]:
276
+ dataflows, codelists, metabase, toc_paths = await self._fetch_global_metadata()
277
+ output: list[DiscoveredDataset] = []
278
+ for dataset_code, dataflow in dataflows.items():
279
+ if "$dv_" in dataset_code.lower():
280
+ continue
281
+ dimensions, dimension_ids, role, time_unit = self._build_dimensions(
282
+ dataset_code,
283
+ codelists,
284
+ metabase,
285
+ )
286
+ updated = parse_dt(dataflow.updated)
287
+ if updated is None:
288
+ continue
289
+ paths = toc_paths.get(dataset_code, [])
290
+ subject = paths[0][0] if paths and paths[0] else None
291
+ output.append(
292
+ DiscoveredDataset(
293
+ task_id=task_id,
294
+ provider_code=self.provider_code,
295
+ dataset_code=dataset_code,
296
+ language=self.language,
297
+ updated=updated,
298
+ label=_remove_html_tags(dataflow.label),
299
+ source=dataflow.source,
300
+ description=dataflow.description,
301
+ time_unit=time_unit,
302
+ first_period=dataflow.first_period,
303
+ last_period=dataflow.last_period,
304
+ paths=paths or None,
305
+ subject_code=subject["id"] if subject else None,
306
+ subject_label=subject["label"] if subject else None,
307
+ metadata_url=METADATA_URL_TEMPLATE.format(
308
+ dataset_code=dataset_code,
309
+ language=self.language,
310
+ ),
311
+ data_url=DATA_URL_TEMPLATE.format(dataset_code=dataset_code),
312
+ web_url=WEB_URL_TEMPLATE.format(dataset_code=dataset_code),
313
+ doc_url=DOC_URL_TEMPLATE.format(
314
+ lowercase_dataset_code=dataset_code.lower()
315
+ ),
316
+ dimension=dimensions,
317
+ required_dimensions=dict.fromkeys(dimension_ids, None),
318
+ role=role,
319
+ dimension_ids=dimension_ids,
320
+ extension=dataflow.extension,
321
+ )
322
+ )
323
+ return output
324
+
325
+ async def resolve_dataset_metadata(
326
+ self,
327
+ discovered: DiscoveredDataset,
328
+ task_id: uuid.UUID | None = None,
329
+ **_: Any,
330
+ ) -> ResolvedDatasetMetadata:
331
+ return ResolvedDatasetMetadata(
332
+ task_id=task_id or discovered.task_id,
333
+ provider_code=discovered.provider_code,
334
+ dataset_code=discovered.dataset_code,
335
+ language=discovered.language,
336
+ updated=discovered.updated,
337
+ label=discovered.label or discovered.dataset_code,
338
+ time_unit=discovered.time_unit or "Other",
339
+ first_period=discovered.first_period or "",
340
+ last_period=discovered.last_period or discovered.first_period or "",
341
+ paths=discovered.paths or [],
342
+ role=discovered.role or {},
343
+ metadata_url=discovered.metadata_url or "",
344
+ data_url=discovered.data_url or "",
345
+ dimension=discovered.dimension or {},
346
+ required_dimensions=discovered.required_dimensions or {},
347
+ note=discovered.note,
348
+ source=discovered.source or "Eurostat",
349
+ description=discovered.description,
350
+ discontinued=discovered.discontinued,
351
+ subject_code=discovered.subject_code,
352
+ subject_label=discovered.subject_label,
353
+ web_url=discovered.web_url,
354
+ doc_url=discovered.doc_url,
355
+ official_statistics=discovered.official_statistics,
356
+ contact=discovered.contact,
357
+ dimension_ids=discovered.dimension_ids,
358
+ extension=discovered.extension,
359
+ )