statwrapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statwrapper/models.py ADDED
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ _VALID_TIME_UNITS = {"Annual", "Quarterly", "Monthly", "Weekly", "Other"}
9
+
10
+
11
+ def _validate_time_unit(value: str | None) -> None:
12
+ if value is None:
13
+ return
14
+ if value not in _VALID_TIME_UNITS:
15
+ raise ValueError(
16
+ f"Invalid time_unit: {value}. Must be one of {_VALID_TIME_UNITS}."
17
+ )
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class Provider:
22
+ provider_code: str
23
+ api_type: str
24
+ default_language: str
25
+ languages: list[str]
26
+ label: str
27
+ country_code: str
28
+ base_api_url: str | None = None
29
+ base_web_url: str | None = None
30
+ rate_limit: float = 1.0
31
+ cell_limit: int = 5000
32
+ variable_limit: int = 1000
33
+ max_concurrency: int | None = None
34
+ data_formats: list[str] = field(default_factory=list)
35
+ extension: dict[str, Any] = field(default_factory=dict)
36
+
37
+
38
+ @dataclass(slots=True)
39
+ class DiscoveredDataset:
40
+ task_id: uuid.UUID
41
+ provider_code: str
42
+ dataset_code: str
43
+ language: str
44
+ updated: datetime
45
+ label: str | None = None
46
+ source: str | None = None
47
+ note: list[str] | None = None
48
+ role: dict[str, list[str]] | None = None
49
+ description: str | None = None
50
+ time_unit: str | None = None
51
+ first_period: str | None = None
52
+ last_period: str | None = None
53
+ discontinued: bool | None = None
54
+ paths: list[list[dict[str, str]]] | None = None
55
+ subject_code: str | None = None
56
+ subject_label: str | None = None
57
+ metadata_url: str | None = None
58
+ data_url: str | None = None
59
+ web_url: str | None = None
60
+ doc_url: str | None = None
61
+ dimension: dict[str, dict[str, Any]] | None = None
62
+ required_dimensions: dict[str, bool | None] | None = None
63
+ official_statistics: bool | None = None
64
+ contact: dict[str, Any] | None = None
65
+ dimension_ids: list[str] | None = None
66
+ extension: dict[str, Any] = field(default_factory=dict)
67
+
68
+ def __post_init__(self) -> None:
69
+ _validate_time_unit(self.time_unit)
70
+
71
+
72
+ @dataclass(slots=True)
73
+ class ResolvedDatasetMetadata:
74
+ task_id: uuid.UUID
75
+ provider_code: str
76
+ dataset_code: str
77
+ language: str
78
+ updated: datetime
79
+ label: str
80
+ time_unit: str
81
+ first_period: str
82
+ last_period: str
83
+ paths: list[list[dict[str, str]]]
84
+ role: dict[str, list[str]]
85
+ metadata_url: str
86
+ data_url: str
87
+ dimension: dict[str, dict[str, Any]]
88
+ required_dimensions: dict[str, bool | None]
89
+ note: list[str] | None = None
90
+ source: str | None = None
91
+ description: str | None = None
92
+ discontinued: bool | None = None
93
+ subject_code: str | None = None
94
+ subject_label: str | None = None
95
+ web_url: str | None = None
96
+ doc_url: str | None = None
97
+ official_statistics: bool | None = None
98
+ contact: dict[str, Any] | None = None
99
+ dimension_ids: list[str] | None = None
100
+ extension: dict[str, Any] = field(default_factory=dict)
101
+
102
+ def __post_init__(self) -> None:
103
+ _validate_time_unit(self.time_unit)
statwrapper/parsers.py ADDED
@@ -0,0 +1,260 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from .utils import normalize_contact, normalize_note, parse_dt
6
+
7
+ _VALID_TIME_UNITS = {"Annual", "Quarterly", "Monthly", "Weekly", "Other"}
8
+
9
+
10
+ def normalize_paths(raw_paths: Any) -> list[list[dict[str, str]]] | None:
11
+ if raw_paths is None:
12
+ return None
13
+
14
+ if isinstance(raw_paths, dict) and isinstance(raw_paths.get("paths"), list):
15
+ raw_paths = raw_paths["paths"]
16
+ elif isinstance(raw_paths, dict) and isinstance(raw_paths.get("path"), list):
17
+ raw_paths = [raw_paths]
18
+ elif not isinstance(raw_paths, list):
19
+ raw_paths = [[raw_paths]]
20
+
21
+ output: list[list[dict[str, str]]] = []
22
+ for raw_path in raw_paths:
23
+ items = raw_path.get("path") if isinstance(raw_path, dict) else raw_path
24
+ if not isinstance(items, list):
25
+ continue
26
+ normalized_path: list[dict[str, str]] = []
27
+ for index, item in enumerate(items):
28
+ if isinstance(item, dict):
29
+ item_id = str(
30
+ item.get("id")
31
+ or item.get("code")
32
+ or item.get("key")
33
+ or item.get("value")
34
+ or index
35
+ ).strip()
36
+ if not item_id:
37
+ continue
38
+ label = str(
39
+ item.get("label")
40
+ or item.get("title")
41
+ or item.get("name")
42
+ or item_id
43
+ ).strip()
44
+ else:
45
+ item_id = str(item).strip()
46
+ if not item_id:
47
+ continue
48
+ label = item_id
49
+ normalized_path.append({"id": item_id, "label": label})
50
+ if normalized_path:
51
+ output.append(normalized_path)
52
+ return output or None
53
+
54
+
55
+ def normalize_role(raw_role: Any) -> dict[str, list[str]] | None:
56
+ if not isinstance(raw_role, dict):
57
+ return None
58
+ output: dict[str, list[str]] = {}
59
+ for key in ("time", "geo", "metric"):
60
+ values = raw_role.get(key)
61
+ if not isinstance(values, list):
62
+ continue
63
+ deduped: list[str] = []
64
+ for value in values:
65
+ text = str(value).strip()
66
+ if text and text not in deduped:
67
+ deduped.append(text)
68
+ if deduped:
69
+ output[key] = deduped
70
+ return output or None
71
+
72
+
73
+ def normalize_dimensions(
74
+ raw_dimensions: Any,
75
+ raw_dimension_ids: Any,
76
+ ) -> tuple[
77
+ dict[str, dict[str, Any]] | None,
78
+ list[str] | None,
79
+ dict[str, bool | None] | None,
80
+ dict[str, dict[str, Any]] | None,
81
+ ]:
82
+ if not isinstance(raw_dimensions, dict):
83
+ return None, None, None, None
84
+
85
+ if isinstance(raw_dimension_ids, list):
86
+ dimension_ids = [str(item).strip() for item in raw_dimension_ids if str(item).strip()]
87
+ else:
88
+ dimension_ids = [str(key).strip() for key in raw_dimensions if str(key).strip()]
89
+
90
+ dimensions: dict[str, dict[str, Any]] = {}
91
+ required_dimensions: dict[str, bool | None] = {}
92
+ dimension_extensions: dict[str, dict[str, Any]] = {}
93
+ for dimension_id in dimension_ids:
94
+ raw_dimension = raw_dimensions.get(dimension_id)
95
+ if not isinstance(raw_dimension, dict):
96
+ continue
97
+ category = raw_dimension.get("category")
98
+ category = category if isinstance(category, dict) else {}
99
+
100
+ raw_index = category.get("index")
101
+ if isinstance(raw_index, dict):
102
+ index = {str(key): int(value) for key, value in raw_index.items()}
103
+ elif isinstance(raw_index, list):
104
+ index = {str(value): ordinal for ordinal, value in enumerate(raw_index)}
105
+ else:
106
+ raw_labels = category.get("label")
107
+ index = (
108
+ {str(key): ordinal for ordinal, key in enumerate(raw_labels)}
109
+ if isinstance(raw_labels, dict)
110
+ else {}
111
+ )
112
+ if not index:
113
+ continue
114
+
115
+ raw_labels = category.get("label")
116
+ labels = (
117
+ {key: str(raw_labels.get(key, key)) for key in index}
118
+ if isinstance(raw_labels, dict)
119
+ else {key: key for key in index}
120
+ )
121
+
122
+ parsed_dimension: dict[str, Any] = {
123
+ "label": str(raw_dimension.get("label") or dimension_id),
124
+ "category": {"index": index, "label": labels},
125
+ }
126
+ note = normalize_note(raw_dimension.get("note"))
127
+ if note:
128
+ parsed_dimension["note"] = note
129
+ dimensions[dimension_id] = parsed_dimension
130
+
131
+ elimination = raw_dimension.get("elimination")
132
+ required_dimensions[dimension_id] = (
133
+ None if elimination is None else not bool(elimination)
134
+ )
135
+
136
+ extras = {
137
+ key: value
138
+ for key, value in raw_dimension.items()
139
+ if key not in {"label", "category", "note", "elimination"}
140
+ }
141
+ if extras:
142
+ dimension_extensions[dimension_id] = {"extension": extras}
143
+
144
+ return (
145
+ dimensions or None,
146
+ dimension_ids or None,
147
+ required_dimensions or None,
148
+ dimension_extensions or None,
149
+ )
150
+
151
+
152
+ def parse_pxweb2_discovery_table(
153
+ table: dict[str, Any],
154
+ *,
155
+ base_api_url: str,
156
+ base_web_url: str | None,
157
+ language: str,
158
+ ) -> dict[str, Any] | None:
159
+ dataset_code = str(table.get("id") or "").strip()
160
+ if not dataset_code:
161
+ return None
162
+
163
+ raw_time_unit = table.get("timeUnit")
164
+ time_unit = (
165
+ raw_time_unit
166
+ if isinstance(raw_time_unit, str) and raw_time_unit in _VALID_TIME_UNITS
167
+ else "Other"
168
+ )
169
+
170
+ clean_api = base_api_url.rstrip("/")
171
+ return {
172
+ "dataset_code": dataset_code,
173
+ "label": str(table.get("label") or table.get("title") or dataset_code),
174
+ "updated": parse_dt(table.get("updated")),
175
+ "time_unit": time_unit,
176
+ "first_period": str(table.get("firstPeriod") or "").strip() or None,
177
+ "last_period": str(table.get("lastPeriod") or "").strip() or None,
178
+ "description": str(table.get("description")).strip()
179
+ if table.get("description") is not None
180
+ else None,
181
+ "source": str(table.get("source")).strip()
182
+ if table.get("source") is not None
183
+ else None,
184
+ "note": normalize_note(table.get("note")),
185
+ "subject_code": str(table.get("subjectCode")).strip()
186
+ if table.get("subjectCode") is not None
187
+ else None,
188
+ "paths": normalize_paths(table.get("paths")),
189
+ "discontinued": bool(table.get("discontinued", False)),
190
+ "metadata_url": f"{clean_api}/tables/{dataset_code}/metadata",
191
+ "data_url": f"{clean_api}/tables/{dataset_code}/data?lang={language}",
192
+ "web_url": (
193
+ f"{base_web_url.rstrip('/')}/{language}/table/{dataset_code}"
194
+ if base_web_url
195
+ else None
196
+ ),
197
+ "extension": {
198
+ "links": table.get("links"),
199
+ "variable_names": list(table.get("variableNames", []))
200
+ if isinstance(table.get("variableNames"), list)
201
+ else [],
202
+ },
203
+ }
204
+
205
+
206
+ def parse_pxweb2_metadata_payload(
207
+ payload: dict[str, Any],
208
+ *,
209
+ default_note: list[str] | None,
210
+ default_subject_label: str | None,
211
+ default_official_statistics: bool | None,
212
+ default_contact: dict[str, Any] | None,
213
+ default_extension: dict[str, Any],
214
+ ) -> dict[str, Any]:
215
+ (
216
+ dimensions,
217
+ dimension_ids,
218
+ required_dimensions,
219
+ dimension_extensions,
220
+ ) = normalize_dimensions(payload.get("dimension"), payload.get("id"))
221
+
222
+ extension = dict(default_extension)
223
+ note = normalize_note(payload.get("note")) or default_note
224
+ role = normalize_role(payload.get("role"))
225
+ subject_label = default_subject_label
226
+ official_statistics = default_official_statistics
227
+ contact = default_contact
228
+
229
+ raw_extension = payload.get("extension")
230
+ if isinstance(raw_extension, dict):
231
+ extension.update(raw_extension)
232
+ px = raw_extension.get("px")
233
+ if isinstance(px, dict):
234
+ if subject_label is None and px.get("subject-area") is not None:
235
+ subject_label = str(px.get("subject-area")).strip() or None
236
+ if px.get("official-statistics") is not None:
237
+ official_statistics = bool(px.get("official-statistics"))
238
+ normalized_contact = normalize_contact(raw_extension.get("contact"))
239
+ if normalized_contact is not None:
240
+ contact = normalized_contact
241
+
242
+ if dimension_extensions is not None:
243
+ merged = {}
244
+ existing = extension.get("dimension_extensions")
245
+ if isinstance(existing, dict):
246
+ merged.update(existing)
247
+ merged.update(dimension_extensions)
248
+ extension["dimension_extensions"] = merged
249
+
250
+ return {
251
+ "dimensions": dimensions,
252
+ "dimension_ids": dimension_ids,
253
+ "required_dimensions": required_dimensions,
254
+ "role": role,
255
+ "note": note,
256
+ "subject_label": subject_label,
257
+ "official_statistics": official_statistics,
258
+ "contact": contact,
259
+ "extension": extension,
260
+ }
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from importlib.resources import files
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from .api_clients import DstClient, EurostatClient, PxWeb2Client, PxWebClient
9
+ from .exceptions import (
10
+ ProviderNotFoundError,
11
+ UnsupportedAPITypeError,
12
+ UnsupportedLanguageError,
13
+ )
14
+ from .models import Provider
15
+
16
+ WRAPPER_REGISTRY = {
17
+ "dst": DstClient,
18
+ "eurostat": EurostatClient,
19
+ "pxweb": PxWebClient,
20
+ "pxweb2": PxWeb2Client,
21
+ }
22
+
23
+
24
+ def _default_provider_path() -> Path:
25
+ repo_file = Path(__file__).resolve().parent.parent / "PROVIDERS.json"
26
+ if repo_file.exists():
27
+ return repo_file
28
+ return Path(files("statwrapper").joinpath("providers.json"))
29
+
30
+
31
+ def load_providers(path: str | Path | None = None) -> list[Provider]:
32
+ resolved_path = Path(path) if path is not None else _default_provider_path()
33
+ data = json.loads(resolved_path.read_text(encoding="utf-8"))
34
+ return [Provider(**item) for item in data]
35
+
36
+
37
+ def get_provider(
38
+ provider_code: str,
39
+ providers: list[Provider] | None = None,
40
+ ) -> Provider:
41
+ candidates = providers or load_providers()
42
+ for provider in candidates:
43
+ if provider.provider_code == provider_code:
44
+ return provider
45
+ raise ProviderNotFoundError(provider_code)
46
+
47
+
48
+ def get_wrapper_class(api_type: str) -> type:
49
+ wrapper_cls = WRAPPER_REGISTRY.get(api_type.lower())
50
+ if wrapper_cls is None:
51
+ raise UnsupportedAPITypeError(api_type)
52
+ return wrapper_cls
53
+
54
+
55
+ def create_wrapper(
56
+ provider: Provider,
57
+ *,
58
+ language: str,
59
+ json_request_handler: Any,
60
+ text_request_handler: Any = None,
61
+ bytes_request_handler: Any = None,
62
+ logger: Any = None,
63
+ ):
64
+ if language not in provider.languages:
65
+ raise UnsupportedLanguageError(provider.provider_code, language)
66
+ wrapper_cls = get_wrapper_class(provider.api_type)
67
+ return wrapper_cls(
68
+ provider,
69
+ language=language,
70
+ json_request_handler=json_request_handler,
71
+ text_request_handler=text_request_handler,
72
+ bytes_request_handler=bytes_request_handler,
73
+ logger=logger,
74
+ )