statwrapper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statwrapper/__init__.py +31 -0
- statwrapper/api_clients/__init__.py +11 -0
- statwrapper/api_clients/dst_client.py +249 -0
- statwrapper/api_clients/eurostat_client.py +359 -0
- statwrapper/api_clients/pxweb2_client.py +170 -0
- statwrapper/api_clients/pxweb_client.py +244 -0
- statwrapper/base_api_client.py +79 -0
- statwrapper/exceptions.py +22 -0
- statwrapper/http.py +126 -0
- statwrapper/models.py +103 -0
- statwrapper/parsers.py +260 -0
- statwrapper/provider_registry.py +74 -0
- statwrapper/providers.json +662 -0
- statwrapper/statwrapper.py +103 -0
- statwrapper/utils.py +134 -0
- statwrapper-0.1.0.dist-info/METADATA +123 -0
- statwrapper-0.1.0.dist-info/RECORD +19 -0
- statwrapper-0.1.0.dist-info/WHEEL +5 -0
- statwrapper-0.1.0.dist-info/top_level.txt +1 -0
statwrapper/models.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
_VALID_TIME_UNITS = {"Annual", "Quarterly", "Monthly", "Weekly", "Other"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _validate_time_unit(value: str | None) -> None:
|
|
12
|
+
if value is None:
|
|
13
|
+
return
|
|
14
|
+
if value not in _VALID_TIME_UNITS:
|
|
15
|
+
raise ValueError(
|
|
16
|
+
f"Invalid time_unit: {value}. Must be one of {_VALID_TIME_UNITS}."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class Provider:
|
|
22
|
+
provider_code: str
|
|
23
|
+
api_type: str
|
|
24
|
+
default_language: str
|
|
25
|
+
languages: list[str]
|
|
26
|
+
label: str
|
|
27
|
+
country_code: str
|
|
28
|
+
base_api_url: str | None = None
|
|
29
|
+
base_web_url: str | None = None
|
|
30
|
+
rate_limit: float = 1.0
|
|
31
|
+
cell_limit: int = 5000
|
|
32
|
+
variable_limit: int = 1000
|
|
33
|
+
max_concurrency: int | None = None
|
|
34
|
+
data_formats: list[str] = field(default_factory=list)
|
|
35
|
+
extension: dict[str, Any] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(slots=True)
|
|
39
|
+
class DiscoveredDataset:
|
|
40
|
+
task_id: uuid.UUID
|
|
41
|
+
provider_code: str
|
|
42
|
+
dataset_code: str
|
|
43
|
+
language: str
|
|
44
|
+
updated: datetime
|
|
45
|
+
label: str | None = None
|
|
46
|
+
source: str | None = None
|
|
47
|
+
note: list[str] | None = None
|
|
48
|
+
role: dict[str, list[str]] | None = None
|
|
49
|
+
description: str | None = None
|
|
50
|
+
time_unit: str | None = None
|
|
51
|
+
first_period: str | None = None
|
|
52
|
+
last_period: str | None = None
|
|
53
|
+
discontinued: bool | None = None
|
|
54
|
+
paths: list[list[dict[str, str]]] | None = None
|
|
55
|
+
subject_code: str | None = None
|
|
56
|
+
subject_label: str | None = None
|
|
57
|
+
metadata_url: str | None = None
|
|
58
|
+
data_url: str | None = None
|
|
59
|
+
web_url: str | None = None
|
|
60
|
+
doc_url: str | None = None
|
|
61
|
+
dimension: dict[str, dict[str, Any]] | None = None
|
|
62
|
+
required_dimensions: dict[str, bool | None] | None = None
|
|
63
|
+
official_statistics: bool | None = None
|
|
64
|
+
contact: dict[str, Any] | None = None
|
|
65
|
+
dimension_ids: list[str] | None = None
|
|
66
|
+
extension: dict[str, Any] = field(default_factory=dict)
|
|
67
|
+
|
|
68
|
+
def __post_init__(self) -> None:
|
|
69
|
+
_validate_time_unit(self.time_unit)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(slots=True)
|
|
73
|
+
class ResolvedDatasetMetadata:
|
|
74
|
+
task_id: uuid.UUID
|
|
75
|
+
provider_code: str
|
|
76
|
+
dataset_code: str
|
|
77
|
+
language: str
|
|
78
|
+
updated: datetime
|
|
79
|
+
label: str
|
|
80
|
+
time_unit: str
|
|
81
|
+
first_period: str
|
|
82
|
+
last_period: str
|
|
83
|
+
paths: list[list[dict[str, str]]]
|
|
84
|
+
role: dict[str, list[str]]
|
|
85
|
+
metadata_url: str
|
|
86
|
+
data_url: str
|
|
87
|
+
dimension: dict[str, dict[str, Any]]
|
|
88
|
+
required_dimensions: dict[str, bool | None]
|
|
89
|
+
note: list[str] | None = None
|
|
90
|
+
source: str | None = None
|
|
91
|
+
description: str | None = None
|
|
92
|
+
discontinued: bool | None = None
|
|
93
|
+
subject_code: str | None = None
|
|
94
|
+
subject_label: str | None = None
|
|
95
|
+
web_url: str | None = None
|
|
96
|
+
doc_url: str | None = None
|
|
97
|
+
official_statistics: bool | None = None
|
|
98
|
+
contact: dict[str, Any] | None = None
|
|
99
|
+
dimension_ids: list[str] | None = None
|
|
100
|
+
extension: dict[str, Any] = field(default_factory=dict)
|
|
101
|
+
|
|
102
|
+
def __post_init__(self) -> None:
|
|
103
|
+
_validate_time_unit(self.time_unit)
|
statwrapper/parsers.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .utils import normalize_contact, normalize_note, parse_dt
|
|
6
|
+
|
|
7
|
+
_VALID_TIME_UNITS = {"Annual", "Quarterly", "Monthly", "Weekly", "Other"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def normalize_paths(raw_paths: Any) -> list[list[dict[str, str]]] | None:
|
|
11
|
+
if raw_paths is None:
|
|
12
|
+
return None
|
|
13
|
+
|
|
14
|
+
if isinstance(raw_paths, dict) and isinstance(raw_paths.get("paths"), list):
|
|
15
|
+
raw_paths = raw_paths["paths"]
|
|
16
|
+
elif isinstance(raw_paths, dict) and isinstance(raw_paths.get("path"), list):
|
|
17
|
+
raw_paths = [raw_paths]
|
|
18
|
+
elif not isinstance(raw_paths, list):
|
|
19
|
+
raw_paths = [[raw_paths]]
|
|
20
|
+
|
|
21
|
+
output: list[list[dict[str, str]]] = []
|
|
22
|
+
for raw_path in raw_paths:
|
|
23
|
+
items = raw_path.get("path") if isinstance(raw_path, dict) else raw_path
|
|
24
|
+
if not isinstance(items, list):
|
|
25
|
+
continue
|
|
26
|
+
normalized_path: list[dict[str, str]] = []
|
|
27
|
+
for index, item in enumerate(items):
|
|
28
|
+
if isinstance(item, dict):
|
|
29
|
+
item_id = str(
|
|
30
|
+
item.get("id")
|
|
31
|
+
or item.get("code")
|
|
32
|
+
or item.get("key")
|
|
33
|
+
or item.get("value")
|
|
34
|
+
or index
|
|
35
|
+
).strip()
|
|
36
|
+
if not item_id:
|
|
37
|
+
continue
|
|
38
|
+
label = str(
|
|
39
|
+
item.get("label")
|
|
40
|
+
or item.get("title")
|
|
41
|
+
or item.get("name")
|
|
42
|
+
or item_id
|
|
43
|
+
).strip()
|
|
44
|
+
else:
|
|
45
|
+
item_id = str(item).strip()
|
|
46
|
+
if not item_id:
|
|
47
|
+
continue
|
|
48
|
+
label = item_id
|
|
49
|
+
normalized_path.append({"id": item_id, "label": label})
|
|
50
|
+
if normalized_path:
|
|
51
|
+
output.append(normalized_path)
|
|
52
|
+
return output or None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize_role(raw_role: Any) -> dict[str, list[str]] | None:
|
|
56
|
+
if not isinstance(raw_role, dict):
|
|
57
|
+
return None
|
|
58
|
+
output: dict[str, list[str]] = {}
|
|
59
|
+
for key in ("time", "geo", "metric"):
|
|
60
|
+
values = raw_role.get(key)
|
|
61
|
+
if not isinstance(values, list):
|
|
62
|
+
continue
|
|
63
|
+
deduped: list[str] = []
|
|
64
|
+
for value in values:
|
|
65
|
+
text = str(value).strip()
|
|
66
|
+
if text and text not in deduped:
|
|
67
|
+
deduped.append(text)
|
|
68
|
+
if deduped:
|
|
69
|
+
output[key] = deduped
|
|
70
|
+
return output or None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def normalize_dimensions(
|
|
74
|
+
raw_dimensions: Any,
|
|
75
|
+
raw_dimension_ids: Any,
|
|
76
|
+
) -> tuple[
|
|
77
|
+
dict[str, dict[str, Any]] | None,
|
|
78
|
+
list[str] | None,
|
|
79
|
+
dict[str, bool | None] | None,
|
|
80
|
+
dict[str, dict[str, Any]] | None,
|
|
81
|
+
]:
|
|
82
|
+
if not isinstance(raw_dimensions, dict):
|
|
83
|
+
return None, None, None, None
|
|
84
|
+
|
|
85
|
+
if isinstance(raw_dimension_ids, list):
|
|
86
|
+
dimension_ids = [str(item).strip() for item in raw_dimension_ids if str(item).strip()]
|
|
87
|
+
else:
|
|
88
|
+
dimension_ids = [str(key).strip() for key in raw_dimensions if str(key).strip()]
|
|
89
|
+
|
|
90
|
+
dimensions: dict[str, dict[str, Any]] = {}
|
|
91
|
+
required_dimensions: dict[str, bool | None] = {}
|
|
92
|
+
dimension_extensions: dict[str, dict[str, Any]] = {}
|
|
93
|
+
for dimension_id in dimension_ids:
|
|
94
|
+
raw_dimension = raw_dimensions.get(dimension_id)
|
|
95
|
+
if not isinstance(raw_dimension, dict):
|
|
96
|
+
continue
|
|
97
|
+
category = raw_dimension.get("category")
|
|
98
|
+
category = category if isinstance(category, dict) else {}
|
|
99
|
+
|
|
100
|
+
raw_index = category.get("index")
|
|
101
|
+
if isinstance(raw_index, dict):
|
|
102
|
+
index = {str(key): int(value) for key, value in raw_index.items()}
|
|
103
|
+
elif isinstance(raw_index, list):
|
|
104
|
+
index = {str(value): ordinal for ordinal, value in enumerate(raw_index)}
|
|
105
|
+
else:
|
|
106
|
+
raw_labels = category.get("label")
|
|
107
|
+
index = (
|
|
108
|
+
{str(key): ordinal for ordinal, key in enumerate(raw_labels)}
|
|
109
|
+
if isinstance(raw_labels, dict)
|
|
110
|
+
else {}
|
|
111
|
+
)
|
|
112
|
+
if not index:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
raw_labels = category.get("label")
|
|
116
|
+
labels = (
|
|
117
|
+
{key: str(raw_labels.get(key, key)) for key in index}
|
|
118
|
+
if isinstance(raw_labels, dict)
|
|
119
|
+
else {key: key for key in index}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
parsed_dimension: dict[str, Any] = {
|
|
123
|
+
"label": str(raw_dimension.get("label") or dimension_id),
|
|
124
|
+
"category": {"index": index, "label": labels},
|
|
125
|
+
}
|
|
126
|
+
note = normalize_note(raw_dimension.get("note"))
|
|
127
|
+
if note:
|
|
128
|
+
parsed_dimension["note"] = note
|
|
129
|
+
dimensions[dimension_id] = parsed_dimension
|
|
130
|
+
|
|
131
|
+
elimination = raw_dimension.get("elimination")
|
|
132
|
+
required_dimensions[dimension_id] = (
|
|
133
|
+
None if elimination is None else not bool(elimination)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
extras = {
|
|
137
|
+
key: value
|
|
138
|
+
for key, value in raw_dimension.items()
|
|
139
|
+
if key not in {"label", "category", "note", "elimination"}
|
|
140
|
+
}
|
|
141
|
+
if extras:
|
|
142
|
+
dimension_extensions[dimension_id] = {"extension": extras}
|
|
143
|
+
|
|
144
|
+
return (
|
|
145
|
+
dimensions or None,
|
|
146
|
+
dimension_ids or None,
|
|
147
|
+
required_dimensions or None,
|
|
148
|
+
dimension_extensions or None,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def parse_pxweb2_discovery_table(
|
|
153
|
+
table: dict[str, Any],
|
|
154
|
+
*,
|
|
155
|
+
base_api_url: str,
|
|
156
|
+
base_web_url: str | None,
|
|
157
|
+
language: str,
|
|
158
|
+
) -> dict[str, Any] | None:
|
|
159
|
+
dataset_code = str(table.get("id") or "").strip()
|
|
160
|
+
if not dataset_code:
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
raw_time_unit = table.get("timeUnit")
|
|
164
|
+
time_unit = (
|
|
165
|
+
raw_time_unit
|
|
166
|
+
if isinstance(raw_time_unit, str) and raw_time_unit in _VALID_TIME_UNITS
|
|
167
|
+
else "Other"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
clean_api = base_api_url.rstrip("/")
|
|
171
|
+
return {
|
|
172
|
+
"dataset_code": dataset_code,
|
|
173
|
+
"label": str(table.get("label") or table.get("title") or dataset_code),
|
|
174
|
+
"updated": parse_dt(table.get("updated")),
|
|
175
|
+
"time_unit": time_unit,
|
|
176
|
+
"first_period": str(table.get("firstPeriod") or "").strip() or None,
|
|
177
|
+
"last_period": str(table.get("lastPeriod") or "").strip() or None,
|
|
178
|
+
"description": str(table.get("description")).strip()
|
|
179
|
+
if table.get("description") is not None
|
|
180
|
+
else None,
|
|
181
|
+
"source": str(table.get("source")).strip()
|
|
182
|
+
if table.get("source") is not None
|
|
183
|
+
else None,
|
|
184
|
+
"note": normalize_note(table.get("note")),
|
|
185
|
+
"subject_code": str(table.get("subjectCode")).strip()
|
|
186
|
+
if table.get("subjectCode") is not None
|
|
187
|
+
else None,
|
|
188
|
+
"paths": normalize_paths(table.get("paths")),
|
|
189
|
+
"discontinued": bool(table.get("discontinued", False)),
|
|
190
|
+
"metadata_url": f"{clean_api}/tables/{dataset_code}/metadata",
|
|
191
|
+
"data_url": f"{clean_api}/tables/{dataset_code}/data?lang={language}",
|
|
192
|
+
"web_url": (
|
|
193
|
+
f"{base_web_url.rstrip('/')}/{language}/table/{dataset_code}"
|
|
194
|
+
if base_web_url
|
|
195
|
+
else None
|
|
196
|
+
),
|
|
197
|
+
"extension": {
|
|
198
|
+
"links": table.get("links"),
|
|
199
|
+
"variable_names": list(table.get("variableNames", []))
|
|
200
|
+
if isinstance(table.get("variableNames"), list)
|
|
201
|
+
else [],
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def parse_pxweb2_metadata_payload(
|
|
207
|
+
payload: dict[str, Any],
|
|
208
|
+
*,
|
|
209
|
+
default_note: list[str] | None,
|
|
210
|
+
default_subject_label: str | None,
|
|
211
|
+
default_official_statistics: bool | None,
|
|
212
|
+
default_contact: dict[str, Any] | None,
|
|
213
|
+
default_extension: dict[str, Any],
|
|
214
|
+
) -> dict[str, Any]:
|
|
215
|
+
(
|
|
216
|
+
dimensions,
|
|
217
|
+
dimension_ids,
|
|
218
|
+
required_dimensions,
|
|
219
|
+
dimension_extensions,
|
|
220
|
+
) = normalize_dimensions(payload.get("dimension"), payload.get("id"))
|
|
221
|
+
|
|
222
|
+
extension = dict(default_extension)
|
|
223
|
+
note = normalize_note(payload.get("note")) or default_note
|
|
224
|
+
role = normalize_role(payload.get("role"))
|
|
225
|
+
subject_label = default_subject_label
|
|
226
|
+
official_statistics = default_official_statistics
|
|
227
|
+
contact = default_contact
|
|
228
|
+
|
|
229
|
+
raw_extension = payload.get("extension")
|
|
230
|
+
if isinstance(raw_extension, dict):
|
|
231
|
+
extension.update(raw_extension)
|
|
232
|
+
px = raw_extension.get("px")
|
|
233
|
+
if isinstance(px, dict):
|
|
234
|
+
if subject_label is None and px.get("subject-area") is not None:
|
|
235
|
+
subject_label = str(px.get("subject-area")).strip() or None
|
|
236
|
+
if px.get("official-statistics") is not None:
|
|
237
|
+
official_statistics = bool(px.get("official-statistics"))
|
|
238
|
+
normalized_contact = normalize_contact(raw_extension.get("contact"))
|
|
239
|
+
if normalized_contact is not None:
|
|
240
|
+
contact = normalized_contact
|
|
241
|
+
|
|
242
|
+
if dimension_extensions is not None:
|
|
243
|
+
merged = {}
|
|
244
|
+
existing = extension.get("dimension_extensions")
|
|
245
|
+
if isinstance(existing, dict):
|
|
246
|
+
merged.update(existing)
|
|
247
|
+
merged.update(dimension_extensions)
|
|
248
|
+
extension["dimension_extensions"] = merged
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
"dimensions": dimensions,
|
|
252
|
+
"dimension_ids": dimension_ids,
|
|
253
|
+
"required_dimensions": required_dimensions,
|
|
254
|
+
"role": role,
|
|
255
|
+
"note": note,
|
|
256
|
+
"subject_label": subject_label,
|
|
257
|
+
"official_statistics": official_statistics,
|
|
258
|
+
"contact": contact,
|
|
259
|
+
"extension": extension,
|
|
260
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from importlib.resources import files
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .api_clients import DstClient, EurostatClient, PxWeb2Client, PxWebClient
|
|
9
|
+
from .exceptions import (
|
|
10
|
+
ProviderNotFoundError,
|
|
11
|
+
UnsupportedAPITypeError,
|
|
12
|
+
UnsupportedLanguageError,
|
|
13
|
+
)
|
|
14
|
+
from .models import Provider
|
|
15
|
+
|
|
16
|
+
WRAPPER_REGISTRY = {
|
|
17
|
+
"dst": DstClient,
|
|
18
|
+
"eurostat": EurostatClient,
|
|
19
|
+
"pxweb": PxWebClient,
|
|
20
|
+
"pxweb2": PxWeb2Client,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _default_provider_path() -> Path:
|
|
25
|
+
repo_file = Path(__file__).resolve().parent.parent / "PROVIDERS.json"
|
|
26
|
+
if repo_file.exists():
|
|
27
|
+
return repo_file
|
|
28
|
+
return Path(files("statwrapper").joinpath("providers.json"))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_providers(path: str | Path | None = None) -> list[Provider]:
|
|
32
|
+
resolved_path = Path(path) if path is not None else _default_provider_path()
|
|
33
|
+
data = json.loads(resolved_path.read_text(encoding="utf-8"))
|
|
34
|
+
return [Provider(**item) for item in data]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_provider(
|
|
38
|
+
provider_code: str,
|
|
39
|
+
providers: list[Provider] | None = None,
|
|
40
|
+
) -> Provider:
|
|
41
|
+
candidates = providers or load_providers()
|
|
42
|
+
for provider in candidates:
|
|
43
|
+
if provider.provider_code == provider_code:
|
|
44
|
+
return provider
|
|
45
|
+
raise ProviderNotFoundError(provider_code)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_wrapper_class(api_type: str) -> type:
|
|
49
|
+
wrapper_cls = WRAPPER_REGISTRY.get(api_type.lower())
|
|
50
|
+
if wrapper_cls is None:
|
|
51
|
+
raise UnsupportedAPITypeError(api_type)
|
|
52
|
+
return wrapper_cls
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def create_wrapper(
|
|
56
|
+
provider: Provider,
|
|
57
|
+
*,
|
|
58
|
+
language: str,
|
|
59
|
+
json_request_handler: Any,
|
|
60
|
+
text_request_handler: Any = None,
|
|
61
|
+
bytes_request_handler: Any = None,
|
|
62
|
+
logger: Any = None,
|
|
63
|
+
):
|
|
64
|
+
if language not in provider.languages:
|
|
65
|
+
raise UnsupportedLanguageError(provider.provider_code, language)
|
|
66
|
+
wrapper_cls = get_wrapper_class(provider.api_type)
|
|
67
|
+
return wrapper_cls(
|
|
68
|
+
provider,
|
|
69
|
+
language=language,
|
|
70
|
+
json_request_handler=json_request_handler,
|
|
71
|
+
text_request_handler=text_request_handler,
|
|
72
|
+
bytes_request_handler=bytes_request_handler,
|
|
73
|
+
logger=logger,
|
|
74
|
+
)
|