statwrapper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: statwrapper
3
+ Version: 0.1.0
4
+ Summary: A completely dependency-free unified wrapper around common statistical api types
5
+ Author-email: Nordic Intel <info@nordicintel.com>
6
+ License-Expression: Apache-2.0
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=8.3.5; extra == "dev"
16
+ Requires-Dist: pytest-asyncio>=0.26.0; extra == "dev"
17
+ Requires-Dist: build>=1.2.2; extra == "dev"
18
+ Requires-Dist: ruff>=0.11.0; extra == "dev"
19
+
20
+ # statwrapper
21
+
22
+ `statwrapper` is a dependency-free Python library that provides a unified interface for common statistical API families. It currently ships wrapper support for:
23
+
24
+ - `pxweb`
25
+ - `pxweb2`
26
+ - `dst`
27
+ - `eurostat`
28
+
29
+ The package standardizes three operations across providers:
30
+
31
+ - health checks
32
+ - dataset discovery
33
+ - dataset metadata resolution
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install statwrapper
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ import asyncio
45
+
46
+ from statwrapper import StatWrapper
47
+
48
+
49
+ async def main() -> None:
50
+ wrapper = StatWrapper()
51
+
52
+ discovered = await wrapper.discover_datasets("scb", "en")
53
+ first = discovered[0]
54
+
55
+ metadata = await wrapper.resolve_dataset_metadata(first)
56
+ print(metadata.label)
57
+ print(metadata.dimension_ids)
58
+
59
+
60
+ asyncio.run(main())
61
+ ```
62
+
63
+ ## Public API
64
+
65
+ ### `StatWrapper`
66
+
67
+ ```python
68
+ from statwrapper import StatWrapper
69
+
70
+ wrapper = StatWrapper()
71
+ ```
72
+
73
+ Methods:
74
+
75
+ - `await wrapper.discover_datasets(provider_code, language, task_id=None)`
76
+ - `await wrapper.resolve_dataset_metadata(discovered, task_id=None)`
77
+ - `wrapper.get_provider(provider_code)`
78
+ - `wrapper.get_wrapper(provider_code, language)`
79
+
80
+ ### Convenience Functions
81
+
82
+ ```python
83
+ from statwrapper import discover_provider_datasets, get_datasets
84
+ ```
85
+
86
+ - `await discover_provider_datasets(provider_code, language)`
87
+ - `get_datasets(provider_code, language)`
88
+
89
+ ## Request Layer
90
+
91
+ The default request layer is a stdlib-backed async helper:
92
+
93
+ ```python
94
+ from statwrapper import RateLimitedSession
95
+ ```
96
+
97
+ `StatWrapper` uses `RateLimitedSession` automatically, but you can inject your own session object if it exposes:
98
+
99
+ - `async def get_json(url, **kwargs)`
100
+ - `async def get_text(url, **kwargs)`
101
+ - `async def get_bytes(url, **kwargs)`
102
+
103
+ ## Provider Registry
104
+
105
+ Provider metadata is loaded from `PROVIDERS.json`. During development, the package reads the repository file. In built distributions, the same data is bundled inside the package.
106
+
107
+ ## Development
108
+
109
+ Run tests:
110
+
111
+ ```bash
112
+ python -m pytest tests/unit tests/integration -q
113
+ ```
114
+
115
+ Build locally:
116
+
117
+ ```bash
118
+ python -m build
119
+ ```
120
+
121
+ ## Publishing
122
+
123
+ PyPI publishing is handled by GitHub Actions through `.github/workflows/pypi_publish.yml` and the repository secret `PYPI_API_TOKEN`.
@@ -0,0 +1,104 @@
1
+ # statwrapper
2
+
3
+ `statwrapper` is a dependency-free Python library that provides a unified interface for common statistical API families. It currently ships wrapper support for:
4
+
5
+ - `pxweb`
6
+ - `pxweb2`
7
+ - `dst`
8
+ - `eurostat`
9
+
10
+ The package standardizes three operations across providers:
11
+
12
+ - health checks
13
+ - dataset discovery
14
+ - dataset metadata resolution
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install statwrapper
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ```python
25
+ import asyncio
26
+
27
+ from statwrapper import StatWrapper
28
+
29
+
30
+ async def main() -> None:
31
+ wrapper = StatWrapper()
32
+
33
+ discovered = await wrapper.discover_datasets("scb", "en")
34
+ first = discovered[0]
35
+
36
+ metadata = await wrapper.resolve_dataset_metadata(first)
37
+ print(metadata.label)
38
+ print(metadata.dimension_ids)
39
+
40
+
41
+ asyncio.run(main())
42
+ ```
43
+
44
+ ## Public API
45
+
46
+ ### `StatWrapper`
47
+
48
+ ```python
49
+ from statwrapper import StatWrapper
50
+
51
+ wrapper = StatWrapper()
52
+ ```
53
+
54
+ Methods:
55
+
56
+ - `await wrapper.discover_datasets(provider_code, language, task_id=None)`
57
+ - `await wrapper.resolve_dataset_metadata(discovered, task_id=None)`
58
+ - `wrapper.get_provider(provider_code)`
59
+ - `wrapper.get_wrapper(provider_code, language)`
60
+
61
+ ### Convenience Functions
62
+
63
+ ```python
64
+ from statwrapper import discover_provider_datasets, get_datasets
65
+ ```
66
+
67
+ - `await discover_provider_datasets(provider_code, language)`
68
+ - `get_datasets(provider_code, language)`
69
+
70
+ ## Request Layer
71
+
72
+ The default request layer is a stdlib-backed async helper:
73
+
74
+ ```python
75
+ from statwrapper import RateLimitedSession
76
+ ```
77
+
78
+ `StatWrapper` uses `RateLimitedSession` automatically, but you can inject your own session object if it exposes:
79
+
80
+ - `async def get_json(url, **kwargs)`
81
+ - `async def get_text(url, **kwargs)`
82
+ - `async def get_bytes(url, **kwargs)`
83
+
84
+ ## Provider Registry
85
+
86
+ Provider metadata is loaded from `PROVIDERS.json`. During development, the package reads the repository file. In built distributions, the same data is bundled inside the package.
87
+
88
+ ## Development
89
+
90
+ Run tests:
91
+
92
+ ```bash
93
+ python -m pytest tests/unit tests/integration -q
94
+ ```
95
+
96
+ Build locally:
97
+
98
+ ```bash
99
+ python -m build
100
+ ```
101
+
102
+ ## Publishing
103
+
104
+ PyPI publishing is handled by GitHub Actions through `.github/workflows/pypi_publish.yml` and the repository secret `PYPI_API_TOKEN`.
@@ -0,0 +1,74 @@
1
+ [project]
2
+ name = "statwrapper"
3
+ version = "0.1.0"
4
+ description = "A completely dependency-free unified wrapper around common statistical api types"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "Apache-2.0"
8
+ authors = [
9
+ { name = "Nordic Intel", email = "info@nordicintel.com" },
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ ]
18
+
19
+ [project.optional-dependencies]
20
+ dev = [
21
+ "pytest>=8.3.5",
22
+ "pytest-asyncio>=0.26.0",
23
+ "build>=1.2.2",
24
+ "ruff>=0.11.0",
25
+ ]
26
+
27
+ [build-system]
28
+ requires = ["setuptools>=61.0"]
29
+ build-backend = "setuptools.build_meta"
30
+
31
+ [tool.setuptools]
32
+ include-package-data = true
33
+
34
+ [tool.setuptools.packages.find]
35
+ include = ["statwrapper*"]
36
+
37
+ [tool.setuptools.package-data]
38
+ statwrapper = ["providers.json"]
39
+
40
+ [tool.ruff]
41
+ target-version = "py310"
42
+
43
+ [tool.ruff.lint]
44
+ select = [
45
+ "E", # pycodestyle errors
46
+ "W", # pycodestyle warnings
47
+ "F", # pyflakes
48
+ "I", # isort
49
+ "B", # flake8-bugbear
50
+ "C4", # flake8-comprehensions
51
+ "UP", # pyupgrade
52
+ "ARG001", # unused arguments in functions
53
+ "T201", # print statements are not allowed
54
+ ]
55
+ ignore = [
56
+ "E501", # line too long, handled by black
57
+ "B008", # do not perform function calls in argument defaults
58
+ "W191", # indentation contains tabs
59
+ "B904", # Allow raising exceptions without from e, for HTTPException
60
+ ]
61
+
62
+ [tool.ruff.lint.pyupgrade]
63
+ # Preserve types, even if a file imports `from __future__ import annotations`.
64
+ keep-runtime-typing = true
65
+
66
+
67
+ [tool.pytest.ini_options]
68
+ asyncio_mode = "auto"
69
+ asyncio_default_fixture_loop_scope = "session"
70
+ asyncio_default_test_loop_scope = "session"
71
+
72
+ # live wrapper tests are excluded from the default run; invoke them explicitly:
73
+ # pytest tests/live/ -v
74
+ testpaths = ["tests/unit", "tests/integration"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,31 @@
1
+ from .base_api_client import APIWrapper
2
+ from .http import RateLimitedSession
3
+ from .models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
4
+ from .provider_registry import (
5
+ create_wrapper,
6
+ get_provider,
7
+ get_wrapper_class,
8
+ load_providers,
9
+ )
10
+ from .statwrapper import (
11
+ StatWrapper,
12
+ discover_provider_datasets,
13
+ get_datasets,
14
+ resolve_dataset_metadata,
15
+ )
16
+
17
+ __all__ = [
18
+ "APIWrapper",
19
+ "DiscoveredDataset",
20
+ "Provider",
21
+ "RateLimitedSession",
22
+ "ResolvedDatasetMetadata",
23
+ "StatWrapper",
24
+ "create_wrapper",
25
+ "discover_provider_datasets",
26
+ "get_datasets",
27
+ "get_provider",
28
+ "get_wrapper_class",
29
+ "load_providers",
30
+ "resolve_dataset_metadata",
31
+ ]
@@ -0,0 +1,11 @@
1
+ from .dst_client import DstClient
2
+ from .eurostat_client import EurostatClient
3
+ from .pxweb2_client import PxWeb2Client
4
+ from .pxweb_client import PxWebClient
5
+
6
+ __all__ = [
7
+ "DstClient",
8
+ "EurostatClient",
9
+ "PxWebClient",
10
+ "PxWeb2Client",
11
+ ]
@@ -0,0 +1,249 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from typing import Any
5
+
6
+ from ..base_api_client import APIWrapper
7
+ from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
8
+ from ..utils import (
9
+ detect_role,
10
+ determine_time_unit,
11
+ normalize_contact,
12
+ normalize_note,
13
+ parse_dt,
14
+ )
15
+
16
+
17
+ def _collect_tables(subjects_tree: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
18
+ tables_by_id: dict[str, dict[str, Any]] = {}
19
+
20
+ def visit(node: dict[str, Any]) -> None:
21
+ for table in node.get("tables") or []:
22
+ if not isinstance(table, dict):
23
+ continue
24
+ table_id = str(table.get("id") or "").strip()
25
+ if not table_id:
26
+ continue
27
+ existing = tables_by_id.setdefault(table_id, {"id": table_id})
28
+ for key, value in table.items():
29
+ if existing.get(key) in (None, "", []) and value not in (None, "", []):
30
+ existing[key] = value
31
+ for child in node.get("subjects") or []:
32
+ if isinstance(child, dict):
33
+ visit(child)
34
+
35
+ for root in subjects_tree:
36
+ if isinstance(root, dict):
37
+ visit(root)
38
+ return tables_by_id
39
+
40
+
41
+ def _parse_paths(subjects_tree: list[dict[str, Any]]) -> dict[str, list[list[dict[str, str]]]]:
42
+ code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
43
+
44
+ def visit(node: dict[str, Any], path: list[dict[str, str]]) -> None:
45
+ node_id = str(node.get("id") or "").strip()
46
+ node_label = str(node.get("description") or node.get("text") or node_id).strip()
47
+ current_path = path + ([{"id": node_id, "label": node_label}] if node_id else [])
48
+ for table in node.get("tables") or []:
49
+ if not isinstance(table, dict):
50
+ continue
51
+ table_id = str(table.get("id") or "").strip()
52
+ if not table_id:
53
+ continue
54
+ code_to_paths.setdefault(table_id, []).append(
55
+ current_path + [{"id": table_id, "label": str(table.get("text") or table_id).strip()}]
56
+ )
57
+ for child in node.get("subjects") or []:
58
+ if isinstance(child, dict):
59
+ visit(child, current_path)
60
+
61
+ for root in subjects_tree:
62
+ if isinstance(root, dict):
63
+ visit(root, [])
64
+ return code_to_paths
65
+
66
+
67
+ class DstClient(APIWrapper):
68
+ def __init__(self, provider: Provider, **kwargs: Any) -> None:
69
+ super().__init__(
70
+ provider_code=provider.provider_code,
71
+ label=provider.label,
72
+ language=kwargs.pop("language"),
73
+ json_request_handler=kwargs.pop("json_request_handler"),
74
+ text_request_handler=kwargs.pop("text_request_handler", None),
75
+ bytes_request_handler=kwargs.pop("bytes_request_handler", None),
76
+ logger=kwargs.pop("logger", None),
77
+ )
78
+ self.provider = provider
79
+ self.api_type = provider.api_type
80
+ self.base_api_url = provider.base_api_url or ""
81
+ self.base_web_url = provider.base_web_url
82
+
83
+ def _subjects_url(self) -> str:
84
+ return f"{self.base_api_url.rstrip('/')}/subjects"
85
+
86
+ def _tableinfo_url(self, dataset_code: str) -> str:
87
+ return f"{self.base_api_url.rstrip('/')}/tableinfo/{dataset_code}"
88
+
89
+ def _data_url(self, dataset_code: str) -> str:
90
+ return f"{self.base_api_url.rstrip('/')}/data/{dataset_code}/JSONSTAT"
91
+
92
+ async def health_check(self, dataset_code: str | None = None) -> bool:
93
+ url = self._subjects_url() if dataset_code is None else self._tableinfo_url(dataset_code)
94
+ payload = await self._get_json(url, params={"lang": self.language})
95
+ return payload is not None
96
+
97
+ async def discover_datasets(
98
+ self,
99
+ task_id: uuid.UUID,
100
+ **_: Any,
101
+ ) -> list[DiscoveredDataset]:
102
+ payload = await self._get_json(
103
+ self._subjects_url(),
104
+ params={
105
+ "lang": self.language,
106
+ "recursive": "true",
107
+ "omitSubjectsWithoutTables": "true",
108
+ "includeTables": "true",
109
+ "format": "JSON",
110
+ },
111
+ )
112
+ if not isinstance(payload, list):
113
+ return []
114
+ tables_by_id = _collect_tables(payload)
115
+ paths_by_id = _parse_paths(payload)
116
+ output: list[DiscoveredDataset] = []
117
+ for dataset_code, table in sorted(tables_by_id.items()):
118
+ paths = paths_by_id.get(dataset_code, [])
119
+ primary_path = paths[0] if paths else []
120
+ subject = primary_path[0] if primary_path else None
121
+ first_period = str(table.get("firstPeriod") or "").strip() or None
122
+ last_period = str(table.get("latestPeriod") or table.get("lastPeriod") or "").strip() or None
123
+ updated = parse_dt(table.get("updated")) or parse_dt("1970-01-01T00:00:00+00:00")
124
+ if updated is None:
125
+ continue
126
+ output.append(
127
+ DiscoveredDataset(
128
+ task_id=task_id,
129
+ provider_code=self.provider_code,
130
+ dataset_code=dataset_code,
131
+ language=self.language,
132
+ updated=updated,
133
+ label=str(table.get("text") or dataset_code).strip(),
134
+ description=str(table.get("description")).strip() if table.get("description") is not None else None,
135
+ source=f"Statistics Denmark - statbank.dk/{dataset_code}",
136
+ subject_code=subject["id"] if subject else None,
137
+ subject_label=subject["label"] if subject else None,
138
+ time_unit=determine_time_unit(first_period, last_period) if first_period and last_period else None,
139
+ first_period=first_period,
140
+ last_period=last_period,
141
+ discontinued=(not bool(table.get("active")) if isinstance(table.get("active"), bool) else False),
142
+ metadata_url=f"{self._tableinfo_url(dataset_code)}?lang={self.language}",
143
+ data_url=f"{self._data_url(dataset_code)}?lang={self.language}",
144
+ web_url=f"{self.base_web_url.rstrip('/')}/{dataset_code}" if self.base_web_url else None,
145
+ paths=paths or None,
146
+ extension={"unit": table.get("unit"), "variable_names": table.get("variables")},
147
+ )
148
+ )
149
+ return output
150
+
151
+ async def resolve_dataset_metadata(
152
+ self,
153
+ discovered: DiscoveredDataset,
154
+ task_id: uuid.UUID | None = None,
155
+ **_: Any,
156
+ ) -> ResolvedDatasetMetadata:
157
+ payload = await self._get_json(
158
+ self._tableinfo_url(discovered.dataset_code),
159
+ params={"lang": self.language},
160
+ )
161
+ dimensions: dict[str, dict[str, Any]] = {}
162
+ dimension_ids: list[str] = []
163
+ required_dimensions: dict[str, bool | None] = {}
164
+ role_map: dict[str, list[str]] = {}
165
+ extension = dict(discovered.extension)
166
+ if isinstance(payload, dict):
167
+ for position, variable in enumerate(payload.get("variables") or []):
168
+ if not isinstance(variable, dict):
169
+ continue
170
+ dimension_id = str(variable.get("id") or "").strip()
171
+ if not dimension_id:
172
+ continue
173
+ values = variable.get("values") if isinstance(variable.get("values"), list) else []
174
+ index: dict[str, int] = {}
175
+ labels: dict[str, str] = {}
176
+ for ordinal, item in enumerate(values):
177
+ if not isinstance(item, dict):
178
+ continue
179
+ value_id = str(item.get("id") or "").strip()
180
+ if not value_id:
181
+ continue
182
+ index[value_id] = ordinal
183
+ labels[value_id] = str(item.get("text") or value_id).strip()
184
+ if not index:
185
+ continue
186
+ label = str(variable.get("text") or dimension_id).strip()
187
+ dimensions[dimension_id] = {
188
+ "label": label,
189
+ "category": {"index": index, "label": labels},
190
+ }
191
+ dimension_ids.append(dimension_id)
192
+ elimination = variable.get("elimination")
193
+ required_dimensions[dimension_id] = None if elimination is None else not bool(elimination)
194
+ role = "geo" if variable.get("map") else detect_role(dimension_id, label, bool(variable.get("time")))
195
+ if role:
196
+ role_map.setdefault(role, []).append(dimension_id)
197
+ extension.setdefault("dimension_extensions", {})[dimension_id] = {
198
+ "extension": {"position": position}
199
+ }
200
+ label = (
201
+ str(payload.get("text")).strip()
202
+ if isinstance(payload, dict) and payload.get("text") is not None
203
+ else (discovered.label or discovered.dataset_code)
204
+ )
205
+ description = (
206
+ str(payload.get("description")).strip()
207
+ if isinstance(payload, dict) and payload.get("description") is not None
208
+ else discovered.description
209
+ )
210
+ updated = (
211
+ parse_dt(payload.get("updated"))
212
+ if isinstance(payload, dict)
213
+ else None
214
+ ) or discovered.updated
215
+ return ResolvedDatasetMetadata(
216
+ task_id=task_id or discovered.task_id,
217
+ provider_code=discovered.provider_code,
218
+ dataset_code=discovered.dataset_code,
219
+ language=discovered.language,
220
+ updated=updated,
221
+ label=label,
222
+ time_unit=discovered.time_unit or "Other",
223
+ first_period=discovered.first_period or "",
224
+ last_period=discovered.last_period or discovered.first_period or "",
225
+ paths=discovered.paths or [],
226
+ role=role_map,
227
+ metadata_url=discovered.metadata_url or f"{self._tableinfo_url(discovered.dataset_code)}?lang={self.language}",
228
+ data_url=discovered.data_url or f"{self._data_url(discovered.dataset_code)}?lang={self.language}",
229
+ dimension=dimensions,
230
+ required_dimensions=required_dimensions,
231
+ note=(normalize_note(payload.get("footnote")) if isinstance(payload, dict) else None) or discovered.note,
232
+ source=discovered.source,
233
+ description=description,
234
+ discontinued=not bool(payload.get("active")) if isinstance(payload, dict) and isinstance(payload.get("active"), bool) else discovered.discontinued,
235
+ subject_code=discovered.subject_code,
236
+ subject_label=discovered.subject_label,
237
+ web_url=discovered.web_url,
238
+ doc_url=(
239
+ str(payload.get("documentation", {}).get("url")).strip()
240
+ if isinstance(payload, dict)
241
+ and isinstance(payload.get("documentation"), dict)
242
+ and payload["documentation"].get("url") is not None
243
+ else discovered.doc_url
244
+ ),
245
+ official_statistics=discovered.official_statistics,
246
+ contact=(normalize_contact(payload.get("contacts")) if isinstance(payload, dict) else None) or discovered.contact,
247
+ dimension_ids=dimension_ids or None,
248
+ extension=extension,
249
+ )