statwrapper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statwrapper-0.1.0/PKG-INFO +123 -0
- statwrapper-0.1.0/README.md +104 -0
- statwrapper-0.1.0/pyproject.toml +74 -0
- statwrapper-0.1.0/setup.cfg +4 -0
- statwrapper-0.1.0/statwrapper/__init__.py +31 -0
- statwrapper-0.1.0/statwrapper/api_clients/__init__.py +11 -0
- statwrapper-0.1.0/statwrapper/api_clients/dst_client.py +249 -0
- statwrapper-0.1.0/statwrapper/api_clients/eurostat_client.py +359 -0
- statwrapper-0.1.0/statwrapper/api_clients/pxweb2_client.py +170 -0
- statwrapper-0.1.0/statwrapper/api_clients/pxweb_client.py +244 -0
- statwrapper-0.1.0/statwrapper/base_api_client.py +79 -0
- statwrapper-0.1.0/statwrapper/exceptions.py +22 -0
- statwrapper-0.1.0/statwrapper/http.py +126 -0
- statwrapper-0.1.0/statwrapper/models.py +103 -0
- statwrapper-0.1.0/statwrapper/parsers.py +260 -0
- statwrapper-0.1.0/statwrapper/provider_registry.py +74 -0
- statwrapper-0.1.0/statwrapper/providers.json +662 -0
- statwrapper-0.1.0/statwrapper/statwrapper.py +103 -0
- statwrapper-0.1.0/statwrapper/utils.py +134 -0
- statwrapper-0.1.0/statwrapper.egg-info/PKG-INFO +123 -0
- statwrapper-0.1.0/statwrapper.egg-info/SOURCES.txt +22 -0
- statwrapper-0.1.0/statwrapper.egg-info/dependency_links.txt +1 -0
- statwrapper-0.1.0/statwrapper.egg-info/requires.txt +6 -0
- statwrapper-0.1.0/statwrapper.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: statwrapper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A completely dependency-free unified wrapper around common statistical api types
|
|
5
|
+
Author-email: Nordic Intel <info@nordicintel.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-asyncio>=0.26.0; extra == "dev"
|
|
17
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.11.0; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# statwrapper
|
|
21
|
+
|
|
22
|
+
`statwrapper` is a dependency-free Python library that provides a unified interface for common statistical API families. It currently ships wrapper support for:
|
|
23
|
+
|
|
24
|
+
- `pxweb`
|
|
25
|
+
- `pxweb2`
|
|
26
|
+
- `dst`
|
|
27
|
+
- `eurostat`
|
|
28
|
+
|
|
29
|
+
The package standardizes three operations across providers:
|
|
30
|
+
|
|
31
|
+
- health checks
|
|
32
|
+
- dataset discovery
|
|
33
|
+
- dataset metadata resolution
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install statwrapper
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import asyncio
|
|
45
|
+
|
|
46
|
+
from statwrapper import StatWrapper
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def main() -> None:
|
|
50
|
+
wrapper = StatWrapper()
|
|
51
|
+
|
|
52
|
+
discovered = await wrapper.discover_datasets("scb", "en")
|
|
53
|
+
first = discovered[0]
|
|
54
|
+
|
|
55
|
+
metadata = await wrapper.resolve_dataset_metadata(first)
|
|
56
|
+
print(metadata.label)
|
|
57
|
+
print(metadata.dimension_ids)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
asyncio.run(main())
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Public API
|
|
64
|
+
|
|
65
|
+
### `StatWrapper`
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from statwrapper import StatWrapper
|
|
69
|
+
|
|
70
|
+
wrapper = StatWrapper()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Methods:
|
|
74
|
+
|
|
75
|
+
- `await wrapper.discover_datasets(provider_code, language, task_id=None)`
|
|
76
|
+
- `await wrapper.resolve_dataset_metadata(discovered, task_id=None)`
|
|
77
|
+
- `wrapper.get_provider(provider_code)`
|
|
78
|
+
- `wrapper.get_wrapper(provider_code, language)`
|
|
79
|
+
|
|
80
|
+
### Convenience Functions
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from statwrapper import discover_provider_datasets, get_datasets
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
- `await discover_provider_datasets(provider_code, language)`
|
|
87
|
+
- `get_datasets(provider_code, language)`
|
|
88
|
+
|
|
89
|
+
## Request Layer
|
|
90
|
+
|
|
91
|
+
The default request layer is a stdlib-backed async helper:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from statwrapper import RateLimitedSession
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
`StatWrapper` uses `RateLimitedSession` automatically, but you can inject your own session object if it exposes:
|
|
98
|
+
|
|
99
|
+
- `async def get_json(url, **kwargs)`
|
|
100
|
+
- `async def get_text(url, **kwargs)`
|
|
101
|
+
- `async def get_bytes(url, **kwargs)`
|
|
102
|
+
|
|
103
|
+
## Provider Registry
|
|
104
|
+
|
|
105
|
+
Provider metadata is loaded from `PROVIDERS.json`. During development, the package reads the repository file. In built distributions, the same data is bundled inside the package.
|
|
106
|
+
|
|
107
|
+
## Development
|
|
108
|
+
|
|
109
|
+
Run tests:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
python -m pytest tests/unit tests/integration -q
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Build locally:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
python -m build
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Publishing
|
|
122
|
+
|
|
123
|
+
PyPI publishing is handled by GitHub Actions through `.github/workflows/pypi_publish.yml` and the repository secret `PYPI_API_TOKEN`.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# statwrapper
|
|
2
|
+
|
|
3
|
+
`statwrapper` is a dependency-free Python library that provides a unified interface for common statistical API families. It currently ships wrapper support for:
|
|
4
|
+
|
|
5
|
+
- `pxweb`
|
|
6
|
+
- `pxweb2`
|
|
7
|
+
- `dst`
|
|
8
|
+
- `eurostat`
|
|
9
|
+
|
|
10
|
+
The package standardizes three operations across providers:
|
|
11
|
+
|
|
12
|
+
- health checks
|
|
13
|
+
- dataset discovery
|
|
14
|
+
- dataset metadata resolution
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install statwrapper
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
import asyncio
|
|
26
|
+
|
|
27
|
+
from statwrapper import StatWrapper
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def main() -> None:
|
|
31
|
+
wrapper = StatWrapper()
|
|
32
|
+
|
|
33
|
+
discovered = await wrapper.discover_datasets("scb", "en")
|
|
34
|
+
first = discovered[0]
|
|
35
|
+
|
|
36
|
+
metadata = await wrapper.resolve_dataset_metadata(first)
|
|
37
|
+
print(metadata.label)
|
|
38
|
+
print(metadata.dimension_ids)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
asyncio.run(main())
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Public API
|
|
45
|
+
|
|
46
|
+
### `StatWrapper`
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from statwrapper import StatWrapper
|
|
50
|
+
|
|
51
|
+
wrapper = StatWrapper()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Methods:
|
|
55
|
+
|
|
56
|
+
- `await wrapper.discover_datasets(provider_code, language, task_id=None)`
|
|
57
|
+
- `await wrapper.resolve_dataset_metadata(discovered, task_id=None)`
|
|
58
|
+
- `wrapper.get_provider(provider_code)`
|
|
59
|
+
- `wrapper.get_wrapper(provider_code, language)`
|
|
60
|
+
|
|
61
|
+
### Convenience Functions
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from statwrapper import discover_provider_datasets, get_datasets
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
- `await discover_provider_datasets(provider_code, language)`
|
|
68
|
+
- `get_datasets(provider_code, language)`
|
|
69
|
+
|
|
70
|
+
## Request Layer
|
|
71
|
+
|
|
72
|
+
The default request layer is a stdlib-backed async helper:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from statwrapper import RateLimitedSession
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
`StatWrapper` uses `RateLimitedSession` automatically, but you can inject your own session object if it exposes:
|
|
79
|
+
|
|
80
|
+
- `async def get_json(url, **kwargs)`
|
|
81
|
+
- `async def get_text(url, **kwargs)`
|
|
82
|
+
- `async def get_bytes(url, **kwargs)`
|
|
83
|
+
|
|
84
|
+
## Provider Registry
|
|
85
|
+
|
|
86
|
+
Provider metadata is loaded from `PROVIDERS.json`. During development, the package reads the repository file. In built distributions, the same data is bundled inside the package.
|
|
87
|
+
|
|
88
|
+
## Development
|
|
89
|
+
|
|
90
|
+
Run tests:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
python -m pytest tests/unit tests/integration -q
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Build locally:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
python -m build
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Publishing
|
|
103
|
+
|
|
104
|
+
PyPI publishing is handled by GitHub Actions through `.github/workflows/pypi_publish.yml` and the repository secret `PYPI_API_TOKEN`.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "statwrapper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A completely dependency-free unified wrapper around common statistical api types"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Nordic Intel", email = "info@nordicintel.com" },
|
|
10
|
+
]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.optional-dependencies]
|
|
20
|
+
dev = [
|
|
21
|
+
"pytest>=8.3.5",
|
|
22
|
+
"pytest-asyncio>=0.26.0",
|
|
23
|
+
"build>=1.2.2",
|
|
24
|
+
"ruff>=0.11.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["setuptools>=61.0"]
|
|
29
|
+
build-backend = "setuptools.build_meta"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools]
|
|
32
|
+
include-package-data = true
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
include = ["statwrapper*"]
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.package-data]
|
|
38
|
+
statwrapper = ["providers.json"]
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
target-version = "py310"
|
|
42
|
+
|
|
43
|
+
[tool.ruff.lint]
|
|
44
|
+
select = [
|
|
45
|
+
"E", # pycodestyle errors
|
|
46
|
+
"W", # pycodestyle warnings
|
|
47
|
+
"F", # pyflakes
|
|
48
|
+
"I", # isort
|
|
49
|
+
"B", # flake8-bugbear
|
|
50
|
+
"C4", # flake8-comprehensions
|
|
51
|
+
"UP", # pyupgrade
|
|
52
|
+
"ARG001", # unused arguments in functions
|
|
53
|
+
"T201", # print statements are not allowed
|
|
54
|
+
]
|
|
55
|
+
ignore = [
|
|
56
|
+
"E501", # line too long, handled by black
|
|
57
|
+
"B008", # do not perform function calls in argument defaults
|
|
58
|
+
"W191", # indentation contains tabs
|
|
59
|
+
"B904", # Allow raising exceptions without from e, for HTTPException
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint.pyupgrade]
|
|
63
|
+
# Preserve types, even if a file imports `from __future__ import annotations`.
|
|
64
|
+
keep-runtime-typing = true
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
asyncio_mode = "auto"
|
|
69
|
+
asyncio_default_fixture_loop_scope = "session"
|
|
70
|
+
asyncio_default_test_loop_scope = "session"
|
|
71
|
+
|
|
72
|
+
# live wrapper tests are excluded from the default run; invoke them explicitly:
|
|
73
|
+
# pytest tests/live/ -v
|
|
74
|
+
testpaths = ["tests/unit", "tests/integration"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .base_api_client import APIWrapper
|
|
2
|
+
from .http import RateLimitedSession
|
|
3
|
+
from .models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
|
|
4
|
+
from .provider_registry import (
|
|
5
|
+
create_wrapper,
|
|
6
|
+
get_provider,
|
|
7
|
+
get_wrapper_class,
|
|
8
|
+
load_providers,
|
|
9
|
+
)
|
|
10
|
+
from .statwrapper import (
|
|
11
|
+
StatWrapper,
|
|
12
|
+
discover_provider_datasets,
|
|
13
|
+
get_datasets,
|
|
14
|
+
resolve_dataset_metadata,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"APIWrapper",
|
|
19
|
+
"DiscoveredDataset",
|
|
20
|
+
"Provider",
|
|
21
|
+
"RateLimitedSession",
|
|
22
|
+
"ResolvedDatasetMetadata",
|
|
23
|
+
"StatWrapper",
|
|
24
|
+
"create_wrapper",
|
|
25
|
+
"discover_provider_datasets",
|
|
26
|
+
"get_datasets",
|
|
27
|
+
"get_provider",
|
|
28
|
+
"get_wrapper_class",
|
|
29
|
+
"load_providers",
|
|
30
|
+
"resolve_dataset_metadata",
|
|
31
|
+
]
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..base_api_client import APIWrapper
|
|
7
|
+
from ..models import DiscoveredDataset, Provider, ResolvedDatasetMetadata
|
|
8
|
+
from ..utils import (
|
|
9
|
+
detect_role,
|
|
10
|
+
determine_time_unit,
|
|
11
|
+
normalize_contact,
|
|
12
|
+
normalize_note,
|
|
13
|
+
parse_dt,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _collect_tables(subjects_tree: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
|
|
18
|
+
tables_by_id: dict[str, dict[str, Any]] = {}
|
|
19
|
+
|
|
20
|
+
def visit(node: dict[str, Any]) -> None:
|
|
21
|
+
for table in node.get("tables") or []:
|
|
22
|
+
if not isinstance(table, dict):
|
|
23
|
+
continue
|
|
24
|
+
table_id = str(table.get("id") or "").strip()
|
|
25
|
+
if not table_id:
|
|
26
|
+
continue
|
|
27
|
+
existing = tables_by_id.setdefault(table_id, {"id": table_id})
|
|
28
|
+
for key, value in table.items():
|
|
29
|
+
if existing.get(key) in (None, "", []) and value not in (None, "", []):
|
|
30
|
+
existing[key] = value
|
|
31
|
+
for child in node.get("subjects") or []:
|
|
32
|
+
if isinstance(child, dict):
|
|
33
|
+
visit(child)
|
|
34
|
+
|
|
35
|
+
for root in subjects_tree:
|
|
36
|
+
if isinstance(root, dict):
|
|
37
|
+
visit(root)
|
|
38
|
+
return tables_by_id
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _parse_paths(subjects_tree: list[dict[str, Any]]) -> dict[str, list[list[dict[str, str]]]]:
|
|
42
|
+
code_to_paths: dict[str, list[list[dict[str, str]]]] = {}
|
|
43
|
+
|
|
44
|
+
def visit(node: dict[str, Any], path: list[dict[str, str]]) -> None:
|
|
45
|
+
node_id = str(node.get("id") or "").strip()
|
|
46
|
+
node_label = str(node.get("description") or node.get("text") or node_id).strip()
|
|
47
|
+
current_path = path + ([{"id": node_id, "label": node_label}] if node_id else [])
|
|
48
|
+
for table in node.get("tables") or []:
|
|
49
|
+
if not isinstance(table, dict):
|
|
50
|
+
continue
|
|
51
|
+
table_id = str(table.get("id") or "").strip()
|
|
52
|
+
if not table_id:
|
|
53
|
+
continue
|
|
54
|
+
code_to_paths.setdefault(table_id, []).append(
|
|
55
|
+
current_path + [{"id": table_id, "label": str(table.get("text") or table_id).strip()}]
|
|
56
|
+
)
|
|
57
|
+
for child in node.get("subjects") or []:
|
|
58
|
+
if isinstance(child, dict):
|
|
59
|
+
visit(child, current_path)
|
|
60
|
+
|
|
61
|
+
for root in subjects_tree:
|
|
62
|
+
if isinstance(root, dict):
|
|
63
|
+
visit(root, [])
|
|
64
|
+
return code_to_paths
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DstClient(APIWrapper):
|
|
68
|
+
def __init__(self, provider: Provider, **kwargs: Any) -> None:
|
|
69
|
+
super().__init__(
|
|
70
|
+
provider_code=provider.provider_code,
|
|
71
|
+
label=provider.label,
|
|
72
|
+
language=kwargs.pop("language"),
|
|
73
|
+
json_request_handler=kwargs.pop("json_request_handler"),
|
|
74
|
+
text_request_handler=kwargs.pop("text_request_handler", None),
|
|
75
|
+
bytes_request_handler=kwargs.pop("bytes_request_handler", None),
|
|
76
|
+
logger=kwargs.pop("logger", None),
|
|
77
|
+
)
|
|
78
|
+
self.provider = provider
|
|
79
|
+
self.api_type = provider.api_type
|
|
80
|
+
self.base_api_url = provider.base_api_url or ""
|
|
81
|
+
self.base_web_url = provider.base_web_url
|
|
82
|
+
|
|
83
|
+
def _subjects_url(self) -> str:
|
|
84
|
+
return f"{self.base_api_url.rstrip('/')}/subjects"
|
|
85
|
+
|
|
86
|
+
def _tableinfo_url(self, dataset_code: str) -> str:
|
|
87
|
+
return f"{self.base_api_url.rstrip('/')}/tableinfo/{dataset_code}"
|
|
88
|
+
|
|
89
|
+
def _data_url(self, dataset_code: str) -> str:
|
|
90
|
+
return f"{self.base_api_url.rstrip('/')}/data/{dataset_code}/JSONSTAT"
|
|
91
|
+
|
|
92
|
+
async def health_check(self, dataset_code: str | None = None) -> bool:
|
|
93
|
+
url = self._subjects_url() if dataset_code is None else self._tableinfo_url(dataset_code)
|
|
94
|
+
payload = await self._get_json(url, params={"lang": self.language})
|
|
95
|
+
return payload is not None
|
|
96
|
+
|
|
97
|
+
async def discover_datasets(
|
|
98
|
+
self,
|
|
99
|
+
task_id: uuid.UUID,
|
|
100
|
+
**_: Any,
|
|
101
|
+
) -> list[DiscoveredDataset]:
|
|
102
|
+
payload = await self._get_json(
|
|
103
|
+
self._subjects_url(),
|
|
104
|
+
params={
|
|
105
|
+
"lang": self.language,
|
|
106
|
+
"recursive": "true",
|
|
107
|
+
"omitSubjectsWithoutTables": "true",
|
|
108
|
+
"includeTables": "true",
|
|
109
|
+
"format": "JSON",
|
|
110
|
+
},
|
|
111
|
+
)
|
|
112
|
+
if not isinstance(payload, list):
|
|
113
|
+
return []
|
|
114
|
+
tables_by_id = _collect_tables(payload)
|
|
115
|
+
paths_by_id = _parse_paths(payload)
|
|
116
|
+
output: list[DiscoveredDataset] = []
|
|
117
|
+
for dataset_code, table in sorted(tables_by_id.items()):
|
|
118
|
+
paths = paths_by_id.get(dataset_code, [])
|
|
119
|
+
primary_path = paths[0] if paths else []
|
|
120
|
+
subject = primary_path[0] if primary_path else None
|
|
121
|
+
first_period = str(table.get("firstPeriod") or "").strip() or None
|
|
122
|
+
last_period = str(table.get("latestPeriod") or table.get("lastPeriod") or "").strip() or None
|
|
123
|
+
updated = parse_dt(table.get("updated")) or parse_dt("1970-01-01T00:00:00+00:00")
|
|
124
|
+
if updated is None:
|
|
125
|
+
continue
|
|
126
|
+
output.append(
|
|
127
|
+
DiscoveredDataset(
|
|
128
|
+
task_id=task_id,
|
|
129
|
+
provider_code=self.provider_code,
|
|
130
|
+
dataset_code=dataset_code,
|
|
131
|
+
language=self.language,
|
|
132
|
+
updated=updated,
|
|
133
|
+
label=str(table.get("text") or dataset_code).strip(),
|
|
134
|
+
description=str(table.get("description")).strip() if table.get("description") is not None else None,
|
|
135
|
+
source=f"Statistics Denmark - statbank.dk/{dataset_code}",
|
|
136
|
+
subject_code=subject["id"] if subject else None,
|
|
137
|
+
subject_label=subject["label"] if subject else None,
|
|
138
|
+
time_unit=determine_time_unit(first_period, last_period) if first_period and last_period else None,
|
|
139
|
+
first_period=first_period,
|
|
140
|
+
last_period=last_period,
|
|
141
|
+
discontinued=(not bool(table.get("active")) if isinstance(table.get("active"), bool) else False),
|
|
142
|
+
metadata_url=f"{self._tableinfo_url(dataset_code)}?lang={self.language}",
|
|
143
|
+
data_url=f"{self._data_url(dataset_code)}?lang={self.language}",
|
|
144
|
+
web_url=f"{self.base_web_url.rstrip('/')}/{dataset_code}" if self.base_web_url else None,
|
|
145
|
+
paths=paths or None,
|
|
146
|
+
extension={"unit": table.get("unit"), "variable_names": table.get("variables")},
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
return output
|
|
150
|
+
|
|
151
|
+
async def resolve_dataset_metadata(
|
|
152
|
+
self,
|
|
153
|
+
discovered: DiscoveredDataset,
|
|
154
|
+
task_id: uuid.UUID | None = None,
|
|
155
|
+
**_: Any,
|
|
156
|
+
) -> ResolvedDatasetMetadata:
|
|
157
|
+
payload = await self._get_json(
|
|
158
|
+
self._tableinfo_url(discovered.dataset_code),
|
|
159
|
+
params={"lang": self.language},
|
|
160
|
+
)
|
|
161
|
+
dimensions: dict[str, dict[str, Any]] = {}
|
|
162
|
+
dimension_ids: list[str] = []
|
|
163
|
+
required_dimensions: dict[str, bool | None] = {}
|
|
164
|
+
role_map: dict[str, list[str]] = {}
|
|
165
|
+
extension = dict(discovered.extension)
|
|
166
|
+
if isinstance(payload, dict):
|
|
167
|
+
for position, variable in enumerate(payload.get("variables") or []):
|
|
168
|
+
if not isinstance(variable, dict):
|
|
169
|
+
continue
|
|
170
|
+
dimension_id = str(variable.get("id") or "").strip()
|
|
171
|
+
if not dimension_id:
|
|
172
|
+
continue
|
|
173
|
+
values = variable.get("values") if isinstance(variable.get("values"), list) else []
|
|
174
|
+
index: dict[str, int] = {}
|
|
175
|
+
labels: dict[str, str] = {}
|
|
176
|
+
for ordinal, item in enumerate(values):
|
|
177
|
+
if not isinstance(item, dict):
|
|
178
|
+
continue
|
|
179
|
+
value_id = str(item.get("id") or "").strip()
|
|
180
|
+
if not value_id:
|
|
181
|
+
continue
|
|
182
|
+
index[value_id] = ordinal
|
|
183
|
+
labels[value_id] = str(item.get("text") or value_id).strip()
|
|
184
|
+
if not index:
|
|
185
|
+
continue
|
|
186
|
+
label = str(variable.get("text") or dimension_id).strip()
|
|
187
|
+
dimensions[dimension_id] = {
|
|
188
|
+
"label": label,
|
|
189
|
+
"category": {"index": index, "label": labels},
|
|
190
|
+
}
|
|
191
|
+
dimension_ids.append(dimension_id)
|
|
192
|
+
elimination = variable.get("elimination")
|
|
193
|
+
required_dimensions[dimension_id] = None if elimination is None else not bool(elimination)
|
|
194
|
+
role = "geo" if variable.get("map") else detect_role(dimension_id, label, bool(variable.get("time")))
|
|
195
|
+
if role:
|
|
196
|
+
role_map.setdefault(role, []).append(dimension_id)
|
|
197
|
+
extension.setdefault("dimension_extensions", {})[dimension_id] = {
|
|
198
|
+
"extension": {"position": position}
|
|
199
|
+
}
|
|
200
|
+
label = (
|
|
201
|
+
str(payload.get("text")).strip()
|
|
202
|
+
if isinstance(payload, dict) and payload.get("text") is not None
|
|
203
|
+
else (discovered.label or discovered.dataset_code)
|
|
204
|
+
)
|
|
205
|
+
description = (
|
|
206
|
+
str(payload.get("description")).strip()
|
|
207
|
+
if isinstance(payload, dict) and payload.get("description") is not None
|
|
208
|
+
else discovered.description
|
|
209
|
+
)
|
|
210
|
+
updated = (
|
|
211
|
+
parse_dt(payload.get("updated"))
|
|
212
|
+
if isinstance(payload, dict)
|
|
213
|
+
else None
|
|
214
|
+
) or discovered.updated
|
|
215
|
+
return ResolvedDatasetMetadata(
|
|
216
|
+
task_id=task_id or discovered.task_id,
|
|
217
|
+
provider_code=discovered.provider_code,
|
|
218
|
+
dataset_code=discovered.dataset_code,
|
|
219
|
+
language=discovered.language,
|
|
220
|
+
updated=updated,
|
|
221
|
+
label=label,
|
|
222
|
+
time_unit=discovered.time_unit or "Other",
|
|
223
|
+
first_period=discovered.first_period or "",
|
|
224
|
+
last_period=discovered.last_period or discovered.first_period or "",
|
|
225
|
+
paths=discovered.paths or [],
|
|
226
|
+
role=role_map,
|
|
227
|
+
metadata_url=discovered.metadata_url or f"{self._tableinfo_url(discovered.dataset_code)}?lang={self.language}",
|
|
228
|
+
data_url=discovered.data_url or f"{self._data_url(discovered.dataset_code)}?lang={self.language}",
|
|
229
|
+
dimension=dimensions,
|
|
230
|
+
required_dimensions=required_dimensions,
|
|
231
|
+
note=(normalize_note(payload.get("footnote")) if isinstance(payload, dict) else None) or discovered.note,
|
|
232
|
+
source=discovered.source,
|
|
233
|
+
description=description,
|
|
234
|
+
discontinued=not bool(payload.get("active")) if isinstance(payload, dict) and isinstance(payload.get("active"), bool) else discovered.discontinued,
|
|
235
|
+
subject_code=discovered.subject_code,
|
|
236
|
+
subject_label=discovered.subject_label,
|
|
237
|
+
web_url=discovered.web_url,
|
|
238
|
+
doc_url=(
|
|
239
|
+
str(payload.get("documentation", {}).get("url")).strip()
|
|
240
|
+
if isinstance(payload, dict)
|
|
241
|
+
and isinstance(payload.get("documentation"), dict)
|
|
242
|
+
and payload["documentation"].get("url") is not None
|
|
243
|
+
else discovered.doc_url
|
|
244
|
+
),
|
|
245
|
+
official_statistics=discovered.official_statistics,
|
|
246
|
+
contact=(normalize_contact(payload.get("contacts")) if isinstance(payload, dict) else None) or discovered.contact,
|
|
247
|
+
dimension_ids=dimension_ids or None,
|
|
248
|
+
extension=extension,
|
|
249
|
+
)
|