sdmxflow 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdmxflow/__init__.py +21 -0
- sdmxflow/_csv.py +148 -0
- sdmxflow/_json.py +18 -0
- sdmxflow/_logging.py +11 -0
- sdmxflow/_paths.py +22 -0
- sdmxflow/_types.py +11 -0
- sdmxflow/dataset.py +737 -0
- sdmxflow/download/__init__.py +4 -0
- sdmxflow/download/native.py +455 -0
- sdmxflow/download/providers/__init__.py +1 -0
- sdmxflow/download/providers/eurostat_bulk_csv.py +338 -0
- sdmxflow/download/structures.py +61 -0
- sdmxflow/errors.py +31 -0
- sdmxflow/extract/__init__.py +1 -0
- sdmxflow/extract/codelists.py +241 -0
- sdmxflow/extras/__init__.py +4 -0
- sdmxflow/extras/parquet.py +5 -0
- sdmxflow/metadata/__init__.py +1 -0
- sdmxflow/metadata/models.py +202 -0
- sdmxflow/metadata/schema.py +9 -0
- sdmxflow/metadata/writer.py +378 -0
- sdmxflow/models.py +50 -0
- sdmxflow/query/__init__.py +1 -0
- sdmxflow/query/last_updated_data.py +213 -0
- sdmxflow-0.1.0.dist-info/METADATA +484 -0
- sdmxflow-0.1.0.dist-info/RECORD +28 -0
- sdmxflow-0.1.0.dist-info/WHEEL +4 -0
- sdmxflow-0.1.0.dist-info/licenses/LICENSE.md +201 -0
sdmxflow/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""sdmxflow: download SDMX datasets to a versioned folder layout.
|
|
2
|
+
|
|
3
|
+
Public API (stable):
|
|
4
|
+
- `SdmxDataset`: configure dataset + output folder, then call `.setup()` and `.fetch()`.
|
|
5
|
+
- `FetchResult`: paths + whether a new version was appended.
|
|
6
|
+
|
|
7
|
+
This package intentionally uses stdlib `logging` and does not configure handlers.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
from .dataset import FetchResult, SdmxDataset
|
|
15
|
+
|
|
16
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"FetchResult",
|
|
20
|
+
"SdmxDataset",
|
|
21
|
+
]
|
sdmxflow/_csv.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""CSV helpers.
|
|
2
|
+
|
|
3
|
+
This module contains small, focused helpers for writing dataset slices into a
|
|
4
|
+
canonical CSV layout.
|
|
5
|
+
|
|
6
|
+
The canonical format used by sdmxflow prepends a `last_updated` column to the
|
|
7
|
+
provider CSV schema. This column is used to tag appended rows with the upstream
|
|
8
|
+
"last updated" timestamp.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import csv
|
|
14
|
+
import io
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .errors import SdmxMetadataError
|
|
18
|
+
|
|
19
|
+
LAST_UPDATED_COLUMN = "last_updated"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _parse_header_line(line: str) -> list[str]:
|
|
23
|
+
try:
|
|
24
|
+
return next(csv.reader([line]))
|
|
25
|
+
except Exception as exc: # noqa: BLE001
|
|
26
|
+
raise SdmxMetadataError(f"Failed to parse CSV header: {exc}") from exc
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _format_header_line(columns: list[str]) -> str:
|
|
30
|
+
buf = io.StringIO()
|
|
31
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
32
|
+
writer.writerow(columns)
|
|
33
|
+
return buf.getvalue()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _normalize_provider_header(columns: list[str]) -> list[str]:
|
|
37
|
+
# Remove BOM from first column name if present.
|
|
38
|
+
if columns and columns[0].startswith("\ufeff"):
|
|
39
|
+
columns = [columns[0].lstrip("\ufeff"), *columns[1:]]
|
|
40
|
+
|
|
41
|
+
cols = [c.strip() for c in columns if c is not None]
|
|
42
|
+
cols2 = [c for c in cols if c != LAST_UPDATED_COLUMN]
|
|
43
|
+
if not cols2:
|
|
44
|
+
raise SdmxMetadataError("Source CSV has an empty/invalid header")
|
|
45
|
+
return cols2
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def ensure_last_updated_first_column(*, csv_path: Path) -> None:
|
|
49
|
+
"""Ensure the CSV header starts with the `last_updated` column.
|
|
50
|
+
|
|
51
|
+
If the file exists and its header does not start with `last_updated`, the
|
|
52
|
+
file is rewritten in-place so that:
|
|
53
|
+
|
|
54
|
+
- the header is updated to include `last_updated` as the first column, and
|
|
55
|
+
- each subsequent non-empty data row is prefixed with a blank value for
|
|
56
|
+
`last_updated`.
|
|
57
|
+
"""
|
|
58
|
+
if not csv_path.exists() or not csv_path.is_file():
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
with csv_path.open("r", encoding="utf-8", newline="") as f:
|
|
62
|
+
header_line = f.readline()
|
|
63
|
+
if not header_line:
|
|
64
|
+
return
|
|
65
|
+
header_cols = _parse_header_line(header_line)
|
|
66
|
+
|
|
67
|
+
if header_cols and header_cols[0] == LAST_UPDATED_COLUMN:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
provider_cols = _normalize_provider_header(header_cols)
|
|
71
|
+
new_header_cols = [LAST_UPDATED_COLUMN, *provider_cols]
|
|
72
|
+
|
|
73
|
+
tmp_path = csv_path.with_name(csv_path.name + ".tmp")
|
|
74
|
+
with tmp_path.open("w", encoding="utf-8", newline="") as out:
|
|
75
|
+
out.write(_format_header_line(new_header_cols))
|
|
76
|
+
|
|
77
|
+
# Rewrite all remaining lines as-is, but prefix a blank last_updated.
|
|
78
|
+
for line in f:
|
|
79
|
+
if not line.strip():
|
|
80
|
+
continue
|
|
81
|
+
if not line.endswith("\n"):
|
|
82
|
+
line += "\n"
|
|
83
|
+
out.write("," + line)
|
|
84
|
+
|
|
85
|
+
tmp_path.replace(csv_path)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def append_version_slice(*, src_csv: Path, dst_csv: Path, upstream_last_updated: str) -> int:
|
|
89
|
+
"""Append a downloaded CSV into a destination CSV.
|
|
90
|
+
|
|
91
|
+
The source CSV is expected to *not* contain a `last_updated` column. This
|
|
92
|
+
function appends its data rows to `dst_csv` while prepending
|
|
93
|
+
`upstream_last_updated` as the first column.
|
|
94
|
+
|
|
95
|
+
Returns the number of appended data rows.
|
|
96
|
+
"""
|
|
97
|
+
if not src_csv.exists() or not src_csv.is_file():
|
|
98
|
+
raise SdmxMetadataError(f"Source CSV does not exist: {src_csv}")
|
|
99
|
+
|
|
100
|
+
with src_csv.open("r", encoding="utf-8", newline="") as src:
|
|
101
|
+
src_header_line = src.readline()
|
|
102
|
+
if not src_header_line:
|
|
103
|
+
raise SdmxMetadataError(f"Source CSV is empty: {src_csv}")
|
|
104
|
+
|
|
105
|
+
src_cols = _normalize_provider_header(_parse_header_line(src_header_line))
|
|
106
|
+
|
|
107
|
+
if dst_csv.exists() and dst_csv.is_file() and dst_csv.stat().st_size > 0:
|
|
108
|
+
ensure_last_updated_first_column(csv_path=dst_csv)
|
|
109
|
+
|
|
110
|
+
with dst_csv.open("r", encoding="utf-8", newline="") as dst_in:
|
|
111
|
+
dst_header_line = dst_in.readline()
|
|
112
|
+
if not dst_header_line:
|
|
113
|
+
raise SdmxMetadataError(f"Destination CSV has no header: {dst_csv}")
|
|
114
|
+
dst_cols = _parse_header_line(dst_header_line)
|
|
115
|
+
if not dst_cols or dst_cols[0] != LAST_UPDATED_COLUMN:
|
|
116
|
+
raise SdmxMetadataError(
|
|
117
|
+
f"Destination CSV header is missing {LAST_UPDATED_COLUMN}: {dst_csv}"
|
|
118
|
+
)
|
|
119
|
+
if dst_cols[1:] != src_cols:
|
|
120
|
+
raise SdmxMetadataError(
|
|
121
|
+
"CSV schema mismatch: source columns differ from destination columns "
|
|
122
|
+
f"(dst={dst_cols[1:]}, src={src_cols})"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
dst = dst_csv.open("a", encoding="utf-8", newline="")
|
|
126
|
+
close_dst = True
|
|
127
|
+
else:
|
|
128
|
+
dst_csv.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
dst = dst_csv.open("w", encoding="utf-8", newline="")
|
|
130
|
+
close_dst = True
|
|
131
|
+
dst.write(_format_header_line([LAST_UPDATED_COLUMN, *src_cols]))
|
|
132
|
+
|
|
133
|
+
rows = 0
|
|
134
|
+
try:
|
|
135
|
+
for line in src:
|
|
136
|
+
if not line.strip():
|
|
137
|
+
continue
|
|
138
|
+
if not line.endswith("\n"):
|
|
139
|
+
line += "\n"
|
|
140
|
+
dst.write(upstream_last_updated)
|
|
141
|
+
dst.write(",")
|
|
142
|
+
dst.write(line)
|
|
143
|
+
rows += 1
|
|
144
|
+
finally:
|
|
145
|
+
if close_dst:
|
|
146
|
+
dst.close()
|
|
147
|
+
|
|
148
|
+
return rows
|
sdmxflow/_json.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from ._types import JsonValue
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_json(path: Path) -> JsonValue:
|
|
10
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def write_json(path: Path, data: JsonValue, *, indent: int = 2) -> None:
|
|
14
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
path.write_text(
|
|
16
|
+
json.dumps(data, indent=indent, sort_keys=True, ensure_ascii=False) + "\n",
|
|
17
|
+
encoding="utf-8",
|
|
18
|
+
)
|
sdmxflow/_logging.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_logger(logger: logging.Logger | None, *, name: str = "sdmxflow") -> logging.Logger:
|
|
7
|
+
"""Return a usable stdlib logger.
|
|
8
|
+
|
|
9
|
+
The package never configures handlers/formatters; the host application owns that.
|
|
10
|
+
"""
|
|
11
|
+
return logger if logger is not None else logging.getLogger(name)
|
sdmxflow/_paths.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class DatasetPaths:
|
|
9
|
+
out_dir: Path
|
|
10
|
+
dataset_csv: Path
|
|
11
|
+
metadata_json: Path
|
|
12
|
+
codelists_dir: Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def dataset_paths(out_dir: str | Path) -> DatasetPaths:
|
|
16
|
+
base = Path(out_dir).expanduser().resolve()
|
|
17
|
+
return DatasetPaths(
|
|
18
|
+
out_dir=base,
|
|
19
|
+
dataset_csv=base / "dataset.csv",
|
|
20
|
+
metadata_json=base / "metadata.json",
|
|
21
|
+
codelists_dir=base / "codelists",
|
|
22
|
+
)
|
sdmxflow/_types.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal, TypeAlias
|
|
5
|
+
|
|
6
|
+
JsonScalar: TypeAlias = str | int | float | bool | None
|
|
7
|
+
JsonValue: TypeAlias = JsonScalar | list["JsonValue"] | dict[str, "JsonValue"]
|
|
8
|
+
|
|
9
|
+
IfExists = Literal["skip", "overwrite"]
|
|
10
|
+
|
|
11
|
+
PathLike: TypeAlias = str | Path
|