sdmxflow 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdmxflow/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """sdmxflow: download SDMX datasets to a versioned folder layout.
2
+
3
+ Public API (stable):
4
+ - `SdmxDataset`: configure dataset + output folder, then call `.setup()` and `.fetch()`.
5
+ - `FetchResult`: paths + whether a new version was appended.
6
+
7
+ This package intentionally uses stdlib `logging` and does not configure handlers.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+
14
+ from .dataset import FetchResult, SdmxDataset
15
+
16
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
17
+
18
+ __all__ = [
19
+ "FetchResult",
20
+ "SdmxDataset",
21
+ ]
sdmxflow/_csv.py ADDED
@@ -0,0 +1,148 @@
1
+ """CSV helpers.
2
+
3
+ This module contains small, focused helpers for writing dataset slices into a
4
+ canonical CSV layout.
5
+
6
+ The canonical format used by sdmxflow prepends a `last_updated` column to the
7
+ provider CSV schema. This column is used to tag appended rows with the upstream
8
+ "last updated" timestamp.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import csv
14
+ import io
15
+ from pathlib import Path
16
+
17
+ from .errors import SdmxMetadataError
18
+
19
+ LAST_UPDATED_COLUMN = "last_updated"
20
+
21
+
22
+ def _parse_header_line(line: str) -> list[str]:
23
+ try:
24
+ return next(csv.reader([line]))
25
+ except Exception as exc: # noqa: BLE001
26
+ raise SdmxMetadataError(f"Failed to parse CSV header: {exc}") from exc
27
+
28
+
29
+ def _format_header_line(columns: list[str]) -> str:
30
+ buf = io.StringIO()
31
+ writer = csv.writer(buf, lineterminator="\n")
32
+ writer.writerow(columns)
33
+ return buf.getvalue()
34
+
35
+
36
+ def _normalize_provider_header(columns: list[str]) -> list[str]:
37
+ # Remove BOM from first column name if present.
38
+ if columns and columns[0].startswith("\ufeff"):
39
+ columns = [columns[0].lstrip("\ufeff"), *columns[1:]]
40
+
41
+ cols = [c.strip() for c in columns if c is not None]
42
+ cols2 = [c for c in cols if c != LAST_UPDATED_COLUMN]
43
+ if not cols2:
44
+ raise SdmxMetadataError("Source CSV has an empty/invalid header")
45
+ return cols2
46
+
47
+
48
+ def ensure_last_updated_first_column(*, csv_path: Path) -> None:
49
+ """Ensure the CSV header starts with the `last_updated` column.
50
+
51
+ If the file exists and its header does not start with `last_updated`, the
52
+ file is rewritten in-place so that:
53
+
54
+ - the header is updated to include `last_updated` as the first column, and
55
+ - each subsequent non-empty data row is prefixed with a blank value for
56
+ `last_updated`.
57
+ """
58
+ if not csv_path.exists() or not csv_path.is_file():
59
+ return
60
+
61
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
62
+ header_line = f.readline()
63
+ if not header_line:
64
+ return
65
+ header_cols = _parse_header_line(header_line)
66
+
67
+ if header_cols and header_cols[0] == LAST_UPDATED_COLUMN:
68
+ return
69
+
70
+ provider_cols = _normalize_provider_header(header_cols)
71
+ new_header_cols = [LAST_UPDATED_COLUMN, *provider_cols]
72
+
73
+ tmp_path = csv_path.with_name(csv_path.name + ".tmp")
74
+ with tmp_path.open("w", encoding="utf-8", newline="") as out:
75
+ out.write(_format_header_line(new_header_cols))
76
+
77
+ # Rewrite all remaining lines as-is, but prefix a blank last_updated.
78
+ for line in f:
79
+ if not line.strip():
80
+ continue
81
+ if not line.endswith("\n"):
82
+ line += "\n"
83
+ out.write("," + line)
84
+
85
+ tmp_path.replace(csv_path)
86
+
87
+
88
+ def append_version_slice(*, src_csv: Path, dst_csv: Path, upstream_last_updated: str) -> int:
89
+ """Append a downloaded CSV into a destination CSV.
90
+
91
+ The source CSV is expected to *not* contain a `last_updated` column. This
92
+ function appends its data rows to `dst_csv` while prepending
93
+ `upstream_last_updated` as the first column.
94
+
95
+ Returns the number of appended data rows.
96
+ """
97
+ if not src_csv.exists() or not src_csv.is_file():
98
+ raise SdmxMetadataError(f"Source CSV does not exist: {src_csv}")
99
+
100
+ with src_csv.open("r", encoding="utf-8", newline="") as src:
101
+ src_header_line = src.readline()
102
+ if not src_header_line:
103
+ raise SdmxMetadataError(f"Source CSV is empty: {src_csv}")
104
+
105
+ src_cols = _normalize_provider_header(_parse_header_line(src_header_line))
106
+
107
+ if dst_csv.exists() and dst_csv.is_file() and dst_csv.stat().st_size > 0:
108
+ ensure_last_updated_first_column(csv_path=dst_csv)
109
+
110
+ with dst_csv.open("r", encoding="utf-8", newline="") as dst_in:
111
+ dst_header_line = dst_in.readline()
112
+ if not dst_header_line:
113
+ raise SdmxMetadataError(f"Destination CSV has no header: {dst_csv}")
114
+ dst_cols = _parse_header_line(dst_header_line)
115
+ if not dst_cols or dst_cols[0] != LAST_UPDATED_COLUMN:
116
+ raise SdmxMetadataError(
117
+ f"Destination CSV header is missing {LAST_UPDATED_COLUMN}: {dst_csv}"
118
+ )
119
+ if dst_cols[1:] != src_cols:
120
+ raise SdmxMetadataError(
121
+ "CSV schema mismatch: source columns differ from destination columns "
122
+ f"(dst={dst_cols[1:]}, src={src_cols})"
123
+ )
124
+
125
+ dst = dst_csv.open("a", encoding="utf-8", newline="")
126
+ close_dst = True
127
+ else:
128
+ dst_csv.parent.mkdir(parents=True, exist_ok=True)
129
+ dst = dst_csv.open("w", encoding="utf-8", newline="")
130
+ close_dst = True
131
+ dst.write(_format_header_line([LAST_UPDATED_COLUMN, *src_cols]))
132
+
133
+ rows = 0
134
+ try:
135
+ for line in src:
136
+ if not line.strip():
137
+ continue
138
+ if not line.endswith("\n"):
139
+ line += "\n"
140
+ dst.write(upstream_last_updated)
141
+ dst.write(",")
142
+ dst.write(line)
143
+ rows += 1
144
+ finally:
145
+ if close_dst:
146
+ dst.close()
147
+
148
+ return rows
sdmxflow/_json.py ADDED
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from ._types import JsonValue
7
+
8
+
9
+ def read_json(path: Path) -> JsonValue:
10
+ return json.loads(path.read_text(encoding="utf-8"))
11
+
12
+
13
+ def write_json(path: Path, data: JsonValue, *, indent: int = 2) -> None:
14
+ path.parent.mkdir(parents=True, exist_ok=True)
15
+ path.write_text(
16
+ json.dumps(data, indent=indent, sort_keys=True, ensure_ascii=False) + "\n",
17
+ encoding="utf-8",
18
+ )
sdmxflow/_logging.py ADDED
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+
6
+ def get_logger(logger: logging.Logger | None, *, name: str = "sdmxflow") -> logging.Logger:
7
+ """Return a usable stdlib logger.
8
+
9
+ The package never configures handlers/formatters; the host application owns that.
10
+ """
11
+ return logger if logger is not None else logging.getLogger(name)
sdmxflow/_paths.py ADDED
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class DatasetPaths:
9
+ out_dir: Path
10
+ dataset_csv: Path
11
+ metadata_json: Path
12
+ codelists_dir: Path
13
+
14
+
15
+ def dataset_paths(out_dir: str | Path) -> DatasetPaths:
16
+ base = Path(out_dir).expanduser().resolve()
17
+ return DatasetPaths(
18
+ out_dir=base,
19
+ dataset_csv=base / "dataset.csv",
20
+ metadata_json=base / "metadata.json",
21
+ codelists_dir=base / "codelists",
22
+ )
sdmxflow/_types.py ADDED
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Literal, TypeAlias
5
+
6
+ JsonScalar: TypeAlias = str | int | float | bool | None
7
+ JsonValue: TypeAlias = JsonScalar | list["JsonValue"] | dict[str, "JsonValue"]
8
+
9
+ IfExists = Literal["skip", "overwrite"]
10
+
11
+ PathLike: TypeAlias = str | Path