eolas-data 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eolas_data/__init__.py +16 -0
- eolas_data/_dataset_names.py +1455 -0
- eolas_data/_regen_names.py +57 -0
- eolas_data/cli.py +617 -0
- eolas_data/client.py +333 -0
- eolas_data/dataset.py +66 -0
- eolas_data/exceptions.py +20 -0
- eolas_data/schedule.py +258 -0
- eolas_data-1.2.0.dist-info/METADATA +214 -0
- eolas_data-1.2.0.dist-info/RECORD +12 -0
- eolas_data-1.2.0.dist-info/WHEEL +4 -0
- eolas_data-1.2.0.dist-info/entry_points.txt +2 -0
eolas_data/client.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from .dataset import Dataset
|
|
10
|
+
from .exceptions import APIError, AuthenticationError, NotFoundError, RateLimitError
|
|
11
|
+
|
|
12
|
+
# Imported separately so the names module is also re-exportable for users who
|
|
13
|
+
# want IDE autocomplete on dataset names without instantiating a Client.
|
|
14
|
+
from ._dataset_names import DatasetName # noqa: F401 (public re-export)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
BASE_URL = "https://api.eolas.fyi"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _to_geodataframe(df: "pd.DataFrame", force: bool = False):
|
|
21
|
+
"""Convert a DataFrame with a ``geometry_wkt`` column to a GeoDataFrame (CRS WGS84).
|
|
22
|
+
|
|
23
|
+
Returns the GeoDataFrame on success, or ``None`` when geopandas isn't installed
|
|
24
|
+
(and ``force`` is False) so the caller can fall back to the plain DataFrame.
|
|
25
|
+
Raises ImportError when ``force=True`` but geopandas is missing.
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
import geopandas as gpd
|
|
29
|
+
from shapely import wkt as _wkt
|
|
30
|
+
except ImportError:
|
|
31
|
+
if force:
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"geopandas + shapely are required to return geospatial datasets "
|
|
34
|
+
"as GeoDataFrames. Install with: pip install eolas-data[geo]"
|
|
35
|
+
)
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
geom = df["geometry_wkt"].apply(lambda s: _wkt.loads(s) if isinstance(s, str) and s else None)
|
|
39
|
+
gdf = gpd.GeoDataFrame(df.drop(columns=["geometry_wkt"]), geometry=geom, crs="EPSG:4326")
|
|
40
|
+
for attr in ("eolas_name", "eolas_source"):
|
|
41
|
+
if hasattr(df, attr):
|
|
42
|
+
try:
|
|
43
|
+
setattr(gdf, attr, getattr(df, attr))
|
|
44
|
+
except Exception:
|
|
45
|
+
pass
|
|
46
|
+
return gdf
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Client:
|
|
50
|
+
"""Client for the eolas.fyi statistical data API.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
api_key: Your API key. Falls back to the ``EOLAS_API_KEY`` env var
|
|
54
|
+
(or ``VS_API_KEY`` for back-compat with the legacy library).
|
|
55
|
+
base_url: Override the API base URL (useful for testing).
|
|
56
|
+
cache: Cache responses in memory for the lifetime of the client.
|
|
57
|
+
Useful in notebooks to avoid re-fetching on re-runs.
|
|
58
|
+
|
|
59
|
+
Examples::
|
|
60
|
+
|
|
61
|
+
from eolas_data import Client
|
|
62
|
+
client = Client("your_api_key")
|
|
63
|
+
|
|
64
|
+
# Source-specific helpers
|
|
65
|
+
df = client.statsnz("nz_cpi", start="2020-01-01")
|
|
66
|
+
df = client.oecd("nz_gdp")
|
|
67
|
+
|
|
68
|
+
# Generic
|
|
69
|
+
df = client.get("nz_cpi")
|
|
70
|
+
|
|
71
|
+
# Discovery
|
|
72
|
+
all_datasets = client.list()
|
|
73
|
+
nz_datasets = client.list("Stats NZ")
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
api_key: Optional[str] = None,
|
|
79
|
+
base_url: str = BASE_URL,
|
|
80
|
+
cache: bool = False,
|
|
81
|
+
):
|
|
82
|
+
self._key = api_key or os.getenv("EOLAS_API_KEY") or os.getenv("VS_API_KEY") or ""
|
|
83
|
+
self._base = base_url.rstrip("/")
|
|
84
|
+
self._cache: dict | None = {} if cache else None
|
|
85
|
+
self._session = requests.Session()
|
|
86
|
+
self._session.headers.update({"X-API-Key": self._key})
|
|
87
|
+
|
|
88
|
+
def __repr__(self) -> str:
|
|
89
|
+
masked = self._key[:8] + "..." if len(self._key) > 8 else self._key
|
|
90
|
+
cache = " cache=on" if self._cache is not None else ""
|
|
91
|
+
return f"<eolas_data.Client key={masked!r}{cache}>"
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Discovery
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def list(self, source: Optional[str] = None) -> list[dict]:
|
|
98
|
+
"""Return metadata for all available datasets.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
source: Optional filter, e.g. ``"Stats NZ"``, ``"OECD"``.
|
|
102
|
+
"""
|
|
103
|
+
data = self._get("/v1/datasets")
|
|
104
|
+
items = data.get("datasets", data) if isinstance(data, dict) else data
|
|
105
|
+
if source:
|
|
106
|
+
items = [s for s in items if s.get("source") == source]
|
|
107
|
+
return items
|
|
108
|
+
|
|
109
|
+
def info(self, name: Union[str, "DatasetName"]) -> dict:
|
|
110
|
+
"""Return metadata for a single dataset."""
|
|
111
|
+
return self._get(f"/v1/datasets/{name}")
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# Integrations (Enterprise plan only)
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def integration(self, platform: str, datasets: list[str]) -> dict[str, str]:
|
|
118
|
+
"""Generate connector config files for a third-party data-pipeline tool.
|
|
119
|
+
|
|
120
|
+
Enterprise plan only. Other plans receive an
|
|
121
|
+
:class:`AuthenticationError` with the upgrade message in the detail.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
platform: One of ``"meltano"``, ``"fivetran"``, ``"azure-data-factory"``.
|
|
125
|
+
datasets: Dataset names to include in the generated config.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
``{filename: file_contents}`` ready to write to disk.
|
|
129
|
+
|
|
130
|
+
Examples::
|
|
131
|
+
|
|
132
|
+
files = client.integration("meltano", ["nz_cpi", "nz_gdp"])
|
|
133
|
+
for filename, content in files.items():
|
|
134
|
+
Path("./tap-eolas") / filename).write_text(content)
|
|
135
|
+
"""
|
|
136
|
+
if not datasets:
|
|
137
|
+
raise ValueError("datasets cannot be empty")
|
|
138
|
+
resp = self._get(
|
|
139
|
+
f"/v1/integrations/{platform}",
|
|
140
|
+
params={"datasets": ",".join(datasets)},
|
|
141
|
+
)
|
|
142
|
+
return resp.get("files", {})
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------
|
|
145
|
+
# Source-specific helpers
|
|
146
|
+
# ------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
def statsnz(self, name, **kwargs) -> Dataset:
|
|
149
|
+
"""Fetch a Stats NZ dataset."""
|
|
150
|
+
return self._get_source(name, "Stats NZ", **kwargs)
|
|
151
|
+
|
|
152
|
+
def oecd(self, name, **kwargs) -> Dataset:
|
|
153
|
+
"""Fetch an OECD dataset."""
|
|
154
|
+
return self._get_source(name, "OECD", **kwargs)
|
|
155
|
+
|
|
156
|
+
def rbnz(self, name, **kwargs) -> Dataset:
|
|
157
|
+
"""Fetch an RBNZ dataset."""
|
|
158
|
+
return self._get_source(name, "RBNZ", **kwargs)
|
|
159
|
+
|
|
160
|
+
def treasury(self, name, **kwargs) -> Dataset:
|
|
161
|
+
"""Fetch an NZ Treasury dataset."""
|
|
162
|
+
return self._get_source(name, "NZ Treasury", **kwargs)
|
|
163
|
+
|
|
164
|
+
def linz(self, name, **kwargs) -> Dataset:
|
|
165
|
+
"""Fetch a LINZ dataset."""
|
|
166
|
+
return self._get_source(name, "LINZ", **kwargs)
|
|
167
|
+
|
|
168
|
+
def statsnz_geo(self, name, **kwargs) -> Dataset:
|
|
169
|
+
"""Fetch a Stats NZ Geospatial dataset."""
|
|
170
|
+
return self._get_source(name, "Stats NZ Geospatial", **kwargs)
|
|
171
|
+
|
|
172
|
+
def mbie(self, name, **kwargs) -> Dataset:
|
|
173
|
+
"""Fetch an MBIE dataset."""
|
|
174
|
+
return self._get_source(name, "MBIE", **kwargs)
|
|
175
|
+
|
|
176
|
+
def nzta(self, name, **kwargs) -> Dataset:
|
|
177
|
+
"""Fetch a Waka Kotahi (NZTA) dataset."""
|
|
178
|
+
return self._get_source(name, "Waka Kotahi", **kwargs)
|
|
179
|
+
|
|
180
|
+
def msd(self, name, **kwargs) -> Dataset:
|
|
181
|
+
"""Fetch an MSD dataset."""
|
|
182
|
+
return self._get_source(name, "MSD", **kwargs)
|
|
183
|
+
|
|
184
|
+
def police(self, name, **kwargs) -> Dataset:
|
|
185
|
+
"""Fetch an NZ Police / MoJ dataset."""
|
|
186
|
+
return self._get_source(name, "NZ Police / MoJ", **kwargs)
|
|
187
|
+
|
|
188
|
+
def acc(self, name, **kwargs) -> Dataset:
|
|
189
|
+
"""Fetch an ACC dataset."""
|
|
190
|
+
return self._get_source(name, "ACC", **kwargs)
|
|
191
|
+
|
|
192
|
+
def edcounts(self, name, **kwargs) -> Dataset:
|
|
193
|
+
"""Fetch an Education Counts dataset."""
|
|
194
|
+
return self._get_source(name, "Education Counts", **kwargs)
|
|
195
|
+
|
|
196
|
+
def worksafe(self, name, **kwargs) -> Dataset:
|
|
197
|
+
"""Fetch a WorkSafe NZ dataset."""
|
|
198
|
+
return self._get_source(name, "WorkSafe NZ", **kwargs)
|
|
199
|
+
|
|
200
|
+
def _get_source(self, name, source: str, **kwargs) -> Dataset:
|
|
201
|
+
df = self.get(name, **kwargs)
|
|
202
|
+
df.eolas_source = source
|
|
203
|
+
return df
|
|
204
|
+
|
|
205
|
+
# ------------------------------------------------------------------
|
|
206
|
+
# Core data fetch
|
|
207
|
+
# ------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def get(
|
|
210
|
+
self,
|
|
211
|
+
name: Union[str, "DatasetName"],
|
|
212
|
+
start: Optional[str] = None,
|
|
213
|
+
end: Optional[str] = None,
|
|
214
|
+
format: str = "json",
|
|
215
|
+
engine: str = "pandas",
|
|
216
|
+
limit: Optional[int] = None,
|
|
217
|
+
as_geo: Optional[bool] = None,
|
|
218
|
+
) -> Dataset:
|
|
219
|
+
"""Fetch dataset rows as a pandas (or polars / geopandas) DataFrame.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
name: Dataset identifier, e.g. ``"nz_cpi"``. Type-checked against
|
|
223
|
+
the ``DatasetName`` Literal at static-analysis time so
|
|
224
|
+
IDEs autocomplete the catalog.
|
|
225
|
+
start: ISO date lower bound, e.g. ``"2020-01-01"``.
|
|
226
|
+
end: ISO date upper bound, e.g. ``"2024-12-31"``.
|
|
227
|
+
format: ``"json"`` (default) or ``"csv"``.
|
|
228
|
+
engine: ``"pandas"`` (default) or ``"polars"``.
|
|
229
|
+
limit: Max rows to return. Default ``None`` requests the full dataset
|
|
230
|
+
(server enforces a 50,000-row cap on Free/Starter plans; Pro is
|
|
231
|
+
unlimited). Pass an explicit integer to request fewer rows.
|
|
232
|
+
as_geo: Convert geospatial datasets to a ``GeoDataFrame``.
|
|
233
|
+
``None`` (default) auto-converts when the dataset has a
|
|
234
|
+
``geometry_wkt`` column AND ``geopandas`` is importable.
|
|
235
|
+
``True`` forces the conversion (raises if geopandas missing).
|
|
236
|
+
``False`` keeps the raw WKT string column.
|
|
237
|
+
Install with ``pip install eolas-data[geo]``.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
A :class:`Dataset` (pandas DataFrame subclass), a polars DataFrame
|
|
241
|
+
when ``engine="polars"``, or a ``geopandas.GeoDataFrame`` when
|
|
242
|
+
geometry is present and conversion is enabled.
|
|
243
|
+
"""
|
|
244
|
+
params: dict = {}
|
|
245
|
+
if start:
|
|
246
|
+
params["start"] = start
|
|
247
|
+
if end:
|
|
248
|
+
params["end"] = end
|
|
249
|
+
# Server-side: limit=0 means "as much as the plan allows" (full dataset for Pro,
|
|
250
|
+
# 50K cap for Free/Starter). limit=None on the client maps to limit=0.
|
|
251
|
+
params["limit"] = 0 if limit is None else int(limit)
|
|
252
|
+
|
|
253
|
+
cache_key = f"{name}:{start}:{end}:{format}:{params['limit']}:{as_geo}"
|
|
254
|
+
if self._cache is not None and cache_key in self._cache:
|
|
255
|
+
return self._cache[cache_key]
|
|
256
|
+
|
|
257
|
+
if format == "csv":
|
|
258
|
+
from io import StringIO
|
|
259
|
+
resp = self._raw_get(f"/v1/datasets/{name}/data", params={"format": "csv", **params})
|
|
260
|
+
df = pd.read_csv(StringIO(resp.text))
|
|
261
|
+
else:
|
|
262
|
+
data = self._get(f"/v1/datasets/{name}/data", params=params)
|
|
263
|
+
records = data.get("data", data) if isinstance(data, dict) else data
|
|
264
|
+
df = pd.DataFrame(records)
|
|
265
|
+
if "date" in df.columns:
|
|
266
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
267
|
+
|
|
268
|
+
result = Dataset(df)
|
|
269
|
+
result.eolas_name = name
|
|
270
|
+
result.eolas_source = ""
|
|
271
|
+
|
|
272
|
+
if engine == "polars":
|
|
273
|
+
try:
|
|
274
|
+
import polars as pl
|
|
275
|
+
return pl.from_pandas(result)
|
|
276
|
+
except ImportError:
|
|
277
|
+
raise ImportError(
|
|
278
|
+
"polars is required for engine='polars'. "
|
|
279
|
+
"Install with: pip install eolas-data[polars]"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Optional geopandas conversion. When as_geo=None we auto-convert if both
|
|
283
|
+
# (a) the dataset has a geometry_wkt column AND (b) geopandas is importable.
|
|
284
|
+
if as_geo is not False and "geometry_wkt" in result.columns:
|
|
285
|
+
converted = _to_geodataframe(result, force=as_geo is True)
|
|
286
|
+
if converted is not None:
|
|
287
|
+
result = converted
|
|
288
|
+
|
|
289
|
+
if self._cache is not None:
|
|
290
|
+
self._cache[cache_key] = result
|
|
291
|
+
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
# ------------------------------------------------------------------
|
|
295
|
+
# HTTP helpers
|
|
296
|
+
# ------------------------------------------------------------------
|
|
297
|
+
|
|
298
|
+
def _get(self, path: str, params: Optional[dict] = None) -> dict:
|
|
299
|
+
return self._raw_get(path, params=params).json()
|
|
300
|
+
|
|
301
|
+
def _raw_get(self, path: str, params: Optional[dict] = None) -> requests.Response:
|
|
302
|
+
url = f"{self._base}{path}"
|
|
303
|
+
resp = self._session.get(url, params=params)
|
|
304
|
+
self._raise_for_status(resp)
|
|
305
|
+
return resp
|
|
306
|
+
|
|
307
|
+
@staticmethod
|
|
308
|
+
def _raise_for_status(resp: requests.Response) -> None:
|
|
309
|
+
if resp.status_code == 200:
|
|
310
|
+
return
|
|
311
|
+
if resp.status_code == 401:
|
|
312
|
+
raise AuthenticationError("Invalid or missing API key.")
|
|
313
|
+
if resp.status_code == 403:
|
|
314
|
+
try:
|
|
315
|
+
detail = resp.json().get("detail", "API key is inactive.")
|
|
316
|
+
except Exception:
|
|
317
|
+
detail = "API key is inactive."
|
|
318
|
+
raise AuthenticationError(detail)
|
|
319
|
+
if resp.status_code == 429:
|
|
320
|
+
raise RateLimitError(
|
|
321
|
+
"Monthly request limit reached. Upgrade for higher limits."
|
|
322
|
+
)
|
|
323
|
+
if resp.status_code == 404:
|
|
324
|
+
try:
|
|
325
|
+
detail = resp.json().get("detail", "Not found.")
|
|
326
|
+
except Exception:
|
|
327
|
+
detail = "Not found."
|
|
328
|
+
raise NotFoundError(detail)
|
|
329
|
+
try:
|
|
330
|
+
detail = resp.json().get("detail", resp.text)
|
|
331
|
+
except Exception:
|
|
332
|
+
detail = resp.text
|
|
333
|
+
raise APIError(resp.status_code, detail)
|
eolas_data/dataset.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Dataset(pd.DataFrame):
|
|
7
|
+
"""A pandas DataFrame with eolas dataset metadata.
|
|
8
|
+
|
|
9
|
+
Behaves exactly like a DataFrame — all pandas operations work normally.
|
|
10
|
+
Extra attributes:
|
|
11
|
+
eolas_name: Dataset identifier (e.g. ``"nz_cpi"``).
|
|
12
|
+
eolas_source: Data source label (e.g. ``"Stats NZ"``).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
_metadata = ["eolas_name", "eolas_source"]
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def _constructor(self):
|
|
19
|
+
return Dataset
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
name = getattr(self, "eolas_name", "") or ""
|
|
23
|
+
source = getattr(self, "eolas_source", "") or ""
|
|
24
|
+
if name:
|
|
25
|
+
header = f"# Dataset: {name}"
|
|
26
|
+
if source:
|
|
27
|
+
header += f" [{source}]"
|
|
28
|
+
header += f"\n# {len(self)} rows\n"
|
|
29
|
+
return header + pd.DataFrame.__repr__(self)
|
|
30
|
+
return pd.DataFrame.__repr__(self)
|
|
31
|
+
|
|
32
|
+
def plot_dataset(self, ax=None, **kwargs):
|
|
33
|
+
"""Quick line chart using matplotlib.
|
|
34
|
+
|
|
35
|
+
Returns the matplotlib Axes object so you can customise further.
|
|
36
|
+
Requires matplotlib: ``pip install eolas-data[plot]``.
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
import matplotlib.pyplot as plt
|
|
40
|
+
except ImportError:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"matplotlib is required for plot_dataset(). "
|
|
43
|
+
"Install with: pip install eolas-data[plot]"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
date_col = "date" if "date" in self.columns else self.columns[0]
|
|
47
|
+
value_col = "value" if "value" in self.columns else self.columns[1]
|
|
48
|
+
|
|
49
|
+
if ax is None:
|
|
50
|
+
_, ax = plt.subplots(figsize=(10, 4))
|
|
51
|
+
|
|
52
|
+
ax.plot(self[date_col], self[value_col], color="#2563eb", linewidth=1.5, **kwargs)
|
|
53
|
+
|
|
54
|
+
name = getattr(self, "eolas_name", "") or ""
|
|
55
|
+
source = getattr(self, "eolas_source", "") or ""
|
|
56
|
+
|
|
57
|
+
if name:
|
|
58
|
+
ax.set_title(name, fontweight="bold", fontsize=13)
|
|
59
|
+
ax.set_xlabel("")
|
|
60
|
+
ax.spines[["top", "right"]].set_visible(False)
|
|
61
|
+
|
|
62
|
+
caption = f"Source: {source} · eolas.fyi" if source else "eolas.fyi"
|
|
63
|
+
ax.figure.text(0.99, 0.01, caption, ha="right", fontsize=8, color="#9ca3af")
|
|
64
|
+
|
|
65
|
+
plt.tight_layout()
|
|
66
|
+
return ax
|
eolas_data/exceptions.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
class EolasError(Exception):
|
|
2
|
+
"""Base exception for the eolas-data client."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AuthenticationError(EolasError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RateLimitError(EolasError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class NotFoundError(EolasError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class APIError(EolasError):
|
|
18
|
+
def __init__(self, status_code: int, message: str):
|
|
19
|
+
self.status_code = status_code
|
|
20
|
+
super().__init__(f"HTTP {status_code}: {message}")
|
eolas_data/schedule.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Cross-platform scheduling backend for `eolas schedule add|list|remove`.
|
|
2
|
+
|
|
3
|
+
POSIX (Linux/macOS): edits the user's crontab via `crontab -l` / `crontab -`.
|
|
4
|
+
Windows: uses `schtasks` to create per-user scheduled tasks.
|
|
5
|
+
|
|
6
|
+
Both backends only manage entries tagged with a sentinel so the user's other
|
|
7
|
+
cron jobs / scheduled tasks are never touched.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import csv
|
|
12
|
+
import io
|
|
13
|
+
import platform
|
|
14
|
+
import re
|
|
15
|
+
import shlex
|
|
16
|
+
import shutil
|
|
17
|
+
import subprocess
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
SENTINEL = "# eolas-schedule:"
|
|
22
|
+
TASK_PREFIX = "eolas-" # Windows task name prefix
|
|
23
|
+
|
|
24
|
+
# Interval shortcut → cron expression (minute hour dom month dow). Daily/weekly/
|
|
25
|
+
# monthly all default to 6am because datasets typically refresh in the early
|
|
26
|
+
# hours; running at 6am gets the freshest data without competing for resources.
|
|
27
|
+
INTERVALS = {
|
|
28
|
+
"hourly": "0 * * * *",
|
|
29
|
+
"daily": "0 6 * * *",
|
|
30
|
+
"weekly": "0 6 * * 1", # Monday 6am
|
|
31
|
+
"monthly": "0 6 1 * *", # 1st of month, 6am
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Windows schtasks /sc value per interval. Custom cron exprs not supported on
|
|
35
|
+
# Windows backend — see _windows_add for the fallback message.
|
|
36
|
+
WIN_SCHED = {
|
|
37
|
+
"hourly": ("HOURLY", None),
|
|
38
|
+
"daily": ("DAILY", "06:00"),
|
|
39
|
+
"weekly": ("WEEKLY", "06:00"), # default day = today's weekday; we override below
|
|
40
|
+
"monthly": ("MONTHLY", "06:00"),
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
CRON_EXPR_RE = re.compile(r"^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s*$")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ScheduleEntry:
|
|
48
|
+
name: str
|
|
49
|
+
schedule: str # cron expr (POSIX) or human description (Windows)
|
|
50
|
+
command: str
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
54
|
+
# Public API — dispatches per OS
|
|
55
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
def is_windows() -> bool:
|
|
58
|
+
return platform.system() == "Windows"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def add(name: str, schedule_expr: str, command: str) -> None:
|
|
62
|
+
"""Register a scheduled task. `schedule_expr` is a cron expression on POSIX
|
|
63
|
+
or one of {'hourly','daily','weekly','monthly'} on Windows."""
|
|
64
|
+
if is_windows():
|
|
65
|
+
_windows_add(name, schedule_expr, command)
|
|
66
|
+
else:
|
|
67
|
+
_cron_add(name, schedule_expr, command)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def remove(name: str) -> bool:
|
|
71
|
+
"""Remove a managed task. Returns True if removed, False if not found."""
|
|
72
|
+
if is_windows():
|
|
73
|
+
return _windows_remove(name)
|
|
74
|
+
return _cron_remove(name)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def list_entries() -> list[ScheduleEntry]:
|
|
78
|
+
"""Return all managed eolas-schedule entries."""
|
|
79
|
+
if is_windows():
|
|
80
|
+
return _windows_list()
|
|
81
|
+
return _cron_list()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def interval_to_cron(interval: str) -> str:
|
|
85
|
+
"""Return the cron expression for an interval shortcut. Raises on unknown."""
|
|
86
|
+
if interval not in INTERVALS:
|
|
87
|
+
raise ValueError(f"unknown interval {interval!r}; expected one of {list(INTERVALS)}")
|
|
88
|
+
return INTERVALS[interval]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def validate_cron_expr(expr: str) -> None:
|
|
92
|
+
"""Basic shape check on a 5-field cron expression. Raises on invalid."""
|
|
93
|
+
if not CRON_EXPR_RE.match(expr):
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"invalid cron expression {expr!r}; expected 5 fields "
|
|
96
|
+
"(minute hour day-of-month month day-of-week)"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
101
|
+
# POSIX cron backend
|
|
102
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
def _crontab_available() -> bool:
|
|
105
|
+
return shutil.which("crontab") is not None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _cron_read() -> list[str]:
|
|
109
|
+
"""Read the user's crontab. Returns [] when no crontab is set."""
|
|
110
|
+
if not _crontab_available():
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
"crontab is not installed on this system. "
|
|
113
|
+
"On Debian/Ubuntu: sudo apt-get install cron. On Alpine: apk add busybox-suid."
|
|
114
|
+
)
|
|
115
|
+
proc = subprocess.run(
|
|
116
|
+
["crontab", "-l"], capture_output=True, text=True
|
|
117
|
+
)
|
|
118
|
+
if proc.returncode == 0:
|
|
119
|
+
return proc.stdout.splitlines()
|
|
120
|
+
# Some implementations exit 1 with "no crontab" — treat as empty.
|
|
121
|
+
if "no crontab" in (proc.stderr or "").lower():
|
|
122
|
+
return []
|
|
123
|
+
raise RuntimeError(f"crontab -l failed: {proc.stderr.strip() or proc.stdout.strip()}")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _cron_write(lines: list[str]) -> None:
|
|
127
|
+
payload = "\n".join(lines).rstrip() + "\n"
|
|
128
|
+
proc = subprocess.run(
|
|
129
|
+
["crontab", "-"], input=payload, text=True, capture_output=True
|
|
130
|
+
)
|
|
131
|
+
if proc.returncode != 0:
|
|
132
|
+
raise RuntimeError(f"crontab - failed: {proc.stderr.strip()}")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _cron_format_line(name: str, cron_expr: str, command: str) -> str:
|
|
136
|
+
return f"{cron_expr} {command} {SENTINEL} {name}"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _cron_match_name(line: str, name: str) -> bool:
|
|
140
|
+
return SENTINEL in line and line.rstrip().endswith(name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _cron_add(name: str, cron_expr: str, command: str) -> None:
|
|
144
|
+
validate_cron_expr(cron_expr)
|
|
145
|
+
lines = [l for l in _cron_read() if not _cron_match_name(l, name)] # idempotent
|
|
146
|
+
lines.append(_cron_format_line(name, cron_expr, command))
|
|
147
|
+
_cron_write(lines)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _cron_remove(name: str) -> bool:
|
|
151
|
+
lines = _cron_read()
|
|
152
|
+
kept = [l for l in lines if not _cron_match_name(l, name)]
|
|
153
|
+
if len(kept) == len(lines):
|
|
154
|
+
return False
|
|
155
|
+
_cron_write(kept)
|
|
156
|
+
return True
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _cron_list() -> list[ScheduleEntry]:
|
|
160
|
+
out: list[ScheduleEntry] = []
|
|
161
|
+
for line in _cron_read():
|
|
162
|
+
if SENTINEL not in line:
|
|
163
|
+
continue
|
|
164
|
+
head, _, tail = line.partition(SENTINEL)
|
|
165
|
+
name = tail.strip()
|
|
166
|
+
parts = head.strip().split(maxsplit=5)
|
|
167
|
+
if len(parts) < 6:
|
|
168
|
+
continue # malformed; skip silently
|
|
169
|
+
cron_expr = " ".join(parts[:5])
|
|
170
|
+
command = parts[5]
|
|
171
|
+
out.append(ScheduleEntry(name=name, schedule=cron_expr, command=command))
|
|
172
|
+
return out
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
176
|
+
# Windows schtasks backend
|
|
177
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
def _schtasks_available() -> bool:
|
|
180
|
+
return shutil.which("schtasks") is not None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _windows_add(name: str, interval: str, command: str) -> None:
|
|
184
|
+
if not _schtasks_available():
|
|
185
|
+
raise RuntimeError("schtasks not found — required on Windows for scheduling")
|
|
186
|
+
if interval not in WIN_SCHED:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
f"Windows backend supports interval shortcuts only "
|
|
189
|
+
f"({list(WIN_SCHED)}); got {interval!r}. "
|
|
190
|
+
"Custom cron expressions aren't translatable; use schtasks GUI for advanced cases."
|
|
191
|
+
)
|
|
192
|
+
sc, st = WIN_SCHED[interval]
|
|
193
|
+
args = [
|
|
194
|
+
"schtasks", "/create",
|
|
195
|
+
"/tn", f"{TASK_PREFIX}{name}",
|
|
196
|
+
"/tr", command,
|
|
197
|
+
"/sc", sc,
|
|
198
|
+
"/f", # overwrite if exists (idempotent add)
|
|
199
|
+
]
|
|
200
|
+
if st:
|
|
201
|
+
args += ["/st", st]
|
|
202
|
+
if interval == "weekly":
|
|
203
|
+
args += ["/d", "MON"]
|
|
204
|
+
proc = subprocess.run(args, capture_output=True, text=True)
|
|
205
|
+
if proc.returncode != 0:
|
|
206
|
+
raise RuntimeError(f"schtasks /create failed: {proc.stderr.strip()}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _windows_remove(name: str) -> bool:
|
|
210
|
+
proc = subprocess.run(
|
|
211
|
+
["schtasks", "/delete", "/tn", f"{TASK_PREFIX}{name}", "/f"],
|
|
212
|
+
capture_output=True, text=True,
|
|
213
|
+
)
|
|
214
|
+
if proc.returncode == 0:
|
|
215
|
+
return True
|
|
216
|
+
# schtasks returns non-zero if the task doesn't exist
|
|
217
|
+
if "cannot find" in (proc.stderr + proc.stdout).lower():
|
|
218
|
+
return False
|
|
219
|
+
raise RuntimeError(f"schtasks /delete failed: {proc.stderr.strip()}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _windows_list() -> list[ScheduleEntry]:
|
|
223
|
+
proc = subprocess.run(
|
|
224
|
+
["schtasks", "/query", "/fo", "CSV", "/v"],
|
|
225
|
+
capture_output=True, text=True,
|
|
226
|
+
)
|
|
227
|
+
if proc.returncode != 0:
|
|
228
|
+
raise RuntimeError(f"schtasks /query failed: {proc.stderr.strip()}")
|
|
229
|
+
out: list[ScheduleEntry] = []
|
|
230
|
+
reader = csv.DictReader(io.StringIO(proc.stdout))
|
|
231
|
+
for row in reader:
|
|
232
|
+
task_name = (row.get("TaskName") or "").lstrip("\\").strip()
|
|
233
|
+
if not task_name.startswith(TASK_PREFIX):
|
|
234
|
+
continue
|
|
235
|
+
out.append(ScheduleEntry(
|
|
236
|
+
name=task_name[len(TASK_PREFIX):],
|
|
237
|
+
schedule=row.get("Schedule Type") or "",
|
|
238
|
+
command=row.get("Task To Run") or "",
|
|
239
|
+
))
|
|
240
|
+
return out
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
244
|
+
# Helpers used by cli.py
|
|
245
|
+
# ────────────────────────────────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
def build_command(eolas_path: str, dataset: str, out_path: str,
|
|
248
|
+
start: Optional[str] = None, end: Optional[str] = None,
|
|
249
|
+
fmt: str = "csv") -> str:
|
|
250
|
+
"""Construct the shell command line to put inside the cron entry."""
|
|
251
|
+
parts = [shlex.quote(eolas_path), "get", shlex.quote(dataset),
|
|
252
|
+
"--format", shlex.quote(fmt),
|
|
253
|
+
"--out", shlex.quote(str(out_path))]
|
|
254
|
+
if start:
|
|
255
|
+
parts += ["--start", shlex.quote(start)]
|
|
256
|
+
if end:
|
|
257
|
+
parts += ["--end", shlex.quote(end)]
|
|
258
|
+
return " ".join(parts)
|