fpu-barometer-admin 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpu_barometer_admin/__init__.py +6 -0
- fpu_barometer_admin/cli/__init__.py +5 -0
- fpu_barometer_admin/cli/commands.py +199 -0
- fpu_barometer_admin/cli/deploy.py +719 -0
- fpu_barometer_admin/connectors/__init__.py +56 -0
- fpu_barometer_admin/connectors/acled_connector.py +77 -0
- fpu_barometer_admin/connectors/base_connector.py +60 -0
- fpu_barometer_admin/connectors/cpj_connector.py +92 -0
- fpu_barometer_admin/connectors/ert_connector.py +134 -0
- fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
- fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
- fpu_barometer_admin/connectors/rr_connector.py +84 -0
- fpu_barometer_admin/connectors/static_sources.py +41 -0
- fpu_barometer_admin/connectors/vdem_connector.py +165 -0
- fpu_barometer_admin/handlers/__init__.py +6 -0
- fpu_barometer_admin/handlers/function_app.py +543 -0
- fpu_barometer_admin/processors/__init__.py +46 -0
- fpu_barometer_admin/processors/acled_processor.py +263 -0
- fpu_barometer_admin/processors/base_processor.py +23 -0
- fpu_barometer_admin/processors/cpj_processor.py +147 -0
- fpu_barometer_admin/processors/ert_processor.py +72 -0
- fpu_barometer_admin/processors/gdelt_processor.py +260 -0
- fpu_barometer_admin/processors/mfrr_processor.py +327 -0
- fpu_barometer_admin/processors/rr_processor.py +208 -0
- fpu_barometer_admin/processors/vdem_processor.py +70 -0
- fpu_barometer_admin/runners/__init__.py +19 -0
- fpu_barometer_admin/runners/definitions.py +159 -0
- fpu_barometer_admin/runners/runners.py +291 -0
- fpu_barometer_admin/runners/scheduler.py +148 -0
- fpu_barometer_admin/runners/seed.py +399 -0
- fpu_barometer_admin/schemas/__init__.py +1 -0
- fpu_barometer_admin/schemas/event.py +362 -0
- fpu_barometer_admin/schemas/predictor.py +418 -0
- fpu_barometer_admin/storage/__init__.py +39 -0
- fpu_barometer_admin/storage/catalog.py +359 -0
- fpu_barometer_admin/storage/factory.py +165 -0
- fpu_barometer_admin/storage/objects.py +463 -0
- fpu_barometer_admin/storage/reader.py +410 -0
- fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
- fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
- fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
- fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
- fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""VDEM connector — GitHub release asset style for vdeminstitute/vdemdata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import pyreadr
|
|
9
|
+
import requests
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from fpu_barometer_admin.connectors.base_connector import (
|
|
13
|
+
NoNewSourceArtifact,
|
|
14
|
+
SourceArtifactSet,
|
|
15
|
+
SourceValidationError,
|
|
16
|
+
)
|
|
17
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
VDEM_GITHUB_OWNER = "vdeminstitute"
|
|
23
|
+
VDEM_GITHUB_REPO = "vdemdata"
|
|
24
|
+
VDEM_RAW_FILE_PATH = "data/vdem.RData"
|
|
25
|
+
VDEM_API_LATEST_RELEASE = (
|
|
26
|
+
f"https://api.github.com/repos/{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}/releases/latest"
|
|
27
|
+
)
|
|
28
|
+
VDEM_RAW_BASE_URL = (
|
|
29
|
+
f"https://raw.githubusercontent.com/{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
VDEM_TIMEOUT_SECONDS = 120
|
|
33
|
+
VDEM_EXPECTED_PYREADR_KEYS = {"vdem"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class VdemConnector:
|
|
37
|
+
"""Connector for VDEM annual GitHub release RData assets."""
|
|
38
|
+
|
|
39
|
+
dataset = "vdem"
|
|
40
|
+
|
|
41
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
42
|
+
self.object_storage = object_storage
|
|
43
|
+
|
|
44
|
+
def fetch(
|
|
45
|
+
self,
|
|
46
|
+
*,
|
|
47
|
+
run_id: str,
|
|
48
|
+
artifact_id: str,
|
|
49
|
+
since_watermark: str | None = None,
|
|
50
|
+
) -> SourceArtifactSet:
|
|
51
|
+
release_info = self._latest_release_info()
|
|
52
|
+
tag_name = release_info["tag_name"]
|
|
53
|
+
etag = release_info.get("etag", tag_name)
|
|
54
|
+
|
|
55
|
+
watermark_after = self._compound_watermark(tag_name, etag=etag)
|
|
56
|
+
|
|
57
|
+
if since_watermark is not None and since_watermark == watermark_after:
|
|
58
|
+
raise NoNewSourceArtifact(
|
|
59
|
+
f"VDEM release {tag_name} already processed",
|
|
60
|
+
watermark_after=watermark_after,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
raw_url = f"{VDEM_RAW_BASE_URL}/{tag_name}/{VDEM_RAW_FILE_PATH}"
|
|
64
|
+
logger.info("Fetching VDEM RData from %s", raw_url)
|
|
65
|
+
response = requests.get(raw_url, timeout=VDEM_TIMEOUT_SECONDS)
|
|
66
|
+
if response.status_code == 404:
|
|
67
|
+
raise SourceValidationError(
|
|
68
|
+
f"VDEM RData not found at {raw_url}",
|
|
69
|
+
metadata={
|
|
70
|
+
"dataset": self.dataset,
|
|
71
|
+
"tag": tag_name,
|
|
72
|
+
"expected_path": VDEM_RAW_FILE_PATH,
|
|
73
|
+
},
|
|
74
|
+
)
|
|
75
|
+
response.raise_for_status()
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
parsed = pyreadr.read_r(io.BytesIO(response.content))
|
|
79
|
+
except Exception as exc:
|
|
80
|
+
raise SourceValidationError(
|
|
81
|
+
f"Failed to parse VDEM RData: {exc}",
|
|
82
|
+
metadata={"dataset": self.dataset, "tag": tag_name},
|
|
83
|
+
) from exc
|
|
84
|
+
|
|
85
|
+
matched_keys = VDEM_EXPECTED_PYREADR_KEYS & set(parsed.keys())
|
|
86
|
+
if not matched_keys:
|
|
87
|
+
available_keys = list(parsed.keys())
|
|
88
|
+
raise SourceValidationError(
|
|
89
|
+
f"VDEM RData missing expected key. Available: {available_keys}",
|
|
90
|
+
metadata={
|
|
91
|
+
"dataset": self.dataset,
|
|
92
|
+
"tag": tag_name,
|
|
93
|
+
"expected_keys": list(VDEM_EXPECTED_PYREADR_KEYS),
|
|
94
|
+
"available_keys": available_keys,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
df = parsed[list(matched_keys)[0]]
|
|
99
|
+
if not isinstance(df, pd.DataFrame) or df.empty:
|
|
100
|
+
raise SourceValidationError(
|
|
101
|
+
"VDEM RData produced empty DataFrame",
|
|
102
|
+
metadata={"dataset": self.dataset, "tag": tag_name},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
df = self._storage_normalized(df)
|
|
106
|
+
|
|
107
|
+
logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/vdem.parquet"
|
|
108
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
109
|
+
return SourceArtifactSet(
|
|
110
|
+
dataset=self.dataset,
|
|
111
|
+
artifact_id=artifact_id,
|
|
112
|
+
logical_path=logical_path,
|
|
113
|
+
watermark_after=watermark_after,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _storage_normalized(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
117
|
+
"""Normalize RData-loaded columns so they survive Parquet round-trip."""
|
|
118
|
+
|
|
119
|
+
normalized = df.copy()
|
|
120
|
+
for column in normalized.select_dtypes(include="object").columns:
|
|
121
|
+
normalized[column] = normalized[column].map(self._source_value)
|
|
122
|
+
return normalized
|
|
123
|
+
|
|
124
|
+
def _source_value(self, value: object) -> object:
|
|
125
|
+
if value is None:
|
|
126
|
+
return None
|
|
127
|
+
try:
|
|
128
|
+
if pd.isna(value):
|
|
129
|
+
return None
|
|
130
|
+
except (TypeError, ValueError):
|
|
131
|
+
pass
|
|
132
|
+
if isinstance(value, (list, tuple, dict)):
|
|
133
|
+
import json
|
|
134
|
+
|
|
135
|
+
return json.dumps(value, ensure_ascii=False)
|
|
136
|
+
return str(value)
|
|
137
|
+
|
|
138
|
+
def _latest_release_info(self) -> dict:
|
|
139
|
+
"""Fetch the latest GitHub release metadata."""
|
|
140
|
+
|
|
141
|
+
response = requests.get(
|
|
142
|
+
VDEM_API_LATEST_RELEASE, timeout=VDEM_TIMEOUT_SECONDS
|
|
143
|
+
)
|
|
144
|
+
if response.status_code == 404:
|
|
145
|
+
raise SourceValidationError(
|
|
146
|
+
"VDEM GitHub repository not found or has no releases",
|
|
147
|
+
metadata={"dataset": self.dataset, "repo": f"{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}"},
|
|
148
|
+
)
|
|
149
|
+
response.raise_for_status()
|
|
150
|
+
info = response.json()
|
|
151
|
+
if "tag_name" not in info:
|
|
152
|
+
raise SourceValidationError(
|
|
153
|
+
"VDEM GitHub release response missing tag_name",
|
|
154
|
+
metadata={
|
|
155
|
+
"dataset": self.dataset,
|
|
156
|
+
"response_keys": list(info.keys()),
|
|
157
|
+
},
|
|
158
|
+
)
|
|
159
|
+
info["etag"] = response.headers.get("etag", "")
|
|
160
|
+
return info
|
|
161
|
+
|
|
162
|
+
def _compound_watermark(self, tag_name: str, *, etag: str) -> str:
|
|
163
|
+
"""Build a compound watermark from release identity."""
|
|
164
|
+
|
|
165
|
+
return f"{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}@{tag_name}:{VDEM_RAW_FILE_PATH}:{etag}"
|