fpu-barometer-admin 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fpu_barometer_admin/__init__.py +6 -0
  2. fpu_barometer_admin/cli/__init__.py +5 -0
  3. fpu_barometer_admin/cli/commands.py +199 -0
  4. fpu_barometer_admin/cli/deploy.py +719 -0
  5. fpu_barometer_admin/connectors/__init__.py +56 -0
  6. fpu_barometer_admin/connectors/acled_connector.py +77 -0
  7. fpu_barometer_admin/connectors/base_connector.py +60 -0
  8. fpu_barometer_admin/connectors/cpj_connector.py +92 -0
  9. fpu_barometer_admin/connectors/ert_connector.py +134 -0
  10. fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
  11. fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
  12. fpu_barometer_admin/connectors/rr_connector.py +84 -0
  13. fpu_barometer_admin/connectors/static_sources.py +41 -0
  14. fpu_barometer_admin/connectors/vdem_connector.py +165 -0
  15. fpu_barometer_admin/handlers/__init__.py +6 -0
  16. fpu_barometer_admin/handlers/function_app.py +543 -0
  17. fpu_barometer_admin/processors/__init__.py +46 -0
  18. fpu_barometer_admin/processors/acled_processor.py +263 -0
  19. fpu_barometer_admin/processors/base_processor.py +23 -0
  20. fpu_barometer_admin/processors/cpj_processor.py +147 -0
  21. fpu_barometer_admin/processors/ert_processor.py +72 -0
  22. fpu_barometer_admin/processors/gdelt_processor.py +260 -0
  23. fpu_barometer_admin/processors/mfrr_processor.py +327 -0
  24. fpu_barometer_admin/processors/rr_processor.py +208 -0
  25. fpu_barometer_admin/processors/vdem_processor.py +70 -0
  26. fpu_barometer_admin/runners/__init__.py +19 -0
  27. fpu_barometer_admin/runners/definitions.py +159 -0
  28. fpu_barometer_admin/runners/runners.py +291 -0
  29. fpu_barometer_admin/runners/scheduler.py +148 -0
  30. fpu_barometer_admin/runners/seed.py +399 -0
  31. fpu_barometer_admin/schemas/__init__.py +1 -0
  32. fpu_barometer_admin/schemas/event.py +362 -0
  33. fpu_barometer_admin/schemas/predictor.py +418 -0
  34. fpu_barometer_admin/storage/__init__.py +39 -0
  35. fpu_barometer_admin/storage/catalog.py +359 -0
  36. fpu_barometer_admin/storage/factory.py +165 -0
  37. fpu_barometer_admin/storage/objects.py +463 -0
  38. fpu_barometer_admin/storage/reader.py +410 -0
  39. fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
  40. fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
  41. fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
  42. fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
  43. fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
@@ -0,0 +1,165 @@
1
+ """VDEM connector — GitHub release asset style for vdeminstitute/vdemdata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+
8
+ import pyreadr
9
+ import requests
10
+ import pandas as pd
11
+
12
+ from fpu_barometer_admin.connectors.base_connector import (
13
+ NoNewSourceArtifact,
14
+ SourceArtifactSet,
15
+ SourceValidationError,
16
+ )
17
+ from fpu_barometer_admin.storage.objects import ObjectStorage
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ VDEM_GITHUB_OWNER = "vdeminstitute"
23
+ VDEM_GITHUB_REPO = "vdemdata"
24
+ VDEM_RAW_FILE_PATH = "data/vdem.RData"
25
+ VDEM_API_LATEST_RELEASE = (
26
+ f"https://api.github.com/repos/{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}/releases/latest"
27
+ )
28
+ VDEM_RAW_BASE_URL = (
29
+ f"https://raw.githubusercontent.com/{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}"
30
+ )
31
+
32
+ VDEM_TIMEOUT_SECONDS = 120
33
+ VDEM_EXPECTED_PYREADR_KEYS = {"vdem"}
34
+
35
+
36
+ class VdemConnector:
37
+ """Connector for VDEM annual GitHub release RData assets."""
38
+
39
+ dataset = "vdem"
40
+
41
+ def __init__(self, object_storage: ObjectStorage):
42
+ self.object_storage = object_storage
43
+
44
+ def fetch(
45
+ self,
46
+ *,
47
+ run_id: str,
48
+ artifact_id: str,
49
+ since_watermark: str | None = None,
50
+ ) -> SourceArtifactSet:
51
+ release_info = self._latest_release_info()
52
+ tag_name = release_info["tag_name"]
53
+ etag = release_info.get("etag", tag_name)
54
+
55
+ watermark_after = self._compound_watermark(tag_name, etag=etag)
56
+
57
+ if since_watermark is not None and since_watermark == watermark_after:
58
+ raise NoNewSourceArtifact(
59
+ f"VDEM release {tag_name} already processed",
60
+ watermark_after=watermark_after,
61
+ )
62
+
63
+ raw_url = f"{VDEM_RAW_BASE_URL}/{tag_name}/{VDEM_RAW_FILE_PATH}"
64
+ logger.info("Fetching VDEM RData from %s", raw_url)
65
+ response = requests.get(raw_url, timeout=VDEM_TIMEOUT_SECONDS)
66
+ if response.status_code == 404:
67
+ raise SourceValidationError(
68
+ f"VDEM RData not found at {raw_url}",
69
+ metadata={
70
+ "dataset": self.dataset,
71
+ "tag": tag_name,
72
+ "expected_path": VDEM_RAW_FILE_PATH,
73
+ },
74
+ )
75
+ response.raise_for_status()
76
+
77
+ try:
78
+ parsed = pyreadr.read_r(io.BytesIO(response.content))
79
+ except Exception as exc:
80
+ raise SourceValidationError(
81
+ f"Failed to parse VDEM RData: {exc}",
82
+ metadata={"dataset": self.dataset, "tag": tag_name},
83
+ ) from exc
84
+
85
+ matched_keys = VDEM_EXPECTED_PYREADR_KEYS & set(parsed.keys())
86
+ if not matched_keys:
87
+ available_keys = list(parsed.keys())
88
+ raise SourceValidationError(
89
+ f"VDEM RData missing expected key. Available: {available_keys}",
90
+ metadata={
91
+ "dataset": self.dataset,
92
+ "tag": tag_name,
93
+ "expected_keys": list(VDEM_EXPECTED_PYREADR_KEYS),
94
+ "available_keys": available_keys,
95
+ },
96
+ )
97
+
98
+ df = parsed[list(matched_keys)[0]]
99
+ if not isinstance(df, pd.DataFrame) or df.empty:
100
+ raise SourceValidationError(
101
+ "VDEM RData produced empty DataFrame",
102
+ metadata={"dataset": self.dataset, "tag": tag_name},
103
+ )
104
+
105
+ df = self._storage_normalized(df)
106
+
107
+ logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/vdem.parquet"
108
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
109
+ return SourceArtifactSet(
110
+ dataset=self.dataset,
111
+ artifact_id=artifact_id,
112
+ logical_path=logical_path,
113
+ watermark_after=watermark_after,
114
+ )
115
+
116
+ def _storage_normalized(self, df: pd.DataFrame) -> pd.DataFrame:
117
+ """Normalize RData-loaded columns so they survive Parquet round-trip."""
118
+
119
+ normalized = df.copy()
120
+ for column in normalized.select_dtypes(include="object").columns:
121
+ normalized[column] = normalized[column].map(self._source_value)
122
+ return normalized
123
+
124
+ def _source_value(self, value: object) -> object:
125
+ if value is None:
126
+ return None
127
+ try:
128
+ if pd.isna(value):
129
+ return None
130
+ except (TypeError, ValueError):
131
+ pass
132
+ if isinstance(value, (list, tuple, dict)):
133
+ import json
134
+
135
+ return json.dumps(value, ensure_ascii=False)
136
+ return str(value)
137
+
138
+ def _latest_release_info(self) -> dict:
139
+ """Fetch the latest GitHub release metadata."""
140
+
141
+ response = requests.get(
142
+ VDEM_API_LATEST_RELEASE, timeout=VDEM_TIMEOUT_SECONDS
143
+ )
144
+ if response.status_code == 404:
145
+ raise SourceValidationError(
146
+ "VDEM GitHub repository not found or has no releases",
147
+ metadata={"dataset": self.dataset, "repo": f"{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}"},
148
+ )
149
+ response.raise_for_status()
150
+ info = response.json()
151
+ if "tag_name" not in info:
152
+ raise SourceValidationError(
153
+ "VDEM GitHub release response missing tag_name",
154
+ metadata={
155
+ "dataset": self.dataset,
156
+ "response_keys": list(info.keys()),
157
+ },
158
+ )
159
+ info["etag"] = response.headers.get("etag", "")
160
+ return info
161
+
162
+ def _compound_watermark(self, tag_name: str, *, etag: str) -> str:
163
+ """Build a compound watermark from release identity."""
164
+
165
+ return f"{VDEM_GITHUB_OWNER}/{VDEM_GITHUB_REPO}@{tag_name}:{VDEM_RAW_FILE_PATH}:{etag}"
@@ -0,0 +1,6 @@
1
+ """Backend HTTP handler adapters.
2
+
3
+ Azure Functions and local development HTTP adapters belong here and should stay
4
+ thin: parse shared fpu_barometer models, read through storage, and serialize shared fpu_barometer
5
+ response models.
6
+ """