fpu-barometer-admin 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fpu_barometer_admin/__init__.py +6 -0
  2. fpu_barometer_admin/cli/__init__.py +5 -0
  3. fpu_barometer_admin/cli/commands.py +199 -0
  4. fpu_barometer_admin/cli/deploy.py +719 -0
  5. fpu_barometer_admin/connectors/__init__.py +56 -0
  6. fpu_barometer_admin/connectors/acled_connector.py +77 -0
  7. fpu_barometer_admin/connectors/base_connector.py +60 -0
  8. fpu_barometer_admin/connectors/cpj_connector.py +92 -0
  9. fpu_barometer_admin/connectors/ert_connector.py +134 -0
  10. fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
  11. fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
  12. fpu_barometer_admin/connectors/rr_connector.py +84 -0
  13. fpu_barometer_admin/connectors/static_sources.py +41 -0
  14. fpu_barometer_admin/connectors/vdem_connector.py +165 -0
  15. fpu_barometer_admin/handlers/__init__.py +6 -0
  16. fpu_barometer_admin/handlers/function_app.py +543 -0
  17. fpu_barometer_admin/processors/__init__.py +46 -0
  18. fpu_barometer_admin/processors/acled_processor.py +263 -0
  19. fpu_barometer_admin/processors/base_processor.py +23 -0
  20. fpu_barometer_admin/processors/cpj_processor.py +147 -0
  21. fpu_barometer_admin/processors/ert_processor.py +72 -0
  22. fpu_barometer_admin/processors/gdelt_processor.py +260 -0
  23. fpu_barometer_admin/processors/mfrr_processor.py +327 -0
  24. fpu_barometer_admin/processors/rr_processor.py +208 -0
  25. fpu_barometer_admin/processors/vdem_processor.py +70 -0
  26. fpu_barometer_admin/runners/__init__.py +19 -0
  27. fpu_barometer_admin/runners/definitions.py +159 -0
  28. fpu_barometer_admin/runners/runners.py +291 -0
  29. fpu_barometer_admin/runners/scheduler.py +148 -0
  30. fpu_barometer_admin/runners/seed.py +399 -0
  31. fpu_barometer_admin/schemas/__init__.py +1 -0
  32. fpu_barometer_admin/schemas/event.py +362 -0
  33. fpu_barometer_admin/schemas/predictor.py +418 -0
  34. fpu_barometer_admin/storage/__init__.py +39 -0
  35. fpu_barometer_admin/storage/catalog.py +359 -0
  36. fpu_barometer_admin/storage/factory.py +165 -0
  37. fpu_barometer_admin/storage/objects.py +463 -0
  38. fpu_barometer_admin/storage/reader.py +410 -0
  39. fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
  40. fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
  41. fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
  42. fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
  43. fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
@@ -0,0 +1,56 @@
1
+ """Source data connectors for FPU admin."""
2
+
3
+ from .base_connector import (
4
+ BaseConnector,
5
+ NoNewSourceArtifact,
6
+ SourceArtifact,
7
+ SourceArtifactSet,
8
+ SourceValidationError,
9
+ )
10
+
11
+ __all__ = [
12
+ "AcledConnector",
13
+ "BaseConnector",
14
+ "CpjConnector",
15
+ "ErtConnector",
16
+ "GdeltConnector",
17
+ "MFRRConnector",
18
+ "NoNewSourceArtifact",
19
+ "RrConnector",
20
+ "SourceArtifact",
21
+ "SourceArtifactSet",
22
+ "SourceValidationError",
23
+ "VdemConnector",
24
+ ]
25
+
26
+
27
+ def __getattr__(name: str):
28
+ if name == "AcledConnector":
29
+ from .acled_connector import AcledConnector
30
+
31
+ return AcledConnector
32
+ if name == "CpjConnector":
33
+ from .cpj_connector import CpjConnector
34
+
35
+ return CpjConnector
36
+ if name == "ErtConnector":
37
+ from .ert_connector import ErtConnector
38
+
39
+ return ErtConnector
40
+ if name == "MFRRConnector":
41
+ from .mfrr_connector import MFRRConnector
42
+
43
+ return MFRRConnector
44
+ if name == "GdeltConnector":
45
+ from .gdelt_connector import GdeltConnector
46
+
47
+ return GdeltConnector
48
+ if name == "RrConnector":
49
+ from .rr_connector import RrConnector
50
+
51
+ return RrConnector
52
+ if name == "VdemConnector":
53
+ from .vdem_connector import VdemConnector
54
+
55
+ return VdemConnector
56
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,77 @@
1
+ """ACLED connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
11
+ from fpu_barometer_admin.connectors.static_sources import static_source_for_dataset
12
+ from fpu_barometer_admin.storage.objects import ObjectStorage
13
+
14
+
15
+ class AcledConnector:
16
+ """Connector for the current static ACLED Event source file."""
17
+
18
+ dataset = "acled"
19
+ required_columns = {
20
+ "event_id",
21
+ "event_date",
22
+ "year",
23
+ "event_type",
24
+ "country",
25
+ "latitude",
26
+ "longitude",
27
+ "fatalities",
28
+ }
29
+
30
+ def __init__(self, object_storage: ObjectStorage):
31
+ self.object_storage = object_storage
32
+
33
+ def fetch(
34
+ self,
35
+ *,
36
+ run_id: str,
37
+ artifact_id: str,
38
+ since_watermark: str | None = None,
39
+ ) -> SourceArtifactSet:
40
+ """Fetch the deployed ACLED static source from logical storage."""
41
+
42
+ source_file = static_source_for_dataset(self.dataset)
43
+ payload = self.object_storage.read_bytes(source_file.logical_path)
44
+ df = pd.read_csv(BytesIO(payload), low_memory=False)
45
+ return self._write_source_artifact(
46
+ df,
47
+ source_name=Path(source_file.logical_path).stem,
48
+ run_id=run_id,
49
+ artifact_id=artifact_id,
50
+ )
51
+
52
+ def _write_source_artifact(
53
+ self,
54
+ df: pd.DataFrame,
55
+ *,
56
+ source_name: str,
57
+ run_id: str,
58
+ artifact_id: str,
59
+ ) -> SourceArtifactSet:
60
+ missing = self.required_columns - set(df.columns)
61
+ if missing:
62
+ missing_columns = sorted(missing)
63
+ raise SourceValidationError(
64
+ f"ACLED static source missing required columns: {missing_columns}",
65
+ metadata={"dataset": self.dataset, "missing_columns": missing_columns},
66
+ )
67
+
68
+ logical_path = (
69
+ f"source_artifacts/{self.dataset}/{artifact_id}/{source_name}.parquet"
70
+ )
71
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
72
+
73
+ return SourceArtifactSet(
74
+ dataset=self.dataset,
75
+ artifact_id=artifact_id,
76
+ logical_path=logical_path,
77
+ )
@@ -0,0 +1,60 @@
1
+ """Connector interfaces and Source Artifact Set models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Protocol
7
+
8
+
9
+ class SourceValidationError(ValueError):
10
+ """Raised when a Connector cannot validate a source as a Source Artifact."""
11
+
12
+ def __init__(self, message: str, *, metadata: dict[str, Any] | None = None):
13
+ super().__init__(message)
14
+ self.metadata = dict(metadata or {})
15
+
16
+
17
+ class NoNewSourceArtifact(RuntimeError):
18
+ """Raised when an incremental Connector finds no new Source Artifact."""
19
+
20
+ def __init__(self, message: str, *, watermark_after: str | None = None):
21
+ super().__init__(message)
22
+ self.watermark_after = watermark_after
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class SourceArtifactSet:
27
+ """Explicit reference to the accepted Source Artifact for one dataset update."""
28
+
29
+ dataset: str
30
+ artifact_id: str
31
+ logical_path: str
32
+ watermark_after: str | None = None
33
+
34
+
35
+ class BaseConnector(Protocol):
36
+ """Generic connector interface."""
37
+
38
+ dataset: str
39
+
40
+ def fetch(
41
+ self,
42
+ *,
43
+ run_id: str,
44
+ artifact_id: str,
45
+ since_watermark: str | None = None,
46
+ ) -> SourceArtifactSet:
47
+ """Fetch and validate source data, returning explicit artifact references."""
48
+
49
+
50
+ # Backward-compatible name for older imports during the tracer refactor.
51
+ SourceArtifact = SourceArtifactSet
52
+
53
+
54
+ __all__ = [
55
+ "BaseConnector",
56
+ "NoNewSourceArtifact",
57
+ "SourceArtifact",
58
+ "SourceArtifactSet",
59
+ "SourceValidationError",
60
+ ]
@@ -0,0 +1,92 @@
1
+ """CPJ (Committee to Protect Journalists) connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import requests
8
+ import pandas as pd
9
+
10
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
11
+ from fpu_barometer_admin.storage.objects import ObjectStorage
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ CPJ_API_BASE = "https://cpj.org/wp-json/cpj-datamanager/v1"
16
+ CPJ_PEOPLE_LIST_ENDPOINT = f"{CPJ_API_BASE}/people_list"
17
+ CPJ_TIMEOUT_SECONDS = 120
18
+ CPJ_PAGE_SIZE = 10000
19
+
20
+ # Columns required in every CPJ API response row
21
+ CPJ_REQUIRED_COLUMNS = {
22
+ "fullName",
23
+ "organizations",
24
+ "location",
25
+ "status",
26
+ "startDisplay",
27
+ "type",
28
+ "mtpage",
29
+ }
30
+
31
+
32
+ class CpjConnector:
33
+ """Connector for the CPJ (Committee to Protect Journalists) data API."""
34
+
35
+ dataset = "cpj"
36
+ required_columns = CPJ_REQUIRED_COLUMNS
37
+
38
+ def __init__(self, object_storage: ObjectStorage):
39
+ self.object_storage = object_storage
40
+
41
+ def fetch(
42
+ self,
43
+ *,
44
+ run_id: str,
45
+ artifact_id: str,
46
+ since_watermark: str | None = None,
47
+ ) -> SourceArtifactSet:
48
+ """
49
+ Fetch all CPJ records in a single API call using the CSV-export mode
50
+ (which returns JSON with all records, bypassing rate limits).
51
+ """
52
+
53
+ response = requests.get(
54
+ CPJ_PEOPLE_LIST_ENDPOINT,
55
+ params={
56
+ "export": "csv",
57
+ "limit": str(CPJ_PAGE_SIZE),
58
+ },
59
+ timeout=CPJ_TIMEOUT_SECONDS,
60
+ )
61
+ response.raise_for_status()
62
+ payload = response.json()
63
+ all_rows = payload.get("data", [])
64
+
65
+ if not all_rows:
66
+ raise SourceValidationError(
67
+ "No CPJ data returned",
68
+ metadata={"dataset": self.dataset},
69
+ )
70
+
71
+ df = pd.DataFrame(all_rows)
72
+
73
+ # Fill empty status fields with "Unknown"
74
+ df["status"] = df["status"].fillna("").replace("", "Unknown")
75
+
76
+ # Validate required columns exist in the combined DataFrame
77
+ missing = self.required_columns - set(df.columns)
78
+ if missing:
79
+ missing_columns = sorted(missing)
80
+ raise SourceValidationError(
81
+ f"CPJ data missing required columns: {missing_columns}",
82
+ metadata={"dataset": self.dataset, "missing_columns": missing_columns},
83
+ )
84
+
85
+ logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/cpj.parquet"
86
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
87
+
88
+ return SourceArtifactSet(
89
+ dataset=self.dataset,
90
+ artifact_id=artifact_id,
91
+ logical_path=logical_path,
92
+ )
@@ -0,0 +1,134 @@
1
+ """ERT connector — GitHub release asset style for vdeminstitute/ERT."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+
8
+ import pandas as pd
9
+ import requests
10
+
11
+ from fpu_barometer_admin.connectors.base_connector import (
12
+ NoNewSourceArtifact,
13
+ SourceArtifactSet,
14
+ SourceValidationError,
15
+ )
16
+ from fpu_barometer_admin.storage.objects import ObjectStorage
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ ERT_GITHUB_OWNER = "vdeminstitute"
22
+ ERT_GITHUB_REPO = "ERT"
23
+ ERT_RAW_FILE_PATH = "inst/ert.csv"
24
+ ERT_API_LATEST_RELEASE = (
25
+ f"https://api.github.com/repos/{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}/releases/latest"
26
+ )
27
+ ERT_RAW_BASE_URL = (
28
+ f"https://raw.githubusercontent.com/{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}"
29
+ )
30
+
31
+ ERT_TIMEOUT_SECONDS = 120
32
+ ERT_REQUIRED_COLUMNS = {"country_text_id", "country_name", "year"}
33
+
34
+
35
+ class ErtConnector:
36
+ """Connector for ERT annual GitHub release CSV assets."""
37
+
38
+ dataset = "ert"
39
+
40
+ def __init__(self, object_storage: ObjectStorage):
41
+ self.object_storage = object_storage
42
+
43
+ def fetch(
44
+ self,
45
+ *,
46
+ run_id: str,
47
+ artifact_id: str,
48
+ since_watermark: str | None = None,
49
+ ) -> SourceArtifactSet:
50
+ release_info = self._latest_release_info()
51
+ tag_name = release_info["tag_name"]
52
+ etag = release_info.get("etag", tag_name)
53
+
54
+ watermark_after = self._compound_watermark(tag_name, etag=etag)
55
+
56
+ if since_watermark is not None and since_watermark == watermark_after:
57
+ raise NoNewSourceArtifact(
58
+ f"ERT release {tag_name} already processed",
59
+ watermark_after=watermark_after,
60
+ )
61
+
62
+ raw_url = f"{ERT_RAW_BASE_URL}/{tag_name}/{ERT_RAW_FILE_PATH}"
63
+ logger.info("Fetching ERT CSV from %s", raw_url)
64
+ response = requests.get(raw_url, timeout=ERT_TIMEOUT_SECONDS)
65
+ if response.status_code == 404:
66
+ raise SourceValidationError(
67
+ f"ERT CSV not found at {raw_url}",
68
+ metadata={
69
+ "dataset": self.dataset,
70
+ "tag": tag_name,
71
+ "expected_path": ERT_RAW_FILE_PATH,
72
+ },
73
+ )
74
+ response.raise_for_status()
75
+
76
+ df = pd.read_csv(io.BytesIO(response.content), low_memory=False)
77
+ if df.empty:
78
+ raise SourceValidationError(
79
+ "ERT CSV produced empty DataFrame",
80
+ metadata={"dataset": self.dataset, "tag": tag_name},
81
+ )
82
+
83
+ missing = ERT_REQUIRED_COLUMNS - set(df.columns)
84
+ if missing:
85
+ missing_columns = sorted(missing)
86
+ raise SourceValidationError(
87
+ f"ERT source missing required columns: {missing_columns}",
88
+ metadata={
89
+ "dataset": self.dataset,
90
+ "tag": tag_name,
91
+ "missing_columns": missing_columns,
92
+ },
93
+ )
94
+
95
+ logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/ert.parquet"
96
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
97
+ return SourceArtifactSet(
98
+ dataset=self.dataset,
99
+ artifact_id=artifact_id,
100
+ logical_path=logical_path,
101
+ watermark_after=watermark_after,
102
+ )
103
+
104
+ def _latest_release_info(self) -> dict:
105
+ """Fetch the latest GitHub release metadata."""
106
+
107
+ response = requests.get(
108
+ ERT_API_LATEST_RELEASE, timeout=ERT_TIMEOUT_SECONDS
109
+ )
110
+ if response.status_code == 404:
111
+ raise SourceValidationError(
112
+ "ERT GitHub repository not found or has no releases",
113
+ metadata={
114
+ "dataset": self.dataset,
115
+ "repo": f"{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}",
116
+ },
117
+ )
118
+ response.raise_for_status()
119
+ info = response.json()
120
+ if "tag_name" not in info:
121
+ raise SourceValidationError(
122
+ "ERT GitHub release response missing tag_name",
123
+ metadata={
124
+ "dataset": self.dataset,
125
+ "response_keys": list(info.keys()),
126
+ },
127
+ )
128
+ info["etag"] = response.headers.get("etag", "")
129
+ return info
130
+
131
+ def _compound_watermark(self, tag_name: str, *, etag: str) -> str:
132
+ """Build a compound watermark from release identity."""
133
+
134
+ return f"{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}@{tag_name}:{ERT_RAW_FILE_PATH}:{etag}"