fpu-barometer-admin 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpu_barometer_admin/__init__.py +6 -0
- fpu_barometer_admin/cli/__init__.py +5 -0
- fpu_barometer_admin/cli/commands.py +199 -0
- fpu_barometer_admin/cli/deploy.py +719 -0
- fpu_barometer_admin/connectors/__init__.py +56 -0
- fpu_barometer_admin/connectors/acled_connector.py +77 -0
- fpu_barometer_admin/connectors/base_connector.py +60 -0
- fpu_barometer_admin/connectors/cpj_connector.py +92 -0
- fpu_barometer_admin/connectors/ert_connector.py +134 -0
- fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
- fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
- fpu_barometer_admin/connectors/rr_connector.py +84 -0
- fpu_barometer_admin/connectors/static_sources.py +41 -0
- fpu_barometer_admin/connectors/vdem_connector.py +165 -0
- fpu_barometer_admin/handlers/__init__.py +6 -0
- fpu_barometer_admin/handlers/function_app.py +543 -0
- fpu_barometer_admin/processors/__init__.py +46 -0
- fpu_barometer_admin/processors/acled_processor.py +263 -0
- fpu_barometer_admin/processors/base_processor.py +23 -0
- fpu_barometer_admin/processors/cpj_processor.py +147 -0
- fpu_barometer_admin/processors/ert_processor.py +72 -0
- fpu_barometer_admin/processors/gdelt_processor.py +260 -0
- fpu_barometer_admin/processors/mfrr_processor.py +327 -0
- fpu_barometer_admin/processors/rr_processor.py +208 -0
- fpu_barometer_admin/processors/vdem_processor.py +70 -0
- fpu_barometer_admin/runners/__init__.py +19 -0
- fpu_barometer_admin/runners/definitions.py +159 -0
- fpu_barometer_admin/runners/runners.py +291 -0
- fpu_barometer_admin/runners/scheduler.py +148 -0
- fpu_barometer_admin/runners/seed.py +399 -0
- fpu_barometer_admin/schemas/__init__.py +1 -0
- fpu_barometer_admin/schemas/event.py +362 -0
- fpu_barometer_admin/schemas/predictor.py +418 -0
- fpu_barometer_admin/storage/__init__.py +39 -0
- fpu_barometer_admin/storage/catalog.py +359 -0
- fpu_barometer_admin/storage/factory.py +165 -0
- fpu_barometer_admin/storage/objects.py +463 -0
- fpu_barometer_admin/storage/reader.py +410 -0
- fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
- fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
- fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
- fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
- fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Source data connectors for FPU admin."""
|
|
2
|
+
|
|
3
|
+
from .base_connector import (
|
|
4
|
+
BaseConnector,
|
|
5
|
+
NoNewSourceArtifact,
|
|
6
|
+
SourceArtifact,
|
|
7
|
+
SourceArtifactSet,
|
|
8
|
+
SourceValidationError,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AcledConnector",
|
|
13
|
+
"BaseConnector",
|
|
14
|
+
"CpjConnector",
|
|
15
|
+
"ErtConnector",
|
|
16
|
+
"GdeltConnector",
|
|
17
|
+
"MFRRConnector",
|
|
18
|
+
"NoNewSourceArtifact",
|
|
19
|
+
"RrConnector",
|
|
20
|
+
"SourceArtifact",
|
|
21
|
+
"SourceArtifactSet",
|
|
22
|
+
"SourceValidationError",
|
|
23
|
+
"VdemConnector",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def __getattr__(name: str):
|
|
28
|
+
if name == "AcledConnector":
|
|
29
|
+
from .acled_connector import AcledConnector
|
|
30
|
+
|
|
31
|
+
return AcledConnector
|
|
32
|
+
if name == "CpjConnector":
|
|
33
|
+
from .cpj_connector import CpjConnector
|
|
34
|
+
|
|
35
|
+
return CpjConnector
|
|
36
|
+
if name == "ErtConnector":
|
|
37
|
+
from .ert_connector import ErtConnector
|
|
38
|
+
|
|
39
|
+
return ErtConnector
|
|
40
|
+
if name == "MFRRConnector":
|
|
41
|
+
from .mfrr_connector import MFRRConnector
|
|
42
|
+
|
|
43
|
+
return MFRRConnector
|
|
44
|
+
if name == "GdeltConnector":
|
|
45
|
+
from .gdelt_connector import GdeltConnector
|
|
46
|
+
|
|
47
|
+
return GdeltConnector
|
|
48
|
+
if name == "RrConnector":
|
|
49
|
+
from .rr_connector import RrConnector
|
|
50
|
+
|
|
51
|
+
return RrConnector
|
|
52
|
+
if name == "VdemConnector":
|
|
53
|
+
from .vdem_connector import VdemConnector
|
|
54
|
+
|
|
55
|
+
return VdemConnector
|
|
56
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""ACLED connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
|
|
11
|
+
from fpu_barometer_admin.connectors.static_sources import static_source_for_dataset
|
|
12
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AcledConnector:
|
|
16
|
+
"""Connector for the current static ACLED Event source file."""
|
|
17
|
+
|
|
18
|
+
dataset = "acled"
|
|
19
|
+
required_columns = {
|
|
20
|
+
"event_id",
|
|
21
|
+
"event_date",
|
|
22
|
+
"year",
|
|
23
|
+
"event_type",
|
|
24
|
+
"country",
|
|
25
|
+
"latitude",
|
|
26
|
+
"longitude",
|
|
27
|
+
"fatalities",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
31
|
+
self.object_storage = object_storage
|
|
32
|
+
|
|
33
|
+
def fetch(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
run_id: str,
|
|
37
|
+
artifact_id: str,
|
|
38
|
+
since_watermark: str | None = None,
|
|
39
|
+
) -> SourceArtifactSet:
|
|
40
|
+
"""Fetch the deployed ACLED static source from logical storage."""
|
|
41
|
+
|
|
42
|
+
source_file = static_source_for_dataset(self.dataset)
|
|
43
|
+
payload = self.object_storage.read_bytes(source_file.logical_path)
|
|
44
|
+
df = pd.read_csv(BytesIO(payload), low_memory=False)
|
|
45
|
+
return self._write_source_artifact(
|
|
46
|
+
df,
|
|
47
|
+
source_name=Path(source_file.logical_path).stem,
|
|
48
|
+
run_id=run_id,
|
|
49
|
+
artifact_id=artifact_id,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def _write_source_artifact(
|
|
53
|
+
self,
|
|
54
|
+
df: pd.DataFrame,
|
|
55
|
+
*,
|
|
56
|
+
source_name: str,
|
|
57
|
+
run_id: str,
|
|
58
|
+
artifact_id: str,
|
|
59
|
+
) -> SourceArtifactSet:
|
|
60
|
+
missing = self.required_columns - set(df.columns)
|
|
61
|
+
if missing:
|
|
62
|
+
missing_columns = sorted(missing)
|
|
63
|
+
raise SourceValidationError(
|
|
64
|
+
f"ACLED static source missing required columns: {missing_columns}",
|
|
65
|
+
metadata={"dataset": self.dataset, "missing_columns": missing_columns},
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
logical_path = (
|
|
69
|
+
f"source_artifacts/{self.dataset}/{artifact_id}/{source_name}.parquet"
|
|
70
|
+
)
|
|
71
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
72
|
+
|
|
73
|
+
return SourceArtifactSet(
|
|
74
|
+
dataset=self.dataset,
|
|
75
|
+
artifact_id=artifact_id,
|
|
76
|
+
logical_path=logical_path,
|
|
77
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Connector interfaces and Source Artifact Set models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Protocol
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SourceValidationError(ValueError):
|
|
10
|
+
"""Raised when a Connector cannot validate a source as a Source Artifact."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, message: str, *, metadata: dict[str, Any] | None = None):
|
|
13
|
+
super().__init__(message)
|
|
14
|
+
self.metadata = dict(metadata or {})
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NoNewSourceArtifact(RuntimeError):
|
|
18
|
+
"""Raised when an incremental Connector finds no new Source Artifact."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, message: str, *, watermark_after: str | None = None):
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
self.watermark_after = watermark_after
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class SourceArtifactSet:
|
|
27
|
+
"""Explicit reference to the accepted Source Artifact for one dataset update."""
|
|
28
|
+
|
|
29
|
+
dataset: str
|
|
30
|
+
artifact_id: str
|
|
31
|
+
logical_path: str
|
|
32
|
+
watermark_after: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BaseConnector(Protocol):
|
|
36
|
+
"""Generic connector interface."""
|
|
37
|
+
|
|
38
|
+
dataset: str
|
|
39
|
+
|
|
40
|
+
def fetch(
|
|
41
|
+
self,
|
|
42
|
+
*,
|
|
43
|
+
run_id: str,
|
|
44
|
+
artifact_id: str,
|
|
45
|
+
since_watermark: str | None = None,
|
|
46
|
+
) -> SourceArtifactSet:
|
|
47
|
+
"""Fetch and validate source data, returning explicit artifact references."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Backward-compatible name for older imports during the tracer refactor.
|
|
51
|
+
SourceArtifact = SourceArtifactSet
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
"BaseConnector",
|
|
56
|
+
"NoNewSourceArtifact",
|
|
57
|
+
"SourceArtifact",
|
|
58
|
+
"SourceArtifactSet",
|
|
59
|
+
"SourceValidationError",
|
|
60
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""CPJ (Committee to Protect Journalists) connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
|
|
11
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
CPJ_API_BASE = "https://cpj.org/wp-json/cpj-datamanager/v1"
|
|
16
|
+
CPJ_PEOPLE_LIST_ENDPOINT = f"{CPJ_API_BASE}/people_list"
|
|
17
|
+
CPJ_TIMEOUT_SECONDS = 120
|
|
18
|
+
CPJ_PAGE_SIZE = 10000
|
|
19
|
+
|
|
20
|
+
# Columns required in every CPJ API response row
|
|
21
|
+
CPJ_REQUIRED_COLUMNS = {
|
|
22
|
+
"fullName",
|
|
23
|
+
"organizations",
|
|
24
|
+
"location",
|
|
25
|
+
"status",
|
|
26
|
+
"startDisplay",
|
|
27
|
+
"type",
|
|
28
|
+
"mtpage",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CpjConnector:
|
|
33
|
+
"""Connector for the CPJ (Committee to Protect Journalists) data API."""
|
|
34
|
+
|
|
35
|
+
dataset = "cpj"
|
|
36
|
+
required_columns = CPJ_REQUIRED_COLUMNS
|
|
37
|
+
|
|
38
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
39
|
+
self.object_storage = object_storage
|
|
40
|
+
|
|
41
|
+
def fetch(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
run_id: str,
|
|
45
|
+
artifact_id: str,
|
|
46
|
+
since_watermark: str | None = None,
|
|
47
|
+
) -> SourceArtifactSet:
|
|
48
|
+
"""
|
|
49
|
+
Fetch all CPJ records in a single API call using the CSV-export mode
|
|
50
|
+
(which returns JSON with all records, bypassing rate limits).
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
response = requests.get(
|
|
54
|
+
CPJ_PEOPLE_LIST_ENDPOINT,
|
|
55
|
+
params={
|
|
56
|
+
"export": "csv",
|
|
57
|
+
"limit": str(CPJ_PAGE_SIZE),
|
|
58
|
+
},
|
|
59
|
+
timeout=CPJ_TIMEOUT_SECONDS,
|
|
60
|
+
)
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
payload = response.json()
|
|
63
|
+
all_rows = payload.get("data", [])
|
|
64
|
+
|
|
65
|
+
if not all_rows:
|
|
66
|
+
raise SourceValidationError(
|
|
67
|
+
"No CPJ data returned",
|
|
68
|
+
metadata={"dataset": self.dataset},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
df = pd.DataFrame(all_rows)
|
|
72
|
+
|
|
73
|
+
# Fill empty status fields with "Unknown"
|
|
74
|
+
df["status"] = df["status"].fillna("").replace("", "Unknown")
|
|
75
|
+
|
|
76
|
+
# Validate required columns exist in the combined DataFrame
|
|
77
|
+
missing = self.required_columns - set(df.columns)
|
|
78
|
+
if missing:
|
|
79
|
+
missing_columns = sorted(missing)
|
|
80
|
+
raise SourceValidationError(
|
|
81
|
+
f"CPJ data missing required columns: {missing_columns}",
|
|
82
|
+
metadata={"dataset": self.dataset, "missing_columns": missing_columns},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/cpj.parquet"
|
|
86
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
87
|
+
|
|
88
|
+
return SourceArtifactSet(
|
|
89
|
+
dataset=self.dataset,
|
|
90
|
+
artifact_id=artifact_id,
|
|
91
|
+
logical_path=logical_path,
|
|
92
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""ERT connector — GitHub release asset style for vdeminstitute/ERT."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from fpu_barometer_admin.connectors.base_connector import (
|
|
12
|
+
NoNewSourceArtifact,
|
|
13
|
+
SourceArtifactSet,
|
|
14
|
+
SourceValidationError,
|
|
15
|
+
)
|
|
16
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
ERT_GITHUB_OWNER = "vdeminstitute"
|
|
22
|
+
ERT_GITHUB_REPO = "ERT"
|
|
23
|
+
ERT_RAW_FILE_PATH = "inst/ert.csv"
|
|
24
|
+
ERT_API_LATEST_RELEASE = (
|
|
25
|
+
f"https://api.github.com/repos/{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}/releases/latest"
|
|
26
|
+
)
|
|
27
|
+
ERT_RAW_BASE_URL = (
|
|
28
|
+
f"https://raw.githubusercontent.com/{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
ERT_TIMEOUT_SECONDS = 120
|
|
32
|
+
ERT_REQUIRED_COLUMNS = {"country_text_id", "country_name", "year"}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ErtConnector:
|
|
36
|
+
"""Connector for ERT annual GitHub release CSV assets."""
|
|
37
|
+
|
|
38
|
+
dataset = "ert"
|
|
39
|
+
|
|
40
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
41
|
+
self.object_storage = object_storage
|
|
42
|
+
|
|
43
|
+
def fetch(
|
|
44
|
+
self,
|
|
45
|
+
*,
|
|
46
|
+
run_id: str,
|
|
47
|
+
artifact_id: str,
|
|
48
|
+
since_watermark: str | None = None,
|
|
49
|
+
) -> SourceArtifactSet:
|
|
50
|
+
release_info = self._latest_release_info()
|
|
51
|
+
tag_name = release_info["tag_name"]
|
|
52
|
+
etag = release_info.get("etag", tag_name)
|
|
53
|
+
|
|
54
|
+
watermark_after = self._compound_watermark(tag_name, etag=etag)
|
|
55
|
+
|
|
56
|
+
if since_watermark is not None and since_watermark == watermark_after:
|
|
57
|
+
raise NoNewSourceArtifact(
|
|
58
|
+
f"ERT release {tag_name} already processed",
|
|
59
|
+
watermark_after=watermark_after,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
raw_url = f"{ERT_RAW_BASE_URL}/{tag_name}/{ERT_RAW_FILE_PATH}"
|
|
63
|
+
logger.info("Fetching ERT CSV from %s", raw_url)
|
|
64
|
+
response = requests.get(raw_url, timeout=ERT_TIMEOUT_SECONDS)
|
|
65
|
+
if response.status_code == 404:
|
|
66
|
+
raise SourceValidationError(
|
|
67
|
+
f"ERT CSV not found at {raw_url}",
|
|
68
|
+
metadata={
|
|
69
|
+
"dataset": self.dataset,
|
|
70
|
+
"tag": tag_name,
|
|
71
|
+
"expected_path": ERT_RAW_FILE_PATH,
|
|
72
|
+
},
|
|
73
|
+
)
|
|
74
|
+
response.raise_for_status()
|
|
75
|
+
|
|
76
|
+
df = pd.read_csv(io.BytesIO(response.content), low_memory=False)
|
|
77
|
+
if df.empty:
|
|
78
|
+
raise SourceValidationError(
|
|
79
|
+
"ERT CSV produced empty DataFrame",
|
|
80
|
+
metadata={"dataset": self.dataset, "tag": tag_name},
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
missing = ERT_REQUIRED_COLUMNS - set(df.columns)
|
|
84
|
+
if missing:
|
|
85
|
+
missing_columns = sorted(missing)
|
|
86
|
+
raise SourceValidationError(
|
|
87
|
+
f"ERT source missing required columns: {missing_columns}",
|
|
88
|
+
metadata={
|
|
89
|
+
"dataset": self.dataset,
|
|
90
|
+
"tag": tag_name,
|
|
91
|
+
"missing_columns": missing_columns,
|
|
92
|
+
},
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/ert.parquet"
|
|
96
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
97
|
+
return SourceArtifactSet(
|
|
98
|
+
dataset=self.dataset,
|
|
99
|
+
artifact_id=artifact_id,
|
|
100
|
+
logical_path=logical_path,
|
|
101
|
+
watermark_after=watermark_after,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _latest_release_info(self) -> dict:
|
|
105
|
+
"""Fetch the latest GitHub release metadata."""
|
|
106
|
+
|
|
107
|
+
response = requests.get(
|
|
108
|
+
ERT_API_LATEST_RELEASE, timeout=ERT_TIMEOUT_SECONDS
|
|
109
|
+
)
|
|
110
|
+
if response.status_code == 404:
|
|
111
|
+
raise SourceValidationError(
|
|
112
|
+
"ERT GitHub repository not found or has no releases",
|
|
113
|
+
metadata={
|
|
114
|
+
"dataset": self.dataset,
|
|
115
|
+
"repo": f"{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}",
|
|
116
|
+
},
|
|
117
|
+
)
|
|
118
|
+
response.raise_for_status()
|
|
119
|
+
info = response.json()
|
|
120
|
+
if "tag_name" not in info:
|
|
121
|
+
raise SourceValidationError(
|
|
122
|
+
"ERT GitHub release response missing tag_name",
|
|
123
|
+
metadata={
|
|
124
|
+
"dataset": self.dataset,
|
|
125
|
+
"response_keys": list(info.keys()),
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
info["etag"] = response.headers.get("etag", "")
|
|
129
|
+
return info
|
|
130
|
+
|
|
131
|
+
def _compound_watermark(self, tag_name: str, *, etag: str) -> str:
|
|
132
|
+
"""Build a compound watermark from release identity."""
|
|
133
|
+
|
|
134
|
+
return f"{ERT_GITHUB_OWNER}/{ERT_GITHUB_REPO}@{tag_name}:{ERT_RAW_FILE_PATH}:{etag}"
|