fpu-barometer-admin 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fpu_barometer_admin/__init__.py +6 -0
  2. fpu_barometer_admin/cli/__init__.py +5 -0
  3. fpu_barometer_admin/cli/commands.py +199 -0
  4. fpu_barometer_admin/cli/deploy.py +719 -0
  5. fpu_barometer_admin/connectors/__init__.py +56 -0
  6. fpu_barometer_admin/connectors/acled_connector.py +77 -0
  7. fpu_barometer_admin/connectors/base_connector.py +60 -0
  8. fpu_barometer_admin/connectors/cpj_connector.py +92 -0
  9. fpu_barometer_admin/connectors/ert_connector.py +134 -0
  10. fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
  11. fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
  12. fpu_barometer_admin/connectors/rr_connector.py +84 -0
  13. fpu_barometer_admin/connectors/static_sources.py +41 -0
  14. fpu_barometer_admin/connectors/vdem_connector.py +165 -0
  15. fpu_barometer_admin/handlers/__init__.py +6 -0
  16. fpu_barometer_admin/handlers/function_app.py +543 -0
  17. fpu_barometer_admin/processors/__init__.py +46 -0
  18. fpu_barometer_admin/processors/acled_processor.py +263 -0
  19. fpu_barometer_admin/processors/base_processor.py +23 -0
  20. fpu_barometer_admin/processors/cpj_processor.py +147 -0
  21. fpu_barometer_admin/processors/ert_processor.py +72 -0
  22. fpu_barometer_admin/processors/gdelt_processor.py +260 -0
  23. fpu_barometer_admin/processors/mfrr_processor.py +327 -0
  24. fpu_barometer_admin/processors/rr_processor.py +208 -0
  25. fpu_barometer_admin/processors/vdem_processor.py +70 -0
  26. fpu_barometer_admin/runners/__init__.py +19 -0
  27. fpu_barometer_admin/runners/definitions.py +159 -0
  28. fpu_barometer_admin/runners/runners.py +291 -0
  29. fpu_barometer_admin/runners/scheduler.py +148 -0
  30. fpu_barometer_admin/runners/seed.py +399 -0
  31. fpu_barometer_admin/schemas/__init__.py +1 -0
  32. fpu_barometer_admin/schemas/event.py +362 -0
  33. fpu_barometer_admin/schemas/predictor.py +418 -0
  34. fpu_barometer_admin/storage/__init__.py +39 -0
  35. fpu_barometer_admin/storage/catalog.py +359 -0
  36. fpu_barometer_admin/storage/factory.py +165 -0
  37. fpu_barometer_admin/storage/objects.py +463 -0
  38. fpu_barometer_admin/storage/reader.py +410 -0
  39. fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
  40. fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
  41. fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
  42. fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
  43. fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
@@ -0,0 +1,263 @@
1
+ """ACLED processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from fpu_barometer_admin.connectors import SourceArtifactSet
8
+ from fpu_barometer_admin.schemas.event import (
9
+ CountryCanonicalizer,
10
+ EventSchemaValidationError,
11
+ canonical_event_id,
12
+ incident_tags,
13
+ validate_event_dataframe,
14
+ )
15
+ from fpu_barometer_admin.storage.objects import ObjectStorage
16
+
17
+
18
+ class AcledProcessor:
19
+ """Transform ACLED Source Artifacts into canonical Event data."""
20
+
21
+ dataset = "acled"
22
+
23
+ def __init__(self, object_storage: ObjectStorage):
24
+ self.object_storage = object_storage
25
+ self.country_canonicalizer = CountryCanonicalizer()
26
+
27
+ def process(
28
+ self,
29
+ artifact_set: SourceArtifactSet,
30
+ *args: object,
31
+ **kwargs: object,
32
+ ) -> pd.DataFrame:
33
+ if artifact_set.dataset != self.dataset:
34
+ raise ValueError(
35
+ f"ACLED processor cannot process dataset {artifact_set.dataset!r}"
36
+ )
37
+ source = self.object_storage.read_dataframe(artifact_set.logical_path)
38
+ self._validate_source_columns(source)
39
+
40
+ processed = pd.DataFrame(index=source.index)
41
+ processed["event_id"] = source["event_id"].map(
42
+ lambda value: canonical_event_id(self.dataset, value)
43
+ )
44
+ processed["dataset"] = self.dataset
45
+
46
+ countries = source.apply(self._canonical_country, axis=1)
47
+ unresolved = countries.isna()
48
+ if unresolved.any():
49
+ unresolved_values = (
50
+ source.loc[unresolved, "country"].dropna().astype(str).unique()[:5]
51
+ )
52
+ raise EventSchemaValidationError(
53
+ "ACLED Event rows have unresolved countries: "
54
+ f"{list(unresolved_values)}"
55
+ )
56
+ processed["iso3"] = countries.map(lambda country: country.iso3)
57
+ processed["country_name"] = countries.map(lambda country: country.country_name)
58
+
59
+ dates = self._parse_event_dates(source["event_date"])
60
+ if dates.isna().any():
61
+ bad_dates = source.loc[dates.isna(), "event_date"].head().tolist()
62
+ raise EventSchemaValidationError(
63
+ f"ACLED Event rows have invalid event_date values: {bad_dates}"
64
+ )
65
+ processed["date"] = dates.dt.date
66
+ processed["date_precision"] = self._date_precision(source)
67
+ processed["year"] = dates.dt.year.astype("Int64")
68
+
69
+ incident_values = source.apply(self._incident_values, axis=1)
70
+ processed["type_of_incident"] = incident_values.map(lambda values: values[0])
71
+ processed["top_type_of_incident"] = incident_values.map(
72
+ lambda values: values[1]
73
+ )
74
+ processed["type_of_incident_leaves"] = incident_values.map(
75
+ lambda values: values[2]
76
+ )
77
+
78
+ processed["processed_at"] = pd.Timestamp.now(tz="UTC")
79
+ processed["n_people_affected"] = self._people_affected(source)
80
+
81
+ processed["region"] = source.get("admin1")
82
+ processed["latitude"] = self._numeric(source.get("latitude"), source.index)
83
+ processed["longitude"] = self._numeric(source.get("longitude"), source.index)
84
+ processed["description"] = source.get("notes")
85
+ processed["source_url"] = pd.NA
86
+ processed["attacked_count"] = self._attacked_count(source)
87
+ processed["gender"] = pd.NA
88
+ processed["media_role"] = pd.NA
89
+ processed["perpetrator_type"] = pd.NA
90
+ self._validate_coordinates(processed)
91
+ return validate_event_dataframe(processed)
92
+
93
+ def _validate_source_columns(self, source: pd.DataFrame) -> None:
94
+ required = {"event_id", "event_date", "country", "event_type", "fatalities"}
95
+ missing = required - set(source.columns)
96
+ if missing:
97
+ raise EventSchemaValidationError(
98
+ f"ACLED Source Artifact missing processor columns: {sorted(missing)}"
99
+ )
100
+
101
+ def _canonical_country(self, row: pd.Series):
102
+ return self.country_canonicalizer.from_values(
103
+ iso_numeric=row.get("iso"), country_name=row.get("country")
104
+ )
105
+
106
+ def _parse_event_dates(self, values: pd.Series) -> pd.Series:
107
+ month_map = {
108
+ "Januar": "January",
109
+ "Februar": "February",
110
+ "März": "March",
111
+ "April": "April",
112
+ "Mai": "May",
113
+ "Juni": "June",
114
+ "Juli": "July",
115
+ "August": "August",
116
+ "September": "September",
117
+ "Oktober": "October",
118
+ "November": "November",
119
+ "Dezember": "December",
120
+ }
121
+ normalized = values.astype(str)
122
+ for german, english in month_map.items():
123
+ normalized = normalized.str.replace(german, english, regex=False)
124
+ return pd.to_datetime(normalized, format="%d-%B-%Y", errors="coerce")
125
+
126
+ def _date_precision(self, source: pd.DataFrame) -> pd.Series:
127
+ if "time_precision" not in source.columns:
128
+ return pd.Series(["day"] * len(source), index=source.index, dtype="object")
129
+ precision = pd.to_numeric(source["time_precision"], errors="coerce")
130
+ return precision.map({1: "day", 2: "month", 3: "year"}).fillna("day")
131
+
132
+
133
+ def _incident_values(
134
+ self, row: pd.Series
135
+ ) -> tuple[list[str], list[str], list[str]]:
136
+ text = " ".join(
137
+ str(row.get(column, ""))
138
+ for column in (
139
+ "disorder_type",
140
+ "event_type",
141
+ "sub_event_type",
142
+ "notes",
143
+ "tags",
144
+ )
145
+ if row.get(column) is not None and not pd.isna(row.get(column))
146
+ ).lower()
147
+ top: list[str] = []
148
+ leaves: list[str] = []
149
+
150
+ if self._flag_enabled(row.get("arrested_mentioned")):
151
+ top.append("Legal incident")
152
+ leaves.append("Arrest/detention/imprisonment")
153
+ if self._flag_enabled(row.get("abducted_mentioned")) or self._flag_enabled(row.get("disappeared_mentioned")):
154
+ top.append("Physical assault")
155
+ leaves.append("Abduction/kidnapping")
156
+ if "sexual" in text:
157
+ top.append("Physical assault")
158
+ leaves.append("Sexual assault")
159
+ if any(
160
+ token in text
161
+ for token in ("attack", "assault", "violence")
162
+ ):
163
+ top.append("Physical assault")
164
+ fatalities = pd.to_numeric(
165
+ pd.Series([row.get("fatalities")]), errors="coerce"
166
+ ).iloc[0]
167
+ if pd.notna(fatalities) and fatalities > 0:
168
+ leaves.append("Death (physical assault resulting in death)")
169
+ if self._flag_enabled(row.get("killed_mentioned")):
170
+ top.append("Physical assault")
171
+ leaves.append("Death (physical assault resulting in death)")
172
+ if self._flag_enabled(row.get("tortured_mentioned")):
173
+ top.append("Physical assault")
174
+ leaves.append("Injury (physical assault resulting in injury)")
175
+
176
+ if any(token in text for token in ("property", "equipment", "raid")):
177
+ top.append("Attack to property")
178
+ if "equipment" in text:
179
+ leaves.append("Equipment")
180
+ elif "raid" in text:
181
+ leaves.append("Raid")
182
+ else:
183
+ leaves.append("Property (incl. houses, cars, personal belongings)")
184
+ if any(token in text for token in ("threat", "intimidat")):
185
+ top.append("Verbal attack")
186
+ leaves.append("Threatening")
187
+ if any(token in text for token in ("harass", "insult", "discredit")):
188
+ top.append("Verbal attack")
189
+ leaves.append("Insult / harassment / discredit")
190
+ if any(
191
+ token in text
192
+ for token in ("protest", "blocked", "censor", "journalistic activity")
193
+ ):
194
+ top.append("Interference")
195
+ leaves.append("Blocked journalistic activity")
196
+
197
+ if not top:
198
+ top.append("Unknown")
199
+ if not leaves:
200
+ leaves.append("Unknown")
201
+ return incident_tags(top=top, leaves=leaves)
202
+
203
+ def _flag_enabled(self, value: object) -> bool:
204
+ if value is None or pd.isna(value):
205
+ return False
206
+ if isinstance(value, str):
207
+ return value.strip().lower() in {"1", "true", "yes", "y"}
208
+ return bool(
209
+ pd.to_numeric(pd.Series([value]), errors="coerce").fillna(0).iloc[0]
210
+ )
211
+
212
+ def _people_affected(self, source: pd.DataFrame) -> pd.Series:
213
+ counts = self._affected_count_components(source).sum(axis=1)
214
+ return counts.clip(lower=1).astype("Int64")
215
+
216
+ def _attacked_count(self, source: pd.DataFrame) -> pd.Series:
217
+ counts = self._affected_count_components(source)
218
+ if counts.empty:
219
+ return pd.Series([pd.NA] * len(source), index=source.index, dtype="Int64")
220
+ return counts.sum(axis=1).astype("Int64")
221
+
222
+ def _affected_count_components(self, source: pd.DataFrame) -> pd.DataFrame:
223
+ components = pd.DataFrame(index=source.index)
224
+ fatalities = pd.to_numeric(source["fatalities"], errors="coerce").fillna(0)
225
+ if "n_killed" in source.columns:
226
+ components["n_killed"] = pd.concat(
227
+ [pd.to_numeric(source["n_killed"], errors="coerce").fillna(0), fatalities],
228
+ axis=1,
229
+ ).max(axis=1)
230
+ else:
231
+ components["n_killed"] = fatalities
232
+ for column in (
233
+ "n_tortured",
234
+ "n_abducted",
235
+ "n_arrested",
236
+ "n_disappeared",
237
+ ):
238
+ if column in source.columns:
239
+ components[column] = pd.to_numeric(source[column], errors="coerce").fillna(0)
240
+ return components
241
+
242
+ # def _media_role(self, source: pd.DataFrame) -> pd.Series:
243
+ # for column in ("assoc_actor_2", "assoc_actor_1", "actor2"):
244
+ # if column in source.columns:
245
+ # return source[column]
246
+ # return pd.Series([pd.NA] * len(source), index=source.index, dtype="object")
247
+
248
+ def _numeric(self, values: pd.Series | None, index: pd.Index) -> pd.Series:
249
+ if values is None:
250
+ return pd.Series([pd.NA] * len(index), index=index, dtype="Float64")
251
+ return pd.to_numeric(
252
+ values.astype(str).str.replace(",", ".", regex=False), errors="coerce"
253
+ )
254
+
255
+ def _validate_coordinates(self, processed: pd.DataFrame) -> None:
256
+ latitude = processed["latitude"].dropna()
257
+ longitude = processed["longitude"].dropna()
258
+ if ((latitude < -90) | (latitude > 90)).any():
259
+ raise EventSchemaValidationError("ACLED Event latitude outside [-90, 90]")
260
+ if ((longitude < -180) | (longitude > 180)).any():
261
+ raise EventSchemaValidationError(
262
+ "ACLED Event longitude outside [-180, 180]"
263
+ )
@@ -0,0 +1,23 @@
1
+ """Shared processor interfaces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol
6
+
7
+ import pandas as pd
8
+
9
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet
10
+
11
+
12
+ class BaseProcessor(Protocol):
13
+ """Generic processor interface."""
14
+
15
+ dataset: str
16
+
17
+ def process(
18
+ self,
19
+ artifact_set: SourceArtifactSet,
20
+ *args: object,
21
+ **kwargs: object,
22
+ ) -> pd.DataFrame:
23
+ """Transform a Source Artifact Set into a canonical DataFrame."""
@@ -0,0 +1,147 @@
1
+ """CPJ (Committee to Protect Journalists) processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from fpu_barometer_admin.connectors import SourceArtifactSet
10
+ from fpu_barometer_admin.schemas.event import (
11
+ CountryCanonicalizer,
12
+ EventSchemaValidationError,
13
+ canonical_event_id,
14
+ incident_tags,
15
+ validate_event_dataframe,
16
+ )
17
+ from fpu_barometer_admin.storage.objects import ObjectStorage
18
+
19
+ # Map CPJ status values to Barometer incident taxonomy.
20
+ # Each entry maps status -> (top_type, leaf_type)
21
+ _CPJ_STATUS_INCIDENT_MAP: dict[str, tuple[str, str]] = {
22
+ "Killed": ("Physical assault", "Death (physical assault resulting in death)"),
23
+ "Imprisoned": ("Legal incident", "Arrest/detention/imprisonment"),
24
+ "Missing": ("Legal incident", "Other"),
25
+ "Unknown": ("Unknown", "Unknown"),
26
+ }
27
+
28
+ # Source columns required by the processor (subset of what the connector validates).
29
+ _PROCESSOR_REQUIRED_COLUMNS = {
30
+ "fullName",
31
+ "location",
32
+ "status",
33
+ "startDisplay",
34
+ }
35
+
36
+ # Override mappings for CPJ country names that the canonicalizer can't resolve.
37
+ _CPJ_COUNTRY_OVERRIDES: dict[str, str] = {
38
+ "Yugoslavia": "Serbia",
39
+ "Serbia and Montenegro": "Serbia",
40
+ "Ethopia": "Ethiopia",
41
+ }
42
+
43
+
44
+ class CpjProcessor:
45
+ """Transform CPJ Source Artifacts into canonical Event data."""
46
+
47
+ dataset = "cpj"
48
+
49
+ def __init__(self, object_storage: ObjectStorage):
50
+ self.object_storage = object_storage
51
+ self.country_canonicalizer = CountryCanonicalizer()
52
+ self._cached_source: pd.DataFrame | None = None
53
+
54
+ def process(
55
+ self,
56
+ artifact_set: SourceArtifactSet,
57
+ *args: object,
58
+ **kwargs: object,
59
+ ) -> pd.DataFrame:
60
+ if artifact_set.dataset != self.dataset:
61
+ raise ValueError(
62
+ f"CPJ processor cannot process dataset {artifact_set.dataset!r}"
63
+ )
64
+
65
+ source = self.object_storage.read_dataframe(artifact_set.logical_path)
66
+ self._validate_source_columns(source)
67
+
68
+ processed = pd.DataFrame(index=source.index)
69
+ # Use row index as suffix to make event_id unique (mtpage can have duplicates)
70
+ processed["event_id"] = source.index.to_series().apply(
71
+ lambda idx: canonical_event_id(self.dataset, f"row-{idx}")
72
+ )
73
+ processed["dataset"] = self.dataset
74
+
75
+ # Map country names to ISO3
76
+ countries = source["location"].map(self._canonical_country)
77
+ unresolved = countries.isna()
78
+ if unresolved.any():
79
+ unresolved_values = (
80
+ source.loc[unresolved, "location"]
81
+ .dropna()
82
+ .astype(str)
83
+ .unique()[:5]
84
+ .tolist()
85
+ )
86
+ raise EventSchemaValidationError(
87
+ "CPJ Event rows have unresolved countries: "
88
+ f"{unresolved_values}"
89
+ )
90
+ processed["iso3"] = countries.map(lambda c: c.iso3)
91
+ processed["country_name"] = countries.map(lambda c: c.country_name)
92
+
93
+ # Parse dates from "Month DD, YYYY" format
94
+ dates = pd.to_datetime(source["startDisplay"], errors="coerce")
95
+ if dates.isna().any():
96
+ bad_dates = source.loc[dates.isna(), "startDisplay"].head().tolist()
97
+ raise EventSchemaValidationError(
98
+ f"CPJ Event rows have invalid date values: {bad_dates}"
99
+ )
100
+ processed["date"] = dates.dt.date
101
+ processed["date_precision"] = "day"
102
+ processed["year"] = dates.dt.year.astype("Int64")
103
+
104
+ # Map incident types based on CPJ status
105
+ incident_values = source["status"].map(self._incident_values)
106
+ processed["type_of_incident"] = incident_values.map(lambda v: v[0])
107
+ processed["top_type_of_incident"] = incident_values.map(lambda v: v[1])
108
+ processed["type_of_incident_leaves"] = incident_values.map(lambda v: v[2])
109
+
110
+ processed["processed_at"] = pd.Timestamp.now(tz="UTC")
111
+ processed["n_people_affected"] = 1 # Each CPJ entry is one person
112
+
113
+ processed["region"] = pd.NA
114
+ processed["latitude"] = pd.NA
115
+ processed["longitude"] = pd.NA
116
+ processed["description"] = source.get("organizations", pd.NA)
117
+ processed["source_url"] = source["mtpage"]
118
+ processed["attacked_count"] = pd.NA
119
+ processed["gender"] = pd.NA
120
+ processed["media_role"] = source.get("type", pd.NA)
121
+ processed["perpetrator_type"] = source.get("sourcesOfFire", pd.NA)
122
+
123
+ return validate_event_dataframe(processed)
124
+
125
+ def _validate_source_columns(self, source: pd.DataFrame) -> None:
126
+ missing = _PROCESSOR_REQUIRED_COLUMNS - set(source.columns)
127
+ if missing:
128
+ raise EventSchemaValidationError(
129
+ f"CPJ Source Artifact missing processor columns: {sorted(missing)}"
130
+ )
131
+
132
+ def _canonical_country(self, country_name: object) -> Any | None:
133
+ if country_name is None or pd.isna(country_name):
134
+ return None
135
+ name = str(country_name).strip()
136
+ # Apply CPJ-specific overrides before canonical lookup
137
+ name = _CPJ_COUNTRY_OVERRIDES.get(name, name)
138
+ return self.country_canonicalizer.from_values(country_name=name)
139
+
140
+ def _incident_values(
141
+ self, status: object
142
+ ) -> tuple[list[str], list[str], list[str]]:
143
+ raw = str(status).strip() if status and not pd.isna(status) else ""
144
+ mapped = _CPJ_STATUS_INCIDENT_MAP.get(raw)
145
+ if mapped:
146
+ return incident_tags(top=[mapped[0]], leaves=[mapped[1]])
147
+ return incident_tags(top=["Unknown"], leaves=["Unknown"])
@@ -0,0 +1,72 @@
1
+ """ERT processor — transforms CSV source artifacts into canonical Predictor data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import pandas as pd
8
+
9
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet
10
+ from fpu_barometer_admin.schemas.predictor import (
11
+ PREDICTOR_CORE_COLUMNS,
12
+ canonicalize_predictor_dataframe,
13
+ validate_predictor_dataframe,
14
+ )
15
+ from fpu_barometer_admin.storage.objects import ObjectStorage
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class ErtProcessor:
22
+ """Transform ERT Source Artifacts into canonical Predictor data.
23
+
24
+ The processor drops the source ``Unnamed: 0`` column from processed output
25
+ and passes ``strict=False`` to canonicalization so that unresolved historical
26
+ country rows and duplicate-collapse rows produce warnings rather than errors.
27
+ """
28
+
29
+ dataset = "ert"
30
+
31
+ def __init__(self, object_storage: ObjectStorage):
32
+ self.object_storage = object_storage
33
+
34
+ def process(
35
+ self,
36
+ artifact_set: SourceArtifactSet,
37
+ *args: object,
38
+ **kwargs: object,
39
+ ) -> pd.DataFrame:
40
+ if artifact_set.dataset != self.dataset:
41
+ raise ValueError(
42
+ f"ERT processor cannot process dataset {artifact_set.dataset!r}"
43
+ )
44
+ source = self.object_storage.read_dataframe(artifact_set.logical_path)
45
+
46
+ canonical, report = canonicalize_predictor_dataframe(
47
+ source,
48
+ dataset=self.dataset,
49
+ iso3_column="country_text_id",
50
+ country_name_column="country_name",
51
+ year_column="year",
52
+ strict=False,
53
+ )
54
+
55
+ if "Unnamed: 0" in canonical.columns:
56
+ canonical = canonical.drop(columns=["Unnamed: 0"])
57
+
58
+ if report.unresolved_country_rows:
59
+ logger.warning(
60
+ "ERT canonicalization dropped %d unresolved country rows: %s",
61
+ report.unresolved_country_rows,
62
+ report.unresolved_source_values,
63
+ )
64
+ if report.duplicate_country_year_rows_dropped:
65
+ logger.warning(
66
+ "ERT canonicalization dropped %d duplicate country-year rows: %s",
67
+ report.duplicate_country_year_rows_dropped,
68
+ report.duplicate_source_values_dropped,
69
+ )
70
+
71
+ result = validate_predictor_dataframe(canonical)
72
+ return result