fpu-barometer-admin 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpu_barometer_admin/__init__.py +6 -0
- fpu_barometer_admin/cli/__init__.py +5 -0
- fpu_barometer_admin/cli/commands.py +199 -0
- fpu_barometer_admin/cli/deploy.py +719 -0
- fpu_barometer_admin/connectors/__init__.py +56 -0
- fpu_barometer_admin/connectors/acled_connector.py +77 -0
- fpu_barometer_admin/connectors/base_connector.py +60 -0
- fpu_barometer_admin/connectors/cpj_connector.py +92 -0
- fpu_barometer_admin/connectors/ert_connector.py +134 -0
- fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
- fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
- fpu_barometer_admin/connectors/rr_connector.py +84 -0
- fpu_barometer_admin/connectors/static_sources.py +41 -0
- fpu_barometer_admin/connectors/vdem_connector.py +165 -0
- fpu_barometer_admin/handlers/__init__.py +6 -0
- fpu_barometer_admin/handlers/function_app.py +543 -0
- fpu_barometer_admin/processors/__init__.py +46 -0
- fpu_barometer_admin/processors/acled_processor.py +263 -0
- fpu_barometer_admin/processors/base_processor.py +23 -0
- fpu_barometer_admin/processors/cpj_processor.py +147 -0
- fpu_barometer_admin/processors/ert_processor.py +72 -0
- fpu_barometer_admin/processors/gdelt_processor.py +260 -0
- fpu_barometer_admin/processors/mfrr_processor.py +327 -0
- fpu_barometer_admin/processors/rr_processor.py +208 -0
- fpu_barometer_admin/processors/vdem_processor.py +70 -0
- fpu_barometer_admin/runners/__init__.py +19 -0
- fpu_barometer_admin/runners/definitions.py +159 -0
- fpu_barometer_admin/runners/runners.py +291 -0
- fpu_barometer_admin/runners/scheduler.py +148 -0
- fpu_barometer_admin/runners/seed.py +399 -0
- fpu_barometer_admin/schemas/__init__.py +1 -0
- fpu_barometer_admin/schemas/event.py +362 -0
- fpu_barometer_admin/schemas/predictor.py +418 -0
- fpu_barometer_admin/storage/__init__.py +39 -0
- fpu_barometer_admin/storage/catalog.py +359 -0
- fpu_barometer_admin/storage/factory.py +165 -0
- fpu_barometer_admin/storage/objects.py +463 -0
- fpu_barometer_admin/storage/reader.py +410 -0
- fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
- fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
- fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
- fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
- fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""ACLED processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from fpu_barometer_admin.connectors import SourceArtifactSet
|
|
8
|
+
from fpu_barometer_admin.schemas.event import (
|
|
9
|
+
CountryCanonicalizer,
|
|
10
|
+
EventSchemaValidationError,
|
|
11
|
+
canonical_event_id,
|
|
12
|
+
incident_tags,
|
|
13
|
+
validate_event_dataframe,
|
|
14
|
+
)
|
|
15
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AcledProcessor:
|
|
19
|
+
"""Transform ACLED Source Artifacts into canonical Event data."""
|
|
20
|
+
|
|
21
|
+
dataset = "acled"
|
|
22
|
+
|
|
23
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
24
|
+
self.object_storage = object_storage
|
|
25
|
+
self.country_canonicalizer = CountryCanonicalizer()
|
|
26
|
+
|
|
27
|
+
def process(
|
|
28
|
+
self,
|
|
29
|
+
artifact_set: SourceArtifactSet,
|
|
30
|
+
*args: object,
|
|
31
|
+
**kwargs: object,
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
if artifact_set.dataset != self.dataset:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"ACLED processor cannot process dataset {artifact_set.dataset!r}"
|
|
36
|
+
)
|
|
37
|
+
source = self.object_storage.read_dataframe(artifact_set.logical_path)
|
|
38
|
+
self._validate_source_columns(source)
|
|
39
|
+
|
|
40
|
+
processed = pd.DataFrame(index=source.index)
|
|
41
|
+
processed["event_id"] = source["event_id"].map(
|
|
42
|
+
lambda value: canonical_event_id(self.dataset, value)
|
|
43
|
+
)
|
|
44
|
+
processed["dataset"] = self.dataset
|
|
45
|
+
|
|
46
|
+
countries = source.apply(self._canonical_country, axis=1)
|
|
47
|
+
unresolved = countries.isna()
|
|
48
|
+
if unresolved.any():
|
|
49
|
+
unresolved_values = (
|
|
50
|
+
source.loc[unresolved, "country"].dropna().astype(str).unique()[:5]
|
|
51
|
+
)
|
|
52
|
+
raise EventSchemaValidationError(
|
|
53
|
+
"ACLED Event rows have unresolved countries: "
|
|
54
|
+
f"{list(unresolved_values)}"
|
|
55
|
+
)
|
|
56
|
+
processed["iso3"] = countries.map(lambda country: country.iso3)
|
|
57
|
+
processed["country_name"] = countries.map(lambda country: country.country_name)
|
|
58
|
+
|
|
59
|
+
dates = self._parse_event_dates(source["event_date"])
|
|
60
|
+
if dates.isna().any():
|
|
61
|
+
bad_dates = source.loc[dates.isna(), "event_date"].head().tolist()
|
|
62
|
+
raise EventSchemaValidationError(
|
|
63
|
+
f"ACLED Event rows have invalid event_date values: {bad_dates}"
|
|
64
|
+
)
|
|
65
|
+
processed["date"] = dates.dt.date
|
|
66
|
+
processed["date_precision"] = self._date_precision(source)
|
|
67
|
+
processed["year"] = dates.dt.year.astype("Int64")
|
|
68
|
+
|
|
69
|
+
incident_values = source.apply(self._incident_values, axis=1)
|
|
70
|
+
processed["type_of_incident"] = incident_values.map(lambda values: values[0])
|
|
71
|
+
processed["top_type_of_incident"] = incident_values.map(
|
|
72
|
+
lambda values: values[1]
|
|
73
|
+
)
|
|
74
|
+
processed["type_of_incident_leaves"] = incident_values.map(
|
|
75
|
+
lambda values: values[2]
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
processed["processed_at"] = pd.Timestamp.now(tz="UTC")
|
|
79
|
+
processed["n_people_affected"] = self._people_affected(source)
|
|
80
|
+
|
|
81
|
+
processed["region"] = source.get("admin1")
|
|
82
|
+
processed["latitude"] = self._numeric(source.get("latitude"), source.index)
|
|
83
|
+
processed["longitude"] = self._numeric(source.get("longitude"), source.index)
|
|
84
|
+
processed["description"] = source.get("notes")
|
|
85
|
+
processed["source_url"] = pd.NA
|
|
86
|
+
processed["attacked_count"] = self._attacked_count(source)
|
|
87
|
+
processed["gender"] = pd.NA
|
|
88
|
+
processed["media_role"] = pd.NA
|
|
89
|
+
processed["perpetrator_type"] = pd.NA
|
|
90
|
+
self._validate_coordinates(processed)
|
|
91
|
+
return validate_event_dataframe(processed)
|
|
92
|
+
|
|
93
|
+
def _validate_source_columns(self, source: pd.DataFrame) -> None:
|
|
94
|
+
required = {"event_id", "event_date", "country", "event_type", "fatalities"}
|
|
95
|
+
missing = required - set(source.columns)
|
|
96
|
+
if missing:
|
|
97
|
+
raise EventSchemaValidationError(
|
|
98
|
+
f"ACLED Source Artifact missing processor columns: {sorted(missing)}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def _canonical_country(self, row: pd.Series):
|
|
102
|
+
return self.country_canonicalizer.from_values(
|
|
103
|
+
iso_numeric=row.get("iso"), country_name=row.get("country")
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _parse_event_dates(self, values: pd.Series) -> pd.Series:
|
|
107
|
+
month_map = {
|
|
108
|
+
"Januar": "January",
|
|
109
|
+
"Februar": "February",
|
|
110
|
+
"März": "March",
|
|
111
|
+
"April": "April",
|
|
112
|
+
"Mai": "May",
|
|
113
|
+
"Juni": "June",
|
|
114
|
+
"Juli": "July",
|
|
115
|
+
"August": "August",
|
|
116
|
+
"September": "September",
|
|
117
|
+
"Oktober": "October",
|
|
118
|
+
"November": "November",
|
|
119
|
+
"Dezember": "December",
|
|
120
|
+
}
|
|
121
|
+
normalized = values.astype(str)
|
|
122
|
+
for german, english in month_map.items():
|
|
123
|
+
normalized = normalized.str.replace(german, english, regex=False)
|
|
124
|
+
return pd.to_datetime(normalized, format="%d-%B-%Y", errors="coerce")
|
|
125
|
+
|
|
126
|
+
def _date_precision(self, source: pd.DataFrame) -> pd.Series:
|
|
127
|
+
if "time_precision" not in source.columns:
|
|
128
|
+
return pd.Series(["day"] * len(source), index=source.index, dtype="object")
|
|
129
|
+
precision = pd.to_numeric(source["time_precision"], errors="coerce")
|
|
130
|
+
return precision.map({1: "day", 2: "month", 3: "year"}).fillna("day")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _incident_values(
|
|
134
|
+
self, row: pd.Series
|
|
135
|
+
) -> tuple[list[str], list[str], list[str]]:
|
|
136
|
+
text = " ".join(
|
|
137
|
+
str(row.get(column, ""))
|
|
138
|
+
for column in (
|
|
139
|
+
"disorder_type",
|
|
140
|
+
"event_type",
|
|
141
|
+
"sub_event_type",
|
|
142
|
+
"notes",
|
|
143
|
+
"tags",
|
|
144
|
+
)
|
|
145
|
+
if row.get(column) is not None and not pd.isna(row.get(column))
|
|
146
|
+
).lower()
|
|
147
|
+
top: list[str] = []
|
|
148
|
+
leaves: list[str] = []
|
|
149
|
+
|
|
150
|
+
if self._flag_enabled(row.get("arrested_mentioned")):
|
|
151
|
+
top.append("Legal incident")
|
|
152
|
+
leaves.append("Arrest/detention/imprisonment")
|
|
153
|
+
if self._flag_enabled(row.get("abducted_mentioned")) or self._flag_enabled(row.get("disappeared_mentioned")):
|
|
154
|
+
top.append("Physical assault")
|
|
155
|
+
leaves.append("Abduction/kidnapping")
|
|
156
|
+
if "sexual" in text:
|
|
157
|
+
top.append("Physical assault")
|
|
158
|
+
leaves.append("Sexual assault")
|
|
159
|
+
if any(
|
|
160
|
+
token in text
|
|
161
|
+
for token in ("attack", "assault", "violence")
|
|
162
|
+
):
|
|
163
|
+
top.append("Physical assault")
|
|
164
|
+
fatalities = pd.to_numeric(
|
|
165
|
+
pd.Series([row.get("fatalities")]), errors="coerce"
|
|
166
|
+
).iloc[0]
|
|
167
|
+
if pd.notna(fatalities) and fatalities > 0:
|
|
168
|
+
leaves.append("Death (physical assault resulting in death)")
|
|
169
|
+
if self._flag_enabled(row.get("killed_mentioned")):
|
|
170
|
+
top.append("Physical assault")
|
|
171
|
+
leaves.append("Death (physical assault resulting in death)")
|
|
172
|
+
if self._flag_enabled(row.get("tortured_mentioned")):
|
|
173
|
+
top.append("Physical assault")
|
|
174
|
+
leaves.append("Injury (physical assault resulting in injury)")
|
|
175
|
+
|
|
176
|
+
if any(token in text for token in ("property", "equipment", "raid")):
|
|
177
|
+
top.append("Attack to property")
|
|
178
|
+
if "equipment" in text:
|
|
179
|
+
leaves.append("Equipment")
|
|
180
|
+
elif "raid" in text:
|
|
181
|
+
leaves.append("Raid")
|
|
182
|
+
else:
|
|
183
|
+
leaves.append("Property (incl. houses, cars, personal belongings)")
|
|
184
|
+
if any(token in text for token in ("threat", "intimidat")):
|
|
185
|
+
top.append("Verbal attack")
|
|
186
|
+
leaves.append("Threatening")
|
|
187
|
+
if any(token in text for token in ("harass", "insult", "discredit")):
|
|
188
|
+
top.append("Verbal attack")
|
|
189
|
+
leaves.append("Insult / harassment / discredit")
|
|
190
|
+
if any(
|
|
191
|
+
token in text
|
|
192
|
+
for token in ("protest", "blocked", "censor", "journalistic activity")
|
|
193
|
+
):
|
|
194
|
+
top.append("Interference")
|
|
195
|
+
leaves.append("Blocked journalistic activity")
|
|
196
|
+
|
|
197
|
+
if not top:
|
|
198
|
+
top.append("Unknown")
|
|
199
|
+
if not leaves:
|
|
200
|
+
leaves.append("Unknown")
|
|
201
|
+
return incident_tags(top=top, leaves=leaves)
|
|
202
|
+
|
|
203
|
+
def _flag_enabled(self, value: object) -> bool:
|
|
204
|
+
if value is None or pd.isna(value):
|
|
205
|
+
return False
|
|
206
|
+
if isinstance(value, str):
|
|
207
|
+
return value.strip().lower() in {"1", "true", "yes", "y"}
|
|
208
|
+
return bool(
|
|
209
|
+
pd.to_numeric(pd.Series([value]), errors="coerce").fillna(0).iloc[0]
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def _people_affected(self, source: pd.DataFrame) -> pd.Series:
|
|
213
|
+
counts = self._affected_count_components(source).sum(axis=1)
|
|
214
|
+
return counts.clip(lower=1).astype("Int64")
|
|
215
|
+
|
|
216
|
+
def _attacked_count(self, source: pd.DataFrame) -> pd.Series:
|
|
217
|
+
counts = self._affected_count_components(source)
|
|
218
|
+
if counts.empty:
|
|
219
|
+
return pd.Series([pd.NA] * len(source), index=source.index, dtype="Int64")
|
|
220
|
+
return counts.sum(axis=1).astype("Int64")
|
|
221
|
+
|
|
222
|
+
def _affected_count_components(self, source: pd.DataFrame) -> pd.DataFrame:
|
|
223
|
+
components = pd.DataFrame(index=source.index)
|
|
224
|
+
fatalities = pd.to_numeric(source["fatalities"], errors="coerce").fillna(0)
|
|
225
|
+
if "n_killed" in source.columns:
|
|
226
|
+
components["n_killed"] = pd.concat(
|
|
227
|
+
[pd.to_numeric(source["n_killed"], errors="coerce").fillna(0), fatalities],
|
|
228
|
+
axis=1,
|
|
229
|
+
).max(axis=1)
|
|
230
|
+
else:
|
|
231
|
+
components["n_killed"] = fatalities
|
|
232
|
+
for column in (
|
|
233
|
+
"n_tortured",
|
|
234
|
+
"n_abducted",
|
|
235
|
+
"n_arrested",
|
|
236
|
+
"n_disappeared",
|
|
237
|
+
):
|
|
238
|
+
if column in source.columns:
|
|
239
|
+
components[column] = pd.to_numeric(source[column], errors="coerce").fillna(0)
|
|
240
|
+
return components
|
|
241
|
+
|
|
242
|
+
# def _media_role(self, source: pd.DataFrame) -> pd.Series:
|
|
243
|
+
# for column in ("assoc_actor_2", "assoc_actor_1", "actor2"):
|
|
244
|
+
# if column in source.columns:
|
|
245
|
+
# return source[column]
|
|
246
|
+
# return pd.Series([pd.NA] * len(source), index=source.index, dtype="object")
|
|
247
|
+
|
|
248
|
+
def _numeric(self, values: pd.Series | None, index: pd.Index) -> pd.Series:
|
|
249
|
+
if values is None:
|
|
250
|
+
return pd.Series([pd.NA] * len(index), index=index, dtype="Float64")
|
|
251
|
+
return pd.to_numeric(
|
|
252
|
+
values.astype(str).str.replace(",", ".", regex=False), errors="coerce"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def _validate_coordinates(self, processed: pd.DataFrame) -> None:
|
|
256
|
+
latitude = processed["latitude"].dropna()
|
|
257
|
+
longitude = processed["longitude"].dropna()
|
|
258
|
+
if ((latitude < -90) | (latitude > 90)).any():
|
|
259
|
+
raise EventSchemaValidationError("ACLED Event latitude outside [-90, 90]")
|
|
260
|
+
if ((longitude < -180) | (longitude > 180)).any():
|
|
261
|
+
raise EventSchemaValidationError(
|
|
262
|
+
"ACLED Event longitude outside [-180, 180]"
|
|
263
|
+
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Shared processor interfaces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseProcessor(Protocol):
|
|
13
|
+
"""Generic processor interface."""
|
|
14
|
+
|
|
15
|
+
dataset: str
|
|
16
|
+
|
|
17
|
+
def process(
|
|
18
|
+
self,
|
|
19
|
+
artifact_set: SourceArtifactSet,
|
|
20
|
+
*args: object,
|
|
21
|
+
**kwargs: object,
|
|
22
|
+
) -> pd.DataFrame:
|
|
23
|
+
"""Transform a Source Artifact Set into a canonical DataFrame."""
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""CPJ (Committee to Protect Journalists) processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from fpu_barometer_admin.connectors import SourceArtifactSet
|
|
10
|
+
from fpu_barometer_admin.schemas.event import (
|
|
11
|
+
CountryCanonicalizer,
|
|
12
|
+
EventSchemaValidationError,
|
|
13
|
+
canonical_event_id,
|
|
14
|
+
incident_tags,
|
|
15
|
+
validate_event_dataframe,
|
|
16
|
+
)
|
|
17
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
18
|
+
|
|
19
|
+
# Map CPJ status values to Barometer incident taxonomy.
|
|
20
|
+
# Each entry maps status -> (top_type, leaf_type)
|
|
21
|
+
_CPJ_STATUS_INCIDENT_MAP: dict[str, tuple[str, str]] = {
|
|
22
|
+
"Killed": ("Physical assault", "Death (physical assault resulting in death)"),
|
|
23
|
+
"Imprisoned": ("Legal incident", "Arrest/detention/imprisonment"),
|
|
24
|
+
"Missing": ("Legal incident", "Other"),
|
|
25
|
+
"Unknown": ("Unknown", "Unknown"),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Source columns required by the processor (subset of what the connector validates).
|
|
29
|
+
_PROCESSOR_REQUIRED_COLUMNS = {
|
|
30
|
+
"fullName",
|
|
31
|
+
"location",
|
|
32
|
+
"status",
|
|
33
|
+
"startDisplay",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Override mappings for CPJ country names that the canonicalizer can't resolve.
|
|
37
|
+
_CPJ_COUNTRY_OVERRIDES: dict[str, str] = {
|
|
38
|
+
"Yugoslavia": "Serbia",
|
|
39
|
+
"Serbia and Montenegro": "Serbia",
|
|
40
|
+
"Ethopia": "Ethiopia",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CpjProcessor:
|
|
45
|
+
"""Transform CPJ Source Artifacts into canonical Event data."""
|
|
46
|
+
|
|
47
|
+
dataset = "cpj"
|
|
48
|
+
|
|
49
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
50
|
+
self.object_storage = object_storage
|
|
51
|
+
self.country_canonicalizer = CountryCanonicalizer()
|
|
52
|
+
self._cached_source: pd.DataFrame | None = None
|
|
53
|
+
|
|
54
|
+
def process(
|
|
55
|
+
self,
|
|
56
|
+
artifact_set: SourceArtifactSet,
|
|
57
|
+
*args: object,
|
|
58
|
+
**kwargs: object,
|
|
59
|
+
) -> pd.DataFrame:
|
|
60
|
+
if artifact_set.dataset != self.dataset:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"CPJ processor cannot process dataset {artifact_set.dataset!r}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
source = self.object_storage.read_dataframe(artifact_set.logical_path)
|
|
66
|
+
self._validate_source_columns(source)
|
|
67
|
+
|
|
68
|
+
processed = pd.DataFrame(index=source.index)
|
|
69
|
+
# Use row index as suffix to make event_id unique (mtpage can have duplicates)
|
|
70
|
+
processed["event_id"] = source.index.to_series().apply(
|
|
71
|
+
lambda idx: canonical_event_id(self.dataset, f"row-{idx}")
|
|
72
|
+
)
|
|
73
|
+
processed["dataset"] = self.dataset
|
|
74
|
+
|
|
75
|
+
# Map country names to ISO3
|
|
76
|
+
countries = source["location"].map(self._canonical_country)
|
|
77
|
+
unresolved = countries.isna()
|
|
78
|
+
if unresolved.any():
|
|
79
|
+
unresolved_values = (
|
|
80
|
+
source.loc[unresolved, "location"]
|
|
81
|
+
.dropna()
|
|
82
|
+
.astype(str)
|
|
83
|
+
.unique()[:5]
|
|
84
|
+
.tolist()
|
|
85
|
+
)
|
|
86
|
+
raise EventSchemaValidationError(
|
|
87
|
+
"CPJ Event rows have unresolved countries: "
|
|
88
|
+
f"{unresolved_values}"
|
|
89
|
+
)
|
|
90
|
+
processed["iso3"] = countries.map(lambda c: c.iso3)
|
|
91
|
+
processed["country_name"] = countries.map(lambda c: c.country_name)
|
|
92
|
+
|
|
93
|
+
# Parse dates from "Month DD, YYYY" format
|
|
94
|
+
dates = pd.to_datetime(source["startDisplay"], errors="coerce")
|
|
95
|
+
if dates.isna().any():
|
|
96
|
+
bad_dates = source.loc[dates.isna(), "startDisplay"].head().tolist()
|
|
97
|
+
raise EventSchemaValidationError(
|
|
98
|
+
f"CPJ Event rows have invalid date values: {bad_dates}"
|
|
99
|
+
)
|
|
100
|
+
processed["date"] = dates.dt.date
|
|
101
|
+
processed["date_precision"] = "day"
|
|
102
|
+
processed["year"] = dates.dt.year.astype("Int64")
|
|
103
|
+
|
|
104
|
+
# Map incident types based on CPJ status
|
|
105
|
+
incident_values = source["status"].map(self._incident_values)
|
|
106
|
+
processed["type_of_incident"] = incident_values.map(lambda v: v[0])
|
|
107
|
+
processed["top_type_of_incident"] = incident_values.map(lambda v: v[1])
|
|
108
|
+
processed["type_of_incident_leaves"] = incident_values.map(lambda v: v[2])
|
|
109
|
+
|
|
110
|
+
processed["processed_at"] = pd.Timestamp.now(tz="UTC")
|
|
111
|
+
processed["n_people_affected"] = 1 # Each CPJ entry is one person
|
|
112
|
+
|
|
113
|
+
processed["region"] = pd.NA
|
|
114
|
+
processed["latitude"] = pd.NA
|
|
115
|
+
processed["longitude"] = pd.NA
|
|
116
|
+
processed["description"] = source.get("organizations", pd.NA)
|
|
117
|
+
processed["source_url"] = source["mtpage"]
|
|
118
|
+
processed["attacked_count"] = pd.NA
|
|
119
|
+
processed["gender"] = pd.NA
|
|
120
|
+
processed["media_role"] = source.get("type", pd.NA)
|
|
121
|
+
processed["perpetrator_type"] = source.get("sourcesOfFire", pd.NA)
|
|
122
|
+
|
|
123
|
+
return validate_event_dataframe(processed)
|
|
124
|
+
|
|
125
|
+
def _validate_source_columns(self, source: pd.DataFrame) -> None:
|
|
126
|
+
missing = _PROCESSOR_REQUIRED_COLUMNS - set(source.columns)
|
|
127
|
+
if missing:
|
|
128
|
+
raise EventSchemaValidationError(
|
|
129
|
+
f"CPJ Source Artifact missing processor columns: {sorted(missing)}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def _canonical_country(self, country_name: object) -> Any | None:
|
|
133
|
+
if country_name is None or pd.isna(country_name):
|
|
134
|
+
return None
|
|
135
|
+
name = str(country_name).strip()
|
|
136
|
+
# Apply CPJ-specific overrides before canonical lookup
|
|
137
|
+
name = _CPJ_COUNTRY_OVERRIDES.get(name, name)
|
|
138
|
+
return self.country_canonicalizer.from_values(country_name=name)
|
|
139
|
+
|
|
140
|
+
def _incident_values(
|
|
141
|
+
self, status: object
|
|
142
|
+
) -> tuple[list[str], list[str], list[str]]:
|
|
143
|
+
raw = str(status).strip() if status and not pd.isna(status) else ""
|
|
144
|
+
mapped = _CPJ_STATUS_INCIDENT_MAP.get(raw)
|
|
145
|
+
if mapped:
|
|
146
|
+
return incident_tags(top=[mapped[0]], leaves=[mapped[1]])
|
|
147
|
+
return incident_tags(top=["Unknown"], leaves=["Unknown"])
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""ERT processor — transforms CSV source artifacts into canonical Predictor data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet
|
|
10
|
+
from fpu_barometer_admin.schemas.predictor import (
|
|
11
|
+
PREDICTOR_CORE_COLUMNS,
|
|
12
|
+
canonicalize_predictor_dataframe,
|
|
13
|
+
validate_predictor_dataframe,
|
|
14
|
+
)
|
|
15
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ErtProcessor:
|
|
22
|
+
"""Transform ERT Source Artifacts into canonical Predictor data.
|
|
23
|
+
|
|
24
|
+
The processor drops the source ``Unnamed: 0`` column from processed output
|
|
25
|
+
and passes ``strict=False`` to canonicalization so that unresolved historical
|
|
26
|
+
country rows and duplicate-collapse rows produce warnings rather than errors.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
dataset = "ert"
|
|
30
|
+
|
|
31
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
32
|
+
self.object_storage = object_storage
|
|
33
|
+
|
|
34
|
+
def process(
|
|
35
|
+
self,
|
|
36
|
+
artifact_set: SourceArtifactSet,
|
|
37
|
+
*args: object,
|
|
38
|
+
**kwargs: object,
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
if artifact_set.dataset != self.dataset:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"ERT processor cannot process dataset {artifact_set.dataset!r}"
|
|
43
|
+
)
|
|
44
|
+
source = self.object_storage.read_dataframe(artifact_set.logical_path)
|
|
45
|
+
|
|
46
|
+
canonical, report = canonicalize_predictor_dataframe(
|
|
47
|
+
source,
|
|
48
|
+
dataset=self.dataset,
|
|
49
|
+
iso3_column="country_text_id",
|
|
50
|
+
country_name_column="country_name",
|
|
51
|
+
year_column="year",
|
|
52
|
+
strict=False,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
if "Unnamed: 0" in canonical.columns:
|
|
56
|
+
canonical = canonical.drop(columns=["Unnamed: 0"])
|
|
57
|
+
|
|
58
|
+
if report.unresolved_country_rows:
|
|
59
|
+
logger.warning(
|
|
60
|
+
"ERT canonicalization dropped %d unresolved country rows: %s",
|
|
61
|
+
report.unresolved_country_rows,
|
|
62
|
+
report.unresolved_source_values,
|
|
63
|
+
)
|
|
64
|
+
if report.duplicate_country_year_rows_dropped:
|
|
65
|
+
logger.warning(
|
|
66
|
+
"ERT canonicalization dropped %d duplicate country-year rows: %s",
|
|
67
|
+
report.duplicate_country_year_rows_dropped,
|
|
68
|
+
report.duplicate_source_values_dropped,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
result = validate_predictor_dataframe(canonical)
|
|
72
|
+
return result
|