fpu-barometer-admin 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpu_barometer_admin/__init__.py +6 -0
- fpu_barometer_admin/cli/__init__.py +5 -0
- fpu_barometer_admin/cli/commands.py +199 -0
- fpu_barometer_admin/cli/deploy.py +719 -0
- fpu_barometer_admin/connectors/__init__.py +56 -0
- fpu_barometer_admin/connectors/acled_connector.py +77 -0
- fpu_barometer_admin/connectors/base_connector.py +60 -0
- fpu_barometer_admin/connectors/cpj_connector.py +92 -0
- fpu_barometer_admin/connectors/ert_connector.py +134 -0
- fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
- fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
- fpu_barometer_admin/connectors/rr_connector.py +84 -0
- fpu_barometer_admin/connectors/static_sources.py +41 -0
- fpu_barometer_admin/connectors/vdem_connector.py +165 -0
- fpu_barometer_admin/handlers/__init__.py +6 -0
- fpu_barometer_admin/handlers/function_app.py +543 -0
- fpu_barometer_admin/processors/__init__.py +46 -0
- fpu_barometer_admin/processors/acled_processor.py +263 -0
- fpu_barometer_admin/processors/base_processor.py +23 -0
- fpu_barometer_admin/processors/cpj_processor.py +147 -0
- fpu_barometer_admin/processors/ert_processor.py +72 -0
- fpu_barometer_admin/processors/gdelt_processor.py +260 -0
- fpu_barometer_admin/processors/mfrr_processor.py +327 -0
- fpu_barometer_admin/processors/rr_processor.py +208 -0
- fpu_barometer_admin/processors/vdem_processor.py +70 -0
- fpu_barometer_admin/runners/__init__.py +19 -0
- fpu_barometer_admin/runners/definitions.py +159 -0
- fpu_barometer_admin/runners/runners.py +291 -0
- fpu_barometer_admin/runners/scheduler.py +148 -0
- fpu_barometer_admin/runners/seed.py +399 -0
- fpu_barometer_admin/schemas/__init__.py +1 -0
- fpu_barometer_admin/schemas/event.py +362 -0
- fpu_barometer_admin/schemas/predictor.py +418 -0
- fpu_barometer_admin/storage/__init__.py +39 -0
- fpu_barometer_admin/storage/catalog.py +359 -0
- fpu_barometer_admin/storage/factory.py +165 -0
- fpu_barometer_admin/storage/objects.py +463 -0
- fpu_barometer_admin/storage/reader.py +410 -0
- fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
- fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
- fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
- fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
- fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""GDELT processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from fpu_barometer_admin.connectors import SourceArtifactSet
|
|
10
|
+
from fpu_barometer_admin.schemas.event import (
|
|
11
|
+
EVENT_COLUMNS,
|
|
12
|
+
CountryCanonicalizer,
|
|
13
|
+
EventSchemaValidationError,
|
|
14
|
+
canonical_event_id,
|
|
15
|
+
incident_tags,
|
|
16
|
+
validate_event_dataframe,
|
|
17
|
+
)
|
|
18
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
_GDELT_INCIDENT_MAP: dict[str, tuple[str, str]] = {
|
|
22
|
+
"091": ("Legal incident", "Investigation"),
|
|
23
|
+
"092": ("Legal incident", "Investigation"),
|
|
24
|
+
"110": ("Verbal attack", "Insult / harassment / discredit"),
|
|
25
|
+
"111": ("Verbal attack", "Insult / harassment / discredit"),
|
|
26
|
+
"112": ("Verbal attack", "Insult / harassment / discredit"),
|
|
27
|
+
"1121": ("Verbal attack", "Insult / harassment / discredit"),
|
|
28
|
+
"1122": ("Verbal attack", "Insult / harassment / discredit"),
|
|
29
|
+
"1123": ("Verbal attack", "Insult / harassment / discredit"),
|
|
30
|
+
"1125": ("Verbal attack", "Insult / harassment / discredit"),
|
|
31
|
+
"113": ("Verbal attack", "Insult / harassment / discredit"),
|
|
32
|
+
"114": ("Legal incident", "Complaint"),
|
|
33
|
+
"115": ("Legal incident", "Civil legal actions / lawsuits"),
|
|
34
|
+
"116": ("Legal incident", "Conviction"),
|
|
35
|
+
"1233": ("Interference", "Blocked journalistic activity"),
|
|
36
|
+
"124": ("Interference", "Blocked journalistic activity"),
|
|
37
|
+
"1234": ("Interference", "Blocked journalistic activity"),
|
|
38
|
+
"1245": ("Interference", "Blocked journalistic activity"),
|
|
39
|
+
"130": ("Verbal attack", "Threatening"),
|
|
40
|
+
"131": ("Verbal attack", "Threatening"),
|
|
41
|
+
"1321": ("Verbal attack", "Threatening"),
|
|
42
|
+
"138": ("Verbal attack", "Threatening"),
|
|
43
|
+
"1384": ("Verbal attack", "Threatening"),
|
|
44
|
+
"139": ("Verbal attack", "Threatening"),
|
|
45
|
+
"170": ("Interference", "Other"),
|
|
46
|
+
"171": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
|
|
47
|
+
"1711": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
|
|
48
|
+
"1712": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
|
|
49
|
+
"172": ("Interference", "Administrative or financial interference"),
|
|
50
|
+
"1721": ("Interference", "Administrative or financial interference"),
|
|
51
|
+
"173": ("Legal incident", "Arrest/detention/imprisonment"),
|
|
52
|
+
"174": ("Legal incident", "Travel ban"),
|
|
53
|
+
"176": ("Attack to property", "Hacking/DDoS"),
|
|
54
|
+
"180": ("Physical assault", "Other"),
|
|
55
|
+
"181": ("Physical assault", "Abduction/kidnapping"),
|
|
56
|
+
"182": ("Physical assault", "Without injury (physical assault not resulting in injury)"),
|
|
57
|
+
"1821": ("Physical assault", "Sexual assault"),
|
|
58
|
+
"1822": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
59
|
+
"1823": ("Physical assault", "Death (physical assault resulting in death)"),
|
|
60
|
+
"183": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
61
|
+
"1831": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
62
|
+
"1832": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
63
|
+
"1833": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
64
|
+
"1834": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
65
|
+
"185": ("Physical assault", "Injury (physical assault resulting in injury)"),
|
|
66
|
+
"186": ("Physical assault", "Death (physical assault resulting in death)"),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GdeltProcessor:
|
|
71
|
+
"""Transform GDELT Source Artifacts into canonical Event data."""
|
|
72
|
+
|
|
73
|
+
dataset = "gdelt"
|
|
74
|
+
required_columns = {
|
|
75
|
+
"GlobalEventID",
|
|
76
|
+
"SQLDATE",
|
|
77
|
+
"Actor1Name",
|
|
78
|
+
"Actor1CountryCode",
|
|
79
|
+
"Actor2Name",
|
|
80
|
+
"Actor2CountryCode",
|
|
81
|
+
"EventCode",
|
|
82
|
+
"ActionGeo_CountryCode",
|
|
83
|
+
"ActionGeo_FullName",
|
|
84
|
+
"ActionGeo_Lat",
|
|
85
|
+
"ActionGeo_Long",
|
|
86
|
+
"SOURCEURL",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
90
|
+
self.object_storage = object_storage
|
|
91
|
+
self.country_canonicalizer = CountryCanonicalizer()
|
|
92
|
+
|
|
93
|
+
def process(
|
|
94
|
+
self,
|
|
95
|
+
artifact_set: SourceArtifactSet,
|
|
96
|
+
*args: object,
|
|
97
|
+
baseline_processed: pd.DataFrame | None = None,
|
|
98
|
+
**kwargs: object,
|
|
99
|
+
) -> pd.DataFrame:
|
|
100
|
+
if artifact_set.dataset != self.dataset:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"GDELT processor cannot process dataset {artifact_set.dataset!r}"
|
|
103
|
+
)
|
|
104
|
+
source = self.object_storage.read_dataframe(artifact_set.logical_path)
|
|
105
|
+
self._validate_source_columns(source)
|
|
106
|
+
|
|
107
|
+
countries = source.apply(self._country, axis=1)
|
|
108
|
+
valid_country = countries.notna()
|
|
109
|
+
if (~valid_country).any():
|
|
110
|
+
samples = source.loc[~valid_country, ["ActionGeo_CountryCode", "Actor1CountryCode", "Actor2CountryCode"]].head().to_dict("records")
|
|
111
|
+
warnings.warn(
|
|
112
|
+
f"GDELT dropped {(~valid_country).sum()} Event rows with unresolved countries: {samples}",
|
|
113
|
+
RuntimeWarning,
|
|
114
|
+
stacklevel=2,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
dates = source["SQLDATE"].map(self._date)
|
|
118
|
+
valid_date = dates.map(lambda value: pd.notna(value))
|
|
119
|
+
if (~valid_date).any():
|
|
120
|
+
samples = source.loc[~valid_date, "SQLDATE"].head().tolist()
|
|
121
|
+
warnings.warn(
|
|
122
|
+
f"GDELT dropped {(~valid_date).sum()} Event rows with invalid dates: {samples}",
|
|
123
|
+
RuntimeWarning,
|
|
124
|
+
stacklevel=2,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
coordinates = source.apply(self._coordinates, axis=1)
|
|
128
|
+
valid_coordinates = coordinates.map(lambda value: value is not None)
|
|
129
|
+
if (~valid_coordinates).any():
|
|
130
|
+
samples = source.loc[~valid_coordinates, ["ActionGeo_Lat", "ActionGeo_Long"]].head().to_dict("records")
|
|
131
|
+
warnings.warn(
|
|
132
|
+
f"GDELT dropped {(~valid_coordinates).sum()} Event rows with invalid coordinates: {samples}",
|
|
133
|
+
RuntimeWarning,
|
|
134
|
+
stacklevel=2,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
keep = valid_country & valid_date & valid_coordinates
|
|
138
|
+
source = source.loc[keep].copy()
|
|
139
|
+
countries = countries.loc[keep]
|
|
140
|
+
dates = dates.loc[keep]
|
|
141
|
+
coordinates = coordinates.loc[keep]
|
|
142
|
+
if source.empty:
|
|
143
|
+
return validate_event_dataframe(pd.DataFrame(columns=EVENT_COLUMNS))
|
|
144
|
+
|
|
145
|
+
processed = pd.DataFrame(index=source.index)
|
|
146
|
+
processed["event_id"] = source["GlobalEventID"].map(
|
|
147
|
+
lambda value: canonical_event_id(self.dataset, value)
|
|
148
|
+
)
|
|
149
|
+
processed = processed.loc[~processed["event_id"].duplicated(keep="last")].copy()
|
|
150
|
+
source = source.loc[processed.index]
|
|
151
|
+
countries = countries.loc[processed.index]
|
|
152
|
+
dates = dates.loc[processed.index]
|
|
153
|
+
coordinates = coordinates.loc[processed.index]
|
|
154
|
+
|
|
155
|
+
processed["dataset"] = self.dataset
|
|
156
|
+
processed["iso3"] = countries.map(lambda country: country.iso3)
|
|
157
|
+
processed["country_name"] = countries.map(lambda country: country.country_name)
|
|
158
|
+
processed["date"] = dates.map(lambda value: value.date())
|
|
159
|
+
processed["date_precision"] = "day"
|
|
160
|
+
processed["year"] = dates.map(lambda value: int(value.year)).astype("Int64")
|
|
161
|
+
tags = source["EventCode"].map(self._incident_tags)
|
|
162
|
+
processed["type_of_incident"] = tags.map(lambda value: value[0])
|
|
163
|
+
processed["top_type_of_incident"] = tags.map(lambda value: value[1])
|
|
164
|
+
processed["type_of_incident_leaves"] = tags.map(lambda value: value[2])
|
|
165
|
+
processed["processed_at"] = pd.Timestamp.now(tz="UTC")
|
|
166
|
+
processed["n_people_affected"] = 1
|
|
167
|
+
processed["region"] = source["ActionGeo_FullName"].map(self._blank_to_na)
|
|
168
|
+
processed["latitude"] = coordinates.map(lambda value: value[0])
|
|
169
|
+
processed["longitude"] = coordinates.map(lambda value: value[1])
|
|
170
|
+
processed["description"] = source.apply(self._description, axis=1)
|
|
171
|
+
processed["source_url"] = source["SOURCEURL"].map(self._source_url)
|
|
172
|
+
processed["attacked_count"] = pd.NA
|
|
173
|
+
processed["gender"] = pd.NA
|
|
174
|
+
processed["media_role"] = pd.NA
|
|
175
|
+
processed["perpetrator_type"] = pd.NA
|
|
176
|
+
|
|
177
|
+
processed = processed.loc[:, EVENT_COLUMNS].reset_index(drop=True)
|
|
178
|
+
if baseline_processed is not None and not baseline_processed.empty:
|
|
179
|
+
processed = pd.concat(
|
|
180
|
+
[baseline_processed.loc[:, EVENT_COLUMNS], processed],
|
|
181
|
+
ignore_index=True,
|
|
182
|
+
)
|
|
183
|
+
processed = processed.loc[~processed["event_id"].duplicated(keep="last")]
|
|
184
|
+
processed = processed.loc[:, EVENT_COLUMNS].reset_index(drop=True)
|
|
185
|
+
return validate_event_dataframe(processed)
|
|
186
|
+
|
|
187
|
+
def _validate_source_columns(self, source: pd.DataFrame) -> None:
|
|
188
|
+
missing = self.required_columns - set(source.columns)
|
|
189
|
+
if missing:
|
|
190
|
+
raise EventSchemaValidationError(
|
|
191
|
+
f"GDELT Source Artifact missing processor columns: {sorted(missing)}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def _country(self, row: pd.Series):
|
|
195
|
+
# GDELT ActionGeo country codes are FIPS-ish, so prefer actor ISO3-like values
|
|
196
|
+
# when available; fall back to location name.
|
|
197
|
+
for value in (row.get("Actor2CountryCode"), row.get("Actor1CountryCode")):
|
|
198
|
+
if not self._is_blank(value):
|
|
199
|
+
country = self.country_canonicalizer.from_values(iso3=value)
|
|
200
|
+
if country is not None:
|
|
201
|
+
return country
|
|
202
|
+
if not self._is_blank(row.get("ActionGeo_FullName")):
|
|
203
|
+
return self.country_canonicalizer.from_values(
|
|
204
|
+
country_name=row.get("ActionGeo_FullName")
|
|
205
|
+
)
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
def _date(self, value: object) -> pd.Timestamp | None:
|
|
209
|
+
text = str(value).strip()
|
|
210
|
+
if not re_match_yyyymmdd(text):
|
|
211
|
+
return None
|
|
212
|
+
parsed = pd.to_datetime(text, format="%Y%m%d", errors="coerce")
|
|
213
|
+
return parsed if pd.notna(parsed) else None
|
|
214
|
+
|
|
215
|
+
def _coordinates(self, row: pd.Series) -> tuple[float | object, float | object] | None:
|
|
216
|
+
latitude = pd.to_numeric(pd.Series([row.get("ActionGeo_Lat")]), errors="coerce").iloc[0]
|
|
217
|
+
longitude = pd.to_numeric(pd.Series([row.get("ActionGeo_Long")]), errors="coerce").iloc[0]
|
|
218
|
+
if pd.isna(latitude) and pd.isna(longitude):
|
|
219
|
+
return (pd.NA, pd.NA)
|
|
220
|
+
if pd.isna(latitude) or pd.isna(longitude):
|
|
221
|
+
return None
|
|
222
|
+
if latitude < -90 or latitude > 90 or longitude < -180 or longitude > 180:
|
|
223
|
+
return None
|
|
224
|
+
return (float(latitude), float(longitude))
|
|
225
|
+
|
|
226
|
+
def _incident_tags(self, event_code: object) -> tuple[list[str], list[str], list[str]]:
|
|
227
|
+
top, leaf = _GDELT_INCIDENT_MAP.get(str(event_code), ("Unknown", "Unknown"))
|
|
228
|
+
return incident_tags(top=[top], leaves=[leaf])
|
|
229
|
+
|
|
230
|
+
def _description(self, row: pd.Series) -> object:
|
|
231
|
+
event_code = str(row.get("EventCode", "")).strip()
|
|
232
|
+
actor1 = str(row.get("Actor1Name", "")).strip()
|
|
233
|
+
actor2 = str(row.get("Actor2Name", "")).strip()
|
|
234
|
+
parts = [part for part in (actor1, actor2, f"GDELT EventCode {event_code}" if event_code else "") if part]
|
|
235
|
+
return " | ".join(parts) if parts else pd.NA
|
|
236
|
+
|
|
237
|
+
def _source_url(self, value: object) -> object:
|
|
238
|
+
if self._is_blank(value):
|
|
239
|
+
return pd.NA
|
|
240
|
+
return str(value).strip()
|
|
241
|
+
|
|
242
|
+
def _blank_to_na(self, value: object) -> object:
|
|
243
|
+
return pd.NA if self._is_blank(value) else str(value).strip()
|
|
244
|
+
|
|
245
|
+
def _is_blank(self, value: object) -> bool:
|
|
246
|
+
if value is None:
|
|
247
|
+
return True
|
|
248
|
+
if isinstance(value, str):
|
|
249
|
+
return not value.strip() or value.strip().lower() in {"nan", "none", "null"}
|
|
250
|
+
try:
|
|
251
|
+
return bool(pd.isna(value))
|
|
252
|
+
except (TypeError, ValueError):
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def re_match_yyyymmdd(value: str) -> bool:
|
|
257
|
+
return len(value) == 8 and value.isdigit()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
__all__ = ["GdeltProcessor"]
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""MFRR processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import html
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import warnings
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from fpu_barometer_admin.connectors import SourceArtifactSet
|
|
15
|
+
from fpu_barometer_admin.schemas.event import (
|
|
16
|
+
EVENT_COLUMNS,
|
|
17
|
+
INCIDENT_TAXONOMY,
|
|
18
|
+
LEAF_INCIDENT_TYPES,
|
|
19
|
+
TOP_INCIDENT_TYPES,
|
|
20
|
+
CountryCanonicalizer,
|
|
21
|
+
EventSchemaValidationError,
|
|
22
|
+
canonical_event_id,
|
|
23
|
+
incident_tags,
|
|
24
|
+
validate_event_dataframe,
|
|
25
|
+
)
|
|
26
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_LEAF_TO_TOP = {
|
|
30
|
+
leaf: top for top, leaves in INCIDENT_TAXONOMY.items() for leaf in leaves
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MFRRProcessor:
|
|
35
|
+
"""Transform MFRR Source Artifacts into canonical Event data."""
|
|
36
|
+
|
|
37
|
+
dataset = "mfrr"
|
|
38
|
+
required_columns = {
|
|
39
|
+
"id",
|
|
40
|
+
"country",
|
|
41
|
+
"date",
|
|
42
|
+
"year",
|
|
43
|
+
"published_at",
|
|
44
|
+
"published_at_date",
|
|
45
|
+
"type_of_incident",
|
|
46
|
+
"top_type_of_incident",
|
|
47
|
+
"type_of_incident_leaves",
|
|
48
|
+
"attacked_count",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
52
|
+
self.object_storage = object_storage
|
|
53
|
+
self.country_canonicalizer = CountryCanonicalizer()
|
|
54
|
+
|
|
55
|
+
def process(
|
|
56
|
+
self,
|
|
57
|
+
artifact_set: SourceArtifactSet,
|
|
58
|
+
*args: object,
|
|
59
|
+
**kwargs: object,
|
|
60
|
+
) -> pd.DataFrame:
|
|
61
|
+
if artifact_set.dataset != self.dataset:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"MFRR processor cannot process dataset {artifact_set.dataset!r}"
|
|
64
|
+
)
|
|
65
|
+
source = self.object_storage.read_dataframe(artifact_set.logical_path)
|
|
66
|
+
self._validate_source_columns(source)
|
|
67
|
+
|
|
68
|
+
countries = source["country"].map(
|
|
69
|
+
lambda value: self.country_canonicalizer.from_values(country_name=value)
|
|
70
|
+
)
|
|
71
|
+
valid_country = countries.notna()
|
|
72
|
+
if (~valid_country).any():
|
|
73
|
+
samples = source.loc[~valid_country, "country"].dropna().astype(str).unique()[:5]
|
|
74
|
+
warnings.warn(
|
|
75
|
+
f"MFRR dropped {(~valid_country).sum()} Event rows with unresolved countries: {list(samples)}",
|
|
76
|
+
RuntimeWarning,
|
|
77
|
+
stacklevel=2,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
dates = source.apply(self._canonical_date, axis=1)
|
|
81
|
+
valid_date = dates.map(lambda value: value is not None)
|
|
82
|
+
if (~valid_date).any():
|
|
83
|
+
samples = source.loc[~valid_date, ["date", "year", "published_at_date", "published_at"]].head().to_dict("records")
|
|
84
|
+
warnings.warn(
|
|
85
|
+
f"MFRR dropped {(~valid_date).sum()} Event rows with invalid dates: {samples}",
|
|
86
|
+
RuntimeWarning,
|
|
87
|
+
stacklevel=2,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
keep = valid_country & valid_date
|
|
91
|
+
source = source.loc[keep].copy()
|
|
92
|
+
countries = countries.loc[keep]
|
|
93
|
+
dates = dates.loc[keep]
|
|
94
|
+
|
|
95
|
+
processed = pd.DataFrame(index=source.index)
|
|
96
|
+
processed["event_id"] = source["id"].map(
|
|
97
|
+
lambda value: canonical_event_id(self.dataset, value)
|
|
98
|
+
)
|
|
99
|
+
processed["dataset"] = self.dataset
|
|
100
|
+
processed["iso3"] = countries.map(lambda country: country.iso3)
|
|
101
|
+
processed["country_name"] = countries.map(lambda country: country.country_name)
|
|
102
|
+
processed["date"] = dates.map(lambda value: value[0])
|
|
103
|
+
processed["date_precision"] = dates.map(lambda value: value[1])
|
|
104
|
+
processed["year"] = dates.map(lambda value: value[2]).astype("Int64")
|
|
105
|
+
|
|
106
|
+
incident_values = source.apply(self._incident_values, axis=1)
|
|
107
|
+
processed["type_of_incident"] = incident_values.map(lambda values: values[0])
|
|
108
|
+
processed["top_type_of_incident"] = incident_values.map(lambda values: values[1])
|
|
109
|
+
processed["type_of_incident_leaves"] = incident_values.map(lambda values: values[2])
|
|
110
|
+
|
|
111
|
+
processed["processed_at"] = pd.Timestamp.now(tz="UTC")
|
|
112
|
+
processed["n_people_affected"] = self._people_affected(source)
|
|
113
|
+
processed["region"] = source.apply(self._region, axis=1)
|
|
114
|
+
processed["latitude"] = source.apply(lambda row: self._coordinate(row, "lat"), axis=1)
|
|
115
|
+
processed["longitude"] = source.apply(lambda row: self._coordinate(row, "lng"), axis=1)
|
|
116
|
+
processed["description"] = source.apply(self._description, axis=1)
|
|
117
|
+
processed["source_url"] = source.apply(self._source_url, axis=1)
|
|
118
|
+
processed["attacked_count"] = self._attacked_count(source)
|
|
119
|
+
processed["gender"] = source["gender"].map(self._semicolon_join) if "gender" in source else pd.NA
|
|
120
|
+
processed["media_role"] = source["type_of_journalist_or_media_actor"].map(self._semicolon_join) if "type_of_journalist_or_media_actor" in source else pd.NA
|
|
121
|
+
processed["perpetrator_type"] = source["source_of_incident"].map(self._semicolon_join) if "source_of_incident" in source else pd.NA
|
|
122
|
+
|
|
123
|
+
processed = processed.loc[:, EVENT_COLUMNS]
|
|
124
|
+
self._validate_coordinates(processed)
|
|
125
|
+
return validate_event_dataframe(processed)
|
|
126
|
+
|
|
127
|
+
def _validate_source_columns(self, source: pd.DataFrame) -> None:
|
|
128
|
+
missing = self.required_columns - set(source.columns)
|
|
129
|
+
if missing:
|
|
130
|
+
raise EventSchemaValidationError(
|
|
131
|
+
f"MFRR Source Artifact missing processor columns: {sorted(missing)}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def _canonical_date(self, row: pd.Series) -> tuple[object, str, int] | None:
|
|
135
|
+
if self._truthy(row.get("has_date_of_incidence")):
|
|
136
|
+
incident_date = pd.to_datetime(row.get("date"), errors="coerce")
|
|
137
|
+
if pd.notna(incident_date):
|
|
138
|
+
return incident_date.date(), "day", int(incident_date.year)
|
|
139
|
+
|
|
140
|
+
year = pd.to_numeric(pd.Series([row.get("year")]), errors="coerce").iloc[0]
|
|
141
|
+
if pd.notna(year):
|
|
142
|
+
year_int = int(year)
|
|
143
|
+
if 1000 <= year_int <= 9999:
|
|
144
|
+
return pd.Timestamp(year=year_int, month=1, day=1).date(), "year", year_int
|
|
145
|
+
|
|
146
|
+
for column in ("published_at_date", "published_at"):
|
|
147
|
+
value = row.get(column)
|
|
148
|
+
if column == "published_at" and not self._is_blank(value):
|
|
149
|
+
numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0]
|
|
150
|
+
if pd.notna(numeric):
|
|
151
|
+
published = pd.to_datetime(numeric, unit="s", errors="coerce")
|
|
152
|
+
else:
|
|
153
|
+
published = pd.to_datetime(value, errors="coerce")
|
|
154
|
+
else:
|
|
155
|
+
published = pd.to_datetime(value, errors="coerce")
|
|
156
|
+
if pd.notna(published):
|
|
157
|
+
return published.date(), "publication_day", int(published.year)
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
def _incident_values(self, row: pd.Series) -> tuple[list[str], list[str], list[str]]:
|
|
161
|
+
source_top = [value for value in self._list_values(row.get("top_type_of_incident")) if value in TOP_INCIDENT_TYPES]
|
|
162
|
+
source_leaves = [value for value in self._list_values(row.get("type_of_incident_leaves")) if value in LEAF_INCIDENT_TYPES]
|
|
163
|
+
for value in self._list_values(row.get("type_of_incident")):
|
|
164
|
+
if value in TOP_INCIDENT_TYPES:
|
|
165
|
+
source_top.append(value)
|
|
166
|
+
if value in LEAF_INCIDENT_TYPES:
|
|
167
|
+
source_leaves.append(value)
|
|
168
|
+
|
|
169
|
+
inferred_top = [_LEAF_TO_TOP[leaf] for leaf in source_leaves if leaf in _LEAF_TO_TOP]
|
|
170
|
+
top = self._dedupe([*source_top, *inferred_top])
|
|
171
|
+
leaves = self._dedupe(source_leaves)
|
|
172
|
+
if top and not leaves:
|
|
173
|
+
leaves = [self._fallback_leaf_for_top(value) for value in top]
|
|
174
|
+
if not top or not leaves:
|
|
175
|
+
top = ["Unknown"]
|
|
176
|
+
leaves = ["Unknown"]
|
|
177
|
+
return incident_tags(top=top, leaves=leaves)
|
|
178
|
+
|
|
179
|
+
def _fallback_leaf_for_top(self, top: str) -> str:
|
|
180
|
+
allowed = INCIDENT_TAXONOMY.get(top, ())
|
|
181
|
+
if "Unknown" in allowed:
|
|
182
|
+
return "Unknown"
|
|
183
|
+
if top in allowed:
|
|
184
|
+
return top
|
|
185
|
+
return allowed[0] if allowed else "Unknown"
|
|
186
|
+
|
|
187
|
+
def _people_affected(self, source: pd.DataFrame) -> pd.Series:
|
|
188
|
+
counts = pd.to_numeric(source["attacked_count"], errors="coerce")
|
|
189
|
+
return counts.fillna(1).clip(lower=1).astype("Int64")
|
|
190
|
+
|
|
191
|
+
def _attacked_count(self, source: pd.DataFrame) -> pd.Series:
|
|
192
|
+
return pd.to_numeric(source["attacked_count"], errors="coerce").astype("Int64")
|
|
193
|
+
|
|
194
|
+
def _description(self, row: pd.Series) -> object:
|
|
195
|
+
title = self._clean_text(row.get("title"))
|
|
196
|
+
content = self._clean_text(row.get("content"))
|
|
197
|
+
parts = [part for part in (title, content) if part]
|
|
198
|
+
return "\n\n".join(parts) if parts else pd.NA
|
|
199
|
+
|
|
200
|
+
def _source_url(self, row: pd.Series) -> object:
|
|
201
|
+
for value in [row.get("coe_link"), *self._url_values(row.get("news_source_links")), *self._url_values(row.get("internet_source_links"))]:
|
|
202
|
+
url = self._clean_text(value)
|
|
203
|
+
if not url:
|
|
204
|
+
continue
|
|
205
|
+
if not re.match(r"^[a-z][a-z0-9+.-]*://", url, flags=re.IGNORECASE):
|
|
206
|
+
url = f"https://{url}"
|
|
207
|
+
return url
|
|
208
|
+
return pd.NA
|
|
209
|
+
|
|
210
|
+
def _region(self, row: pd.Series) -> object:
|
|
211
|
+
region_names = [self._clean_text(value) for value in self._list_values(row.get("region_names"))]
|
|
212
|
+
region_names = [value for value in region_names if value and not self._is_unknownish(value)]
|
|
213
|
+
return region_names[2] if len(region_names) >= 3 else pd.NA
|
|
214
|
+
|
|
215
|
+
def _coordinate(self, row: pd.Series, axis: str) -> object:
|
|
216
|
+
flat_column = "_geo_lat" if axis == "lat" else "_geo_lng"
|
|
217
|
+
dotted_column = "_geo.lat" if axis == "lat" else "_geo.lng"
|
|
218
|
+
value = row.get(flat_column)
|
|
219
|
+
if self._is_blank(value):
|
|
220
|
+
value = row.get(dotted_column)
|
|
221
|
+
if self._is_blank(value):
|
|
222
|
+
geo = self._maybe_mapping(row.get("_geo"))
|
|
223
|
+
if isinstance(geo, dict):
|
|
224
|
+
value = geo.get("lat" if axis == "lat" else "lng")
|
|
225
|
+
numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0]
|
|
226
|
+
return numeric if pd.notna(numeric) else pd.NA
|
|
227
|
+
|
|
228
|
+
def _validate_coordinates(self, processed: pd.DataFrame) -> None:
|
|
229
|
+
latitude = processed["latitude"].dropna()
|
|
230
|
+
longitude = processed["longitude"].dropna()
|
|
231
|
+
if ((latitude < -90) | (latitude > 90)).any():
|
|
232
|
+
raise EventSchemaValidationError("MFRR Event latitude outside [-90, 90]")
|
|
233
|
+
if ((longitude < -180) | (longitude > 180)).any():
|
|
234
|
+
raise EventSchemaValidationError("MFRR Event longitude outside [-180, 180]")
|
|
235
|
+
|
|
236
|
+
def _semicolon_join(self, value: object) -> object:
|
|
237
|
+
values = [self._clean_text(item) for item in self._list_values(value)]
|
|
238
|
+
values = [item for item in values if item and not self._is_unknownish(item)]
|
|
239
|
+
return "; ".join(self._dedupe(values)) if values else pd.NA
|
|
240
|
+
|
|
241
|
+
def _url_values(self, value: object) -> list[object]:
|
|
242
|
+
values: list[object] = []
|
|
243
|
+
for item in self._list_values(value):
|
|
244
|
+
if isinstance(item, str):
|
|
245
|
+
values.extend(part for part in re.split(r"[\r\n]+", item) if part.strip())
|
|
246
|
+
else:
|
|
247
|
+
values.append(item)
|
|
248
|
+
return values
|
|
249
|
+
|
|
250
|
+
def _list_values(self, value: object) -> list[object]:
|
|
251
|
+
if self._is_blank(value):
|
|
252
|
+
return []
|
|
253
|
+
if isinstance(value, (list, tuple, set)):
|
|
254
|
+
return list(value)
|
|
255
|
+
if isinstance(value, str):
|
|
256
|
+
text = value.strip()
|
|
257
|
+
if not text:
|
|
258
|
+
return []
|
|
259
|
+
if text[:1] in "[({":
|
|
260
|
+
try:
|
|
261
|
+
parsed = ast.literal_eval(text)
|
|
262
|
+
except (SyntaxError, ValueError):
|
|
263
|
+
try:
|
|
264
|
+
parsed = json.loads(text)
|
|
265
|
+
except json.JSONDecodeError:
|
|
266
|
+
return [text]
|
|
267
|
+
if isinstance(parsed, (list, tuple, set)):
|
|
268
|
+
return list(parsed)
|
|
269
|
+
return [text]
|
|
270
|
+
if isinstance(value, Iterable) and not isinstance(value, (str, bytes, dict)):
|
|
271
|
+
return list(value)
|
|
272
|
+
return [value]
|
|
273
|
+
|
|
274
|
+
def _maybe_mapping(self, value: object) -> object:
|
|
275
|
+
if isinstance(value, dict) or self._is_blank(value):
|
|
276
|
+
return value
|
|
277
|
+
if isinstance(value, str) and value.strip()[:1] in "{":
|
|
278
|
+
for loader in (json.loads, ast.literal_eval):
|
|
279
|
+
try:
|
|
280
|
+
parsed = loader(value)
|
|
281
|
+
except (json.JSONDecodeError, SyntaxError, ValueError):
|
|
282
|
+
continue
|
|
283
|
+
if isinstance(parsed, dict):
|
|
284
|
+
return parsed
|
|
285
|
+
return value
|
|
286
|
+
|
|
287
|
+
def _clean_text(self, value: object) -> str | None:
|
|
288
|
+
if self._is_blank(value):
|
|
289
|
+
return None
|
|
290
|
+
text = str(value)
|
|
291
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
292
|
+
text = html.unescape(text)
|
|
293
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
294
|
+
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
|
295
|
+
return text or None
|
|
296
|
+
|
|
297
|
+
def _is_unknownish(self, value: str) -> bool:
|
|
298
|
+
text = value.strip().lower()
|
|
299
|
+
return (
|
|
300
|
+
text in {"unknown", "nan", "none", "null", "not applicable", "n/a", "na"}
|
|
301
|
+
or text.startswith("unknown ")
|
|
302
|
+
or text.endswith(" unknown")
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def _truthy(self, value: object) -> bool:
|
|
306
|
+
if self._is_blank(value):
|
|
307
|
+
return False
|
|
308
|
+
if isinstance(value, str):
|
|
309
|
+
return value.strip().lower() in {"1", "true", "yes", "y"}
|
|
310
|
+
return bool(value)
|
|
311
|
+
|
|
312
|
+
def _is_blank(self, value: object) -> bool:
|
|
313
|
+
if value is None:
|
|
314
|
+
return True
|
|
315
|
+
if isinstance(value, str):
|
|
316
|
+
return not value.strip() or value.strip().lower() in {"nan", "none", "null"}
|
|
317
|
+
try:
|
|
318
|
+
return bool(pd.isna(value))
|
|
319
|
+
except (TypeError, ValueError):
|
|
320
|
+
return False
|
|
321
|
+
|
|
322
|
+
def _dedupe(self, values: Iterable) -> list:
|
|
323
|
+
result = []
|
|
324
|
+
for value in values:
|
|
325
|
+
if value not in result:
|
|
326
|
+
result.append(value)
|
|
327
|
+
return result
|