fpu-barometer-admin 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fpu_barometer_admin/__init__.py +6 -0
  2. fpu_barometer_admin/cli/__init__.py +5 -0
  3. fpu_barometer_admin/cli/commands.py +199 -0
  4. fpu_barometer_admin/cli/deploy.py +719 -0
  5. fpu_barometer_admin/connectors/__init__.py +56 -0
  6. fpu_barometer_admin/connectors/acled_connector.py +77 -0
  7. fpu_barometer_admin/connectors/base_connector.py +60 -0
  8. fpu_barometer_admin/connectors/cpj_connector.py +92 -0
  9. fpu_barometer_admin/connectors/ert_connector.py +134 -0
  10. fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
  11. fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
  12. fpu_barometer_admin/connectors/rr_connector.py +84 -0
  13. fpu_barometer_admin/connectors/static_sources.py +41 -0
  14. fpu_barometer_admin/connectors/vdem_connector.py +165 -0
  15. fpu_barometer_admin/handlers/__init__.py +6 -0
  16. fpu_barometer_admin/handlers/function_app.py +543 -0
  17. fpu_barometer_admin/processors/__init__.py +46 -0
  18. fpu_barometer_admin/processors/acled_processor.py +263 -0
  19. fpu_barometer_admin/processors/base_processor.py +23 -0
  20. fpu_barometer_admin/processors/cpj_processor.py +147 -0
  21. fpu_barometer_admin/processors/ert_processor.py +72 -0
  22. fpu_barometer_admin/processors/gdelt_processor.py +260 -0
  23. fpu_barometer_admin/processors/mfrr_processor.py +327 -0
  24. fpu_barometer_admin/processors/rr_processor.py +208 -0
  25. fpu_barometer_admin/processors/vdem_processor.py +70 -0
  26. fpu_barometer_admin/runners/__init__.py +19 -0
  27. fpu_barometer_admin/runners/definitions.py +159 -0
  28. fpu_barometer_admin/runners/runners.py +291 -0
  29. fpu_barometer_admin/runners/scheduler.py +148 -0
  30. fpu_barometer_admin/runners/seed.py +399 -0
  31. fpu_barometer_admin/schemas/__init__.py +1 -0
  32. fpu_barometer_admin/schemas/event.py +362 -0
  33. fpu_barometer_admin/schemas/predictor.py +418 -0
  34. fpu_barometer_admin/storage/__init__.py +39 -0
  35. fpu_barometer_admin/storage/catalog.py +359 -0
  36. fpu_barometer_admin/storage/factory.py +165 -0
  37. fpu_barometer_admin/storage/objects.py +463 -0
  38. fpu_barometer_admin/storage/reader.py +410 -0
  39. fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
  40. fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
  41. fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
  42. fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
  43. fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
@@ -0,0 +1,260 @@
1
+ """GDELT processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+
7
+ import pandas as pd
8
+
9
+ from fpu_barometer_admin.connectors import SourceArtifactSet
10
+ from fpu_barometer_admin.schemas.event import (
11
+ EVENT_COLUMNS,
12
+ CountryCanonicalizer,
13
+ EventSchemaValidationError,
14
+ canonical_event_id,
15
+ incident_tags,
16
+ validate_event_dataframe,
17
+ )
18
+ from fpu_barometer_admin.storage.objects import ObjectStorage
19
+
20
+
21
+ _GDELT_INCIDENT_MAP: dict[str, tuple[str, str]] = {
22
+ "091": ("Legal incident", "Investigation"),
23
+ "092": ("Legal incident", "Investigation"),
24
+ "110": ("Verbal attack", "Insult / harassment / discredit"),
25
+ "111": ("Verbal attack", "Insult / harassment / discredit"),
26
+ "112": ("Verbal attack", "Insult / harassment / discredit"),
27
+ "1121": ("Verbal attack", "Insult / harassment / discredit"),
28
+ "1122": ("Verbal attack", "Insult / harassment / discredit"),
29
+ "1123": ("Verbal attack", "Insult / harassment / discredit"),
30
+ "1125": ("Verbal attack", "Insult / harassment / discredit"),
31
+ "113": ("Verbal attack", "Insult / harassment / discredit"),
32
+ "114": ("Legal incident", "Complaint"),
33
+ "115": ("Legal incident", "Civil legal actions / lawsuits"),
34
+ "116": ("Legal incident", "Conviction"),
35
+ "1233": ("Interference", "Blocked journalistic activity"),
36
+ "124": ("Interference", "Blocked journalistic activity"),
37
+ "1234": ("Interference", "Blocked journalistic activity"),
38
+ "1245": ("Interference", "Blocked journalistic activity"),
39
+ "130": ("Verbal attack", "Threatening"),
40
+ "131": ("Verbal attack", "Threatening"),
41
+ "1321": ("Verbal attack", "Threatening"),
42
+ "138": ("Verbal attack", "Threatening"),
43
+ "1384": ("Verbal attack", "Threatening"),
44
+ "139": ("Verbal attack", "Threatening"),
45
+ "170": ("Interference", "Other"),
46
+ "171": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
47
+ "1711": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
48
+ "1712": ("Attack to property", "Property (incl. houses, cars, personal belongings)"),
49
+ "172": ("Interference", "Administrative or financial interference"),
50
+ "1721": ("Interference", "Administrative or financial interference"),
51
+ "173": ("Legal incident", "Arrest/detention/imprisonment"),
52
+ "174": ("Legal incident", "Travel ban"),
53
+ "176": ("Attack to property", "Hacking/DDoS"),
54
+ "180": ("Physical assault", "Other"),
55
+ "181": ("Physical assault", "Abduction/kidnapping"),
56
+ "182": ("Physical assault", "Without injury (physical assault not resulting in injury)"),
57
+ "1821": ("Physical assault", "Sexual assault"),
58
+ "1822": ("Physical assault", "Injury (physical assault resulting in injury)"),
59
+ "1823": ("Physical assault", "Death (physical assault resulting in death)"),
60
+ "183": ("Physical assault", "Injury (physical assault resulting in injury)"),
61
+ "1831": ("Physical assault", "Injury (physical assault resulting in injury)"),
62
+ "1832": ("Physical assault", "Injury (physical assault resulting in injury)"),
63
+ "1833": ("Physical assault", "Injury (physical assault resulting in injury)"),
64
+ "1834": ("Physical assault", "Injury (physical assault resulting in injury)"),
65
+ "185": ("Physical assault", "Injury (physical assault resulting in injury)"),
66
+ "186": ("Physical assault", "Death (physical assault resulting in death)"),
67
+ }
68
+
69
+
70
+ class GdeltProcessor:
71
+ """Transform GDELT Source Artifacts into canonical Event data."""
72
+
73
+ dataset = "gdelt"
74
+ required_columns = {
75
+ "GlobalEventID",
76
+ "SQLDATE",
77
+ "Actor1Name",
78
+ "Actor1CountryCode",
79
+ "Actor2Name",
80
+ "Actor2CountryCode",
81
+ "EventCode",
82
+ "ActionGeo_CountryCode",
83
+ "ActionGeo_FullName",
84
+ "ActionGeo_Lat",
85
+ "ActionGeo_Long",
86
+ "SOURCEURL",
87
+ }
88
+
89
+ def __init__(self, object_storage: ObjectStorage):
90
+ self.object_storage = object_storage
91
+ self.country_canonicalizer = CountryCanonicalizer()
92
+
93
+ def process(
94
+ self,
95
+ artifact_set: SourceArtifactSet,
96
+ *args: object,
97
+ baseline_processed: pd.DataFrame | None = None,
98
+ **kwargs: object,
99
+ ) -> pd.DataFrame:
100
+ if artifact_set.dataset != self.dataset:
101
+ raise ValueError(
102
+ f"GDELT processor cannot process dataset {artifact_set.dataset!r}"
103
+ )
104
+ source = self.object_storage.read_dataframe(artifact_set.logical_path)
105
+ self._validate_source_columns(source)
106
+
107
+ countries = source.apply(self._country, axis=1)
108
+ valid_country = countries.notna()
109
+ if (~valid_country).any():
110
+ samples = source.loc[~valid_country, ["ActionGeo_CountryCode", "Actor1CountryCode", "Actor2CountryCode"]].head().to_dict("records")
111
+ warnings.warn(
112
+ f"GDELT dropped {(~valid_country).sum()} Event rows with unresolved countries: {samples}",
113
+ RuntimeWarning,
114
+ stacklevel=2,
115
+ )
116
+
117
+ dates = source["SQLDATE"].map(self._date)
118
+ valid_date = dates.map(lambda value: pd.notna(value))
119
+ if (~valid_date).any():
120
+ samples = source.loc[~valid_date, "SQLDATE"].head().tolist()
121
+ warnings.warn(
122
+ f"GDELT dropped {(~valid_date).sum()} Event rows with invalid dates: {samples}",
123
+ RuntimeWarning,
124
+ stacklevel=2,
125
+ )
126
+
127
+ coordinates = source.apply(self._coordinates, axis=1)
128
+ valid_coordinates = coordinates.map(lambda value: value is not None)
129
+ if (~valid_coordinates).any():
130
+ samples = source.loc[~valid_coordinates, ["ActionGeo_Lat", "ActionGeo_Long"]].head().to_dict("records")
131
+ warnings.warn(
132
+ f"GDELT dropped {(~valid_coordinates).sum()} Event rows with invalid coordinates: {samples}",
133
+ RuntimeWarning,
134
+ stacklevel=2,
135
+ )
136
+
137
+ keep = valid_country & valid_date & valid_coordinates
138
+ source = source.loc[keep].copy()
139
+ countries = countries.loc[keep]
140
+ dates = dates.loc[keep]
141
+ coordinates = coordinates.loc[keep]
142
+ if source.empty:
143
+ return validate_event_dataframe(pd.DataFrame(columns=EVENT_COLUMNS))
144
+
145
+ processed = pd.DataFrame(index=source.index)
146
+ processed["event_id"] = source["GlobalEventID"].map(
147
+ lambda value: canonical_event_id(self.dataset, value)
148
+ )
149
+ processed = processed.loc[~processed["event_id"].duplicated(keep="last")].copy()
150
+ source = source.loc[processed.index]
151
+ countries = countries.loc[processed.index]
152
+ dates = dates.loc[processed.index]
153
+ coordinates = coordinates.loc[processed.index]
154
+
155
+ processed["dataset"] = self.dataset
156
+ processed["iso3"] = countries.map(lambda country: country.iso3)
157
+ processed["country_name"] = countries.map(lambda country: country.country_name)
158
+ processed["date"] = dates.map(lambda value: value.date())
159
+ processed["date_precision"] = "day"
160
+ processed["year"] = dates.map(lambda value: int(value.year)).astype("Int64")
161
+ tags = source["EventCode"].map(self._incident_tags)
162
+ processed["type_of_incident"] = tags.map(lambda value: value[0])
163
+ processed["top_type_of_incident"] = tags.map(lambda value: value[1])
164
+ processed["type_of_incident_leaves"] = tags.map(lambda value: value[2])
165
+ processed["processed_at"] = pd.Timestamp.now(tz="UTC")
166
+ processed["n_people_affected"] = 1
167
+ processed["region"] = source["ActionGeo_FullName"].map(self._blank_to_na)
168
+ processed["latitude"] = coordinates.map(lambda value: value[0])
169
+ processed["longitude"] = coordinates.map(lambda value: value[1])
170
+ processed["description"] = source.apply(self._description, axis=1)
171
+ processed["source_url"] = source["SOURCEURL"].map(self._source_url)
172
+ processed["attacked_count"] = pd.NA
173
+ processed["gender"] = pd.NA
174
+ processed["media_role"] = pd.NA
175
+ processed["perpetrator_type"] = pd.NA
176
+
177
+ processed = processed.loc[:, EVENT_COLUMNS].reset_index(drop=True)
178
+ if baseline_processed is not None and not baseline_processed.empty:
179
+ processed = pd.concat(
180
+ [baseline_processed.loc[:, EVENT_COLUMNS], processed],
181
+ ignore_index=True,
182
+ )
183
+ processed = processed.loc[~processed["event_id"].duplicated(keep="last")]
184
+ processed = processed.loc[:, EVENT_COLUMNS].reset_index(drop=True)
185
+ return validate_event_dataframe(processed)
186
+
187
+ def _validate_source_columns(self, source: pd.DataFrame) -> None:
188
+ missing = self.required_columns - set(source.columns)
189
+ if missing:
190
+ raise EventSchemaValidationError(
191
+ f"GDELT Source Artifact missing processor columns: {sorted(missing)}"
192
+ )
193
+
194
+ def _country(self, row: pd.Series):
195
+ # GDELT ActionGeo country codes are FIPS-ish, so prefer actor ISO3-like values
196
+ # when available; fall back to location name.
197
+ for value in (row.get("Actor2CountryCode"), row.get("Actor1CountryCode")):
198
+ if not self._is_blank(value):
199
+ country = self.country_canonicalizer.from_values(iso3=value)
200
+ if country is not None:
201
+ return country
202
+ if not self._is_blank(row.get("ActionGeo_FullName")):
203
+ return self.country_canonicalizer.from_values(
204
+ country_name=row.get("ActionGeo_FullName")
205
+ )
206
+ return None
207
+
208
+ def _date(self, value: object) -> pd.Timestamp | None:
209
+ text = str(value).strip()
210
+ if not re_match_yyyymmdd(text):
211
+ return None
212
+ parsed = pd.to_datetime(text, format="%Y%m%d", errors="coerce")
213
+ return parsed if pd.notna(parsed) else None
214
+
215
+ def _coordinates(self, row: pd.Series) -> tuple[float | object, float | object] | None:
216
+ latitude = pd.to_numeric(pd.Series([row.get("ActionGeo_Lat")]), errors="coerce").iloc[0]
217
+ longitude = pd.to_numeric(pd.Series([row.get("ActionGeo_Long")]), errors="coerce").iloc[0]
218
+ if pd.isna(latitude) and pd.isna(longitude):
219
+ return (pd.NA, pd.NA)
220
+ if pd.isna(latitude) or pd.isna(longitude):
221
+ return None
222
+ if latitude < -90 or latitude > 90 or longitude < -180 or longitude > 180:
223
+ return None
224
+ return (float(latitude), float(longitude))
225
+
226
+ def _incident_tags(self, event_code: object) -> tuple[list[str], list[str], list[str]]:
227
+ top, leaf = _GDELT_INCIDENT_MAP.get(str(event_code), ("Unknown", "Unknown"))
228
+ return incident_tags(top=[top], leaves=[leaf])
229
+
230
+ def _description(self, row: pd.Series) -> object:
231
+ event_code = str(row.get("EventCode", "")).strip()
232
+ actor1 = str(row.get("Actor1Name", "")).strip()
233
+ actor2 = str(row.get("Actor2Name", "")).strip()
234
+ parts = [part for part in (actor1, actor2, f"GDELT EventCode {event_code}" if event_code else "") if part]
235
+ return " | ".join(parts) if parts else pd.NA
236
+
237
+ def _source_url(self, value: object) -> object:
238
+ if self._is_blank(value):
239
+ return pd.NA
240
+ return str(value).strip()
241
+
242
+ def _blank_to_na(self, value: object) -> object:
243
+ return pd.NA if self._is_blank(value) else str(value).strip()
244
+
245
+ def _is_blank(self, value: object) -> bool:
246
+ if value is None:
247
+ return True
248
+ if isinstance(value, str):
249
+ return not value.strip() or value.strip().lower() in {"nan", "none", "null"}
250
+ try:
251
+ return bool(pd.isna(value))
252
+ except (TypeError, ValueError):
253
+ return False
254
+
255
+
256
+ def re_match_yyyymmdd(value: str) -> bool:
257
+ return len(value) == 8 and value.isdigit()
258
+
259
+
260
+ __all__ = ["GdeltProcessor"]
@@ -0,0 +1,327 @@
1
+ """MFRR processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import html
7
+ import json
8
+ import re
9
+ import warnings
10
+ from collections.abc import Iterable
11
+
12
+ import pandas as pd
13
+
14
+ from fpu_barometer_admin.connectors import SourceArtifactSet
15
+ from fpu_barometer_admin.schemas.event import (
16
+ EVENT_COLUMNS,
17
+ INCIDENT_TAXONOMY,
18
+ LEAF_INCIDENT_TYPES,
19
+ TOP_INCIDENT_TYPES,
20
+ CountryCanonicalizer,
21
+ EventSchemaValidationError,
22
+ canonical_event_id,
23
+ incident_tags,
24
+ validate_event_dataframe,
25
+ )
26
+ from fpu_barometer_admin.storage.objects import ObjectStorage
27
+
28
+
29
+ _LEAF_TO_TOP = {
30
+ leaf: top for top, leaves in INCIDENT_TAXONOMY.items() for leaf in leaves
31
+ }
32
+
33
+
34
+ class MFRRProcessor:
35
+ """Transform MFRR Source Artifacts into canonical Event data."""
36
+
37
+ dataset = "mfrr"
38
+ required_columns = {
39
+ "id",
40
+ "country",
41
+ "date",
42
+ "year",
43
+ "published_at",
44
+ "published_at_date",
45
+ "type_of_incident",
46
+ "top_type_of_incident",
47
+ "type_of_incident_leaves",
48
+ "attacked_count",
49
+ }
50
+
51
+ def __init__(self, object_storage: ObjectStorage):
52
+ self.object_storage = object_storage
53
+ self.country_canonicalizer = CountryCanonicalizer()
54
+
55
+ def process(
56
+ self,
57
+ artifact_set: SourceArtifactSet,
58
+ *args: object,
59
+ **kwargs: object,
60
+ ) -> pd.DataFrame:
61
+ if artifact_set.dataset != self.dataset:
62
+ raise ValueError(
63
+ f"MFRR processor cannot process dataset {artifact_set.dataset!r}"
64
+ )
65
+ source = self.object_storage.read_dataframe(artifact_set.logical_path)
66
+ self._validate_source_columns(source)
67
+
68
+ countries = source["country"].map(
69
+ lambda value: self.country_canonicalizer.from_values(country_name=value)
70
+ )
71
+ valid_country = countries.notna()
72
+ if (~valid_country).any():
73
+ samples = source.loc[~valid_country, "country"].dropna().astype(str).unique()[:5]
74
+ warnings.warn(
75
+ f"MFRR dropped {(~valid_country).sum()} Event rows with unresolved countries: {list(samples)}",
76
+ RuntimeWarning,
77
+ stacklevel=2,
78
+ )
79
+
80
+ dates = source.apply(self._canonical_date, axis=1)
81
+ valid_date = dates.map(lambda value: value is not None)
82
+ if (~valid_date).any():
83
+ samples = source.loc[~valid_date, ["date", "year", "published_at_date", "published_at"]].head().to_dict("records")
84
+ warnings.warn(
85
+ f"MFRR dropped {(~valid_date).sum()} Event rows with invalid dates: {samples}",
86
+ RuntimeWarning,
87
+ stacklevel=2,
88
+ )
89
+
90
+ keep = valid_country & valid_date
91
+ source = source.loc[keep].copy()
92
+ countries = countries.loc[keep]
93
+ dates = dates.loc[keep]
94
+
95
+ processed = pd.DataFrame(index=source.index)
96
+ processed["event_id"] = source["id"].map(
97
+ lambda value: canonical_event_id(self.dataset, value)
98
+ )
99
+ processed["dataset"] = self.dataset
100
+ processed["iso3"] = countries.map(lambda country: country.iso3)
101
+ processed["country_name"] = countries.map(lambda country: country.country_name)
102
+ processed["date"] = dates.map(lambda value: value[0])
103
+ processed["date_precision"] = dates.map(lambda value: value[1])
104
+ processed["year"] = dates.map(lambda value: value[2]).astype("Int64")
105
+
106
+ incident_values = source.apply(self._incident_values, axis=1)
107
+ processed["type_of_incident"] = incident_values.map(lambda values: values[0])
108
+ processed["top_type_of_incident"] = incident_values.map(lambda values: values[1])
109
+ processed["type_of_incident_leaves"] = incident_values.map(lambda values: values[2])
110
+
111
+ processed["processed_at"] = pd.Timestamp.now(tz="UTC")
112
+ processed["n_people_affected"] = self._people_affected(source)
113
+ processed["region"] = source.apply(self._region, axis=1)
114
+ processed["latitude"] = source.apply(lambda row: self._coordinate(row, "lat"), axis=1)
115
+ processed["longitude"] = source.apply(lambda row: self._coordinate(row, "lng"), axis=1)
116
+ processed["description"] = source.apply(self._description, axis=1)
117
+ processed["source_url"] = source.apply(self._source_url, axis=1)
118
+ processed["attacked_count"] = self._attacked_count(source)
119
+ processed["gender"] = source["gender"].map(self._semicolon_join) if "gender" in source else pd.NA
120
+ processed["media_role"] = source["type_of_journalist_or_media_actor"].map(self._semicolon_join) if "type_of_journalist_or_media_actor" in source else pd.NA
121
+ processed["perpetrator_type"] = source["source_of_incident"].map(self._semicolon_join) if "source_of_incident" in source else pd.NA
122
+
123
+ processed = processed.loc[:, EVENT_COLUMNS]
124
+ self._validate_coordinates(processed)
125
+ return validate_event_dataframe(processed)
126
+
127
+ def _validate_source_columns(self, source: pd.DataFrame) -> None:
128
+ missing = self.required_columns - set(source.columns)
129
+ if missing:
130
+ raise EventSchemaValidationError(
131
+ f"MFRR Source Artifact missing processor columns: {sorted(missing)}"
132
+ )
133
+
134
+ def _canonical_date(self, row: pd.Series) -> tuple[object, str, int] | None:
135
+ if self._truthy(row.get("has_date_of_incidence")):
136
+ incident_date = pd.to_datetime(row.get("date"), errors="coerce")
137
+ if pd.notna(incident_date):
138
+ return incident_date.date(), "day", int(incident_date.year)
139
+
140
+ year = pd.to_numeric(pd.Series([row.get("year")]), errors="coerce").iloc[0]
141
+ if pd.notna(year):
142
+ year_int = int(year)
143
+ if 1000 <= year_int <= 9999:
144
+ return pd.Timestamp(year=year_int, month=1, day=1).date(), "year", year_int
145
+
146
+ for column in ("published_at_date", "published_at"):
147
+ value = row.get(column)
148
+ if column == "published_at" and not self._is_blank(value):
149
+ numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0]
150
+ if pd.notna(numeric):
151
+ published = pd.to_datetime(numeric, unit="s", errors="coerce")
152
+ else:
153
+ published = pd.to_datetime(value, errors="coerce")
154
+ else:
155
+ published = pd.to_datetime(value, errors="coerce")
156
+ if pd.notna(published):
157
+ return published.date(), "publication_day", int(published.year)
158
+ return None
159
+
160
+ def _incident_values(self, row: pd.Series) -> tuple[list[str], list[str], list[str]]:
161
+ source_top = [value for value in self._list_values(row.get("top_type_of_incident")) if value in TOP_INCIDENT_TYPES]
162
+ source_leaves = [value for value in self._list_values(row.get("type_of_incident_leaves")) if value in LEAF_INCIDENT_TYPES]
163
+ for value in self._list_values(row.get("type_of_incident")):
164
+ if value in TOP_INCIDENT_TYPES:
165
+ source_top.append(value)
166
+ if value in LEAF_INCIDENT_TYPES:
167
+ source_leaves.append(value)
168
+
169
+ inferred_top = [_LEAF_TO_TOP[leaf] for leaf in source_leaves if leaf in _LEAF_TO_TOP]
170
+ top = self._dedupe([*source_top, *inferred_top])
171
+ leaves = self._dedupe(source_leaves)
172
+ if top and not leaves:
173
+ leaves = [self._fallback_leaf_for_top(value) for value in top]
174
+ if not top or not leaves:
175
+ top = ["Unknown"]
176
+ leaves = ["Unknown"]
177
+ return incident_tags(top=top, leaves=leaves)
178
+
179
+ def _fallback_leaf_for_top(self, top: str) -> str:
180
+ allowed = INCIDENT_TAXONOMY.get(top, ())
181
+ if "Unknown" in allowed:
182
+ return "Unknown"
183
+ if top in allowed:
184
+ return top
185
+ return allowed[0] if allowed else "Unknown"
186
+
187
+ def _people_affected(self, source: pd.DataFrame) -> pd.Series:
188
+ counts = pd.to_numeric(source["attacked_count"], errors="coerce")
189
+ return counts.fillna(1).clip(lower=1).astype("Int64")
190
+
191
+ def _attacked_count(self, source: pd.DataFrame) -> pd.Series:
192
+ return pd.to_numeric(source["attacked_count"], errors="coerce").astype("Int64")
193
+
194
+ def _description(self, row: pd.Series) -> object:
195
+ title = self._clean_text(row.get("title"))
196
+ content = self._clean_text(row.get("content"))
197
+ parts = [part for part in (title, content) if part]
198
+ return "\n\n".join(parts) if parts else pd.NA
199
+
200
+ def _source_url(self, row: pd.Series) -> object:
201
+ for value in [row.get("coe_link"), *self._url_values(row.get("news_source_links")), *self._url_values(row.get("internet_source_links"))]:
202
+ url = self._clean_text(value)
203
+ if not url:
204
+ continue
205
+ if not re.match(r"^[a-z][a-z0-9+.-]*://", url, flags=re.IGNORECASE):
206
+ url = f"https://{url}"
207
+ return url
208
+ return pd.NA
209
+
210
+ def _region(self, row: pd.Series) -> object:
211
+ region_names = [self._clean_text(value) for value in self._list_values(row.get("region_names"))]
212
+ region_names = [value for value in region_names if value and not self._is_unknownish(value)]
213
+ return region_names[2] if len(region_names) >= 3 else pd.NA
214
+
215
+ def _coordinate(self, row: pd.Series, axis: str) -> object:
216
+ flat_column = "_geo_lat" if axis == "lat" else "_geo_lng"
217
+ dotted_column = "_geo.lat" if axis == "lat" else "_geo.lng"
218
+ value = row.get(flat_column)
219
+ if self._is_blank(value):
220
+ value = row.get(dotted_column)
221
+ if self._is_blank(value):
222
+ geo = self._maybe_mapping(row.get("_geo"))
223
+ if isinstance(geo, dict):
224
+ value = geo.get("lat" if axis == "lat" else "lng")
225
+ numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0]
226
+ return numeric if pd.notna(numeric) else pd.NA
227
+
228
+ def _validate_coordinates(self, processed: pd.DataFrame) -> None:
229
+ latitude = processed["latitude"].dropna()
230
+ longitude = processed["longitude"].dropna()
231
+ if ((latitude < -90) | (latitude > 90)).any():
232
+ raise EventSchemaValidationError("MFRR Event latitude outside [-90, 90]")
233
+ if ((longitude < -180) | (longitude > 180)).any():
234
+ raise EventSchemaValidationError("MFRR Event longitude outside [-180, 180]")
235
+
236
+ def _semicolon_join(self, value: object) -> object:
237
+ values = [self._clean_text(item) for item in self._list_values(value)]
238
+ values = [item for item in values if item and not self._is_unknownish(item)]
239
+ return "; ".join(self._dedupe(values)) if values else pd.NA
240
+
241
+ def _url_values(self, value: object) -> list[object]:
242
+ values: list[object] = []
243
+ for item in self._list_values(value):
244
+ if isinstance(item, str):
245
+ values.extend(part for part in re.split(r"[\r\n]+", item) if part.strip())
246
+ else:
247
+ values.append(item)
248
+ return values
249
+
250
+ def _list_values(self, value: object) -> list[object]:
251
+ if self._is_blank(value):
252
+ return []
253
+ if isinstance(value, (list, tuple, set)):
254
+ return list(value)
255
+ if isinstance(value, str):
256
+ text = value.strip()
257
+ if not text:
258
+ return []
259
+ if text[:1] in "[({":
260
+ try:
261
+ parsed = ast.literal_eval(text)
262
+ except (SyntaxError, ValueError):
263
+ try:
264
+ parsed = json.loads(text)
265
+ except json.JSONDecodeError:
266
+ return [text]
267
+ if isinstance(parsed, (list, tuple, set)):
268
+ return list(parsed)
269
+ return [text]
270
+ if isinstance(value, Iterable) and not isinstance(value, (str, bytes, dict)):
271
+ return list(value)
272
+ return [value]
273
+
274
+ def _maybe_mapping(self, value: object) -> object:
275
+ if isinstance(value, dict) or self._is_blank(value):
276
+ return value
277
+ if isinstance(value, str) and value.strip()[:1] in "{":
278
+ for loader in (json.loads, ast.literal_eval):
279
+ try:
280
+ parsed = loader(value)
281
+ except (json.JSONDecodeError, SyntaxError, ValueError):
282
+ continue
283
+ if isinstance(parsed, dict):
284
+ return parsed
285
+ return value
286
+
287
+ def _clean_text(self, value: object) -> str | None:
288
+ if self._is_blank(value):
289
+ return None
290
+ text = str(value)
291
+ text = re.sub(r"<[^>]+>", " ", text)
292
+ text = html.unescape(text)
293
+ text = re.sub(r"\s+", " ", text).strip()
294
+ text = re.sub(r"\s+([.,;:!?])", r"\1", text)
295
+ return text or None
296
+
297
+ def _is_unknownish(self, value: str) -> bool:
298
+ text = value.strip().lower()
299
+ return (
300
+ text in {"unknown", "nan", "none", "null", "not applicable", "n/a", "na"}
301
+ or text.startswith("unknown ")
302
+ or text.endswith(" unknown")
303
+ )
304
+
305
+ def _truthy(self, value: object) -> bool:
306
+ if self._is_blank(value):
307
+ return False
308
+ if isinstance(value, str):
309
+ return value.strip().lower() in {"1", "true", "yes", "y"}
310
+ return bool(value)
311
+
312
+ def _is_blank(self, value: object) -> bool:
313
+ if value is None:
314
+ return True
315
+ if isinstance(value, str):
316
+ return not value.strip() or value.strip().lower() in {"nan", "none", "null"}
317
+ try:
318
+ return bool(pd.isna(value))
319
+ except (TypeError, ValueError):
320
+ return False
321
+
322
+ def _dedupe(self, values: Iterable) -> list:
323
+ result = []
324
+ for value in values:
325
+ if value not in result:
326
+ result.append(value)
327
+ return result