fpu-barometer-admin 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fpu_barometer_admin/__init__.py +6 -0
- fpu_barometer_admin/cli/__init__.py +5 -0
- fpu_barometer_admin/cli/commands.py +199 -0
- fpu_barometer_admin/cli/deploy.py +719 -0
- fpu_barometer_admin/connectors/__init__.py +56 -0
- fpu_barometer_admin/connectors/acled_connector.py +77 -0
- fpu_barometer_admin/connectors/base_connector.py +60 -0
- fpu_barometer_admin/connectors/cpj_connector.py +92 -0
- fpu_barometer_admin/connectors/ert_connector.py +134 -0
- fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
- fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
- fpu_barometer_admin/connectors/rr_connector.py +84 -0
- fpu_barometer_admin/connectors/static_sources.py +41 -0
- fpu_barometer_admin/connectors/vdem_connector.py +165 -0
- fpu_barometer_admin/handlers/__init__.py +6 -0
- fpu_barometer_admin/handlers/function_app.py +543 -0
- fpu_barometer_admin/processors/__init__.py +46 -0
- fpu_barometer_admin/processors/acled_processor.py +263 -0
- fpu_barometer_admin/processors/base_processor.py +23 -0
- fpu_barometer_admin/processors/cpj_processor.py +147 -0
- fpu_barometer_admin/processors/ert_processor.py +72 -0
- fpu_barometer_admin/processors/gdelt_processor.py +260 -0
- fpu_barometer_admin/processors/mfrr_processor.py +327 -0
- fpu_barometer_admin/processors/rr_processor.py +208 -0
- fpu_barometer_admin/processors/vdem_processor.py +70 -0
- fpu_barometer_admin/runners/__init__.py +19 -0
- fpu_barometer_admin/runners/definitions.py +159 -0
- fpu_barometer_admin/runners/runners.py +291 -0
- fpu_barometer_admin/runners/scheduler.py +148 -0
- fpu_barometer_admin/runners/seed.py +399 -0
- fpu_barometer_admin/schemas/__init__.py +1 -0
- fpu_barometer_admin/schemas/event.py +362 -0
- fpu_barometer_admin/schemas/predictor.py +418 -0
- fpu_barometer_admin/storage/__init__.py +39 -0
- fpu_barometer_admin/storage/catalog.py +359 -0
- fpu_barometer_admin/storage/factory.py +165 -0
- fpu_barometer_admin/storage/objects.py +463 -0
- fpu_barometer_admin/storage/reader.py +410 -0
- fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
- fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
- fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
- fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
- fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""GDELT connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import zipfile
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
from fpu_barometer_admin.connectors.base_connector import (
|
|
18
|
+
NoNewSourceArtifact,
|
|
19
|
+
SourceArtifactSet,
|
|
20
|
+
SourceValidationError,
|
|
21
|
+
)
|
|
22
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
GDELT_MASTER_FILE_LIST = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
|
|
26
|
+
GDELT_TIMEOUT_SECONDS = 60
|
|
27
|
+
GDELT_EVENT_CODES = {
|
|
28
|
+
"091",
|
|
29
|
+
"092",
|
|
30
|
+
"110",
|
|
31
|
+
"111",
|
|
32
|
+
"112",
|
|
33
|
+
"1121",
|
|
34
|
+
"1122",
|
|
35
|
+
"1123",
|
|
36
|
+
"1125",
|
|
37
|
+
"113",
|
|
38
|
+
"114",
|
|
39
|
+
"115",
|
|
40
|
+
"116",
|
|
41
|
+
"1233",
|
|
42
|
+
"124",
|
|
43
|
+
"1234",
|
|
44
|
+
"1245",
|
|
45
|
+
"130",
|
|
46
|
+
"131",
|
|
47
|
+
"1321",
|
|
48
|
+
"138",
|
|
49
|
+
"1384",
|
|
50
|
+
"139",
|
|
51
|
+
"170",
|
|
52
|
+
"171",
|
|
53
|
+
"1711",
|
|
54
|
+
"1712",
|
|
55
|
+
"172",
|
|
56
|
+
"1721",
|
|
57
|
+
"173",
|
|
58
|
+
"174",
|
|
59
|
+
"176",
|
|
60
|
+
"180",
|
|
61
|
+
"181",
|
|
62
|
+
"182",
|
|
63
|
+
"1821",
|
|
64
|
+
"1822",
|
|
65
|
+
"1823",
|
|
66
|
+
"183",
|
|
67
|
+
"1831",
|
|
68
|
+
"1832",
|
|
69
|
+
"1833",
|
|
70
|
+
"1834",
|
|
71
|
+
"185",
|
|
72
|
+
"186",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True)
|
|
77
|
+
class _ManifestEntry:
|
|
78
|
+
timestamp: str
|
|
79
|
+
url: str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class GdeltConnector:
|
|
83
|
+
"""Connector for bounded incremental GDELT Event export refreshes."""
|
|
84
|
+
|
|
85
|
+
dataset = "gdelt"
|
|
86
|
+
source_columns = (
|
|
87
|
+
"GlobalEventID",
|
|
88
|
+
"SQLDATE",
|
|
89
|
+
"MonthYear",
|
|
90
|
+
"Year",
|
|
91
|
+
"FractionDate",
|
|
92
|
+
"Actor1Code",
|
|
93
|
+
"Actor1Name",
|
|
94
|
+
"Actor1CountryCode",
|
|
95
|
+
"Actor1KnownGroupCode",
|
|
96
|
+
"Actor1EthnicCode",
|
|
97
|
+
"Actor1Religion1Code",
|
|
98
|
+
"Actor1Religion2Code",
|
|
99
|
+
"Actor1Type1Code",
|
|
100
|
+
"Actor1Type2Code",
|
|
101
|
+
"Actor1Type3Code",
|
|
102
|
+
"Actor2Code",
|
|
103
|
+
"Actor2Name",
|
|
104
|
+
"Actor2CountryCode",
|
|
105
|
+
"Actor2KnownGroupCode",
|
|
106
|
+
"Actor2EthnicCode",
|
|
107
|
+
"Actor2Religion1Code",
|
|
108
|
+
"Actor2Religion2Code",
|
|
109
|
+
"Actor2Type1Code",
|
|
110
|
+
"Actor2Type2Code",
|
|
111
|
+
"Actor2Type3Code",
|
|
112
|
+
"IsRootEvent",
|
|
113
|
+
"EventCode",
|
|
114
|
+
"EventBaseCode",
|
|
115
|
+
"EventRootCode",
|
|
116
|
+
"QuadClass",
|
|
117
|
+
"GoldsteinScale",
|
|
118
|
+
"NumMentions",
|
|
119
|
+
"NumSources",
|
|
120
|
+
"NumArticles",
|
|
121
|
+
"AvgTone",
|
|
122
|
+
"Actor1Geo_Type",
|
|
123
|
+
"Actor1Geo_FullName",
|
|
124
|
+
"Actor1Geo_CountryCode",
|
|
125
|
+
"Actor1Geo_ADM1Code",
|
|
126
|
+
"Actor1Geo_ADM2Code",
|
|
127
|
+
"Actor1Geo_Lat",
|
|
128
|
+
"Actor1Geo_Long",
|
|
129
|
+
"Actor1Geo_FeatureID",
|
|
130
|
+
"Actor2Geo_Type",
|
|
131
|
+
"Actor2Geo_FullName",
|
|
132
|
+
"Actor2Geo_CountryCode",
|
|
133
|
+
"Actor2Geo_ADM1Code",
|
|
134
|
+
"Actor2Geo_ADM2Code",
|
|
135
|
+
"Actor2Geo_Lat",
|
|
136
|
+
"Actor2Geo_Long",
|
|
137
|
+
"Actor2Geo_FeatureID",
|
|
138
|
+
"ActionGeo_Type",
|
|
139
|
+
"ActionGeo_FullName",
|
|
140
|
+
"ActionGeo_CountryCode",
|
|
141
|
+
"ActionGeo_ADM1Code",
|
|
142
|
+
"ActionGeo_ADM2Code",
|
|
143
|
+
"ActionGeo_Lat",
|
|
144
|
+
"ActionGeo_Long",
|
|
145
|
+
"ActionGeo_FeatureID",
|
|
146
|
+
"DATEADDED",
|
|
147
|
+
"SOURCEURL",
|
|
148
|
+
)
|
|
149
|
+
required_columns = {
|
|
150
|
+
"GlobalEventID",
|
|
151
|
+
"SQLDATE",
|
|
152
|
+
"Actor1Name",
|
|
153
|
+
"Actor1CountryCode",
|
|
154
|
+
"Actor1Type1Code",
|
|
155
|
+
"Actor2Name",
|
|
156
|
+
"Actor2CountryCode",
|
|
157
|
+
"Actor2Type1Code",
|
|
158
|
+
"EventCode",
|
|
159
|
+
"ActionGeo_CountryCode",
|
|
160
|
+
"ActionGeo_Lat",
|
|
161
|
+
"ActionGeo_Long",
|
|
162
|
+
"DATEADDED",
|
|
163
|
+
"SOURCEURL",
|
|
164
|
+
}
|
|
165
|
+
media_terms = ("media", "journalist", "reporter", "press", "news")
|
|
166
|
+
media_type_codes = {"MED", "JRN"}
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
object_storage: ObjectStorage,
|
|
171
|
+
*,
|
|
172
|
+
manifest_url: str = GDELT_MASTER_FILE_LIST,
|
|
173
|
+
timeout_seconds: int = GDELT_TIMEOUT_SECONDS,
|
|
174
|
+
max_files_per_run: int | None = None,
|
|
175
|
+
bootstrap_file_limit: int | None = None,
|
|
176
|
+
):
|
|
177
|
+
self.object_storage = object_storage
|
|
178
|
+
self.manifest_url = manifest_url
|
|
179
|
+
self.timeout_seconds = timeout_seconds
|
|
180
|
+
self.max_files_per_run = max_files_per_run or _positive_int_env(
|
|
181
|
+
"GDELT_MAX_FILES_PER_RUN"
|
|
182
|
+
)
|
|
183
|
+
self.bootstrap_file_limit = (
|
|
184
|
+
bootstrap_file_limit
|
|
185
|
+
if bootstrap_file_limit is not None
|
|
186
|
+
else _positive_int_env("GDELT_BOOTSTRAP_FILE_LIMIT", default=1)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def fetch(
|
|
190
|
+
self,
|
|
191
|
+
*,
|
|
192
|
+
run_id: str,
|
|
193
|
+
artifact_id: str,
|
|
194
|
+
since_watermark: str | None = None,
|
|
195
|
+
) -> SourceArtifactSet:
|
|
196
|
+
entries = self._manifest_entries()
|
|
197
|
+
selected = self._select_entries(entries, since_watermark=since_watermark)
|
|
198
|
+
if not selected:
|
|
199
|
+
raise NoNewSourceArtifact("GDELT manifest has no newer event export files")
|
|
200
|
+
|
|
201
|
+
frames: list[pd.DataFrame] = []
|
|
202
|
+
latest_successful_timestamp: str | None = None
|
|
203
|
+
earliest_skipped_timestamp: str | None = None
|
|
204
|
+
for entry in selected:
|
|
205
|
+
try:
|
|
206
|
+
frame = self._read_remote_zip_csv(entry.url)
|
|
207
|
+
frames.append(frame)
|
|
208
|
+
latest_successful_timestamp = entry.timestamp
|
|
209
|
+
except requests.exceptions.HTTPError as exc:
|
|
210
|
+
if exc.response is not None and exc.response.status_code == 404:
|
|
211
|
+
logger.warning(
|
|
212
|
+
"GDELT export file not yet available, skipping: %s",
|
|
213
|
+
entry.url,
|
|
214
|
+
)
|
|
215
|
+
if earliest_skipped_timestamp is None:
|
|
216
|
+
earliest_skipped_timestamp = entry.timestamp
|
|
217
|
+
continue
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
if not frames:
|
|
221
|
+
raise NoNewSourceArtifact(
|
|
222
|
+
"GDELT export files listed in manifest are not yet available",
|
|
223
|
+
watermark_after=since_watermark,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Determine watermark so we never skip data due to transient 404s.
|
|
227
|
+
# If any file returned 404, only advance the watermark to the last
|
|
228
|
+
# selected entry BEFORE the first gap. Files at and after the gap
|
|
229
|
+
# will be retried on the next run (reprocessing already-seen files
|
|
230
|
+
# is safe because the processor deduplicates by event_id against
|
|
231
|
+
# the baseline processed dataset).
|
|
232
|
+
if earliest_skipped_timestamp is not None:
|
|
233
|
+
watermark_after = since_watermark or ""
|
|
234
|
+
for entry in selected:
|
|
235
|
+
if entry.timestamp >= earliest_skipped_timestamp:
|
|
236
|
+
break
|
|
237
|
+
watermark_after = entry.timestamp
|
|
238
|
+
else:
|
|
239
|
+
watermark_after = selected[-1].timestamp
|
|
240
|
+
|
|
241
|
+
return self._write_source_artifact(
|
|
242
|
+
frames,
|
|
243
|
+
artifact_id=artifact_id,
|
|
244
|
+
watermark_after=watermark_after,
|
|
245
|
+
empty_message="GDELT export files contained no Barometer-relevant Event rows",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def _manifest_entries(self) -> list[_ManifestEntry]:
|
|
249
|
+
response = requests.get(self.manifest_url, timeout=self.timeout_seconds)
|
|
250
|
+
response.raise_for_status()
|
|
251
|
+
entries: list[_ManifestEntry] = []
|
|
252
|
+
for line in response.text.splitlines():
|
|
253
|
+
match = re.search(r"(https?://\S+/(\d{14})\.export\.CSV\.zip)", line)
|
|
254
|
+
if match:
|
|
255
|
+
entries.append(
|
|
256
|
+
_ManifestEntry(timestamp=match.group(2), url=match.group(1))
|
|
257
|
+
)
|
|
258
|
+
return sorted(
|
|
259
|
+
{entry.url: entry for entry in entries}.values(),
|
|
260
|
+
key=lambda entry: entry.timestamp,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
def _select_entries(
|
|
264
|
+
self, entries: list[_ManifestEntry], *, since_watermark: str | None
|
|
265
|
+
) -> list[_ManifestEntry]:
|
|
266
|
+
if since_watermark:
|
|
267
|
+
candidates = [
|
|
268
|
+
entry for entry in entries if entry.timestamp > since_watermark
|
|
269
|
+
]
|
|
270
|
+
else:
|
|
271
|
+
candidates = entries[-self.bootstrap_file_limit :]
|
|
272
|
+
candidates = sorted(candidates, key=lambda entry: entry.timestamp)
|
|
273
|
+
if self.max_files_per_run is not None:
|
|
274
|
+
candidates = candidates[: self.max_files_per_run]
|
|
275
|
+
return candidates
|
|
276
|
+
|
|
277
|
+
def _write_source_artifact(
|
|
278
|
+
self,
|
|
279
|
+
frames: list[pd.DataFrame],
|
|
280
|
+
*,
|
|
281
|
+
artifact_id: str,
|
|
282
|
+
watermark_after: str,
|
|
283
|
+
empty_message: str,
|
|
284
|
+
) -> SourceArtifactSet:
|
|
285
|
+
source = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
|
|
286
|
+
self._validate_source_columns(source)
|
|
287
|
+
source = self._filter_relevant_rows(source)
|
|
288
|
+
if source.empty:
|
|
289
|
+
raise NoNewSourceArtifact(empty_message, watermark_after=watermark_after)
|
|
290
|
+
|
|
291
|
+
logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/gdelt.parquet"
|
|
292
|
+
self.object_storage.write_dataframe(logical_path, source, fail_if_exists=True)
|
|
293
|
+
return SourceArtifactSet(
|
|
294
|
+
dataset=self.dataset,
|
|
295
|
+
artifact_id=artifact_id,
|
|
296
|
+
logical_path=logical_path,
|
|
297
|
+
watermark_after=watermark_after,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
def _read_remote_zip_csv(self, url: str) -> pd.DataFrame:
|
|
301
|
+
response = requests.get(url, timeout=self.timeout_seconds)
|
|
302
|
+
response.raise_for_status()
|
|
303
|
+
return self._read_zip_csv_bytes(response.content, source_label=url)
|
|
304
|
+
|
|
305
|
+
def _read_zip_csv_bytes(self, payload: bytes, *, source_label: str) -> pd.DataFrame:
|
|
306
|
+
try:
|
|
307
|
+
with zipfile.ZipFile(io.BytesIO(payload)) as archive:
|
|
308
|
+
members = [
|
|
309
|
+
name for name in archive.namelist() if name.upper().endswith("CSV")
|
|
310
|
+
]
|
|
311
|
+
if not members:
|
|
312
|
+
raise SourceValidationError(
|
|
313
|
+
f"GDELT zip contains no CSV member: {source_label}",
|
|
314
|
+
metadata={
|
|
315
|
+
"dataset": self.dataset,
|
|
316
|
+
"source_url": source_label,
|
|
317
|
+
"expected_shape": "zip_csv_member",
|
|
318
|
+
},
|
|
319
|
+
)
|
|
320
|
+
with archive.open(members[0]) as handle:
|
|
321
|
+
frame = self._read_gdelt_csv(handle, source_label=source_label)
|
|
322
|
+
except zipfile.BadZipFile as exc:
|
|
323
|
+
raise SourceValidationError(
|
|
324
|
+
f"GDELT source is not a valid zip: {source_label}",
|
|
325
|
+
metadata={
|
|
326
|
+
"dataset": self.dataset,
|
|
327
|
+
"source_url": source_label,
|
|
328
|
+
"expected_shape": "zip_file",
|
|
329
|
+
},
|
|
330
|
+
) from exc
|
|
331
|
+
return frame
|
|
332
|
+
|
|
333
|
+
def _read_gdelt_csv(self, source, *, source_label: str) -> pd.DataFrame:
|
|
334
|
+
frame = pd.read_csv(
|
|
335
|
+
source,
|
|
336
|
+
sep="\t",
|
|
337
|
+
header=None,
|
|
338
|
+
dtype=str,
|
|
339
|
+
keep_default_na=False,
|
|
340
|
+
)
|
|
341
|
+
if frame.shape[1] != len(self.source_columns):
|
|
342
|
+
raise SourceValidationError(
|
|
343
|
+
f"GDELT source has {frame.shape[1]} columns; expected {len(self.source_columns)}",
|
|
344
|
+
metadata={
|
|
345
|
+
"dataset": self.dataset,
|
|
346
|
+
"source_url": source_label,
|
|
347
|
+
"actual_columns": frame.shape[1],
|
|
348
|
+
"expected_columns": len(self.source_columns),
|
|
349
|
+
},
|
|
350
|
+
)
|
|
351
|
+
frame.columns = self.source_columns
|
|
352
|
+
return frame
|
|
353
|
+
|
|
354
|
+
def _validate_source_columns(self, source: pd.DataFrame) -> None:
|
|
355
|
+
missing = self.required_columns - set(source.columns)
|
|
356
|
+
if missing:
|
|
357
|
+
missing_columns = sorted(missing)
|
|
358
|
+
raise SourceValidationError(
|
|
359
|
+
f"GDELT source missing required columns: {missing_columns}",
|
|
360
|
+
metadata={"dataset": self.dataset, "missing_columns": missing_columns},
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
def _filter_relevant_rows(self, source: pd.DataFrame) -> pd.DataFrame:
|
|
364
|
+
filtered = source[
|
|
365
|
+
source["EventCode"].astype(str).isin(GDELT_EVENT_CODES)
|
|
366
|
+
].copy()
|
|
367
|
+
if filtered.empty:
|
|
368
|
+
return filtered
|
|
369
|
+
return filtered.loc[self._media_relevance_mask(filtered)].reset_index(drop=True)
|
|
370
|
+
|
|
371
|
+
def _media_relevance_mask(self, source: pd.DataFrame) -> pd.Series:
|
|
372
|
+
mask = pd.Series(False, index=source.index)
|
|
373
|
+
for column in (
|
|
374
|
+
"Actor1Type1Code",
|
|
375
|
+
"Actor1Type2Code",
|
|
376
|
+
"Actor1Type3Code",
|
|
377
|
+
"Actor2Type1Code",
|
|
378
|
+
"Actor2Type2Code",
|
|
379
|
+
"Actor2Type3Code",
|
|
380
|
+
):
|
|
381
|
+
if column in source:
|
|
382
|
+
mask = mask | source[column].astype(str).str.upper().isin(
|
|
383
|
+
self.media_type_codes
|
|
384
|
+
)
|
|
385
|
+
for column in ("Actor1Name", "Actor2Name", "Actor1Code", "Actor2Code"):
|
|
386
|
+
if column in source:
|
|
387
|
+
lowered = source[column].astype(str).str.lower()
|
|
388
|
+
for term in self.media_terms:
|
|
389
|
+
mask = mask | lowered.str.contains(term, regex=False)
|
|
390
|
+
return mask
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _positive_int_env(name: str, default: int | None = None) -> int | None:
|
|
394
|
+
raw = os.environ.get(name)
|
|
395
|
+
if raw is None or raw == "":
|
|
396
|
+
return default
|
|
397
|
+
value = int(raw)
|
|
398
|
+
if value <= 0:
|
|
399
|
+
raise ValueError(f"{name} must be a positive integer")
|
|
400
|
+
return value
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
__all__ = ["GDELT_EVENT_CODES", "GDELT_MASTER_FILE_LIST", "GdeltConnector"]
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""MFRR connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
|
|
11
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
MFRR_API_ENDPOINT = "https://www.mapmf.org/meili/multi-search"
|
|
15
|
+
MFRR_API_TOKEN = "c129ca42527c52965c80099ab1a869f40de8ec3b698d1e361b0cf7402c6d48a1"
|
|
16
|
+
MFRR_PAGE_SIZE = 1000
|
|
17
|
+
MFRR_TIMEOUT_SECONDS = 60
|
|
18
|
+
MFRR_LIMIT = 999999999999
|
|
19
|
+
|
|
20
|
+
MFRR_ATTRIBUTES_TO_RETRIEVE = [
|
|
21
|
+
"id",
|
|
22
|
+
"title",
|
|
23
|
+
"content",
|
|
24
|
+
"country",
|
|
25
|
+
"date",
|
|
26
|
+
"year",
|
|
27
|
+
"published_at",
|
|
28
|
+
"published_at_date",
|
|
29
|
+
"has_date_of_incidence",
|
|
30
|
+
"type_of_incident",
|
|
31
|
+
"top_type_of_incident",
|
|
32
|
+
"type_of_incident_leaves",
|
|
33
|
+
"attacked_count",
|
|
34
|
+
"coe_link",
|
|
35
|
+
"news_source_links",
|
|
36
|
+
"internet_source_links",
|
|
37
|
+
"gender",
|
|
38
|
+
"type_of_journalist_or_media_actor",
|
|
39
|
+
"source_of_incident",
|
|
40
|
+
"context_of_incident",
|
|
41
|
+
"subjects",
|
|
42
|
+
"who_was_attacked",
|
|
43
|
+
"region_names",
|
|
44
|
+
"region_ids",
|
|
45
|
+
"consolidated_count",
|
|
46
|
+
"specific_topic",
|
|
47
|
+
"project",
|
|
48
|
+
"_geo_lat",
|
|
49
|
+
"_geo_lng",
|
|
50
|
+
"_geo.lat",
|
|
51
|
+
"_geo.lng",
|
|
52
|
+
"_geo",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MFRRConnector:
|
|
57
|
+
"""Connector for the MFRR API Event source."""
|
|
58
|
+
|
|
59
|
+
dataset = "mfrr"
|
|
60
|
+
required_columns = {
|
|
61
|
+
"id",
|
|
62
|
+
"country",
|
|
63
|
+
"date",
|
|
64
|
+
"year",
|
|
65
|
+
"published_at",
|
|
66
|
+
"published_at_date",
|
|
67
|
+
"type_of_incident",
|
|
68
|
+
"top_type_of_incident",
|
|
69
|
+
"type_of_incident_leaves",
|
|
70
|
+
"attacked_count",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
74
|
+
self.object_storage = object_storage
|
|
75
|
+
|
|
76
|
+
def fetch(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
run_id: str,
|
|
80
|
+
artifact_id: str,
|
|
81
|
+
since_watermark: str | None = None,
|
|
82
|
+
) -> SourceArtifactSet:
|
|
83
|
+
response = requests.post(
|
|
84
|
+
MFRR_API_ENDPOINT,
|
|
85
|
+
headers={"Authorization": f"Bearer {MFRR_API_TOKEN}"},
|
|
86
|
+
json={
|
|
87
|
+
"queries": [
|
|
88
|
+
{
|
|
89
|
+
"indexUid": "alerts",
|
|
90
|
+
"q": "",
|
|
91
|
+
"sort": ["timestamp:desc"],
|
|
92
|
+
"limit": MFRR_LIMIT,
|
|
93
|
+
"offset": 0,
|
|
94
|
+
"attributesToRetrieve": MFRR_ATTRIBUTES_TO_RETRIEVE,
|
|
95
|
+
}
|
|
96
|
+
]
|
|
97
|
+
},
|
|
98
|
+
timeout=MFRR_TIMEOUT_SECONDS,
|
|
99
|
+
)
|
|
100
|
+
response.raise_for_status()
|
|
101
|
+
payload = response.json()
|
|
102
|
+
hits = self._hits_from_response(payload)
|
|
103
|
+
df = pd.DataFrame(hits)
|
|
104
|
+
missing = self.required_columns - set(df.columns)
|
|
105
|
+
if missing:
|
|
106
|
+
missing_columns = sorted(missing)
|
|
107
|
+
raise SourceValidationError(
|
|
108
|
+
f"MFRR API source missing required columns: {missing_columns}",
|
|
109
|
+
metadata={"dataset": self.dataset, "missing_columns": missing_columns},
|
|
110
|
+
)
|
|
111
|
+
df = self._storage_normalized(df)
|
|
112
|
+
|
|
113
|
+
logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/mfrr.parquet"
|
|
114
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
115
|
+
return SourceArtifactSet(
|
|
116
|
+
dataset=self.dataset,
|
|
117
|
+
artifact_id=artifact_id,
|
|
118
|
+
logical_path=logical_path,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def _storage_normalized(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
122
|
+
"""Normalize source-shaped API values so mixed JSON-ish columns survive Parquet."""
|
|
123
|
+
|
|
124
|
+
normalized = df.copy()
|
|
125
|
+
for column in normalized.select_dtypes(include="object").columns:
|
|
126
|
+
normalized[column] = normalized[column].map(self._source_value)
|
|
127
|
+
return normalized
|
|
128
|
+
|
|
129
|
+
def _source_value(self, value: object) -> object:
|
|
130
|
+
if value is None:
|
|
131
|
+
return None
|
|
132
|
+
try:
|
|
133
|
+
if pd.isna(value):
|
|
134
|
+
return None
|
|
135
|
+
except (TypeError, ValueError):
|
|
136
|
+
pass
|
|
137
|
+
if isinstance(value, (list, tuple, dict)):
|
|
138
|
+
return json.dumps(value, ensure_ascii=False)
|
|
139
|
+
return str(value)
|
|
140
|
+
|
|
141
|
+
def _hits_from_response(self, payload: object) -> list[dict]:
|
|
142
|
+
if not isinstance(payload, dict):
|
|
143
|
+
raise SourceValidationError(
|
|
144
|
+
"MFRR API response must be a JSON object",
|
|
145
|
+
metadata={"dataset": self.dataset, "expected_shape": "json_object"},
|
|
146
|
+
)
|
|
147
|
+
results = payload.get("results")
|
|
148
|
+
if not isinstance(results, list) or not results:
|
|
149
|
+
raise SourceValidationError(
|
|
150
|
+
"MFRR API response results must be a non-empty list",
|
|
151
|
+
metadata={
|
|
152
|
+
"dataset": self.dataset,
|
|
153
|
+
"expected_shape": "non_empty_results_list",
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
first = results[0]
|
|
157
|
+
if not isinstance(first, dict) or "hits" not in first:
|
|
158
|
+
raise SourceValidationError(
|
|
159
|
+
"MFRR API first result must contain hits",
|
|
160
|
+
metadata={
|
|
161
|
+
"dataset": self.dataset,
|
|
162
|
+
"expected_shape": "first_result_with_hits",
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
hits = first["hits"]
|
|
166
|
+
if not isinstance(hits, list):
|
|
167
|
+
raise SourceValidationError(
|
|
168
|
+
"MFRR API hits must be a list",
|
|
169
|
+
metadata={"dataset": self.dataset, "expected_shape": "hits_list"},
|
|
170
|
+
)
|
|
171
|
+
return hits
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""RR (Reporters Respond) connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
|
|
11
|
+
from fpu_barometer_admin.connectors.static_sources import static_source_for_dataset
|
|
12
|
+
from fpu_barometer_admin.storage.objects import ObjectStorage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RrConnector:
|
|
16
|
+
"""Connector for the static RR (Reporters Respond) source file."""
|
|
17
|
+
|
|
18
|
+
dataset = "rr"
|
|
19
|
+
required_columns = {
|
|
20
|
+
"Submission Date",
|
|
21
|
+
"Nationality:",
|
|
22
|
+
"Current location:",
|
|
23
|
+
"Region",
|
|
24
|
+
"Year",
|
|
25
|
+
"Decision",
|
|
26
|
+
"Considered under",
|
|
27
|
+
"Prevention or Protection",
|
|
28
|
+
"Type of Assistance",
|
|
29
|
+
"Legal Threat",
|
|
30
|
+
"EUR Amount",
|
|
31
|
+
"Number of Journalists Supported",
|
|
32
|
+
"I am applying to Reporters Respond as:",
|
|
33
|
+
"How do you identify as?",
|
|
34
|
+
"Submission ID",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def __init__(self, object_storage: ObjectStorage):
|
|
38
|
+
self.object_storage = object_storage
|
|
39
|
+
|
|
40
|
+
def fetch(
|
|
41
|
+
self,
|
|
42
|
+
*,
|
|
43
|
+
run_id: str,
|
|
44
|
+
artifact_id: str,
|
|
45
|
+
since_watermark: str | None = None,
|
|
46
|
+
) -> SourceArtifactSet:
|
|
47
|
+
"""Fetch the deployed RR static source from logical storage."""
|
|
48
|
+
|
|
49
|
+
source_file = static_source_for_dataset(self.dataset)
|
|
50
|
+
payload = self.object_storage.read_bytes(source_file.logical_path)
|
|
51
|
+
df = pd.read_csv(BytesIO(payload), low_memory=False)
|
|
52
|
+
return self._write_source_artifact(
|
|
53
|
+
df,
|
|
54
|
+
source_name=Path(source_file.logical_path).stem,
|
|
55
|
+
run_id=run_id,
|
|
56
|
+
artifact_id=artifact_id,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _write_source_artifact(
|
|
60
|
+
self,
|
|
61
|
+
df: pd.DataFrame,
|
|
62
|
+
*,
|
|
63
|
+
source_name: str,
|
|
64
|
+
run_id: str,
|
|
65
|
+
artifact_id: str,
|
|
66
|
+
) -> SourceArtifactSet:
|
|
67
|
+
missing = self.required_columns - set(df.columns)
|
|
68
|
+
if missing:
|
|
69
|
+
missing_columns = sorted(missing)
|
|
70
|
+
raise SourceValidationError(
|
|
71
|
+
f"RR static source missing required columns: {missing_columns}",
|
|
72
|
+
metadata={"dataset": self.dataset, "missing_columns": missing_columns},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
logical_path = (
|
|
76
|
+
f"source_artifacts/{self.dataset}/{artifact_id}/{source_name}.parquet"
|
|
77
|
+
)
|
|
78
|
+
self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
|
|
79
|
+
|
|
80
|
+
return SourceArtifactSet(
|
|
81
|
+
dataset=self.dataset,
|
|
82
|
+
artifact_id=artifact_id,
|
|
83
|
+
logical_path=logical_path,
|
|
84
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Static source file declarations for dataset Connectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class StaticSourceFile:
|
|
11
|
+
"""A deploy-time static source file copied into logical storage."""
|
|
12
|
+
|
|
13
|
+
dataset: str
|
|
14
|
+
local_path: Path
|
|
15
|
+
logical_path: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
STATIC_SOURCE_FILES: tuple[StaticSourceFile, ...] = (
|
|
19
|
+
StaticSourceFile(
|
|
20
|
+
dataset="acled",
|
|
21
|
+
local_path=Path("data") / "static" / "acled.csv",
|
|
22
|
+
logical_path="source_artifacts/acled/acled.csv",
|
|
23
|
+
),
|
|
24
|
+
StaticSourceFile(
|
|
25
|
+
dataset="rr",
|
|
26
|
+
local_path=Path("data") / "static" / "rr.csv",
|
|
27
|
+
logical_path="source_artifacts/rr/rr.csv",
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def static_source_for_dataset(dataset: str) -> StaticSourceFile:
|
|
33
|
+
"""Return the configured static source file for a dataset."""
|
|
34
|
+
|
|
35
|
+
for source_file in STATIC_SOURCE_FILES:
|
|
36
|
+
if source_file.dataset == dataset:
|
|
37
|
+
return source_file
|
|
38
|
+
raise ValueError(f"Dataset {dataset!r} has no configured static source file")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__all__ = ["STATIC_SOURCE_FILES", "StaticSourceFile", "static_source_for_dataset"]
|