fpu-barometer-admin 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fpu_barometer_admin/__init__.py +6 -0
  2. fpu_barometer_admin/cli/__init__.py +5 -0
  3. fpu_barometer_admin/cli/commands.py +199 -0
  4. fpu_barometer_admin/cli/deploy.py +719 -0
  5. fpu_barometer_admin/connectors/__init__.py +56 -0
  6. fpu_barometer_admin/connectors/acled_connector.py +77 -0
  7. fpu_barometer_admin/connectors/base_connector.py +60 -0
  8. fpu_barometer_admin/connectors/cpj_connector.py +92 -0
  9. fpu_barometer_admin/connectors/ert_connector.py +134 -0
  10. fpu_barometer_admin/connectors/gdelt_connector.py +403 -0
  11. fpu_barometer_admin/connectors/mfrr_connector.py +171 -0
  12. fpu_barometer_admin/connectors/rr_connector.py +84 -0
  13. fpu_barometer_admin/connectors/static_sources.py +41 -0
  14. fpu_barometer_admin/connectors/vdem_connector.py +165 -0
  15. fpu_barometer_admin/handlers/__init__.py +6 -0
  16. fpu_barometer_admin/handlers/function_app.py +543 -0
  17. fpu_barometer_admin/processors/__init__.py +46 -0
  18. fpu_barometer_admin/processors/acled_processor.py +263 -0
  19. fpu_barometer_admin/processors/base_processor.py +23 -0
  20. fpu_barometer_admin/processors/cpj_processor.py +147 -0
  21. fpu_barometer_admin/processors/ert_processor.py +72 -0
  22. fpu_barometer_admin/processors/gdelt_processor.py +260 -0
  23. fpu_barometer_admin/processors/mfrr_processor.py +327 -0
  24. fpu_barometer_admin/processors/rr_processor.py +208 -0
  25. fpu_barometer_admin/processors/vdem_processor.py +70 -0
  26. fpu_barometer_admin/runners/__init__.py +19 -0
  27. fpu_barometer_admin/runners/definitions.py +159 -0
  28. fpu_barometer_admin/runners/runners.py +291 -0
  29. fpu_barometer_admin/runners/scheduler.py +148 -0
  30. fpu_barometer_admin/runners/seed.py +399 -0
  31. fpu_barometer_admin/schemas/__init__.py +1 -0
  32. fpu_barometer_admin/schemas/event.py +362 -0
  33. fpu_barometer_admin/schemas/predictor.py +418 -0
  34. fpu_barometer_admin/storage/__init__.py +39 -0
  35. fpu_barometer_admin/storage/catalog.py +359 -0
  36. fpu_barometer_admin/storage/factory.py +165 -0
  37. fpu_barometer_admin/storage/objects.py +463 -0
  38. fpu_barometer_admin/storage/reader.py +410 -0
  39. fpu_barometer_admin-0.3.0.dist-info/METADATA +27 -0
  40. fpu_barometer_admin-0.3.0.dist-info/RECORD +43 -0
  41. fpu_barometer_admin-0.3.0.dist-info/WHEEL +4 -0
  42. fpu_barometer_admin-0.3.0.dist-info/entry_points.txt +2 -0
  43. fpu_barometer_admin-0.3.0.dist-info/licenses/LICENSE.md +7 -0
@@ -0,0 +1,403 @@
1
+ """GDELT connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ import os
8
+ import re
9
+ import zipfile
10
+ from dataclasses import dataclass
11
+ import pandas as pd
12
+ import requests
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ from fpu_barometer_admin.connectors.base_connector import (
18
+ NoNewSourceArtifact,
19
+ SourceArtifactSet,
20
+ SourceValidationError,
21
+ )
22
+ from fpu_barometer_admin.storage.objects import ObjectStorage
23
+
24
+
25
+ GDELT_MASTER_FILE_LIST = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
26
+ GDELT_TIMEOUT_SECONDS = 60
27
+ GDELT_EVENT_CODES = {
28
+ "091",
29
+ "092",
30
+ "110",
31
+ "111",
32
+ "112",
33
+ "1121",
34
+ "1122",
35
+ "1123",
36
+ "1125",
37
+ "113",
38
+ "114",
39
+ "115",
40
+ "116",
41
+ "1233",
42
+ "124",
43
+ "1234",
44
+ "1245",
45
+ "130",
46
+ "131",
47
+ "1321",
48
+ "138",
49
+ "1384",
50
+ "139",
51
+ "170",
52
+ "171",
53
+ "1711",
54
+ "1712",
55
+ "172",
56
+ "1721",
57
+ "173",
58
+ "174",
59
+ "176",
60
+ "180",
61
+ "181",
62
+ "182",
63
+ "1821",
64
+ "1822",
65
+ "1823",
66
+ "183",
67
+ "1831",
68
+ "1832",
69
+ "1833",
70
+ "1834",
71
+ "185",
72
+ "186",
73
+ }
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class _ManifestEntry:
78
+ timestamp: str
79
+ url: str
80
+
81
+
82
+ class GdeltConnector:
83
+ """Connector for bounded incremental GDELT Event export refreshes."""
84
+
85
+ dataset = "gdelt"
86
+ source_columns = (
87
+ "GlobalEventID",
88
+ "SQLDATE",
89
+ "MonthYear",
90
+ "Year",
91
+ "FractionDate",
92
+ "Actor1Code",
93
+ "Actor1Name",
94
+ "Actor1CountryCode",
95
+ "Actor1KnownGroupCode",
96
+ "Actor1EthnicCode",
97
+ "Actor1Religion1Code",
98
+ "Actor1Religion2Code",
99
+ "Actor1Type1Code",
100
+ "Actor1Type2Code",
101
+ "Actor1Type3Code",
102
+ "Actor2Code",
103
+ "Actor2Name",
104
+ "Actor2CountryCode",
105
+ "Actor2KnownGroupCode",
106
+ "Actor2EthnicCode",
107
+ "Actor2Religion1Code",
108
+ "Actor2Religion2Code",
109
+ "Actor2Type1Code",
110
+ "Actor2Type2Code",
111
+ "Actor2Type3Code",
112
+ "IsRootEvent",
113
+ "EventCode",
114
+ "EventBaseCode",
115
+ "EventRootCode",
116
+ "QuadClass",
117
+ "GoldsteinScale",
118
+ "NumMentions",
119
+ "NumSources",
120
+ "NumArticles",
121
+ "AvgTone",
122
+ "Actor1Geo_Type",
123
+ "Actor1Geo_FullName",
124
+ "Actor1Geo_CountryCode",
125
+ "Actor1Geo_ADM1Code",
126
+ "Actor1Geo_ADM2Code",
127
+ "Actor1Geo_Lat",
128
+ "Actor1Geo_Long",
129
+ "Actor1Geo_FeatureID",
130
+ "Actor2Geo_Type",
131
+ "Actor2Geo_FullName",
132
+ "Actor2Geo_CountryCode",
133
+ "Actor2Geo_ADM1Code",
134
+ "Actor2Geo_ADM2Code",
135
+ "Actor2Geo_Lat",
136
+ "Actor2Geo_Long",
137
+ "Actor2Geo_FeatureID",
138
+ "ActionGeo_Type",
139
+ "ActionGeo_FullName",
140
+ "ActionGeo_CountryCode",
141
+ "ActionGeo_ADM1Code",
142
+ "ActionGeo_ADM2Code",
143
+ "ActionGeo_Lat",
144
+ "ActionGeo_Long",
145
+ "ActionGeo_FeatureID",
146
+ "DATEADDED",
147
+ "SOURCEURL",
148
+ )
149
+ required_columns = {
150
+ "GlobalEventID",
151
+ "SQLDATE",
152
+ "Actor1Name",
153
+ "Actor1CountryCode",
154
+ "Actor1Type1Code",
155
+ "Actor2Name",
156
+ "Actor2CountryCode",
157
+ "Actor2Type1Code",
158
+ "EventCode",
159
+ "ActionGeo_CountryCode",
160
+ "ActionGeo_Lat",
161
+ "ActionGeo_Long",
162
+ "DATEADDED",
163
+ "SOURCEURL",
164
+ }
165
+ media_terms = ("media", "journalist", "reporter", "press", "news")
166
+ media_type_codes = {"MED", "JRN"}
167
+
168
+ def __init__(
169
+ self,
170
+ object_storage: ObjectStorage,
171
+ *,
172
+ manifest_url: str = GDELT_MASTER_FILE_LIST,
173
+ timeout_seconds: int = GDELT_TIMEOUT_SECONDS,
174
+ max_files_per_run: int | None = None,
175
+ bootstrap_file_limit: int | None = None,
176
+ ):
177
+ self.object_storage = object_storage
178
+ self.manifest_url = manifest_url
179
+ self.timeout_seconds = timeout_seconds
180
+ self.max_files_per_run = max_files_per_run or _positive_int_env(
181
+ "GDELT_MAX_FILES_PER_RUN"
182
+ )
183
+ self.bootstrap_file_limit = (
184
+ bootstrap_file_limit
185
+ if bootstrap_file_limit is not None
186
+ else _positive_int_env("GDELT_BOOTSTRAP_FILE_LIMIT", default=1)
187
+ )
188
+
189
+ def fetch(
190
+ self,
191
+ *,
192
+ run_id: str,
193
+ artifact_id: str,
194
+ since_watermark: str | None = None,
195
+ ) -> SourceArtifactSet:
196
+ entries = self._manifest_entries()
197
+ selected = self._select_entries(entries, since_watermark=since_watermark)
198
+ if not selected:
199
+ raise NoNewSourceArtifact("GDELT manifest has no newer event export files")
200
+
201
+ frames: list[pd.DataFrame] = []
202
+ latest_successful_timestamp: str | None = None
203
+ earliest_skipped_timestamp: str | None = None
204
+ for entry in selected:
205
+ try:
206
+ frame = self._read_remote_zip_csv(entry.url)
207
+ frames.append(frame)
208
+ latest_successful_timestamp = entry.timestamp
209
+ except requests.exceptions.HTTPError as exc:
210
+ if exc.response is not None and exc.response.status_code == 404:
211
+ logger.warning(
212
+ "GDELT export file not yet available, skipping: %s",
213
+ entry.url,
214
+ )
215
+ if earliest_skipped_timestamp is None:
216
+ earliest_skipped_timestamp = entry.timestamp
217
+ continue
218
+ raise
219
+
220
+ if not frames:
221
+ raise NoNewSourceArtifact(
222
+ "GDELT export files listed in manifest are not yet available",
223
+ watermark_after=since_watermark,
224
+ )
225
+
226
+ # Determine watermark so we never skip data due to transient 404s.
227
+ # If any file returned 404, only advance the watermark to the last
228
+ # selected entry BEFORE the first gap. Files at and after the gap
229
+ # will be retried on the next run (reprocessing already-seen files
230
+ # is safe because the processor deduplicates by event_id against
231
+ # the baseline processed dataset).
232
+ if earliest_skipped_timestamp is not None:
233
+ watermark_after = since_watermark or ""
234
+ for entry in selected:
235
+ if entry.timestamp >= earliest_skipped_timestamp:
236
+ break
237
+ watermark_after = entry.timestamp
238
+ else:
239
+ watermark_after = selected[-1].timestamp
240
+
241
+ return self._write_source_artifact(
242
+ frames,
243
+ artifact_id=artifact_id,
244
+ watermark_after=watermark_after,
245
+ empty_message="GDELT export files contained no Barometer-relevant Event rows",
246
+ )
247
+
248
+ def _manifest_entries(self) -> list[_ManifestEntry]:
249
+ response = requests.get(self.manifest_url, timeout=self.timeout_seconds)
250
+ response.raise_for_status()
251
+ entries: list[_ManifestEntry] = []
252
+ for line in response.text.splitlines():
253
+ match = re.search(r"(https?://\S+/(\d{14})\.export\.CSV\.zip)", line)
254
+ if match:
255
+ entries.append(
256
+ _ManifestEntry(timestamp=match.group(2), url=match.group(1))
257
+ )
258
+ return sorted(
259
+ {entry.url: entry for entry in entries}.values(),
260
+ key=lambda entry: entry.timestamp,
261
+ )
262
+
263
+ def _select_entries(
264
+ self, entries: list[_ManifestEntry], *, since_watermark: str | None
265
+ ) -> list[_ManifestEntry]:
266
+ if since_watermark:
267
+ candidates = [
268
+ entry for entry in entries if entry.timestamp > since_watermark
269
+ ]
270
+ else:
271
+ candidates = entries[-self.bootstrap_file_limit :]
272
+ candidates = sorted(candidates, key=lambda entry: entry.timestamp)
273
+ if self.max_files_per_run is not None:
274
+ candidates = candidates[: self.max_files_per_run]
275
+ return candidates
276
+
277
+ def _write_source_artifact(
278
+ self,
279
+ frames: list[pd.DataFrame],
280
+ *,
281
+ artifact_id: str,
282
+ watermark_after: str,
283
+ empty_message: str,
284
+ ) -> SourceArtifactSet:
285
+ source = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
286
+ self._validate_source_columns(source)
287
+ source = self._filter_relevant_rows(source)
288
+ if source.empty:
289
+ raise NoNewSourceArtifact(empty_message, watermark_after=watermark_after)
290
+
291
+ logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/gdelt.parquet"
292
+ self.object_storage.write_dataframe(logical_path, source, fail_if_exists=True)
293
+ return SourceArtifactSet(
294
+ dataset=self.dataset,
295
+ artifact_id=artifact_id,
296
+ logical_path=logical_path,
297
+ watermark_after=watermark_after,
298
+ )
299
+
300
+ def _read_remote_zip_csv(self, url: str) -> pd.DataFrame:
301
+ response = requests.get(url, timeout=self.timeout_seconds)
302
+ response.raise_for_status()
303
+ return self._read_zip_csv_bytes(response.content, source_label=url)
304
+
305
+ def _read_zip_csv_bytes(self, payload: bytes, *, source_label: str) -> pd.DataFrame:
306
+ try:
307
+ with zipfile.ZipFile(io.BytesIO(payload)) as archive:
308
+ members = [
309
+ name for name in archive.namelist() if name.upper().endswith("CSV")
310
+ ]
311
+ if not members:
312
+ raise SourceValidationError(
313
+ f"GDELT zip contains no CSV member: {source_label}",
314
+ metadata={
315
+ "dataset": self.dataset,
316
+ "source_url": source_label,
317
+ "expected_shape": "zip_csv_member",
318
+ },
319
+ )
320
+ with archive.open(members[0]) as handle:
321
+ frame = self._read_gdelt_csv(handle, source_label=source_label)
322
+ except zipfile.BadZipFile as exc:
323
+ raise SourceValidationError(
324
+ f"GDELT source is not a valid zip: {source_label}",
325
+ metadata={
326
+ "dataset": self.dataset,
327
+ "source_url": source_label,
328
+ "expected_shape": "zip_file",
329
+ },
330
+ ) from exc
331
+ return frame
332
+
333
+ def _read_gdelt_csv(self, source, *, source_label: str) -> pd.DataFrame:
334
+ frame = pd.read_csv(
335
+ source,
336
+ sep="\t",
337
+ header=None,
338
+ dtype=str,
339
+ keep_default_na=False,
340
+ )
341
+ if frame.shape[1] != len(self.source_columns):
342
+ raise SourceValidationError(
343
+ f"GDELT source has {frame.shape[1]} columns; expected {len(self.source_columns)}",
344
+ metadata={
345
+ "dataset": self.dataset,
346
+ "source_url": source_label,
347
+ "actual_columns": frame.shape[1],
348
+ "expected_columns": len(self.source_columns),
349
+ },
350
+ )
351
+ frame.columns = self.source_columns
352
+ return frame
353
+
354
+ def _validate_source_columns(self, source: pd.DataFrame) -> None:
355
+ missing = self.required_columns - set(source.columns)
356
+ if missing:
357
+ missing_columns = sorted(missing)
358
+ raise SourceValidationError(
359
+ f"GDELT source missing required columns: {missing_columns}",
360
+ metadata={"dataset": self.dataset, "missing_columns": missing_columns},
361
+ )
362
+
363
+ def _filter_relevant_rows(self, source: pd.DataFrame) -> pd.DataFrame:
364
+ filtered = source[
365
+ source["EventCode"].astype(str).isin(GDELT_EVENT_CODES)
366
+ ].copy()
367
+ if filtered.empty:
368
+ return filtered
369
+ return filtered.loc[self._media_relevance_mask(filtered)].reset_index(drop=True)
370
+
371
+ def _media_relevance_mask(self, source: pd.DataFrame) -> pd.Series:
372
+ mask = pd.Series(False, index=source.index)
373
+ for column in (
374
+ "Actor1Type1Code",
375
+ "Actor1Type2Code",
376
+ "Actor1Type3Code",
377
+ "Actor2Type1Code",
378
+ "Actor2Type2Code",
379
+ "Actor2Type3Code",
380
+ ):
381
+ if column in source:
382
+ mask = mask | source[column].astype(str).str.upper().isin(
383
+ self.media_type_codes
384
+ )
385
+ for column in ("Actor1Name", "Actor2Name", "Actor1Code", "Actor2Code"):
386
+ if column in source:
387
+ lowered = source[column].astype(str).str.lower()
388
+ for term in self.media_terms:
389
+ mask = mask | lowered.str.contains(term, regex=False)
390
+ return mask
391
+
392
+
393
+ def _positive_int_env(name: str, default: int | None = None) -> int | None:
394
+ raw = os.environ.get(name)
395
+ if raw is None or raw == "":
396
+ return default
397
+ value = int(raw)
398
+ if value <= 0:
399
+ raise ValueError(f"{name} must be a positive integer")
400
+ return value
401
+
402
+
403
+ __all__ = ["GDELT_EVENT_CODES", "GDELT_MASTER_FILE_LIST", "GdeltConnector"]
@@ -0,0 +1,171 @@
1
+ """MFRR connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import requests
8
+ import pandas as pd
9
+
10
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
11
+ from fpu_barometer_admin.storage.objects import ObjectStorage
12
+
13
+
14
+ MFRR_API_ENDPOINT = "https://www.mapmf.org/meili/multi-search"
15
+ MFRR_API_TOKEN = "c129ca42527c52965c80099ab1a869f40de8ec3b698d1e361b0cf7402c6d48a1"
16
+ MFRR_PAGE_SIZE = 1000
17
+ MFRR_TIMEOUT_SECONDS = 60
18
+ MFRR_LIMIT = 999999999999
19
+
20
+ MFRR_ATTRIBUTES_TO_RETRIEVE = [
21
+ "id",
22
+ "title",
23
+ "content",
24
+ "country",
25
+ "date",
26
+ "year",
27
+ "published_at",
28
+ "published_at_date",
29
+ "has_date_of_incidence",
30
+ "type_of_incident",
31
+ "top_type_of_incident",
32
+ "type_of_incident_leaves",
33
+ "attacked_count",
34
+ "coe_link",
35
+ "news_source_links",
36
+ "internet_source_links",
37
+ "gender",
38
+ "type_of_journalist_or_media_actor",
39
+ "source_of_incident",
40
+ "context_of_incident",
41
+ "subjects",
42
+ "who_was_attacked",
43
+ "region_names",
44
+ "region_ids",
45
+ "consolidated_count",
46
+ "specific_topic",
47
+ "project",
48
+ "_geo_lat",
49
+ "_geo_lng",
50
+ "_geo.lat",
51
+ "_geo.lng",
52
+ "_geo",
53
+ ]
54
+
55
+
56
+ class MFRRConnector:
57
+ """Connector for the MFRR API Event source."""
58
+
59
+ dataset = "mfrr"
60
+ required_columns = {
61
+ "id",
62
+ "country",
63
+ "date",
64
+ "year",
65
+ "published_at",
66
+ "published_at_date",
67
+ "type_of_incident",
68
+ "top_type_of_incident",
69
+ "type_of_incident_leaves",
70
+ "attacked_count",
71
+ }
72
+
73
+ def __init__(self, object_storage: ObjectStorage):
74
+ self.object_storage = object_storage
75
+
76
+ def fetch(
77
+ self,
78
+ *,
79
+ run_id: str,
80
+ artifact_id: str,
81
+ since_watermark: str | None = None,
82
+ ) -> SourceArtifactSet:
83
+ response = requests.post(
84
+ MFRR_API_ENDPOINT,
85
+ headers={"Authorization": f"Bearer {MFRR_API_TOKEN}"},
86
+ json={
87
+ "queries": [
88
+ {
89
+ "indexUid": "alerts",
90
+ "q": "",
91
+ "sort": ["timestamp:desc"],
92
+ "limit": MFRR_LIMIT,
93
+ "offset": 0,
94
+ "attributesToRetrieve": MFRR_ATTRIBUTES_TO_RETRIEVE,
95
+ }
96
+ ]
97
+ },
98
+ timeout=MFRR_TIMEOUT_SECONDS,
99
+ )
100
+ response.raise_for_status()
101
+ payload = response.json()
102
+ hits = self._hits_from_response(payload)
103
+ df = pd.DataFrame(hits)
104
+ missing = self.required_columns - set(df.columns)
105
+ if missing:
106
+ missing_columns = sorted(missing)
107
+ raise SourceValidationError(
108
+ f"MFRR API source missing required columns: {missing_columns}",
109
+ metadata={"dataset": self.dataset, "missing_columns": missing_columns},
110
+ )
111
+ df = self._storage_normalized(df)
112
+
113
+ logical_path = f"source_artifacts/{self.dataset}/{artifact_id}/mfrr.parquet"
114
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
115
+ return SourceArtifactSet(
116
+ dataset=self.dataset,
117
+ artifact_id=artifact_id,
118
+ logical_path=logical_path,
119
+ )
120
+
121
+ def _storage_normalized(self, df: pd.DataFrame) -> pd.DataFrame:
122
+ """Normalize source-shaped API values so mixed JSON-ish columns survive Parquet."""
123
+
124
+ normalized = df.copy()
125
+ for column in normalized.select_dtypes(include="object").columns:
126
+ normalized[column] = normalized[column].map(self._source_value)
127
+ return normalized
128
+
129
+ def _source_value(self, value: object) -> object:
130
+ if value is None:
131
+ return None
132
+ try:
133
+ if pd.isna(value):
134
+ return None
135
+ except (TypeError, ValueError):
136
+ pass
137
+ if isinstance(value, (list, tuple, dict)):
138
+ return json.dumps(value, ensure_ascii=False)
139
+ return str(value)
140
+
141
+ def _hits_from_response(self, payload: object) -> list[dict]:
142
+ if not isinstance(payload, dict):
143
+ raise SourceValidationError(
144
+ "MFRR API response must be a JSON object",
145
+ metadata={"dataset": self.dataset, "expected_shape": "json_object"},
146
+ )
147
+ results = payload.get("results")
148
+ if not isinstance(results, list) or not results:
149
+ raise SourceValidationError(
150
+ "MFRR API response results must be a non-empty list",
151
+ metadata={
152
+ "dataset": self.dataset,
153
+ "expected_shape": "non_empty_results_list",
154
+ },
155
+ )
156
+ first = results[0]
157
+ if not isinstance(first, dict) or "hits" not in first:
158
+ raise SourceValidationError(
159
+ "MFRR API first result must contain hits",
160
+ metadata={
161
+ "dataset": self.dataset,
162
+ "expected_shape": "first_result_with_hits",
163
+ },
164
+ )
165
+ hits = first["hits"]
166
+ if not isinstance(hits, list):
167
+ raise SourceValidationError(
168
+ "MFRR API hits must be a list",
169
+ metadata={"dataset": self.dataset, "expected_shape": "hits_list"},
170
+ )
171
+ return hits
@@ -0,0 +1,84 @@
1
+ """RR (Reporters Respond) connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from fpu_barometer_admin.connectors.base_connector import SourceArtifactSet, SourceValidationError
11
+ from fpu_barometer_admin.connectors.static_sources import static_source_for_dataset
12
+ from fpu_barometer_admin.storage.objects import ObjectStorage
13
+
14
+
15
+ class RrConnector:
16
+ """Connector for the static RR (Reporters Respond) source file."""
17
+
18
+ dataset = "rr"
19
+ required_columns = {
20
+ "Submission Date",
21
+ "Nationality:",
22
+ "Current location:",
23
+ "Region",
24
+ "Year",
25
+ "Decision",
26
+ "Considered under",
27
+ "Prevention or Protection",
28
+ "Type of Assistance",
29
+ "Legal Threat",
30
+ "EUR Amount",
31
+ "Number of Journalists Supported",
32
+ "I am applying to Reporters Respond as:",
33
+ "How do you identify as?",
34
+ "Submission ID",
35
+ }
36
+
37
+ def __init__(self, object_storage: ObjectStorage):
38
+ self.object_storage = object_storage
39
+
40
+ def fetch(
41
+ self,
42
+ *,
43
+ run_id: str,
44
+ artifact_id: str,
45
+ since_watermark: str | None = None,
46
+ ) -> SourceArtifactSet:
47
+ """Fetch the deployed RR static source from logical storage."""
48
+
49
+ source_file = static_source_for_dataset(self.dataset)
50
+ payload = self.object_storage.read_bytes(source_file.logical_path)
51
+ df = pd.read_csv(BytesIO(payload), low_memory=False)
52
+ return self._write_source_artifact(
53
+ df,
54
+ source_name=Path(source_file.logical_path).stem,
55
+ run_id=run_id,
56
+ artifact_id=artifact_id,
57
+ )
58
+
59
+ def _write_source_artifact(
60
+ self,
61
+ df: pd.DataFrame,
62
+ *,
63
+ source_name: str,
64
+ run_id: str,
65
+ artifact_id: str,
66
+ ) -> SourceArtifactSet:
67
+ missing = self.required_columns - set(df.columns)
68
+ if missing:
69
+ missing_columns = sorted(missing)
70
+ raise SourceValidationError(
71
+ f"RR static source missing required columns: {missing_columns}",
72
+ metadata={"dataset": self.dataset, "missing_columns": missing_columns},
73
+ )
74
+
75
+ logical_path = (
76
+ f"source_artifacts/{self.dataset}/{artifact_id}/{source_name}.parquet"
77
+ )
78
+ self.object_storage.write_dataframe(logical_path, df, fail_if_exists=True)
79
+
80
+ return SourceArtifactSet(
81
+ dataset=self.dataset,
82
+ artifact_id=artifact_id,
83
+ logical_path=logical_path,
84
+ )
@@ -0,0 +1,41 @@
1
+ """Static source file declarations for dataset Connectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class StaticSourceFile:
11
+ """A deploy-time static source file copied into logical storage."""
12
+
13
+ dataset: str
14
+ local_path: Path
15
+ logical_path: str
16
+
17
+
18
+ STATIC_SOURCE_FILES: tuple[StaticSourceFile, ...] = (
19
+ StaticSourceFile(
20
+ dataset="acled",
21
+ local_path=Path("data") / "static" / "acled.csv",
22
+ logical_path="source_artifacts/acled/acled.csv",
23
+ ),
24
+ StaticSourceFile(
25
+ dataset="rr",
26
+ local_path=Path("data") / "static" / "rr.csv",
27
+ logical_path="source_artifacts/rr/rr.csv",
28
+ ),
29
+ )
30
+
31
+
32
+ def static_source_for_dataset(dataset: str) -> StaticSourceFile:
33
+ """Return the configured static source file for a dataset."""
34
+
35
+ for source_file in STATIC_SOURCE_FILES:
36
+ if source_file.dataset == dataset:
37
+ return source_file
38
+ raise ValueError(f"Dataset {dataset!r} has no configured static source file")
39
+
40
+
41
+ __all__ = ["STATIC_SOURCE_FILES", "StaticSourceFile", "static_source_for_dataset"]