groundsource 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundsource/__init__.py +6 -0
- groundsource/cache.py +63 -0
- groundsource/charts.py +206 -0
- groundsource/data/__init__.py +1 -0
- groundsource/data/ne_110m_admin_0_countries.cpg +1 -0
- groundsource/data/ne_110m_admin_0_countries.dbf +0 -0
- groundsource/data/ne_110m_admin_0_countries.prj +1 -0
- groundsource/data/ne_110m_admin_0_countries.shp +0 -0
- groundsource/data/ne_110m_admin_0_countries.shx +0 -0
- groundsource/db.py +322 -0
- groundsource/spatial.py +131 -0
- groundsource/trends.py +113 -0
- groundsource-0.1.0.dist-info/METADATA +150 -0
- groundsource-0.1.0.dist-info/RECORD +17 -0
- groundsource-0.1.0.dist-info/WHEEL +5 -0
- groundsource-0.1.0.dist-info/licenses/LICENSE +21 -0
- groundsource-0.1.0.dist-info/top_level.txt +1 -0
groundsource/__init__.py
ADDED
groundsource/cache.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Download and cache management for the Groundsource dataset."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.request
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
ZENODO_URL = "https://zenodo.org/records/18647054/files/groundsource_2026.parquet?download=1"
|
|
9
|
+
PARQUET_FILENAME = "groundsource_2026.parquet"
|
|
10
|
+
ENRICHED_FILENAME = "groundsource_enriched.parquet"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_cache_dir() -> Path:
|
|
14
|
+
"""Return platform-appropriate cache directory."""
|
|
15
|
+
if sys.platform == "win32":
|
|
16
|
+
base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
|
|
17
|
+
elif sys.platform == "darwin":
|
|
18
|
+
base = Path.home() / "Library" / "Caches"
|
|
19
|
+
else:
|
|
20
|
+
base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
|
|
21
|
+
cache_dir = base / "groundsource"
|
|
22
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
return cache_dir
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_raw_parquet_path() -> Path:
|
|
27
|
+
return get_cache_dir() / PARQUET_FILENAME
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_enriched_parquet_path() -> Path:
|
|
31
|
+
return get_cache_dir() / ENRICHED_FILENAME
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def download_parquet(force: bool = False) -> Path:
|
|
35
|
+
"""Download the raw Parquet from Zenodo if not already cached."""
|
|
36
|
+
path = get_raw_parquet_path()
|
|
37
|
+
if path.exists() and not force:
|
|
38
|
+
return path
|
|
39
|
+
|
|
40
|
+
print(f"Downloading Groundsource dataset ({PARQUET_FILENAME})...")
|
|
41
|
+
print(f"Source: {ZENODO_URL}")
|
|
42
|
+
print(f"Destination: {path}")
|
|
43
|
+
|
|
44
|
+
def _progress(block_num, block_size, total_size):
|
|
45
|
+
downloaded = block_num * block_size
|
|
46
|
+
if total_size > 0:
|
|
47
|
+
pct = min(100, downloaded * 100 / total_size)
|
|
48
|
+
mb = downloaded / (1024 * 1024)
|
|
49
|
+
total_mb = total_size / (1024 * 1024)
|
|
50
|
+
sys.stdout.write(f"\r {mb:.0f}/{total_mb:.0f} MB ({pct:.1f}%)")
|
|
51
|
+
sys.stdout.flush()
|
|
52
|
+
|
|
53
|
+
urllib.request.urlretrieve(ZENODO_URL, str(path), reporthook=_progress)
|
|
54
|
+
print("\n Download complete.")
|
|
55
|
+
return path
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_from_local(path: str) -> Path:
|
|
59
|
+
"""Use a local Parquet file instead of downloading."""
|
|
60
|
+
p = Path(path)
|
|
61
|
+
if not p.exists():
|
|
62
|
+
raise FileNotFoundError(f"Local parquet not found: {path}")
|
|
63
|
+
return p
|
groundsource/charts.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Chart generators for Groundsource analysis. LinkedIn-worthy matplotlib charts."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import matplotlib.ticker as ticker
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Consistent style for all charts
|
|
10
|
+
STYLE = {
|
|
11
|
+
"figure.facecolor": "white",
|
|
12
|
+
"axes.facecolor": "#f8f9fa",
|
|
13
|
+
"axes.grid": True,
|
|
14
|
+
"grid.alpha": 0.3,
|
|
15
|
+
"font.family": "sans-serif",
|
|
16
|
+
"font.size": 11,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _apply_style():
|
|
21
|
+
plt.rcParams.update(STYLE)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def plot_hockey_stick(yearly_counts: pd.DataFrame, save_path: str = None) -> plt.Figure:
|
|
25
|
+
"""Chart 1: Total events per year — the 807x hockey stick.
|
|
26
|
+
|
|
27
|
+
yearly_counts: DataFrame with columns [year, count]
|
|
28
|
+
"""
|
|
29
|
+
_apply_style()
|
|
30
|
+
fig, ax = plt.subplots(figsize=(12, 6))
|
|
31
|
+
|
|
32
|
+
ax.bar(yearly_counts["year"], yearly_counts["count"],
|
|
33
|
+
color="#1a73e8", alpha=0.85, width=0.8)
|
|
34
|
+
|
|
35
|
+
# Annotate the growth
|
|
36
|
+
first_year = yearly_counts.iloc[0]
|
|
37
|
+
peak_year = yearly_counts.loc[yearly_counts["count"].idxmax()]
|
|
38
|
+
growth = peak_year["count"] / first_year["count"] if first_year["count"] > 0 else 0
|
|
39
|
+
|
|
40
|
+
ax.annotate(
|
|
41
|
+
f'{first_year["count"]:,.0f} events',
|
|
42
|
+
xy=(first_year["year"], first_year["count"]),
|
|
43
|
+
xytext=(first_year["year"] + 3, peak_year["count"] * 0.3),
|
|
44
|
+
fontsize=10, color="#666",
|
|
45
|
+
arrowprops=dict(arrowstyle="->", color="#999"),
|
|
46
|
+
)
|
|
47
|
+
ax.annotate(
|
|
48
|
+
f'{peak_year["count"]:,.0f} events\n({growth:,.0f}x growth)',
|
|
49
|
+
xy=(peak_year["year"], peak_year["count"]),
|
|
50
|
+
xytext=(peak_year["year"] - 6, peak_year["count"] * 1.05),
|
|
51
|
+
fontsize=10, fontweight="bold", color="#d93025",
|
|
52
|
+
arrowprops=dict(arrowstyle="->", color="#d93025"),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
ax.set_title("Flash Flood Events Detected Per Year\n— or is it news coverage?",
|
|
56
|
+
fontsize=16, fontweight="bold", pad=15)
|
|
57
|
+
ax.set_xlabel("Year", fontsize=12)
|
|
58
|
+
ax.set_ylabel("Events Detected", fontsize=12)
|
|
59
|
+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:,.0f}"))
|
|
60
|
+
|
|
61
|
+
ax.text(0.02, 0.95,
|
|
62
|
+
"Source: Google Groundsource dataset (2.6M events from news articles, 2000–2026)",
|
|
63
|
+
transform=ax.transAxes, fontsize=8, color="#999", va="top")
|
|
64
|
+
|
|
65
|
+
plt.tight_layout()
|
|
66
|
+
if save_path:
|
|
67
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
68
|
+
print(f" Saved: {save_path}")
|
|
69
|
+
return fig
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def plot_bias_normalized(yearly_counts: pd.DataFrame, save_path: str = None) -> plt.Figure:
|
|
73
|
+
"""Chart 2: Overlay event growth vs estimated digital news growth.
|
|
74
|
+
|
|
75
|
+
Uses a simple exponential proxy for global digital news volume.
|
|
76
|
+
Internet users: ~400M (2000) → ~5.5B (2025) ≈ 13.75x
|
|
77
|
+
Online news output grew even faster due to digital-native outlets.
|
|
78
|
+
We use a conservative 15-20x estimate for indexed news articles.
|
|
79
|
+
"""
|
|
80
|
+
_apply_style()
|
|
81
|
+
fig, ax = plt.subplots(figsize=(12, 6))
|
|
82
|
+
|
|
83
|
+
years = yearly_counts["year"].values
|
|
84
|
+
counts = yearly_counts["count"].values
|
|
85
|
+
|
|
86
|
+
# Normalize both to year 2007 (when dataset has enough events to be meaningful)
|
|
87
|
+
ref_idx = np.where(years == 2007)[0]
|
|
88
|
+
if len(ref_idx) == 0:
|
|
89
|
+
ref_idx = [7] # fallback
|
|
90
|
+
ref_idx = ref_idx[0]
|
|
91
|
+
|
|
92
|
+
norm_events = counts / counts[ref_idx]
|
|
93
|
+
|
|
94
|
+
# Conservative proxy: internet users grew ~14x from 2000 to 2025
|
|
95
|
+
# Online news articles grew faster. Use a simple logistic-like growth curve.
|
|
96
|
+
internet_users_billions = {
|
|
97
|
+
2000: 0.41, 2001: 0.50, 2002: 0.63, 2003: 0.72, 2004: 0.82,
|
|
98
|
+
2005: 1.02, 2006: 1.15, 2007: 1.37, 2008: 1.57, 2009: 1.77,
|
|
99
|
+
2010: 2.02, 2011: 2.23, 2012: 2.49, 2013: 2.73, 2014: 2.96,
|
|
100
|
+
2015: 3.19, 2016: 3.42, 2017: 3.65, 2018: 3.90, 2019: 4.13,
|
|
101
|
+
2020: 4.59, 2021: 4.90, 2022: 5.16, 2023: 5.35, 2024: 5.52,
|
|
102
|
+
2025: 5.56, 2026: 5.60,
|
|
103
|
+
}
|
|
104
|
+
news_proxy = np.array([internet_users_billions.get(y, 5.6) for y in years])
|
|
105
|
+
norm_news = news_proxy / news_proxy[ref_idx]
|
|
106
|
+
|
|
107
|
+
ax.plot(years, norm_events, "o-", color="#d93025", linewidth=2.5,
|
|
108
|
+
markersize=5, label="Detected flood events", zorder=3)
|
|
109
|
+
ax.plot(years, norm_news, "s--", color="#1a73e8", linewidth=2,
|
|
110
|
+
markersize=4, label="Internet users (proxy for digital news)", zorder=2)
|
|
111
|
+
|
|
112
|
+
ax.fill_between(years, norm_events, norm_news,
|
|
113
|
+
where=(norm_events > norm_news),
|
|
114
|
+
alpha=0.1, color="#d93025")
|
|
115
|
+
|
|
116
|
+
ax.set_title("Are Floods Increasing — or Is News Coverage?\nBoth curves normalized to 2007 = 1.0",
|
|
117
|
+
fontsize=15, fontweight="bold", pad=15)
|
|
118
|
+
ax.set_xlabel("Year", fontsize=12)
|
|
119
|
+
ax.set_ylabel("Growth relative to 2007", fontsize=12)
|
|
120
|
+
ax.legend(fontsize=11, loc="upper left")
|
|
121
|
+
ax.set_yscale("log")
|
|
122
|
+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:.0f}x"))
|
|
123
|
+
|
|
124
|
+
ax.text(0.02, 0.02,
|
|
125
|
+
"Event detection grows faster than internet adoption — likely because\n"
|
|
126
|
+
"news digitization (articles going online) grew faster than raw user count.",
|
|
127
|
+
transform=ax.transAxes, fontsize=9, color="#666", va="bottom",
|
|
128
|
+
style="italic")
|
|
129
|
+
|
|
130
|
+
plt.tight_layout()
|
|
131
|
+
if save_path:
|
|
132
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
133
|
+
print(f" Saved: {save_path}")
|
|
134
|
+
return fig
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def plot_country_growth(growth_df: pd.DataFrame, n: int = 20,
|
|
138
|
+
save_path: str = None) -> plt.Figure:
|
|
139
|
+
"""Chart 3: Countries with highest growth factor — what might be real.
|
|
140
|
+
|
|
141
|
+
growth_df: from trends.country_growth_ranking()
|
|
142
|
+
"""
|
|
143
|
+
_apply_style()
|
|
144
|
+
top = growth_df.head(n).copy()
|
|
145
|
+
top = top.sort_values("growth_factor", ascending=True) # horizontal bar, ascending
|
|
146
|
+
|
|
147
|
+
fig, ax = plt.subplots(figsize=(10, 8))
|
|
148
|
+
|
|
149
|
+
colors = ["#d93025" if g > 50 else "#ea8600" if g > 20 else "#1a73e8"
|
|
150
|
+
for g in top["growth_factor"]]
|
|
151
|
+
|
|
152
|
+
ax.barh(top["country"], top["growth_factor"], color=colors, alpha=0.85)
|
|
153
|
+
|
|
154
|
+
for i, (_, row) in enumerate(top.iterrows()):
|
|
155
|
+
ax.text(row["growth_factor"] + 0.5, i,
|
|
156
|
+
f'{row["growth_factor"]:.0f}x',
|
|
157
|
+
va="center", fontsize=9, fontweight="bold")
|
|
158
|
+
|
|
159
|
+
ax.set_title("Which Countries Show the Fastest Growth in Detected Flood Events?\n"
|
|
160
|
+
"Growth factor: avg events/year (2018–2025) vs (2005–2012)",
|
|
161
|
+
fontsize=14, fontweight="bold", pad=15)
|
|
162
|
+
ax.set_xlabel("Growth Factor (higher = faster acceleration)", fontsize=11)
|
|
163
|
+
|
|
164
|
+
ax.text(0.98, 0.02,
|
|
165
|
+
"⚠ High growth may reflect news digitization, not actual flood increase.\n"
|
|
166
|
+
"Countries with low baseline coverage will show inflated growth.",
|
|
167
|
+
transform=ax.transAxes, fontsize=8, color="#999",
|
|
168
|
+
ha="right", va="bottom", style="italic")
|
|
169
|
+
|
|
170
|
+
plt.tight_layout()
|
|
171
|
+
if save_path:
|
|
172
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
173
|
+
print(f" Saved: {save_path}")
|
|
174
|
+
return fig
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def plot_top_countries(country_counts: pd.DataFrame, n: int = 20,
|
|
178
|
+
save_path: str = None) -> plt.Figure:
|
|
179
|
+
"""Chart 4: Top N countries by total event count — the LinkedIn chart."""
|
|
180
|
+
_apply_style()
|
|
181
|
+
top = country_counts.head(n).copy()
|
|
182
|
+
top = top.sort_values("count", ascending=True)
|
|
183
|
+
|
|
184
|
+
fig, ax = plt.subplots(figsize=(10, 8))
|
|
185
|
+
|
|
186
|
+
ax.barh(top["country"], top["count"], color="#1a73e8", alpha=0.85)
|
|
187
|
+
|
|
188
|
+
for i, (_, row) in enumerate(top.iterrows()):
|
|
189
|
+
ax.text(row["count"] + 500, i,
|
|
190
|
+
f'{row["count"]:,.0f}',
|
|
191
|
+
va="center", fontsize=9)
|
|
192
|
+
|
|
193
|
+
ax.set_title("Top 20 Countries by Total Flash Flood Events Detected (2000–2026)",
|
|
194
|
+
fontsize=14, fontweight="bold", pad=15)
|
|
195
|
+
ax.set_xlabel("Total Events Detected", fontsize=11)
|
|
196
|
+
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:,.0f}"))
|
|
197
|
+
|
|
198
|
+
ax.text(0.98, 0.02,
|
|
199
|
+
"Source: Google Groundsource — 2.6M events extracted by Gemini from news articles",
|
|
200
|
+
transform=ax.transAxes, fontsize=8, color="#999", ha="right", va="bottom")
|
|
201
|
+
|
|
202
|
+
plt.tight_layout()
|
|
203
|
+
if save_path:
|
|
204
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
205
|
+
print(f" Saved: {save_path}")
|
|
206
|
+
return fig
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Data directory — contains Natural Earth shapefiles bundled with the package.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
UTF-8
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.017453292519943295]]
|
|
Binary file
|
|
Binary file
|
groundsource/db.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""FloodDB — Main interface to the Groundsource flash flood dataset."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from groundsource.cache import (
|
|
7
|
+
download_parquet,
|
|
8
|
+
get_enriched_parquet_path,
|
|
9
|
+
load_from_local,
|
|
10
|
+
)
|
|
11
|
+
from groundsource.spatial import enrich_dataframe, search_by_radius, search_by_bbox
|
|
12
|
+
from groundsource import trends as _trends
|
|
13
|
+
from groundsource import charts as _charts
|
|
14
|
+
|
|
15
|
+
# Common country name aliases -> Natural Earth canonical name
|
|
16
|
+
_COUNTRY_ALIASES = {
|
|
17
|
+
"usa": "United States of America",
|
|
18
|
+
"us": "United States of America",
|
|
19
|
+
"united states": "United States of America",
|
|
20
|
+
"america": "United States of America",
|
|
21
|
+
"uk": "United Kingdom",
|
|
22
|
+
"britain": "United Kingdom",
|
|
23
|
+
"great britain": "United Kingdom",
|
|
24
|
+
"england": "United Kingdom",
|
|
25
|
+
"uae": "United Arab Emirates",
|
|
26
|
+
"south korea": "South Korea",
|
|
27
|
+
"korea": "South Korea",
|
|
28
|
+
"north korea": "North Korea",
|
|
29
|
+
"dr congo": "Dem. Rep. Congo",
|
|
30
|
+
"democratic republic of congo": "Dem. Rep. Congo",
|
|
31
|
+
"drc": "Dem. Rep. Congo",
|
|
32
|
+
"congo": "Dem. Rep. Congo",
|
|
33
|
+
"ivory coast": "Ivory Coast",
|
|
34
|
+
"cote d'ivoire": "Ivory Coast",
|
|
35
|
+
"czech republic": "Czechia",
|
|
36
|
+
"czechia": "Czechia",
|
|
37
|
+
"bosnia": "Bosnia and Herz.",
|
|
38
|
+
"dominican republic": "Dominican Rep.",
|
|
39
|
+
"central african republic": "Central African Rep.",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Simple built-in city geocoding (major cities only, no external API needed)
|
|
43
|
+
# Format: city_name -> (lon, lat)
|
|
44
|
+
_CITY_COORDS = {
|
|
45
|
+
"houston": (-95.37, 29.76), "mumbai": (72.88, 19.08),
|
|
46
|
+
"jakarta": (106.85, -6.21), "lagos": (3.39, 6.52),
|
|
47
|
+
"dhaka": (90.41, 23.81), "bangkok": (100.50, 13.76),
|
|
48
|
+
"manila": (120.98, 14.60), "kolkata": (88.36, 22.57),
|
|
49
|
+
"chennai": (80.27, 13.08), "delhi": (77.21, 28.61),
|
|
50
|
+
"new york": (-74.01, 40.71), "los angeles": (-118.24, 34.05),
|
|
51
|
+
"chicago": (-87.63, 41.88), "miami": (-80.19, 25.76),
|
|
52
|
+
"london": (-0.13, 51.51), "paris": (2.35, 48.86),
|
|
53
|
+
"berlin": (13.40, 52.52), "rome": (12.50, 41.90),
|
|
54
|
+
"madrid": (-3.70, 40.42), "tokyo": (139.69, 35.69),
|
|
55
|
+
"beijing": (116.41, 39.90), "shanghai": (121.47, 31.23),
|
|
56
|
+
"sydney": (151.21, -33.87), "melbourne": (144.96, -37.81),
|
|
57
|
+
"sao paulo": (-46.63, -23.55), "rio de janeiro": (-43.17, -22.91),
|
|
58
|
+
"buenos aires": (-58.38, -34.60), "mexico city": (-99.13, 19.43),
|
|
59
|
+
"cairo": (31.24, 30.04), "nairobi": (36.82, -1.29),
|
|
60
|
+
"johannesburg": (28.05, -26.20), "cape town": (18.42, -33.93),
|
|
61
|
+
"istanbul": (28.98, 41.01), "moscow": (37.62, 55.76),
|
|
62
|
+
"dubai": (55.30, 25.20), "singapore": (103.85, 1.29),
|
|
63
|
+
"kuala lumpur": (101.69, 3.14), "hanoi": (105.85, 21.03),
|
|
64
|
+
"ho chi minh city": (106.63, 10.82), "seoul": (126.98, 37.57),
|
|
65
|
+
"osaka": (135.50, 34.69), "lima": (-77.04, -12.05),
|
|
66
|
+
"bogota": (-74.07, 4.71), "santiago": (-70.65, -33.45),
|
|
67
|
+
"accra": (-0.19, 5.56), "kinshasa": (15.31, -4.32),
|
|
68
|
+
"addis ababa": (38.75, 9.02), "dar es salaam": (39.27, -6.79),
|
|
69
|
+
"karachi": (67.01, 24.86), "lahore": (74.35, 31.56),
|
|
70
|
+
"islamabad": (73.05, 33.69), "kabul": (69.17, 34.53),
|
|
71
|
+
"kathmandu": (85.32, 27.72), "colombo": (79.86, 6.93),
|
|
72
|
+
"yangon": (96.15, 16.87), "phnom penh": (104.92, 11.56),
|
|
73
|
+
"taipei": (121.57, 25.03), "hong kong": (114.17, 22.32),
|
|
74
|
+
"amsterdam": (4.90, 52.37), "brussels": (4.35, 50.85),
|
|
75
|
+
"vienna": (16.37, 48.21), "zurich": (8.54, 47.38),
|
|
76
|
+
"munich": (11.58, 48.14), "milan": (9.19, 45.46),
|
|
77
|
+
"barcelona": (2.17, 41.39), "lisbon": (-9.14, 38.74),
|
|
78
|
+
"athens": (23.73, 37.98), "bucharest": (26.10, 44.43),
|
|
79
|
+
"warsaw": (21.01, 52.23), "prague": (14.42, 50.08),
|
|
80
|
+
"budapest": (19.04, 47.50), "stockholm": (18.07, 59.33),
|
|
81
|
+
"oslo": (10.75, 59.91), "copenhagen": (12.57, 55.68),
|
|
82
|
+
"helsinki": (24.94, 60.17), "dublin": (-6.26, 53.35),
|
|
83
|
+
"toronto": (-79.38, 43.65), "vancouver": (-123.12, 49.28),
|
|
84
|
+
"montreal": (-73.57, 45.50), "abuja": (7.49, 9.06),
|
|
85
|
+
"new orleans": (-90.07, 29.95), "denver": (-104.99, 39.74),
|
|
86
|
+
"atlanta": (-84.39, 33.75), "dallas": (-96.80, 32.78),
|
|
87
|
+
"seattle": (-122.33, 47.61), "san francisco": (-122.42, 37.77),
|
|
88
|
+
"phoenix": (-112.07, 33.45), "las vegas": (-115.14, 36.17),
|
|
89
|
+
"washington": (-77.04, 38.91), "boston": (-71.06, 42.36),
|
|
90
|
+
"philadelphia": (-75.17, 39.95), "detroit": (-83.05, 42.33),
|
|
91
|
+
"minneapolis": (-93.27, 44.98), "portland": (-122.68, 45.52),
|
|
92
|
+
"pittsburgh": (-79.99, 40.44), "st louis": (-90.20, 38.63),
|
|
93
|
+
"nashville": (-86.78, 36.16), "charlotte": (-80.84, 35.23),
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _resolve_country(name: str, df: pd.DataFrame) -> pd.Series:
|
|
98
|
+
"""Resolve a country name to a boolean mask, handling aliases and fuzzy matching."""
|
|
99
|
+
lower = name.lower()
|
|
100
|
+
|
|
101
|
+
# Check aliases first
|
|
102
|
+
if lower in _COUNTRY_ALIASES:
|
|
103
|
+
canonical = _COUNTRY_ALIASES[lower]
|
|
104
|
+
return df["country"] == canonical
|
|
105
|
+
|
|
106
|
+
# Exact case-insensitive match
|
|
107
|
+
mask = df["country"].str.lower() == lower
|
|
108
|
+
if mask.sum() > 0:
|
|
109
|
+
return mask
|
|
110
|
+
|
|
111
|
+
# Substring match as fallback
|
|
112
|
+
return df["country"].str.contains(name, case=False, na=False)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class FloodDB:
|
|
116
|
+
"""Interface to Google's Groundsource flash flood dataset (2.6M events, 2000–2026).
|
|
117
|
+
|
|
118
|
+
Usage:
|
|
119
|
+
from groundsource import FloodDB
|
|
120
|
+
db = FloodDB() # auto-downloads from Zenodo if not cached
|
|
121
|
+
floods = db.search(country="India")
|
|
122
|
+
trend = db.trend("India")
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, local_path: str = None):
|
|
126
|
+
"""Load the Groundsource dataset.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
local_path: Path to a local copy of groundsource_2026.parquet.
|
|
130
|
+
If None, downloads from Zenodo and caches automatically.
|
|
131
|
+
"""
|
|
132
|
+
enriched_path = get_enriched_parquet_path()
|
|
133
|
+
|
|
134
|
+
if enriched_path.exists():
|
|
135
|
+
print(f"Loading enriched dataset from cache...")
|
|
136
|
+
self._df = pd.read_parquet(enriched_path)
|
|
137
|
+
print(f" Loaded {len(self._df):,} events.")
|
|
138
|
+
else:
|
|
139
|
+
# Load raw parquet
|
|
140
|
+
if local_path:
|
|
141
|
+
raw_path = load_from_local(local_path)
|
|
142
|
+
else:
|
|
143
|
+
raw_path = download_parquet()
|
|
144
|
+
|
|
145
|
+
print("Loading raw dataset...")
|
|
146
|
+
raw_df = pd.read_parquet(raw_path)
|
|
147
|
+
print(f" Loaded {len(raw_df):,} raw events.")
|
|
148
|
+
|
|
149
|
+
# Enrich with spatial data
|
|
150
|
+
print("Enriching with country/continent data (one-time, ~2 min)...")
|
|
151
|
+
self._df = enrich_dataframe(raw_df)
|
|
152
|
+
|
|
153
|
+
# Cache enriched version
|
|
154
|
+
print(f" Caching enriched dataset to {enriched_path}...")
|
|
155
|
+
self._df.to_parquet(enriched_path, index=False)
|
|
156
|
+
print(" Cached. Future loads will be instant.")
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def df(self) -> pd.DataFrame:
|
|
160
|
+
"""Access the full enriched DataFrame."""
|
|
161
|
+
return self._df
|
|
162
|
+
|
|
163
|
+
def __len__(self) -> int:
|
|
164
|
+
return len(self._df)
|
|
165
|
+
|
|
166
|
+
def __repr__(self) -> str:
|
|
167
|
+
years = f"{self._df['year'].min()}–{self._df['year'].max()}"
|
|
168
|
+
countries = self._df["country"].nunique()
|
|
169
|
+
return f"FloodDB({len(self._df):,} events, {years}, {countries} countries)"
|
|
170
|
+
|
|
171
|
+
def info(self) -> dict:
|
|
172
|
+
"""Summary statistics about the dataset."""
|
|
173
|
+
df = self._df
|
|
174
|
+
return {
|
|
175
|
+
"total_events": len(df),
|
|
176
|
+
"date_range": (df["start_date"].min(), df["start_date"].max()),
|
|
177
|
+
"countries": df["country"].nunique(),
|
|
178
|
+
"continents": sorted(df["continent"].dropna().unique().tolist()),
|
|
179
|
+
"untagged_events": int(df["country"].isna().sum()),
|
|
180
|
+
"columns": list(df.columns),
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
184
|
+
"""Return the full enriched dataset as a pandas DataFrame."""
|
|
185
|
+
return self._df.copy()
|
|
186
|
+
|
|
187
|
+
@staticmethod
|
|
188
|
+
def available_cities() -> list:
|
|
189
|
+
"""List all city names available for city search."""
|
|
190
|
+
return sorted(_CITY_COORDS.keys())
|
|
191
|
+
|
|
192
|
+
def available_countries(self) -> list:
|
|
193
|
+
"""List all country names in the dataset."""
|
|
194
|
+
return sorted(self._df["country"].dropna().unique().tolist())
|
|
195
|
+
|
|
196
|
+
# ── Search ──────────────────────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
def search(self, country: str = None, continent: str = None,
|
|
199
|
+
city: str = None, radius_km: float = 100,
|
|
200
|
+
bbox: list = None,
|
|
201
|
+
year_range: tuple = None) -> pd.DataFrame:
|
|
202
|
+
"""Search for flood events by location and/or time.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
country: Country name (e.g., "India", "United States of America")
|
|
206
|
+
continent: Continent name (e.g., "Asia", "Africa")
|
|
207
|
+
city: City name for radius search (e.g., "Houston")
|
|
208
|
+
radius_km: Radius in km for city search (default 100)
|
|
209
|
+
bbox: Bounding box [min_lat, min_lon, max_lat, max_lon]
|
|
210
|
+
year_range: Tuple of (start_year, end_year) inclusive
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Filtered DataFrame of matching events.
|
|
214
|
+
"""
|
|
215
|
+
result = self._df
|
|
216
|
+
|
|
217
|
+
if year_range:
|
|
218
|
+
result = result[(result["year"] >= year_range[0]) & (result["year"] <= year_range[1])]
|
|
219
|
+
|
|
220
|
+
if continent:
|
|
221
|
+
result = result[result["continent"].str.lower().fillna("") == continent.lower()]
|
|
222
|
+
|
|
223
|
+
if country:
|
|
224
|
+
mask = _resolve_country(country, result)
|
|
225
|
+
result = result[mask]
|
|
226
|
+
|
|
227
|
+
if city:
|
|
228
|
+
city_lower = city.lower()
|
|
229
|
+
if city_lower in _CITY_COORDS:
|
|
230
|
+
lon, lat = _CITY_COORDS[city_lower]
|
|
231
|
+
result = search_by_radius(result, lon, lat, radius_km)
|
|
232
|
+
else:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
f"City '{city}' not in built-in gazetteer. "
|
|
235
|
+
f"Use bbox=[min_lat, min_lon, max_lat, max_lon] or "
|
|
236
|
+
f"search(country=...) instead. "
|
|
237
|
+
f"Available cities: {len(_CITY_COORDS)}"
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if bbox:
|
|
241
|
+
result = search_by_bbox(result, bbox)
|
|
242
|
+
|
|
243
|
+
return result.copy()
|
|
244
|
+
|
|
245
|
+
# ── Trend Analysis ──────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
def trend(self, country: str = None, continent: str = None) -> pd.DataFrame:
|
|
248
|
+
"""Yearly event counts, optionally filtered by country/continent.
|
|
249
|
+
|
|
250
|
+
Returns DataFrame with columns: year, count
|
|
251
|
+
"""
|
|
252
|
+
df = self._df
|
|
253
|
+
if country:
|
|
254
|
+
df = df[_resolve_country(country, df)]
|
|
255
|
+
if continent:
|
|
256
|
+
df = df[df["continent"].str.lower().fillna("") == continent.lower()]
|
|
257
|
+
return _trends.yearly_counts(df)
|
|
258
|
+
|
|
259
|
+
def growth(self, country: str = None, continent: str = None,
|
|
260
|
+
early: tuple = (2005, 2012), recent: tuple = (2018, 2025)) -> dict:
|
|
261
|
+
"""Compare average yearly events between two periods."""
|
|
262
|
+
counts = self.trend(country=country, continent=continent)
|
|
263
|
+
return _trends.growth_rate(counts, early=early, recent=recent)
|
|
264
|
+
|
|
265
|
+
def compare(self, countries: list) -> pd.DataFrame:
|
|
266
|
+
"""Side-by-side yearly counts for multiple countries.
|
|
267
|
+
|
|
268
|
+
Accepts aliases (e.g., "USA", "UK") and case-insensitive names.
|
|
269
|
+
"""
|
|
270
|
+
# Resolve each country name to its canonical form
|
|
271
|
+
resolved = []
|
|
272
|
+
for c in countries:
|
|
273
|
+
lower = c.lower()
|
|
274
|
+
if lower in _COUNTRY_ALIASES:
|
|
275
|
+
resolved.append(_COUNTRY_ALIASES[lower])
|
|
276
|
+
else:
|
|
277
|
+
# Find best match in dataset
|
|
278
|
+
mask = self._df["country"].str.lower() == lower
|
|
279
|
+
if mask.sum() > 0:
|
|
280
|
+
resolved.append(self._df.loc[mask, "country"].iloc[0])
|
|
281
|
+
else:
|
|
282
|
+
resolved.append(c) # pass through, will show zeros
|
|
283
|
+
return _trends.compare_countries(self._df, resolved)
|
|
284
|
+
|
|
285
|
+
def top_countries(self, n: int = 20) -> pd.DataFrame:
|
|
286
|
+
"""Top N countries by total event count."""
|
|
287
|
+
return _trends.top_countries(self._df, n=n)
|
|
288
|
+
|
|
289
|
+
def country_growth_ranking(self, n: int = 20, **kwargs) -> pd.DataFrame:
|
|
290
|
+
"""Rank countries by flood event growth rate."""
|
|
291
|
+
return _trends.country_growth_ranking(self._df, n=n, **kwargs)
|
|
292
|
+
|
|
293
|
+
def bias_check(self) -> pd.DataFrame:
|
|
294
|
+
"""Global yearly event counts for bias analysis.
|
|
295
|
+
|
|
296
|
+
Returns the hockey-stick curve that should be compared against
|
|
297
|
+
digital news growth. If the curves track together, the dataset's
|
|
298
|
+
growth reflects news digitization, not increasing floods.
|
|
299
|
+
"""
|
|
300
|
+
return _trends.bias_check(self._df)
|
|
301
|
+
|
|
302
|
+
# ── Charts ──────────────────────────────────────────────────────────
|
|
303
|
+
|
|
304
|
+
def plot_hockey_stick(self, save_path: str = None):
|
|
305
|
+
"""Chart 1: The 807x hockey stick — events per year."""
|
|
306
|
+
data = self.bias_check()
|
|
307
|
+
return _charts.plot_hockey_stick(data, save_path=save_path)
|
|
308
|
+
|
|
309
|
+
def plot_bias(self, save_path: str = None):
|
|
310
|
+
"""Chart 2: Event growth vs internet/news growth."""
|
|
311
|
+
data = self.bias_check()
|
|
312
|
+
return _charts.plot_bias_normalized(data, save_path=save_path)
|
|
313
|
+
|
|
314
|
+
def plot_country_growth(self, n: int = 20, save_path: str = None, **kwargs):
|
|
315
|
+
"""Chart 3: Countries with fastest flood event growth."""
|
|
316
|
+
data = self.country_growth_ranking(n=n, **kwargs)
|
|
317
|
+
return _charts.plot_country_growth(data, n=n, save_path=save_path)
|
|
318
|
+
|
|
319
|
+
def plot_top_countries(self, n: int = 20, save_path: str = None):
|
|
320
|
+
"""Chart 4: Top N countries by total events — the LinkedIn chart."""
|
|
321
|
+
data = self.top_countries(n=n)
|
|
322
|
+
return _charts.plot_top_countries(data, n=n, save_path=save_path)
|
groundsource/spatial.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Spatial operations: WKB decoding, country tagging, geocoding, search."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from math import radians, cos, sin, asin, sqrt
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import geopandas as gpd
|
|
9
|
+
from shapely import wkb
|
|
10
|
+
from shapely.geometry import Point
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Path to bundled Natural Earth shapefile
|
|
14
|
+
_DATA_DIR = Path(__file__).parent / "data"
|
|
15
|
+
_NE_SHP = _DATA_DIR / "ne_110m_admin_0_countries.shp"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def decode_centroids(geometry_series: pd.Series) -> pd.DataFrame:
|
|
19
|
+
"""Decode WKB geometry bytes to centroid lon/lat columns.
|
|
20
|
+
|
|
21
|
+
Returns DataFrame with columns: centroid_lon, centroid_lat
|
|
22
|
+
"""
|
|
23
|
+
lons = np.empty(len(geometry_series), dtype=np.float64)
|
|
24
|
+
lats = np.empty(len(geometry_series), dtype=np.float64)
|
|
25
|
+
|
|
26
|
+
for i, geom_bytes in enumerate(geometry_series):
|
|
27
|
+
geom = wkb.loads(geom_bytes)
|
|
28
|
+
c = geom.centroid
|
|
29
|
+
lons[i] = c.x
|
|
30
|
+
lats[i] = c.y
|
|
31
|
+
|
|
32
|
+
return pd.DataFrame({"centroid_lon": lons, "centroid_lat": lats})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def tag_countries(lons: np.ndarray, lats: np.ndarray) -> pd.DataFrame:
|
|
36
|
+
"""Spatial join centroids against Natural Earth to get country/continent.
|
|
37
|
+
|
|
38
|
+
Returns DataFrame with columns: country, iso_a3, continent
|
|
39
|
+
"""
|
|
40
|
+
if not _NE_SHP.exists():
|
|
41
|
+
raise FileNotFoundError(
|
|
42
|
+
f"Natural Earth shapefile not found at {_NE_SHP}. "
|
|
43
|
+
"Please reinstall the groundsource package."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
world = gpd.read_file(_NE_SHP)[["NAME", "ISO_A3", "CONTINENT", "geometry"]]
|
|
47
|
+
|
|
48
|
+
points = gpd.GeoDataFrame(
|
|
49
|
+
{"idx": np.arange(len(lons))},
|
|
50
|
+
geometry=gpd.points_from_xy(lons, lats),
|
|
51
|
+
crs="EPSG:4326",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
joined = gpd.sjoin(points, world, how="left", predicate="within")
|
|
55
|
+
|
|
56
|
+
# Handle duplicates from sjoin (point on border matched to multiple countries)
|
|
57
|
+
joined = joined.drop_duplicates(subset="idx", keep="first")
|
|
58
|
+
joined = joined.sort_values("idx").reset_index(drop=True)
|
|
59
|
+
|
|
60
|
+
return pd.DataFrame({
|
|
61
|
+
"country": joined["NAME"].values,
|
|
62
|
+
"iso_a3": joined["ISO_A3"].values,
|
|
63
|
+
"continent": joined["CONTINENT"].values,
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
68
|
+
"""Add centroid_lon, centroid_lat, country, iso_a3, continent to DataFrame."""
|
|
69
|
+
print(" Decoding WKB geometries (2.6M polygons)...")
|
|
70
|
+
centroids = decode_centroids(df["geometry"])
|
|
71
|
+
|
|
72
|
+
print(" Tagging countries via spatial join...")
|
|
73
|
+
countries = tag_countries(centroids["centroid_lon"].values, centroids["centroid_lat"].values)
|
|
74
|
+
|
|
75
|
+
result = df.copy()
|
|
76
|
+
result["centroid_lon"] = centroids["centroid_lon"].values
|
|
77
|
+
result["centroid_lat"] = centroids["centroid_lat"].values
|
|
78
|
+
result["country"] = countries["country"].values
|
|
79
|
+
result["iso_a3"] = countries["iso_a3"].values
|
|
80
|
+
result["continent"] = countries["continent"].values
|
|
81
|
+
|
|
82
|
+
# Parse dates
|
|
83
|
+
result["start_date"] = pd.to_datetime(result["start_date"])
|
|
84
|
+
result["end_date"] = pd.to_datetime(result["end_date"])
|
|
85
|
+
result["year"] = result["start_date"].dt.year
|
|
86
|
+
|
|
87
|
+
# Drop raw geometry bytes (no longer needed after enrichment)
|
|
88
|
+
result = result.drop(columns=["geometry"])
|
|
89
|
+
|
|
90
|
+
untagged = result["country"].isna().sum()
|
|
91
|
+
pct = untagged / len(result) * 100
|
|
92
|
+
print(f" Enrichment complete. {untagged:,} events ({pct:.1f}%) "
|
|
93
|
+
f"fell outside country boundaries (ocean/border).")
|
|
94
|
+
|
|
95
|
+
return result
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def haversine_km(lon1, lat1, lon2, lat2):
|
|
99
|
+
"""Haversine distance in km between two points."""
|
|
100
|
+
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
|
|
101
|
+
dlon = lon2 - lon1
|
|
102
|
+
dlat = lat2 - lat1
|
|
103
|
+
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
|
|
104
|
+
return 6371 * 2 * asin(sqrt(a))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def search_by_radius(df: pd.DataFrame, lon: float, lat: float, radius_km: float) -> pd.DataFrame:
|
|
108
|
+
"""Filter events within radius_km of a point using vectorized haversine."""
|
|
109
|
+
lon1 = np.radians(df["centroid_lon"].values)
|
|
110
|
+
lat1 = np.radians(df["centroid_lat"].values)
|
|
111
|
+
lon2 = radians(lon)
|
|
112
|
+
lat2 = radians(lat)
|
|
113
|
+
|
|
114
|
+
dlat = lat2 - lat1
|
|
115
|
+
dlon = lon2 - lon1
|
|
116
|
+
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
|
|
117
|
+
dist_km = 6371 * 2 * np.arcsin(np.sqrt(a))
|
|
118
|
+
|
|
119
|
+
return df[dist_km <= radius_km].copy()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def search_by_bbox(df: pd.DataFrame, bbox: list) -> pd.DataFrame:
|
|
123
|
+
"""Filter events within bounding box [min_lat, min_lon, max_lat, max_lon]."""
|
|
124
|
+
min_lat, min_lon, max_lat, max_lon = bbox
|
|
125
|
+
mask = (
|
|
126
|
+
(df["centroid_lat"] >= min_lat)
|
|
127
|
+
& (df["centroid_lat"] <= max_lat)
|
|
128
|
+
& (df["centroid_lon"] >= min_lon)
|
|
129
|
+
& (df["centroid_lon"] <= max_lon)
|
|
130
|
+
)
|
|
131
|
+
return df[mask].copy()
|
groundsource/trends.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Trend analysis and bias detection for flood event data."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def yearly_counts(df: pd.DataFrame, group_col: str = None, group_val: str = None) -> pd.DataFrame:
|
|
8
|
+
"""Count events per year, optionally filtered by a group column.
|
|
9
|
+
|
|
10
|
+
Returns DataFrame with columns: year, count
|
|
11
|
+
"""
|
|
12
|
+
subset = df
|
|
13
|
+
if group_col and group_val:
|
|
14
|
+
subset = df[df[group_col] == group_val]
|
|
15
|
+
|
|
16
|
+
counts = subset.groupby("year").size().reset_index(name="count")
|
|
17
|
+
return counts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def growth_rate(counts: pd.DataFrame, early=(2000, 2012), recent=(2015, 2025)) -> dict:
|
|
21
|
+
"""Compare average yearly counts between two periods.
|
|
22
|
+
|
|
23
|
+
Returns dict with early_avg, recent_avg, growth_factor.
|
|
24
|
+
"""
|
|
25
|
+
early_avg = counts[(counts["year"] >= early[0]) & (counts["year"] <= early[1])]["count"].mean()
|
|
26
|
+
recent_avg = counts[(counts["year"] >= recent[0]) & (counts["year"] <= recent[1])]["count"].mean()
|
|
27
|
+
|
|
28
|
+
if early_avg == 0 or pd.isna(early_avg):
|
|
29
|
+
factor = float("inf") if recent_avg > 0 else 0
|
|
30
|
+
else:
|
|
31
|
+
factor = recent_avg / early_avg
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
"early_period": early,
|
|
35
|
+
"recent_period": recent,
|
|
36
|
+
"early_avg": float(round(early_avg, 1)) if not pd.isna(early_avg) else 0.0,
|
|
37
|
+
"recent_avg": float(round(recent_avg, 1)) if not pd.isna(recent_avg) else 0.0,
|
|
38
|
+
"growth_factor": float(round(factor, 1)),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def top_countries(df: pd.DataFrame, n: int = 20) -> pd.DataFrame:
|
|
43
|
+
"""Top N countries by total event count."""
|
|
44
|
+
counts = df.groupby("country").size().reset_index(name="count")
|
|
45
|
+
return counts.sort_values("count", ascending=False).head(n).reset_index(drop=True)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def country_growth_ranking(df: pd.DataFrame, n: int = 20,
|
|
49
|
+
early=(2005, 2012), recent=(2018, 2025),
|
|
50
|
+
min_early_events: int = 10) -> pd.DataFrame:
|
|
51
|
+
"""Rank countries by flood event growth rate between two periods.
|
|
52
|
+
|
|
53
|
+
Filters to countries with at least min_early_events in the early period
|
|
54
|
+
to avoid divide-by-zero / noise from tiny samples.
|
|
55
|
+
"""
|
|
56
|
+
early_counts = (
|
|
57
|
+
df[(df["year"] >= early[0]) & (df["year"] <= early[1])]
|
|
58
|
+
.groupby("country").size().reset_index(name="early_count")
|
|
59
|
+
)
|
|
60
|
+
recent_counts = (
|
|
61
|
+
df[(df["year"] >= recent[0]) & (df["year"] <= recent[1])]
|
|
62
|
+
.groupby("country").size().reset_index(name="recent_count")
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
merged = early_counts.merge(recent_counts, on="country", how="inner")
|
|
66
|
+
merged = merged[merged["early_count"] >= min_early_events]
|
|
67
|
+
|
|
68
|
+
early_years = early[1] - early[0] + 1
|
|
69
|
+
recent_years = recent[1] - recent[0] + 1
|
|
70
|
+
merged["early_avg"] = merged["early_count"] / early_years
|
|
71
|
+
merged["recent_avg"] = merged["recent_count"] / recent_years
|
|
72
|
+
merged["growth_factor"] = merged["recent_avg"] / merged["early_avg"]
|
|
73
|
+
|
|
74
|
+
return merged.sort_values("growth_factor", ascending=False).head(n).reset_index(drop=True)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def bias_check(df: pd.DataFrame) -> pd.DataFrame:
|
|
78
|
+
"""Return global yearly event counts for bias analysis.
|
|
79
|
+
|
|
80
|
+
The returned DataFrame has columns: year, count.
|
|
81
|
+
Users should plot this and compare against digital news growth curves.
|
|
82
|
+
A hockey-stick shape matching internet/news growth indicates
|
|
83
|
+
reporting bias, not necessarily increasing flood frequency.
|
|
84
|
+
"""
|
|
85
|
+
counts = df.groupby("year").size().reset_index(name="count")
|
|
86
|
+
|
|
87
|
+
# Add some context columns
|
|
88
|
+
base_year = counts.loc[counts["year"].idxmin(), "count"]
|
|
89
|
+
counts["cumulative_growth"] = counts["count"] / base_year if base_year > 0 else 0
|
|
90
|
+
|
|
91
|
+
return counts
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def compare_countries(df: pd.DataFrame, countries: list) -> pd.DataFrame:
|
|
95
|
+
"""Side-by-side yearly counts for multiple countries.
|
|
96
|
+
|
|
97
|
+
Returns DataFrame with columns: year, country1, country2, ...
|
|
98
|
+
"""
|
|
99
|
+
result = None
|
|
100
|
+
for country in countries:
|
|
101
|
+
counts = yearly_counts(df, group_col="country", group_val=country)
|
|
102
|
+
counts = counts.rename(columns={"count": country})
|
|
103
|
+
if result is None:
|
|
104
|
+
result = counts
|
|
105
|
+
else:
|
|
106
|
+
result = result.merge(counts, on="year", how="outer")
|
|
107
|
+
|
|
108
|
+
if result is not None:
|
|
109
|
+
result = result.sort_values("year").fillna(0)
|
|
110
|
+
for col in countries:
|
|
111
|
+
result[col] = result[col].astype(int)
|
|
112
|
+
|
|
113
|
+
return result
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundsource
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python package for Google's Groundsource flash flood dataset — 2.6M events, 150+ countries, 2000–2026
|
|
5
|
+
Author: Shara
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sharanry/groundsource
|
|
8
|
+
Project-URL: Repository, https://github.com/sharanry/groundsource
|
|
9
|
+
Project-URL: Issues, https://github.com/sharanry/groundsource/issues
|
|
10
|
+
Keywords: flood,flash-flood,climate,groundsource,google,geospatial,dataset,gemini,natural-disaster,news-mining
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.5
|
|
26
|
+
Requires-Dist: pyarrow>=10.0
|
|
27
|
+
Requires-Dist: geopandas>=0.13
|
|
28
|
+
Requires-Dist: shapely>=2.0
|
|
29
|
+
Requires-Dist: matplotlib>=3.6
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# groundsource
|
|
35
|
+
|
|
36
|
+
**Python package for Google's Groundsource flash flood dataset.**
|
|
37
|
+
|
|
38
|
+
Google used Gemini to extract 2.6 million flash flood events from news articles across 150+ countries (2000-2026). The raw data is a 667MB Parquet file with undocumented WKB geometries and no location labels. This package decodes the geometries, tags every event with country and continent, and provides a clean search and analysis API.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from groundsource import FloodDB
|
|
42
|
+
|
|
43
|
+
db = FloodDB() # auto-downloads + enriches on first run
|
|
44
|
+
floods = db.search(country="India", year_range=(2020, 2025))
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install groundsource
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Requirements:** Python 3.9+, pandas, pyarrow, geopandas, shapely, matplotlib
|
|
54
|
+
|
|
55
|
+
On first run, the package downloads the dataset from Zenodo (~667MB), decodes 2.6M WKB polygons, and performs a spatial join against Natural Earth boundaries. This takes 2-3 minutes and is cached locally for instant subsequent loads.
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
### Search
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from groundsource import FloodDB
|
|
63
|
+
db = FloodDB()
|
|
64
|
+
|
|
65
|
+
# By country (supports common aliases: "USA", "UK", "UAE", etc.)
|
|
66
|
+
db.search(country="India")
|
|
67
|
+
db.search(country="USA", year_range=(2020, 2025))
|
|
68
|
+
|
|
69
|
+
# By city (98 major cities built-in, default 100km radius)
|
|
70
|
+
db.search(city="Houston", radius_km=50)
|
|
71
|
+
|
|
72
|
+
# By continent or bounding box
|
|
73
|
+
db.search(continent="Asia")
|
|
74
|
+
db.search(bbox=[0, 95, 25, 120]) # [min_lat, min_lon, max_lat, max_lon]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Trend Analysis
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
db.trend(country="India") # yearly event counts
|
|
81
|
+
db.growth(country="India") # growth rate between two periods
|
|
82
|
+
db.compare(["USA", "UK", "India", "Indonesia"]) # side-by-side comparison
|
|
83
|
+
db.top_countries(20) # ranked by total events
|
|
84
|
+
db.country_growth_ranking(20) # ranked by growth acceleration
|
|
85
|
+
db.bias_check() # global yearly counts for bias analysis
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Built-in Charts
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
db.plot_hockey_stick(save_path="hockey_stick.png")
|
|
92
|
+
db.plot_bias(save_path="bias.png")
|
|
93
|
+
db.plot_top_countries(save_path="top_countries.png")
|
|
94
|
+
db.plot_country_growth(save_path="growth.png")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Raw DataFrame Access
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
df = db.to_dataframe()
|
|
101
|
+
# Columns: uuid, area_km2, start_date, end_date, centroid_lon, centroid_lat,
|
|
102
|
+
# country, iso_a3, continent, year
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## What This Package Does
|
|
106
|
+
|
|
107
|
+
The raw Parquet from Zenodo has 5 columns with no documentation:
|
|
108
|
+
|
|
109
|
+
| Raw Column | Type | Issue |
|
|
110
|
+
|-----------|------|-------|
|
|
111
|
+
| `uuid` | string | ID only |
|
|
112
|
+
| `area_km2` | float | Usable as-is |
|
|
113
|
+
| `geometry` | WKB binary | Requires `shapely` to decode |
|
|
114
|
+
| `start_date` | string | Not parsed as datetime |
|
|
115
|
+
| `end_date` | string | Not parsed as datetime |
|
|
116
|
+
|
|
117
|
+
This package enriches each event with:
|
|
118
|
+
|
|
119
|
+
| Added Column | Source |
|
|
120
|
+
|-------------|--------|
|
|
121
|
+
| `centroid_lon`, `centroid_lat` | Decoded from WKB polygons |
|
|
122
|
+
| `country`, `iso_a3` | Spatial join against Natural Earth |
|
|
123
|
+
| `continent` | Natural Earth |
|
|
124
|
+
| `year` | Extracted from `start_date` |
|
|
125
|
+
|
|
126
|
+
## Reporting Bias
|
|
127
|
+
|
|
128
|
+
The dataset shows 498 events in 2000 and 402,012 in 2024. This does not mean floods increased 807x. The data is extracted from news articles, and digital news coverage grew dramatically over this period. Any trend analysis should account for this reporting bias. Use `db.bias_check()` and `db.plot_bias()` to visualize this.
|
|
129
|
+
|
|
130
|
+

|
|
131
|
+
|
|
132
|
+
## Top Countries by Events Detected
|
|
133
|
+
|
|
134
|
+

|
|
135
|
+
|
|
136
|
+
## Dataset
|
|
137
|
+
|
|
138
|
+
- **Source:** [Google Groundsource](https://research.google/blog/introducing-groundsource-turning-news-reports-into-data-with-gemini/)
|
|
139
|
+
- **Download:** [Zenodo](https://zenodo.org/records/18647054) (CC BY 4.0)
|
|
140
|
+
- **Records:** 2,646,302 events across 175 countries, 2000-2026
|
|
141
|
+
- **Method:** Gemini parsed ~5M news articles
|
|
142
|
+
- **Accuracy:** 60% location+timing, 82% practically useful (per Google)
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
MIT. The underlying dataset is licensed CC BY 4.0 by Google.
|
|
147
|
+
|
|
148
|
+
## Citation
|
|
149
|
+
|
|
150
|
+
> Google Research. *Groundsource: Turning News Reports into Data with Gemini.* Zenodo, 2026. DOI: [10.5281/zenodo.18647054](https://zenodo.org/records/18647054)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
groundsource/__init__.py,sha256=inQIRcjV65ACtxIrYFrP5o8Mi5YpuV2ce6e96i983_c,167
|
|
2
|
+
groundsource/cache.py,sha256=tbvKJSHBGrERVri92WZYGyVczya-i2vqBWwQF2C05LQ,2082
|
|
3
|
+
groundsource/charts.py,sha256=0g8KhGwi7YnQHSVvZvJxxcpWWT-P_DvlwZDsFJjOcPg,8084
|
|
4
|
+
groundsource/db.py,sha256=8QWs4Ld-hzGJpALWhCG-BTK_XNfHHVlI3qve09GLuec,13644
|
|
5
|
+
groundsource/spatial.py,sha256=QfjCUi2ANavv2hyoEhX2TQAL9tckyiVAoCuDgi5Whr4,4591
|
|
6
|
+
groundsource/trends.py,sha256=8olS2F0epwZE1KWDLyb7MUdlNgeXvsso6yz2jkqjGDY,4260
|
|
7
|
+
groundsource/data/__init__.py,sha256=fMw6NvHh0R15QoK3Nvsn6UxML3TEmfqc38TWzQDpED4,81
|
|
8
|
+
groundsource/data/ne_110m_admin_0_countries.cpg,sha256=OtMDH1UDpEBK-CUmLugjLMBNTqZoPULF3QovKiesmCQ,5
|
|
9
|
+
groundsource/data/ne_110m_admin_0_countries.dbf,sha256=H-5nfNTgOzZ4duA4YesQGX5AIqhGv5IGDgMTQyhjeFs,531808
|
|
10
|
+
groundsource/data/ne_110m_admin_0_countries.prj,sha256=Mlnw5VKQqCsTUGRvYE6KfuHiE2wDIKQPrYOKtAgZ__g,147
|
|
11
|
+
groundsource/data/ne_110m_admin_0_countries.shp,sha256=CONBYG6DkeRYw_CN6zEt5mS1a_rjdgZMWqCu5mgaX1U,180924
|
|
12
|
+
groundsource/data/ne_110m_admin_0_countries.shx,sha256=iwvirZfdSEruXC68mGl9U3LoMriuWKNaZhru9rmFZo0,1516
|
|
13
|
+
groundsource-0.1.0.dist-info/licenses/LICENSE,sha256=J_TEk_POk6Le3dSCQUYuAbyVwpqGP-4RTjT6cQMF16k,1078
|
|
14
|
+
groundsource-0.1.0.dist-info/METADATA,sha256=bYwQCMtjuaE9q17dvIeqRZ3ACZCyXW7NSRkkibUP5C4,5680
|
|
15
|
+
groundsource-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
groundsource-0.1.0.dist-info/top_level.txt,sha256=x8jiZoCtnyZzmlaeNj9l_KeNusbMFG6ZFYTDcPoH1Bk,13
|
|
17
|
+
groundsource-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sharath Sivamalaisamy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
groundsource
|