groundsource 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundsource-0.1.0/LICENSE +21 -0
- groundsource-0.1.0/MANIFEST.in +3 -0
- groundsource-0.1.0/PKG-INFO +150 -0
- groundsource-0.1.0/README.md +117 -0
- groundsource-0.1.0/groundsource/__init__.py +6 -0
- groundsource-0.1.0/groundsource/cache.py +63 -0
- groundsource-0.1.0/groundsource/charts.py +206 -0
- groundsource-0.1.0/groundsource/data/__init__.py +1 -0
- groundsource-0.1.0/groundsource/data/ne_110m_admin_0_countries.cpg +1 -0
- groundsource-0.1.0/groundsource/data/ne_110m_admin_0_countries.dbf +0 -0
- groundsource-0.1.0/groundsource/data/ne_110m_admin_0_countries.prj +1 -0
- groundsource-0.1.0/groundsource/data/ne_110m_admin_0_countries.shp +0 -0
- groundsource-0.1.0/groundsource/data/ne_110m_admin_0_countries.shx +0 -0
- groundsource-0.1.0/groundsource/db.py +322 -0
- groundsource-0.1.0/groundsource/spatial.py +131 -0
- groundsource-0.1.0/groundsource/trends.py +113 -0
- groundsource-0.1.0/groundsource.egg-info/PKG-INFO +150 -0
- groundsource-0.1.0/groundsource.egg-info/SOURCES.txt +22 -0
- groundsource-0.1.0/groundsource.egg-info/dependency_links.txt +1 -0
- groundsource-0.1.0/groundsource.egg-info/requires.txt +8 -0
- groundsource-0.1.0/groundsource.egg-info/top_level.txt +1 -0
- groundsource-0.1.0/pyproject.toml +52 -0
- groundsource-0.1.0/setup.cfg +4 -0
- groundsource-0.1.0/tests/test_groundsource.py +249 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sharath Sivamalaisamy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundsource
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python package for Google's Groundsource flash flood dataset — 2.6M events, 150+ countries, 2000–2026
|
|
5
|
+
Author: Shara
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sharanry/groundsource
|
|
8
|
+
Project-URL: Repository, https://github.com/sharanry/groundsource
|
|
9
|
+
Project-URL: Issues, https://github.com/sharanry/groundsource/issues
|
|
10
|
+
Keywords: flood,flash-flood,climate,groundsource,google,geospatial,dataset,gemini,natural-disaster,news-mining
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.5
|
|
26
|
+
Requires-Dist: pyarrow>=10.0
|
|
27
|
+
Requires-Dist: geopandas>=0.13
|
|
28
|
+
Requires-Dist: shapely>=2.0
|
|
29
|
+
Requires-Dist: matplotlib>=3.6
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# groundsource
|
|
35
|
+
|
|
36
|
+
**Python package for Google's Groundsource flash flood dataset.**
|
|
37
|
+
|
|
38
|
+
Google used Gemini to extract 2.6 million flash flood events from news articles across 150+ countries (2000-2026). The raw data is a 667MB Parquet file with undocumented WKB geometries and no location labels. This package decodes the geometries, tags every event with country and continent, and provides a clean search and analysis API.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from groundsource import FloodDB
|
|
42
|
+
|
|
43
|
+
db = FloodDB() # auto-downloads + enriches on first run
|
|
44
|
+
floods = db.search(country="India", year_range=(2020, 2025))
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install groundsource
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Requirements:** Python 3.9+, pandas, pyarrow, geopandas, shapely, matplotlib
|
|
54
|
+
|
|
55
|
+
On first run, the package downloads the dataset from Zenodo (~667MB), decodes 2.6M WKB polygons, and performs a spatial join against Natural Earth boundaries. This takes 2-3 minutes and is cached locally for instant subsequent loads.
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
### Search
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from groundsource import FloodDB
|
|
63
|
+
db = FloodDB()
|
|
64
|
+
|
|
65
|
+
# By country (supports common aliases: "USA", "UK", "UAE", etc.)
|
|
66
|
+
db.search(country="India")
|
|
67
|
+
db.search(country="USA", year_range=(2020, 2025))
|
|
68
|
+
|
|
69
|
+
# By city (98 major cities built-in, default 100km radius)
|
|
70
|
+
db.search(city="Houston", radius_km=50)
|
|
71
|
+
|
|
72
|
+
# By continent or bounding box
|
|
73
|
+
db.search(continent="Asia")
|
|
74
|
+
db.search(bbox=[0, 95, 25, 120]) # [min_lat, min_lon, max_lat, max_lon]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Trend Analysis
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
db.trend(country="India") # yearly event counts
|
|
81
|
+
db.growth(country="India") # growth rate between two periods
|
|
82
|
+
db.compare(["USA", "UK", "India", "Indonesia"]) # side-by-side comparison
|
|
83
|
+
db.top_countries(20) # ranked by total events
|
|
84
|
+
db.country_growth_ranking(20) # ranked by growth acceleration
|
|
85
|
+
db.bias_check() # global yearly counts for bias analysis
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Built-in Charts
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
db.plot_hockey_stick(save_path="hockey_stick.png")
|
|
92
|
+
db.plot_bias(save_path="bias.png")
|
|
93
|
+
db.plot_top_countries(save_path="top_countries.png")
|
|
94
|
+
db.plot_country_growth(save_path="growth.png")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Raw DataFrame Access
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
df = db.to_dataframe()
|
|
101
|
+
# Columns: uuid, area_km2, start_date, end_date, centroid_lon, centroid_lat,
|
|
102
|
+
# country, iso_a3, continent, year
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## What This Package Does
|
|
106
|
+
|
|
107
|
+
The raw Parquet from Zenodo has 5 columns with no documentation:
|
|
108
|
+
|
|
109
|
+
| Raw Column | Type | Issue |
|
|
110
|
+
|-----------|------|-------|
|
|
111
|
+
| `uuid` | string | ID only |
|
|
112
|
+
| `area_km2` | float | Usable as-is |
|
|
113
|
+
| `geometry` | WKB binary | Requires `shapely` to decode |
|
|
114
|
+
| `start_date` | string | Not parsed as datetime |
|
|
115
|
+
| `end_date` | string | Not parsed as datetime |
|
|
116
|
+
|
|
117
|
+
This package enriches each event with:
|
|
118
|
+
|
|
119
|
+
| Added Column | Source |
|
|
120
|
+
|-------------|--------|
|
|
121
|
+
| `centroid_lon`, `centroid_lat` | Decoded from WKB polygons |
|
|
122
|
+
| `country`, `iso_a3` | Spatial join against Natural Earth |
|
|
123
|
+
| `continent` | Natural Earth |
|
|
124
|
+
| `year` | Extracted from `start_date` |
|
|
125
|
+
|
|
126
|
+
## Reporting Bias
|
|
127
|
+
|
|
128
|
+
The dataset shows 498 events in 2000 and 402,012 in 2024. This does not mean floods increased 807x. The data is extracted from news articles, and digital news coverage grew dramatically over this period. Any trend analysis should account for this reporting bias. Use `db.bias_check()` and `db.plot_bias()` to visualize this.
|
|
129
|
+
|
|
130
|
+

|
|
131
|
+
|
|
132
|
+
## Top Countries by Events Detected
|
|
133
|
+
|
|
134
|
+

|
|
135
|
+
|
|
136
|
+
## Dataset
|
|
137
|
+
|
|
138
|
+
- **Source:** [Google Groundsource](https://research.google/blog/introducing-groundsource-turning-news-reports-into-data-with-gemini/)
|
|
139
|
+
- **Download:** [Zenodo](https://zenodo.org/records/18647054) (CC BY 4.0)
|
|
140
|
+
- **Records:** 2,646,302 events across 175 countries, 2000-2026
|
|
141
|
+
- **Method:** Gemini parsed ~5M news articles
|
|
142
|
+
- **Accuracy:** 60% location+timing, 82% practically useful (per Google)
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
MIT. The underlying dataset is licensed CC BY 4.0 by Google.
|
|
147
|
+
|
|
148
|
+
## Citation
|
|
149
|
+
|
|
150
|
+
> Google Research. *Groundsource: Turning News Reports into Data with Gemini.* Zenodo, 2026. DOI: [10.5281/zenodo.18647054](https://zenodo.org/records/18647054)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# groundsource
|
|
2
|
+
|
|
3
|
+
**Python package for Google's Groundsource flash flood dataset.**
|
|
4
|
+
|
|
5
|
+
Google used Gemini to extract 2.6 million flash flood events from news articles across 150+ countries (2000-2026). The raw data is a 667MB Parquet file with undocumented WKB geometries and no location labels. This package decodes the geometries, tags every event with country and continent, and provides a clean search and analysis API.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from groundsource import FloodDB
|
|
9
|
+
|
|
10
|
+
db = FloodDB() # auto-downloads + enriches on first run
|
|
11
|
+
floods = db.search(country="India", year_range=(2020, 2025))
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install groundsource
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Requirements:** Python 3.9+, pandas, pyarrow, geopandas, shapely, matplotlib
|
|
21
|
+
|
|
22
|
+
On first run, the package downloads the dataset from Zenodo (~667MB), decodes 2.6M WKB polygons, and performs a spatial join against Natural Earth boundaries. This takes 2-3 minutes and is cached locally for instant subsequent loads.
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
### Search
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from groundsource import FloodDB
|
|
30
|
+
db = FloodDB()
|
|
31
|
+
|
|
32
|
+
# By country (supports common aliases: "USA", "UK", "UAE", etc.)
|
|
33
|
+
db.search(country="India")
|
|
34
|
+
db.search(country="USA", year_range=(2020, 2025))
|
|
35
|
+
|
|
36
|
+
# By city (98 major cities built-in, default 100km radius)
|
|
37
|
+
db.search(city="Houston", radius_km=50)
|
|
38
|
+
|
|
39
|
+
# By continent or bounding box
|
|
40
|
+
db.search(continent="Asia")
|
|
41
|
+
db.search(bbox=[0, 95, 25, 120]) # [min_lat, min_lon, max_lat, max_lon]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Trend Analysis
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
db.trend(country="India") # yearly event counts
|
|
48
|
+
db.growth(country="India") # growth rate between two periods
|
|
49
|
+
db.compare(["USA", "UK", "India", "Indonesia"]) # side-by-side comparison
|
|
50
|
+
db.top_countries(20) # ranked by total events
|
|
51
|
+
db.country_growth_ranking(20) # ranked by growth acceleration
|
|
52
|
+
db.bias_check() # global yearly counts for bias analysis
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Built-in Charts
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
db.plot_hockey_stick(save_path="hockey_stick.png")
|
|
59
|
+
db.plot_bias(save_path="bias.png")
|
|
60
|
+
db.plot_top_countries(save_path="top_countries.png")
|
|
61
|
+
db.plot_country_growth(save_path="growth.png")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Raw DataFrame Access
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
df = db.to_dataframe()
|
|
68
|
+
# Columns: uuid, area_km2, start_date, end_date, centroid_lon, centroid_lat,
|
|
69
|
+
# country, iso_a3, continent, year
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## What This Package Does
|
|
73
|
+
|
|
74
|
+
The raw Parquet from Zenodo has 5 columns with no documentation:
|
|
75
|
+
|
|
76
|
+
| Raw Column | Type | Issue |
|
|
77
|
+
|-----------|------|-------|
|
|
78
|
+
| `uuid` | string | ID only |
|
|
79
|
+
| `area_km2` | float | Usable as-is |
|
|
80
|
+
| `geometry` | WKB binary | Requires `shapely` to decode |
|
|
81
|
+
| `start_date` | string | Not parsed as datetime |
|
|
82
|
+
| `end_date` | string | Not parsed as datetime |
|
|
83
|
+
|
|
84
|
+
This package enriches each event with:
|
|
85
|
+
|
|
86
|
+
| Added Column | Source |
|
|
87
|
+
|-------------|--------|
|
|
88
|
+
| `centroid_lon`, `centroid_lat` | Decoded from WKB polygons |
|
|
89
|
+
| `country`, `iso_a3` | Spatial join against Natural Earth |
|
|
90
|
+
| `continent` | Natural Earth |
|
|
91
|
+
| `year` | Extracted from `start_date` |
|
|
92
|
+
|
|
93
|
+
## Reporting Bias
|
|
94
|
+
|
|
95
|
+
The dataset shows 498 events in 2000 and 402,012 in 2024. This does not mean floods increased 807x. The data is extracted from news articles, and digital news coverage grew dramatically over this period. Any trend analysis should account for this reporting bias. Use `db.bias_check()` and `db.plot_bias()` to visualize this.
|
|
96
|
+
|
|
97
|
+

|
|
98
|
+
|
|
99
|
+
## Top Countries by Events Detected
|
|
100
|
+
|
|
101
|
+

|
|
102
|
+
|
|
103
|
+
## Dataset
|
|
104
|
+
|
|
105
|
+
- **Source:** [Google Groundsource](https://research.google/blog/introducing-groundsource-turning-news-reports-into-data-with-gemini/)
|
|
106
|
+
- **Download:** [Zenodo](https://zenodo.org/records/18647054) (CC BY 4.0)
|
|
107
|
+
- **Records:** 2,646,302 events across 175 countries, 2000-2026
|
|
108
|
+
- **Method:** Gemini parsed ~5M news articles
|
|
109
|
+
- **Accuracy:** 60% location+timing, 82% practically useful (per Google)
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT. The underlying dataset is licensed CC BY 4.0 by Google.
|
|
114
|
+
|
|
115
|
+
## Citation
|
|
116
|
+
|
|
117
|
+
> Google Research. *Groundsource: Turning News Reports into Data with Gemini.* Zenodo, 2026. DOI: [10.5281/zenodo.18647054](https://zenodo.org/records/18647054)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Download and cache management for the Groundsource dataset."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.request
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
ZENODO_URL = "https://zenodo.org/records/18647054/files/groundsource_2026.parquet?download=1"
|
|
9
|
+
PARQUET_FILENAME = "groundsource_2026.parquet"
|
|
10
|
+
ENRICHED_FILENAME = "groundsource_enriched.parquet"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_cache_dir() -> Path:
|
|
14
|
+
"""Return platform-appropriate cache directory."""
|
|
15
|
+
if sys.platform == "win32":
|
|
16
|
+
base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
|
|
17
|
+
elif sys.platform == "darwin":
|
|
18
|
+
base = Path.home() / "Library" / "Caches"
|
|
19
|
+
else:
|
|
20
|
+
base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
|
|
21
|
+
cache_dir = base / "groundsource"
|
|
22
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
return cache_dir
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_raw_parquet_path() -> Path:
|
|
27
|
+
return get_cache_dir() / PARQUET_FILENAME
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_enriched_parquet_path() -> Path:
|
|
31
|
+
return get_cache_dir() / ENRICHED_FILENAME
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def download_parquet(force: bool = False) -> Path:
|
|
35
|
+
"""Download the raw Parquet from Zenodo if not already cached."""
|
|
36
|
+
path = get_raw_parquet_path()
|
|
37
|
+
if path.exists() and not force:
|
|
38
|
+
return path
|
|
39
|
+
|
|
40
|
+
print(f"Downloading Groundsource dataset ({PARQUET_FILENAME})...")
|
|
41
|
+
print(f"Source: {ZENODO_URL}")
|
|
42
|
+
print(f"Destination: {path}")
|
|
43
|
+
|
|
44
|
+
def _progress(block_num, block_size, total_size):
|
|
45
|
+
downloaded = block_num * block_size
|
|
46
|
+
if total_size > 0:
|
|
47
|
+
pct = min(100, downloaded * 100 / total_size)
|
|
48
|
+
mb = downloaded / (1024 * 1024)
|
|
49
|
+
total_mb = total_size / (1024 * 1024)
|
|
50
|
+
sys.stdout.write(f"\r {mb:.0f}/{total_mb:.0f} MB ({pct:.1f}%)")
|
|
51
|
+
sys.stdout.flush()
|
|
52
|
+
|
|
53
|
+
urllib.request.urlretrieve(ZENODO_URL, str(path), reporthook=_progress)
|
|
54
|
+
print("\n Download complete.")
|
|
55
|
+
return path
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_from_local(path: str) -> Path:
|
|
59
|
+
"""Use a local Parquet file instead of downloading."""
|
|
60
|
+
p = Path(path)
|
|
61
|
+
if not p.exists():
|
|
62
|
+
raise FileNotFoundError(f"Local parquet not found: {path}")
|
|
63
|
+
return p
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Chart generators for Groundsource analysis. LinkedIn-worthy matplotlib charts."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import matplotlib.ticker as ticker
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Consistent style for all charts
|
|
10
|
+
STYLE = {
|
|
11
|
+
"figure.facecolor": "white",
|
|
12
|
+
"axes.facecolor": "#f8f9fa",
|
|
13
|
+
"axes.grid": True,
|
|
14
|
+
"grid.alpha": 0.3,
|
|
15
|
+
"font.family": "sans-serif",
|
|
16
|
+
"font.size": 11,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _apply_style():
|
|
21
|
+
plt.rcParams.update(STYLE)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def plot_hockey_stick(yearly_counts: pd.DataFrame, save_path: str = None) -> plt.Figure:
|
|
25
|
+
"""Chart 1: Total events per year — the 807x hockey stick.
|
|
26
|
+
|
|
27
|
+
yearly_counts: DataFrame with columns [year, count]
|
|
28
|
+
"""
|
|
29
|
+
_apply_style()
|
|
30
|
+
fig, ax = plt.subplots(figsize=(12, 6))
|
|
31
|
+
|
|
32
|
+
ax.bar(yearly_counts["year"], yearly_counts["count"],
|
|
33
|
+
color="#1a73e8", alpha=0.85, width=0.8)
|
|
34
|
+
|
|
35
|
+
# Annotate the growth
|
|
36
|
+
first_year = yearly_counts.iloc[0]
|
|
37
|
+
peak_year = yearly_counts.loc[yearly_counts["count"].idxmax()]
|
|
38
|
+
growth = peak_year["count"] / first_year["count"] if first_year["count"] > 0 else 0
|
|
39
|
+
|
|
40
|
+
ax.annotate(
|
|
41
|
+
f'{first_year["count"]:,.0f} events',
|
|
42
|
+
xy=(first_year["year"], first_year["count"]),
|
|
43
|
+
xytext=(first_year["year"] + 3, peak_year["count"] * 0.3),
|
|
44
|
+
fontsize=10, color="#666",
|
|
45
|
+
arrowprops=dict(arrowstyle="->", color="#999"),
|
|
46
|
+
)
|
|
47
|
+
ax.annotate(
|
|
48
|
+
f'{peak_year["count"]:,.0f} events\n({growth:,.0f}x growth)',
|
|
49
|
+
xy=(peak_year["year"], peak_year["count"]),
|
|
50
|
+
xytext=(peak_year["year"] - 6, peak_year["count"] * 1.05),
|
|
51
|
+
fontsize=10, fontweight="bold", color="#d93025",
|
|
52
|
+
arrowprops=dict(arrowstyle="->", color="#d93025"),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
ax.set_title("Flash Flood Events Detected Per Year\n— or is it news coverage?",
|
|
56
|
+
fontsize=16, fontweight="bold", pad=15)
|
|
57
|
+
ax.set_xlabel("Year", fontsize=12)
|
|
58
|
+
ax.set_ylabel("Events Detected", fontsize=12)
|
|
59
|
+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:,.0f}"))
|
|
60
|
+
|
|
61
|
+
ax.text(0.02, 0.95,
|
|
62
|
+
"Source: Google Groundsource dataset (2.6M events from news articles, 2000–2026)",
|
|
63
|
+
transform=ax.transAxes, fontsize=8, color="#999", va="top")
|
|
64
|
+
|
|
65
|
+
plt.tight_layout()
|
|
66
|
+
if save_path:
|
|
67
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
68
|
+
print(f" Saved: {save_path}")
|
|
69
|
+
return fig
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def plot_bias_normalized(yearly_counts: pd.DataFrame, save_path: str = None) -> plt.Figure:
|
|
73
|
+
"""Chart 2: Overlay event growth vs estimated digital news growth.
|
|
74
|
+
|
|
75
|
+
Uses a simple exponential proxy for global digital news volume.
|
|
76
|
+
Internet users: ~400M (2000) → ~5.5B (2025) ≈ 13.75x
|
|
77
|
+
Online news output grew even faster due to digital-native outlets.
|
|
78
|
+
We use a conservative 15-20x estimate for indexed news articles.
|
|
79
|
+
"""
|
|
80
|
+
_apply_style()
|
|
81
|
+
fig, ax = plt.subplots(figsize=(12, 6))
|
|
82
|
+
|
|
83
|
+
years = yearly_counts["year"].values
|
|
84
|
+
counts = yearly_counts["count"].values
|
|
85
|
+
|
|
86
|
+
# Normalize both to year 2007 (when dataset has enough events to be meaningful)
|
|
87
|
+
ref_idx = np.where(years == 2007)[0]
|
|
88
|
+
if len(ref_idx) == 0:
|
|
89
|
+
ref_idx = [7] # fallback
|
|
90
|
+
ref_idx = ref_idx[0]
|
|
91
|
+
|
|
92
|
+
norm_events = counts / counts[ref_idx]
|
|
93
|
+
|
|
94
|
+
# Conservative proxy: internet users grew ~14x from 2000 to 2025
|
|
95
|
+
# Online news articles grew faster. Use a simple logistic-like growth curve.
|
|
96
|
+
internet_users_billions = {
|
|
97
|
+
2000: 0.41, 2001: 0.50, 2002: 0.63, 2003: 0.72, 2004: 0.82,
|
|
98
|
+
2005: 1.02, 2006: 1.15, 2007: 1.37, 2008: 1.57, 2009: 1.77,
|
|
99
|
+
2010: 2.02, 2011: 2.23, 2012: 2.49, 2013: 2.73, 2014: 2.96,
|
|
100
|
+
2015: 3.19, 2016: 3.42, 2017: 3.65, 2018: 3.90, 2019: 4.13,
|
|
101
|
+
2020: 4.59, 2021: 4.90, 2022: 5.16, 2023: 5.35, 2024: 5.52,
|
|
102
|
+
2025: 5.56, 2026: 5.60,
|
|
103
|
+
}
|
|
104
|
+
news_proxy = np.array([internet_users_billions.get(y, 5.6) for y in years])
|
|
105
|
+
norm_news = news_proxy / news_proxy[ref_idx]
|
|
106
|
+
|
|
107
|
+
ax.plot(years, norm_events, "o-", color="#d93025", linewidth=2.5,
|
|
108
|
+
markersize=5, label="Detected flood events", zorder=3)
|
|
109
|
+
ax.plot(years, norm_news, "s--", color="#1a73e8", linewidth=2,
|
|
110
|
+
markersize=4, label="Internet users (proxy for digital news)", zorder=2)
|
|
111
|
+
|
|
112
|
+
ax.fill_between(years, norm_events, norm_news,
|
|
113
|
+
where=(norm_events > norm_news),
|
|
114
|
+
alpha=0.1, color="#d93025")
|
|
115
|
+
|
|
116
|
+
ax.set_title("Are Floods Increasing — or Is News Coverage?\nBoth curves normalized to 2007 = 1.0",
|
|
117
|
+
fontsize=15, fontweight="bold", pad=15)
|
|
118
|
+
ax.set_xlabel("Year", fontsize=12)
|
|
119
|
+
ax.set_ylabel("Growth relative to 2007", fontsize=12)
|
|
120
|
+
ax.legend(fontsize=11, loc="upper left")
|
|
121
|
+
ax.set_yscale("log")
|
|
122
|
+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:.0f}x"))
|
|
123
|
+
|
|
124
|
+
ax.text(0.02, 0.02,
|
|
125
|
+
"Event detection grows faster than internet adoption — likely because\n"
|
|
126
|
+
"news digitization (articles going online) grew faster than raw user count.",
|
|
127
|
+
transform=ax.transAxes, fontsize=9, color="#666", va="bottom",
|
|
128
|
+
style="italic")
|
|
129
|
+
|
|
130
|
+
plt.tight_layout()
|
|
131
|
+
if save_path:
|
|
132
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
133
|
+
print(f" Saved: {save_path}")
|
|
134
|
+
return fig
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def plot_country_growth(growth_df: pd.DataFrame, n: int = 20,
|
|
138
|
+
save_path: str = None) -> plt.Figure:
|
|
139
|
+
"""Chart 3: Countries with highest growth factor — what might be real.
|
|
140
|
+
|
|
141
|
+
growth_df: from trends.country_growth_ranking()
|
|
142
|
+
"""
|
|
143
|
+
_apply_style()
|
|
144
|
+
top = growth_df.head(n).copy()
|
|
145
|
+
top = top.sort_values("growth_factor", ascending=True) # horizontal bar, ascending
|
|
146
|
+
|
|
147
|
+
fig, ax = plt.subplots(figsize=(10, 8))
|
|
148
|
+
|
|
149
|
+
colors = ["#d93025" if g > 50 else "#ea8600" if g > 20 else "#1a73e8"
|
|
150
|
+
for g in top["growth_factor"]]
|
|
151
|
+
|
|
152
|
+
ax.barh(top["country"], top["growth_factor"], color=colors, alpha=0.85)
|
|
153
|
+
|
|
154
|
+
for i, (_, row) in enumerate(top.iterrows()):
|
|
155
|
+
ax.text(row["growth_factor"] + 0.5, i,
|
|
156
|
+
f'{row["growth_factor"]:.0f}x',
|
|
157
|
+
va="center", fontsize=9, fontweight="bold")
|
|
158
|
+
|
|
159
|
+
ax.set_title("Which Countries Show the Fastest Growth in Detected Flood Events?\n"
|
|
160
|
+
"Growth factor: avg events/year (2018–2025) vs (2005–2012)",
|
|
161
|
+
fontsize=14, fontweight="bold", pad=15)
|
|
162
|
+
ax.set_xlabel("Growth Factor (higher = faster acceleration)", fontsize=11)
|
|
163
|
+
|
|
164
|
+
ax.text(0.98, 0.02,
|
|
165
|
+
"⚠ High growth may reflect news digitization, not actual flood increase.\n"
|
|
166
|
+
"Countries with low baseline coverage will show inflated growth.",
|
|
167
|
+
transform=ax.transAxes, fontsize=8, color="#999",
|
|
168
|
+
ha="right", va="bottom", style="italic")
|
|
169
|
+
|
|
170
|
+
plt.tight_layout()
|
|
171
|
+
if save_path:
|
|
172
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
173
|
+
print(f" Saved: {save_path}")
|
|
174
|
+
return fig
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def plot_top_countries(country_counts: pd.DataFrame, n: int = 20,
|
|
178
|
+
save_path: str = None) -> plt.Figure:
|
|
179
|
+
"""Chart 4: Top N countries by total event count — the LinkedIn chart."""
|
|
180
|
+
_apply_style()
|
|
181
|
+
top = country_counts.head(n).copy()
|
|
182
|
+
top = top.sort_values("count", ascending=True)
|
|
183
|
+
|
|
184
|
+
fig, ax = plt.subplots(figsize=(10, 8))
|
|
185
|
+
|
|
186
|
+
ax.barh(top["country"], top["count"], color="#1a73e8", alpha=0.85)
|
|
187
|
+
|
|
188
|
+
for i, (_, row) in enumerate(top.iterrows()):
|
|
189
|
+
ax.text(row["count"] + 500, i,
|
|
190
|
+
f'{row["count"]:,.0f}',
|
|
191
|
+
va="center", fontsize=9)
|
|
192
|
+
|
|
193
|
+
ax.set_title("Top 20 Countries by Total Flash Flood Events Detected (2000–2026)",
|
|
194
|
+
fontsize=14, fontweight="bold", pad=15)
|
|
195
|
+
ax.set_xlabel("Total Events Detected", fontsize=11)
|
|
196
|
+
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:,.0f}"))
|
|
197
|
+
|
|
198
|
+
ax.text(0.98, 0.02,
|
|
199
|
+
"Source: Google Groundsource — 2.6M events extracted by Gemini from news articles",
|
|
200
|
+
transform=ax.transAxes, fontsize=8, color="#999", ha="right", va="bottom")
|
|
201
|
+
|
|
202
|
+
plt.tight_layout()
|
|
203
|
+
if save_path:
|
|
204
|
+
fig.savefig(save_path, dpi=150, bbox_inches="tight")
|
|
205
|
+
print(f" Saved: {save_path}")
|
|
206
|
+
return fig
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Data directory — contains Natural Earth shapefiles bundled with the package.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
UTF-8
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.017453292519943295]]
|
|
Binary file
|
|
Binary file
|