geo-intel-offline 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geo_intel_offline/__init__.py +15 -0
- geo_intel_offline/api.py +114 -0
- geo_intel_offline/compression.py +238 -0
- geo_intel_offline/confidence.py +89 -0
- geo_intel_offline/data/geohash_index.json.gz +0 -0
- geo_intel_offline/data/metadata.json.gz +0 -0
- geo_intel_offline/data/polygons.json.gz +0 -0
- geo_intel_offline/data_builder.py +528 -0
- geo_intel_offline/data_builder_minimal.py +173 -0
- geo_intel_offline/data_builder_modular.py +474 -0
- geo_intel_offline/data_loader.py +170 -0
- geo_intel_offline/geohash.py +150 -0
- geo_intel_offline/hierarchical_resolver.py +136 -0
- geo_intel_offline/migrate_to_modular.py +159 -0
- geo_intel_offline/modular_data_loader.py +212 -0
- geo_intel_offline/pip.py +150 -0
- geo_intel_offline/polygon_utils.py +104 -0
- geo_intel_offline/resolver.py +306 -0
- geo_intel_offline-1.0.1.dist-info/LICENSE +21 -0
- geo_intel_offline-1.0.1.dist-info/METADATA +784 -0
- geo_intel_offline-1.0.1.dist-info/RECORD +24 -0
- geo_intel_offline-1.0.1.dist-info/WHEEL +5 -0
- geo_intel_offline-1.0.1.dist-info/entry_points.txt +2 -0
- geo_intel_offline-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Geohash encoding and decoding for spatial indexing.
|
|
3
|
+
|
|
4
|
+
Geohash is a geocoding system that encodes latitude/longitude into a string.
|
|
5
|
+
We use it to create a spatial index for fast candidate country filtering.
|
|
6
|
+
|
|
7
|
+
Design Decision: Using precision level 6 (32-bit geohash) as a balance:
|
|
8
|
+
- Precision ~1.2km × 0.6km (sufficient for country-level resolution)
|
|
9
|
+
- Small index size (~200 countries × few geohashes each)
|
|
10
|
+
- Fast encoding/decoding operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
|
|
15
|
+
# Base32 encoding used by geohash
|
|
16
|
+
BASE32 = "0123456789bcdefghjkmnpqrstuvwxyz"
|
|
17
|
+
|
|
18
|
+
# Geohash precision for indexing (6 chars = ~1.2km precision)
|
|
19
|
+
GEOHASH_PRECISION = 6
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def encode(lat: float, lon: float, precision: int = GEOHASH_PRECISION) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Encode latitude/longitude to geohash string.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
lat: Latitude (-90 to 90)
|
|
28
|
+
lon: Longitude (-180 to 180)
|
|
29
|
+
precision: Number of geohash characters (default 6)
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Geohash string
|
|
33
|
+
"""
|
|
34
|
+
if not (-90 <= lat <= 90):
|
|
35
|
+
raise ValueError(f"Latitude must be between -90 and 90, got {lat}")
|
|
36
|
+
if not (-180 <= lon <= 180):
|
|
37
|
+
raise ValueError(f"Longitude must be between -180 and 180, got {lon}")
|
|
38
|
+
|
|
39
|
+
lat_range = (-90.0, 90.0)
|
|
40
|
+
lon_range = (-180.0, 180.0)
|
|
41
|
+
bits = 0
|
|
42
|
+
bits_per_char = 5
|
|
43
|
+
geohash = []
|
|
44
|
+
|
|
45
|
+
ch = 0
|
|
46
|
+
|
|
47
|
+
for i in range(precision * bits_per_char):
|
|
48
|
+
if i % 2 == 0:
|
|
49
|
+
# Longitude bit
|
|
50
|
+
mid = (lon_range[0] + lon_range[1]) / 2
|
|
51
|
+
if lon >= mid:
|
|
52
|
+
ch |= (1 << (bits_per_char - 1 - (i // 2) % bits_per_char))
|
|
53
|
+
lon_range = (mid, lon_range[1])
|
|
54
|
+
else:
|
|
55
|
+
lon_range = (lon_range[0], mid)
|
|
56
|
+
else:
|
|
57
|
+
# Latitude bit
|
|
58
|
+
mid = (lat_range[0] + lat_range[1]) / 2
|
|
59
|
+
if lat >= mid:
|
|
60
|
+
ch |= (1 << (bits_per_char - 1 - (i // 2) % bits_per_char))
|
|
61
|
+
lat_range = (mid, lat_range[1])
|
|
62
|
+
else:
|
|
63
|
+
lat_range = (lat_range[0], mid)
|
|
64
|
+
|
|
65
|
+
if (i + 1) % bits_per_char == 0:
|
|
66
|
+
geohash.append(BASE32[ch])
|
|
67
|
+
ch = 0
|
|
68
|
+
|
|
69
|
+
return ''.join(geohash)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def decode(geohash: str) -> Tuple[float, float, Tuple[float, float], Tuple[float, float]]:
|
|
73
|
+
"""
|
|
74
|
+
Decode geohash string to latitude/longitude with bounding box.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
geohash: Geohash string
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Tuple of (lat, lon, lat_range, lon_range)
|
|
81
|
+
"""
|
|
82
|
+
if not geohash:
|
|
83
|
+
raise ValueError("Geohash cannot be empty")
|
|
84
|
+
|
|
85
|
+
lat_range = (-90.0, 90.0)
|
|
86
|
+
lon_range = (-180.0, 180.0)
|
|
87
|
+
is_even = True
|
|
88
|
+
|
|
89
|
+
for char in geohash:
|
|
90
|
+
if char not in BASE32:
|
|
91
|
+
raise ValueError(f"Invalid geohash character: {char}")
|
|
92
|
+
|
|
93
|
+
idx = BASE32.index(char)
|
|
94
|
+
|
|
95
|
+
for j in range(5):
|
|
96
|
+
bit = (idx >> (4 - j)) & 1
|
|
97
|
+
if is_even:
|
|
98
|
+
mid = (lon_range[0] + lon_range[1]) / 2
|
|
99
|
+
if bit:
|
|
100
|
+
lon_range = (mid, lon_range[1])
|
|
101
|
+
else:
|
|
102
|
+
lon_range = (lon_range[0], mid)
|
|
103
|
+
else:
|
|
104
|
+
mid = (lat_range[0] + lat_range[1]) / 2
|
|
105
|
+
if bit:
|
|
106
|
+
lat_range = (mid, lat_range[1])
|
|
107
|
+
else:
|
|
108
|
+
lat_range = (lat_range[0], mid)
|
|
109
|
+
is_even = not is_even
|
|
110
|
+
|
|
111
|
+
lat = (lat_range[0] + lat_range[1]) / 2
|
|
112
|
+
lon = (lon_range[0] + lon_range[1]) / 2
|
|
113
|
+
|
|
114
|
+
return lat, lon, lat_range, lon_range
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_neighbors(geohash: str) -> list[str]:
|
|
118
|
+
"""
|
|
119
|
+
Get 8 neighboring geohashes (for border cases).
|
|
120
|
+
|
|
121
|
+
Design Decision: Check neighbors when point-in-polygon fails.
|
|
122
|
+
This handles edge cases where a point is near geohash boundaries.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
geohash: Geohash string
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of 8 neighboring geohashes plus the original
|
|
129
|
+
"""
|
|
130
|
+
lat, lon, lat_range, lon_range = decode(geohash)
|
|
131
|
+
|
|
132
|
+
# Calculate step size from precision
|
|
133
|
+
lat_step = lat_range[1] - lat_range[0]
|
|
134
|
+
lon_step = lon_range[1] - lon_range[0]
|
|
135
|
+
|
|
136
|
+
neighbors = []
|
|
137
|
+
for dlat in [-lat_step, 0, lat_step]:
|
|
138
|
+
for dlon in [-lon_step, 0, lon_step]:
|
|
139
|
+
if dlat == 0 and dlon == 0:
|
|
140
|
+
continue
|
|
141
|
+
new_lat = lat + dlat
|
|
142
|
+
new_lon = lon + dlon
|
|
143
|
+
|
|
144
|
+
# Clamp to valid ranges
|
|
145
|
+
new_lat = max(-90, min(90, new_lat))
|
|
146
|
+
new_lon = max(-180, min(180, new_lon))
|
|
147
|
+
|
|
148
|
+
neighbors.append(encode(new_lat, new_lon, len(geohash)))
|
|
149
|
+
|
|
150
|
+
return neighbors
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hierarchical resolver for country → state → city resolution.
|
|
3
|
+
|
|
4
|
+
Extends the base resolver to support multi-level geo-intelligence.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, Dict, List, Tuple
|
|
8
|
+
from .resolver import resolve as resolve_country, ResolutionResult
|
|
9
|
+
from .data_loader import get_loader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HierarchicalResult:
|
|
13
|
+
"""Result with country, state/province, and city information."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
country: Optional[str] = None,
|
|
18
|
+
country_iso2: Optional[str] = None,
|
|
19
|
+
country_iso3: Optional[str] = None,
|
|
20
|
+
state: Optional[str] = None,
|
|
21
|
+
state_code: Optional[str] = None,
|
|
22
|
+
city: Optional[str] = None,
|
|
23
|
+
continent: Optional[str] = None,
|
|
24
|
+
timezone: Optional[str] = None,
|
|
25
|
+
confidence: float = 0.0
|
|
26
|
+
):
|
|
27
|
+
self.country = country
|
|
28
|
+
self.country_iso2 = country_iso2
|
|
29
|
+
self.country_iso3 = country_iso3
|
|
30
|
+
self.state = state
|
|
31
|
+
self.state_code = state_code
|
|
32
|
+
self.city = city
|
|
33
|
+
self.continent = continent
|
|
34
|
+
self.timezone = timezone
|
|
35
|
+
self.confidence = confidence
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> Dict:
|
|
38
|
+
"""Convert to dictionary."""
|
|
39
|
+
return {
|
|
40
|
+
"country": self.country,
|
|
41
|
+
"country_iso2": self.country_iso2,
|
|
42
|
+
"country_iso3": self.country_iso3,
|
|
43
|
+
"state": self.state,
|
|
44
|
+
"state_code": self.state_code,
|
|
45
|
+
"city": self.city,
|
|
46
|
+
"continent": self.continent,
|
|
47
|
+
"timezone": self.timezone,
|
|
48
|
+
"confidence": self.confidence
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def resolve_hierarchical(
|
|
53
|
+
lat: float,
|
|
54
|
+
lon: float,
|
|
55
|
+
include_states: bool = False,
|
|
56
|
+
include_cities: bool = False,
|
|
57
|
+
data_dir: Optional[str] = None
|
|
58
|
+
) -> HierarchicalResult:
|
|
59
|
+
"""
|
|
60
|
+
Resolve coordinates hierarchically: country → state → city.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
lat: Latitude
|
|
64
|
+
lon: Longitude
|
|
65
|
+
include_states: Whether to resolve state/province (requires state data)
|
|
66
|
+
include_cities: Whether to resolve city (requires city data)
|
|
67
|
+
data_dir: Optional custom data directory
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
HierarchicalResult with country, state, and city information
|
|
71
|
+
"""
|
|
72
|
+
# First resolve country (always available)
|
|
73
|
+
country_result = resolve_country(lat, lon, data_dir)
|
|
74
|
+
|
|
75
|
+
if not country_result.is_valid():
|
|
76
|
+
return HierarchicalResult()
|
|
77
|
+
|
|
78
|
+
result = HierarchicalResult(
|
|
79
|
+
country=country_result.country_name,
|
|
80
|
+
country_iso2=country_result.iso2,
|
|
81
|
+
country_iso3=country_result.iso3,
|
|
82
|
+
continent=country_result.continent,
|
|
83
|
+
timezone=country_result.timezone,
|
|
84
|
+
confidence=country_result.confidence
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Resolve state/province if requested and data available
|
|
88
|
+
if include_states:
|
|
89
|
+
state_info = _resolve_state(lat, lon, country_result.country_id, data_dir)
|
|
90
|
+
if state_info:
|
|
91
|
+
result.state = state_info.get('name')
|
|
92
|
+
result.state_code = state_info.get('code')
|
|
93
|
+
# Adjust confidence (state resolution may be less accurate)
|
|
94
|
+
result.confidence = min(result.confidence, state_info.get('confidence', result.confidence))
|
|
95
|
+
|
|
96
|
+
# Resolve city if requested and data available
|
|
97
|
+
if include_cities:
|
|
98
|
+
city_info = _resolve_city(lat, lon, country_result.country_id, data_dir)
|
|
99
|
+
if city_info:
|
|
100
|
+
result.city = city_info.get('name')
|
|
101
|
+
# Adjust confidence
|
|
102
|
+
result.confidence = min(result.confidence, city_info.get('confidence', result.confidence))
|
|
103
|
+
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _resolve_state(
|
|
108
|
+
lat: float,
|
|
109
|
+
lon: float,
|
|
110
|
+
country_id: int,
|
|
111
|
+
data_dir: Optional[str]
|
|
112
|
+
) -> Optional[Dict]:
|
|
113
|
+
"""
|
|
114
|
+
Resolve state/province within a country.
|
|
115
|
+
|
|
116
|
+
Note: Requires state-level data files (states_index.json, states_polygons.json)
|
|
117
|
+
"""
|
|
118
|
+
# This would load state data similar to country resolution
|
|
119
|
+
# For now, return None (state data not yet implemented)
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _resolve_city(
|
|
124
|
+
lat: float,
|
|
125
|
+
lon: float,
|
|
126
|
+
country_id: int,
|
|
127
|
+
data_dir: Optional[str]
|
|
128
|
+
) -> Optional[Dict]:
|
|
129
|
+
"""
|
|
130
|
+
Resolve city within a country.
|
|
131
|
+
|
|
132
|
+
Note: Requires city-level data files (cities_index.json, cities_data.json)
|
|
133
|
+
"""
|
|
134
|
+
# This would load city data (points or polygons)
|
|
135
|
+
# For now, return None (city data not yet implemented)
|
|
136
|
+
return None
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Migration tool: Convert monolithic data format to modular format.
|
|
3
|
+
|
|
4
|
+
This tool reads existing monolithic data files and converts them to
|
|
5
|
+
the new modular country-wise format.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def migrate_monolithic_to_modular(
|
|
15
|
+
monolithic_dir: Path,
|
|
16
|
+
output_dir: Path
|
|
17
|
+
) -> Dict:
|
|
18
|
+
"""
|
|
19
|
+
Migrate monolithic data files to modular format.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
monolithic_dir: Directory with monolithic files (geohash_index.json, etc.)
|
|
23
|
+
output_dir: Output directory for modular format
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dict with migration statistics
|
|
27
|
+
"""
|
|
28
|
+
# Load monolithic files
|
|
29
|
+
index_file = monolithic_dir / "geohash_index.json"
|
|
30
|
+
polygons_file = monolithic_dir / "polygons.json"
|
|
31
|
+
metadata_file = monolithic_dir / "metadata.json"
|
|
32
|
+
|
|
33
|
+
if not all(f.exists() for f in [index_file, polygons_file, metadata_file]):
|
|
34
|
+
raise FileNotFoundError(
|
|
35
|
+
f"Monolithic data files not found in {monolithic_dir}. "
|
|
36
|
+
"Expected: geohash_index.json, polygons.json, metadata.json"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
with open(index_file, 'r') as f:
|
|
40
|
+
geohash_index = json.load(f)
|
|
41
|
+
|
|
42
|
+
with open(polygons_file, 'r') as f:
|
|
43
|
+
polygons = json.load(f)
|
|
44
|
+
|
|
45
|
+
with open(metadata_file, 'r') as f:
|
|
46
|
+
metadata = json.load(f)
|
|
47
|
+
|
|
48
|
+
# Create output structure
|
|
49
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
continents_dir = output_dir / "continents"
|
|
51
|
+
continents_dir.mkdir(exist_ok=True)
|
|
52
|
+
|
|
53
|
+
# Organize by country
|
|
54
|
+
master_index = {
|
|
55
|
+
'version': '1.0.0',
|
|
56
|
+
'countries': {},
|
|
57
|
+
'continents': {}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
continent_countries = {}
|
|
61
|
+
|
|
62
|
+
# Process each country
|
|
63
|
+
for country_id_str, country_metadata in metadata.items():
|
|
64
|
+
country_id = int(country_id_str)
|
|
65
|
+
iso2 = country_metadata.get('iso2', '').upper()
|
|
66
|
+
continent_raw = country_metadata.get('continent', 'unknown')
|
|
67
|
+
continent = continent_raw.lower().replace(' ', '_')
|
|
68
|
+
|
|
69
|
+
if not iso2:
|
|
70
|
+
# Skip countries without ISO2 code
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# Get polygon for this country
|
|
74
|
+
country_polygon = polygons.get(country_id_str)
|
|
75
|
+
if not country_polygon:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Extract geohashes for this country
|
|
79
|
+
country_geohashes = {}
|
|
80
|
+
for geohash, country_ids in geohash_index.items():
|
|
81
|
+
if country_id in (country_ids if isinstance(country_ids, list) else [country_ids]):
|
|
82
|
+
country_geohashes[geohash] = [country_id]
|
|
83
|
+
|
|
84
|
+
# Create country data structure
|
|
85
|
+
country_data = {
|
|
86
|
+
'country_id': country_id,
|
|
87
|
+
'metadata': country_metadata,
|
|
88
|
+
'geohashes': country_geohashes,
|
|
89
|
+
'polygon': country_polygon
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Create continent directory
|
|
93
|
+
continent_dir = continents_dir / continent
|
|
94
|
+
continent_dir.mkdir(exist_ok=True)
|
|
95
|
+
|
|
96
|
+
# Save country file
|
|
97
|
+
country_file = continent_dir / f"{iso2}.json"
|
|
98
|
+
with open(country_file, 'w', encoding='utf-8') as f:
|
|
99
|
+
json.dump(country_data, f, separators=(',', ':'))
|
|
100
|
+
|
|
101
|
+
# Update master index
|
|
102
|
+
relative_path = f"continents/{continent}/{iso2}.json"
|
|
103
|
+
master_index['countries'][iso2] = {
|
|
104
|
+
'id': country_id,
|
|
105
|
+
'name': country_metadata.get('name', ''),
|
|
106
|
+
'iso2': iso2,
|
|
107
|
+
'iso3': country_metadata.get('iso3', ''),
|
|
108
|
+
'continent': continent_raw,
|
|
109
|
+
'file': relative_path,
|
|
110
|
+
'size_bytes': country_file.stat().st_size
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# Track continent membership
|
|
114
|
+
if continent not in continent_countries:
|
|
115
|
+
continent_countries[continent] = []
|
|
116
|
+
continent_countries[continent].append(iso2)
|
|
117
|
+
|
|
118
|
+
# Update continent index
|
|
119
|
+
master_index['continents'] = continent_countries
|
|
120
|
+
|
|
121
|
+
# Save master index
|
|
122
|
+
index_output = output_dir / "index.json"
|
|
123
|
+
with open(index_output, 'w', encoding='utf-8') as f:
|
|
124
|
+
json.dump(master_index, f, indent=2, ensure_ascii=False)
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
'countries_migrated': len(master_index['countries']),
|
|
128
|
+
'continents': list(continent_countries.keys())
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def main():
|
|
133
|
+
"""CLI entry point."""
|
|
134
|
+
if len(sys.argv) < 3:
|
|
135
|
+
print("Usage: python -m geo_intel_offline.migrate_to_modular <monolithic_dir> <output_dir>")
|
|
136
|
+
print("\nExample:")
|
|
137
|
+
print(" python -m geo_intel_offline.migrate_to_modular geo_intel_offline/data geo_intel_offline/data_modular")
|
|
138
|
+
sys.exit(1)
|
|
139
|
+
|
|
140
|
+
monolithic_dir = Path(sys.argv[1])
|
|
141
|
+
output_dir = Path(sys.argv[2])
|
|
142
|
+
|
|
143
|
+
print(f"Migrating monolithic data from {monolithic_dir} to {output_dir}...")
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
stats = migrate_monolithic_to_modular(monolithic_dir, output_dir)
|
|
147
|
+
print(f"\n✓ Migration complete!")
|
|
148
|
+
print(f" Countries migrated: {stats['countries_migrated']}")
|
|
149
|
+
print(f" Continents: {', '.join(stats['continents'])}")
|
|
150
|
+
print(f"\nModular data saved to: {output_dir}")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"\n✗ Migration failed: {e}")
|
|
153
|
+
import traceback
|
|
154
|
+
traceback.print_exc()
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == '__main__':
|
|
159
|
+
main()
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modular data loader - supports selective country/continent loading.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Load specific countries or continents
|
|
6
|
+
- Lazy loading of country files
|
|
7
|
+
- Efficient memory usage
|
|
8
|
+
- Backward compatible with monolithic format
|
|
9
|
+
- Supports gzip-compressed data files
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import gzip
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Optional
|
|
16
|
+
from .data_loader import DataLoader as MonolithicLoader
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ModularDataLoader:
|
|
20
|
+
"""
|
|
21
|
+
Modular data loader that supports selective loading.
|
|
22
|
+
|
|
23
|
+
Can load:
|
|
24
|
+
- All countries (backward compatible)
|
|
25
|
+
- Specific countries (by ISO2 codes)
|
|
26
|
+
- Specific continents
|
|
27
|
+
- All except excluded countries
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
data_dir: Optional[str] = None,
|
|
33
|
+
countries: Optional[List[str]] = None,
|
|
34
|
+
continents: Optional[List[str]] = None,
|
|
35
|
+
exclude_countries: Optional[List[str]] = None
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize modular data loader.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
data_dir: Data directory (defaults to package data)
|
|
42
|
+
countries: List of ISO2 codes to load (None = all)
|
|
43
|
+
continents: List of continent names to load (None = all)
|
|
44
|
+
exclude_countries: List of ISO2 codes to exclude
|
|
45
|
+
"""
|
|
46
|
+
if data_dir is None:
|
|
47
|
+
package_dir = Path(__file__).parent
|
|
48
|
+
data_dir = package_dir / "data"
|
|
49
|
+
|
|
50
|
+
self.data_dir = Path(data_dir)
|
|
51
|
+
self.countries_filter = set(c.upper() for c in (countries or []))
|
|
52
|
+
self.continents_filter = set(c.lower().replace(" ", "_") for c in (continents or [])) if continents else None
|
|
53
|
+
self.exclude_filter = set(c.upper() for c in (exclude_countries or []))
|
|
54
|
+
|
|
55
|
+
# Check if modular format exists
|
|
56
|
+
self.index_path = self.data_dir / "index.json"
|
|
57
|
+
self.is_modular = self.index_path.exists()
|
|
58
|
+
|
|
59
|
+
if not self.is_modular:
|
|
60
|
+
# Fallback to monolithic loader
|
|
61
|
+
self.monolithic_loader = MonolithicLoader(data_dir)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# Load master index (supports gzip compression)
|
|
65
|
+
index_gzip_path = self.index_path.with_suffix(self.index_path.suffix + '.gz')
|
|
66
|
+
if index_gzip_path.exists():
|
|
67
|
+
with gzip.open(index_gzip_path, 'rt', encoding='utf-8') as f:
|
|
68
|
+
self.index = json.load(f)
|
|
69
|
+
else:
|
|
70
|
+
with open(self.index_path, 'r', encoding='utf-8') as f:
|
|
71
|
+
self.index = json.load(f)
|
|
72
|
+
|
|
73
|
+
# Determine which countries to load
|
|
74
|
+
self._determine_countries_to_load()
|
|
75
|
+
|
|
76
|
+
# Cache for loaded country data
|
|
77
|
+
self._loaded_countries: Dict[str, Dict] = {}
|
|
78
|
+
self._geohash_index: Dict[str, List[int]] = {}
|
|
79
|
+
self._polygons: Dict[int, Dict] = {}
|
|
80
|
+
self._metadata: Dict[int, Dict] = {}
|
|
81
|
+
|
|
82
|
+
def _determine_countries_to_load(self):
|
|
83
|
+
"""Determine which countries should be loaded based on filters."""
|
|
84
|
+
available_countries = set(self.index['countries'].keys())
|
|
85
|
+
|
|
86
|
+
if self.countries_filter:
|
|
87
|
+
# Specific countries requested
|
|
88
|
+
to_load = available_countries & self.countries_filter
|
|
89
|
+
elif self.continents_filter:
|
|
90
|
+
# Specific continents requested
|
|
91
|
+
to_load = set()
|
|
92
|
+
for continent in self.continents_filter:
|
|
93
|
+
if continent in self.index.get('continents', {}):
|
|
94
|
+
to_load.update(self.index['continents'][continent])
|
|
95
|
+
else:
|
|
96
|
+
# Load all countries
|
|
97
|
+
to_load = available_countries
|
|
98
|
+
|
|
99
|
+
# Apply exclusion filter
|
|
100
|
+
to_load -= self.exclude_filter
|
|
101
|
+
|
|
102
|
+
self.countries_to_load = to_load
|
|
103
|
+
|
|
104
|
+
def _load_country(self, iso2: str) -> Optional[Dict]:
|
|
105
|
+
"""Load a single country file."""
|
|
106
|
+
if iso2 in self._loaded_countries:
|
|
107
|
+
return self._loaded_countries[iso2]
|
|
108
|
+
|
|
109
|
+
if iso2 not in self.index['countries']:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
country_info = self.index['countries'][iso2]
|
|
113
|
+
country_file = self.data_dir / country_info['file']
|
|
114
|
+
|
|
115
|
+
# Try compressed version first, fallback to uncompressed
|
|
116
|
+
country_gzip_file = country_file.with_suffix(country_file.suffix + '.gz')
|
|
117
|
+
if country_gzip_file.exists():
|
|
118
|
+
with gzip.open(country_gzip_file, 'rt', encoding='utf-8') as f:
|
|
119
|
+
country_data = json.load(f)
|
|
120
|
+
elif country_file.exists():
|
|
121
|
+
with open(country_file, 'r', encoding='utf-8') as f:
|
|
122
|
+
country_data = json.load(f)
|
|
123
|
+
else:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
self._loaded_countries[iso2] = country_data
|
|
127
|
+
|
|
128
|
+
# Update caches
|
|
129
|
+
country_id = country_data['country_id']
|
|
130
|
+
|
|
131
|
+
# Add to geohash index
|
|
132
|
+
for geohash, ids in country_data['geohashes'].items():
|
|
133
|
+
if geohash not in self._geohash_index:
|
|
134
|
+
self._geohash_index[geohash] = []
|
|
135
|
+
if country_id not in self._geohash_index[geohash]:
|
|
136
|
+
self._geohash_index[geohash].append(country_id)
|
|
137
|
+
|
|
138
|
+
# Add to polygons and metadata
|
|
139
|
+
self._polygons[country_id] = country_data['polygon']
|
|
140
|
+
self._metadata[country_id] = country_data['metadata']
|
|
141
|
+
|
|
142
|
+
return country_data
|
|
143
|
+
|
|
144
|
+
def _load_all_countries(self):
|
|
145
|
+
"""Load all countries that match filters."""
|
|
146
|
+
for iso2 in self.countries_to_load:
|
|
147
|
+
self._load_country(iso2)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def geohash_index(self) -> Dict[str, List[int]]:
|
|
151
|
+
"""Get geohash index (lazy-loaded)."""
|
|
152
|
+
if not self.is_modular:
|
|
153
|
+
return self.monolithic_loader.geohash_index
|
|
154
|
+
|
|
155
|
+
if not self._geohash_index:
|
|
156
|
+
self._load_all_countries()
|
|
157
|
+
return self._geohash_index
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def polygons(self) -> Dict[int, Dict]:
|
|
161
|
+
"""Get polygons (lazy-loaded)."""
|
|
162
|
+
if not self.is_modular:
|
|
163
|
+
return self.monolithic_loader.polygons
|
|
164
|
+
|
|
165
|
+
if not self._polygons:
|
|
166
|
+
self._load_all_countries()
|
|
167
|
+
return self._polygons
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def metadata(self) -> Dict[int, Dict]:
|
|
171
|
+
"""Get metadata (lazy-loaded)."""
|
|
172
|
+
if not self.is_modular:
|
|
173
|
+
return self.monolithic_loader.metadata
|
|
174
|
+
|
|
175
|
+
if not self._metadata:
|
|
176
|
+
self._load_all_countries()
|
|
177
|
+
return self._metadata
|
|
178
|
+
|
|
179
|
+
def get_candidate_countries(self, geohash: str) -> List[int]:
|
|
180
|
+
"""Get candidate country IDs for a geohash."""
|
|
181
|
+
index = self.geohash_index
|
|
182
|
+
|
|
183
|
+
# Try full geohash first
|
|
184
|
+
candidates = index.get(geohash, [])
|
|
185
|
+
|
|
186
|
+
# If no exact match, try prefixes
|
|
187
|
+
if not candidates:
|
|
188
|
+
for prefix_len in range(len(geohash), 0, -1):
|
|
189
|
+
prefix = geohash[:prefix_len]
|
|
190
|
+
if prefix in index:
|
|
191
|
+
candidates.extend(index[prefix])
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
return list(set(candidates))
|
|
195
|
+
|
|
196
|
+
def get_polygon(self, country_id: int) -> Optional[Dict]:
|
|
197
|
+
"""Get polygon for a country."""
|
|
198
|
+
return self.polygons.get(country_id)
|
|
199
|
+
|
|
200
|
+
def get_metadata(self, country_id: int) -> Optional[Dict]:
|
|
201
|
+
"""Get metadata for a country."""
|
|
202
|
+
return self.metadata.get(country_id)
|
|
203
|
+
|
|
204
|
+
def get_loaded_countries(self) -> List[str]:
|
|
205
|
+
"""Get list of loaded country ISO2 codes."""
|
|
206
|
+
if not self.is_modular:
|
|
207
|
+
return list(self.monolithic_loader.metadata.keys())
|
|
208
|
+
return list(self.countries_to_load)
|
|
209
|
+
|
|
210
|
+
def get_loaded_count(self) -> int:
|
|
211
|
+
"""Get count of loaded countries."""
|
|
212
|
+
return len(self.countries_to_load) if self.is_modular else len(self.monolithic_loader.metadata)
|