geo-intel-offline 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ """
2
+ Geohash encoding and decoding for spatial indexing.
3
+
4
+ Geohash is a geocoding system that encodes latitude/longitude into a string.
5
+ We use it to create a spatial index for fast candidate country filtering.
6
+
7
+ Design Decision: Using precision level 6 (32-bit geohash) as a balance:
8
+ - Precision ~1.2km × 0.6km (sufficient for country-level resolution)
9
+ - Small index size (~200 countries × few geohashes each)
10
+ - Fast encoding/decoding operations
11
+ """
12
+
13
+ from typing import Tuple
14
+
15
+ # Base32 encoding used by geohash
16
+ BASE32 = "0123456789bcdefghjkmnpqrstuvwxyz"
17
+
18
+ # Geohash precision for indexing (6 chars = ~1.2km precision)
19
+ GEOHASH_PRECISION = 6
20
+
21
+
22
+ def encode(lat: float, lon: float, precision: int = GEOHASH_PRECISION) -> str:
23
+ """
24
+ Encode latitude/longitude to geohash string.
25
+
26
+ Args:
27
+ lat: Latitude (-90 to 90)
28
+ lon: Longitude (-180 to 180)
29
+ precision: Number of geohash characters (default 6)
30
+
31
+ Returns:
32
+ Geohash string
33
+ """
34
+ if not (-90 <= lat <= 90):
35
+ raise ValueError(f"Latitude must be between -90 and 90, got {lat}")
36
+ if not (-180 <= lon <= 180):
37
+ raise ValueError(f"Longitude must be between -180 and 180, got {lon}")
38
+
39
+ lat_range = (-90.0, 90.0)
40
+ lon_range = (-180.0, 180.0)
41
+ bits = 0
42
+ bits_per_char = 5
43
+ geohash = []
44
+
45
+ ch = 0
46
+
47
+ for i in range(precision * bits_per_char):
48
+ if i % 2 == 0:
49
+ # Longitude bit
50
+ mid = (lon_range[0] + lon_range[1]) / 2
51
+ if lon >= mid:
52
+ ch |= (1 << (bits_per_char - 1 - (i // 2) % bits_per_char))
53
+ lon_range = (mid, lon_range[1])
54
+ else:
55
+ lon_range = (lon_range[0], mid)
56
+ else:
57
+ # Latitude bit
58
+ mid = (lat_range[0] + lat_range[1]) / 2
59
+ if lat >= mid:
60
+ ch |= (1 << (bits_per_char - 1 - (i // 2) % bits_per_char))
61
+ lat_range = (mid, lat_range[1])
62
+ else:
63
+ lat_range = (lat_range[0], mid)
64
+
65
+ if (i + 1) % bits_per_char == 0:
66
+ geohash.append(BASE32[ch])
67
+ ch = 0
68
+
69
+ return ''.join(geohash)
70
+
71
+
72
+ def decode(geohash: str) -> Tuple[float, float, Tuple[float, float], Tuple[float, float]]:
73
+ """
74
+ Decode geohash string to latitude/longitude with bounding box.
75
+
76
+ Args:
77
+ geohash: Geohash string
78
+
79
+ Returns:
80
+ Tuple of (lat, lon, lat_range, lon_range)
81
+ """
82
+ if not geohash:
83
+ raise ValueError("Geohash cannot be empty")
84
+
85
+ lat_range = (-90.0, 90.0)
86
+ lon_range = (-180.0, 180.0)
87
+ is_even = True
88
+
89
+ for char in geohash:
90
+ if char not in BASE32:
91
+ raise ValueError(f"Invalid geohash character: {char}")
92
+
93
+ idx = BASE32.index(char)
94
+
95
+ for j in range(5):
96
+ bit = (idx >> (4 - j)) & 1
97
+ if is_even:
98
+ mid = (lon_range[0] + lon_range[1]) / 2
99
+ if bit:
100
+ lon_range = (mid, lon_range[1])
101
+ else:
102
+ lon_range = (lon_range[0], mid)
103
+ else:
104
+ mid = (lat_range[0] + lat_range[1]) / 2
105
+ if bit:
106
+ lat_range = (mid, lat_range[1])
107
+ else:
108
+ lat_range = (lat_range[0], mid)
109
+ is_even = not is_even
110
+
111
+ lat = (lat_range[0] + lat_range[1]) / 2
112
+ lon = (lon_range[0] + lon_range[1]) / 2
113
+
114
+ return lat, lon, lat_range, lon_range
115
+
116
+
117
+ def get_neighbors(geohash: str) -> list[str]:
118
+ """
119
+ Get 8 neighboring geohashes (for border cases).
120
+
121
+ Design Decision: Check neighbors when point-in-polygon fails.
122
+ This handles edge cases where a point is near geohash boundaries.
123
+
124
+ Args:
125
+ geohash: Geohash string
126
+
127
+ Returns:
128
+ List of 8 neighboring geohashes plus the original
129
+ """
130
+ lat, lon, lat_range, lon_range = decode(geohash)
131
+
132
+ # Calculate step size from precision
133
+ lat_step = lat_range[1] - lat_range[0]
134
+ lon_step = lon_range[1] - lon_range[0]
135
+
136
+ neighbors = []
137
+ for dlat in [-lat_step, 0, lat_step]:
138
+ for dlon in [-lon_step, 0, lon_step]:
139
+ if dlat == 0 and dlon == 0:
140
+ continue
141
+ new_lat = lat + dlat
142
+ new_lon = lon + dlon
143
+
144
+ # Clamp to valid ranges
145
+ new_lat = max(-90, min(90, new_lat))
146
+ new_lon = max(-180, min(180, new_lon))
147
+
148
+ neighbors.append(encode(new_lat, new_lon, len(geohash)))
149
+
150
+ return neighbors
@@ -0,0 +1,136 @@
1
+ """
2
+ Hierarchical resolver for country → state → city resolution.
3
+
4
+ Extends the base resolver to support multi-level geo-intelligence.
5
+ """
6
+
7
+ from typing import Optional, Dict, List, Tuple
8
+ from .resolver import resolve as resolve_country, ResolutionResult
9
+ from .data_loader import get_loader
10
+
11
+
12
+ class HierarchicalResult:
13
+ """Result with country, state/province, and city information."""
14
+
15
+ def __init__(
16
+ self,
17
+ country: Optional[str] = None,
18
+ country_iso2: Optional[str] = None,
19
+ country_iso3: Optional[str] = None,
20
+ state: Optional[str] = None,
21
+ state_code: Optional[str] = None,
22
+ city: Optional[str] = None,
23
+ continent: Optional[str] = None,
24
+ timezone: Optional[str] = None,
25
+ confidence: float = 0.0
26
+ ):
27
+ self.country = country
28
+ self.country_iso2 = country_iso2
29
+ self.country_iso3 = country_iso3
30
+ self.state = state
31
+ self.state_code = state_code
32
+ self.city = city
33
+ self.continent = continent
34
+ self.timezone = timezone
35
+ self.confidence = confidence
36
+
37
+ def to_dict(self) -> Dict:
38
+ """Convert to dictionary."""
39
+ return {
40
+ "country": self.country,
41
+ "country_iso2": self.country_iso2,
42
+ "country_iso3": self.country_iso3,
43
+ "state": self.state,
44
+ "state_code": self.state_code,
45
+ "city": self.city,
46
+ "continent": self.continent,
47
+ "timezone": self.timezone,
48
+ "confidence": self.confidence
49
+ }
50
+
51
+
52
+ def resolve_hierarchical(
53
+ lat: float,
54
+ lon: float,
55
+ include_states: bool = False,
56
+ include_cities: bool = False,
57
+ data_dir: Optional[str] = None
58
+ ) -> HierarchicalResult:
59
+ """
60
+ Resolve coordinates hierarchically: country → state → city.
61
+
62
+ Args:
63
+ lat: Latitude
64
+ lon: Longitude
65
+ include_states: Whether to resolve state/province (requires state data)
66
+ include_cities: Whether to resolve city (requires city data)
67
+ data_dir: Optional custom data directory
68
+
69
+ Returns:
70
+ HierarchicalResult with country, state, and city information
71
+ """
72
+ # First resolve country (always available)
73
+ country_result = resolve_country(lat, lon, data_dir)
74
+
75
+ if not country_result.is_valid():
76
+ return HierarchicalResult()
77
+
78
+ result = HierarchicalResult(
79
+ country=country_result.country_name,
80
+ country_iso2=country_result.iso2,
81
+ country_iso3=country_result.iso3,
82
+ continent=country_result.continent,
83
+ timezone=country_result.timezone,
84
+ confidence=country_result.confidence
85
+ )
86
+
87
+ # Resolve state/province if requested and data available
88
+ if include_states:
89
+ state_info = _resolve_state(lat, lon, country_result.country_id, data_dir)
90
+ if state_info:
91
+ result.state = state_info.get('name')
92
+ result.state_code = state_info.get('code')
93
+ # Adjust confidence (state resolution may be less accurate)
94
+ result.confidence = min(result.confidence, state_info.get('confidence', result.confidence))
95
+
96
+ # Resolve city if requested and data available
97
+ if include_cities:
98
+ city_info = _resolve_city(lat, lon, country_result.country_id, data_dir)
99
+ if city_info:
100
+ result.city = city_info.get('name')
101
+ # Adjust confidence
102
+ result.confidence = min(result.confidence, city_info.get('confidence', result.confidence))
103
+
104
+ return result
105
+
106
+
107
+ def _resolve_state(
108
+ lat: float,
109
+ lon: float,
110
+ country_id: int,
111
+ data_dir: Optional[str]
112
+ ) -> Optional[Dict]:
113
+ """
114
+ Resolve state/province within a country.
115
+
116
+ Note: Requires state-level data files (states_index.json, states_polygons.json)
117
+ """
118
+ # This would load state data similar to country resolution
119
+ # For now, return None (state data not yet implemented)
120
+ return None
121
+
122
+
123
+ def _resolve_city(
124
+ lat: float,
125
+ lon: float,
126
+ country_id: int,
127
+ data_dir: Optional[str]
128
+ ) -> Optional[Dict]:
129
+ """
130
+ Resolve city within a country.
131
+
132
+ Note: Requires city-level data files (cities_index.json, cities_data.json)
133
+ """
134
+ # This would load city data (points or polygons)
135
+ # For now, return None (city data not yet implemented)
136
+ return None
@@ -0,0 +1,159 @@
1
+ """
2
+ Migration tool: Convert monolithic data format to modular format.
3
+
4
+ This tool reads existing monolithic data files and converts them to
5
+ the new modular country-wise format.
6
+ """
7
+
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Dict
12
+
13
+
14
+ def migrate_monolithic_to_modular(
15
+ monolithic_dir: Path,
16
+ output_dir: Path
17
+ ) -> Dict:
18
+ """
19
+ Migrate monolithic data files to modular format.
20
+
21
+ Args:
22
+ monolithic_dir: Directory with monolithic files (geohash_index.json, etc.)
23
+ output_dir: Output directory for modular format
24
+
25
+ Returns:
26
+ Dict with migration statistics
27
+ """
28
+ # Load monolithic files
29
+ index_file = monolithic_dir / "geohash_index.json"
30
+ polygons_file = monolithic_dir / "polygons.json"
31
+ metadata_file = monolithic_dir / "metadata.json"
32
+
33
+ if not all(f.exists() for f in [index_file, polygons_file, metadata_file]):
34
+ raise FileNotFoundError(
35
+ f"Monolithic data files not found in {monolithic_dir}. "
36
+ "Expected: geohash_index.json, polygons.json, metadata.json"
37
+ )
38
+
39
+ with open(index_file, 'r') as f:
40
+ geohash_index = json.load(f)
41
+
42
+ with open(polygons_file, 'r') as f:
43
+ polygons = json.load(f)
44
+
45
+ with open(metadata_file, 'r') as f:
46
+ metadata = json.load(f)
47
+
48
+ # Create output structure
49
+ output_dir.mkdir(parents=True, exist_ok=True)
50
+ continents_dir = output_dir / "continents"
51
+ continents_dir.mkdir(exist_ok=True)
52
+
53
+ # Organize by country
54
+ master_index = {
55
+ 'version': '1.0.0',
56
+ 'countries': {},
57
+ 'continents': {}
58
+ }
59
+
60
+ continent_countries = {}
61
+
62
+ # Process each country
63
+ for country_id_str, country_metadata in metadata.items():
64
+ country_id = int(country_id_str)
65
+ iso2 = country_metadata.get('iso2', '').upper()
66
+ continent_raw = country_metadata.get('continent', 'unknown')
67
+ continent = continent_raw.lower().replace(' ', '_')
68
+
69
+ if not iso2:
70
+ # Skip countries without ISO2 code
71
+ continue
72
+
73
+ # Get polygon for this country
74
+ country_polygon = polygons.get(country_id_str)
75
+ if not country_polygon:
76
+ continue
77
+
78
+ # Extract geohashes for this country
79
+ country_geohashes = {}
80
+ for geohash, country_ids in geohash_index.items():
81
+ if country_id in (country_ids if isinstance(country_ids, list) else [country_ids]):
82
+ country_geohashes[geohash] = [country_id]
83
+
84
+ # Create country data structure
85
+ country_data = {
86
+ 'country_id': country_id,
87
+ 'metadata': country_metadata,
88
+ 'geohashes': country_geohashes,
89
+ 'polygon': country_polygon
90
+ }
91
+
92
+ # Create continent directory
93
+ continent_dir = continents_dir / continent
94
+ continent_dir.mkdir(exist_ok=True)
95
+
96
+ # Save country file
97
+ country_file = continent_dir / f"{iso2}.json"
98
+ with open(country_file, 'w', encoding='utf-8') as f:
99
+ json.dump(country_data, f, separators=(',', ':'))
100
+
101
+ # Update master index
102
+ relative_path = f"continents/{continent}/{iso2}.json"
103
+ master_index['countries'][iso2] = {
104
+ 'id': country_id,
105
+ 'name': country_metadata.get('name', ''),
106
+ 'iso2': iso2,
107
+ 'iso3': country_metadata.get('iso3', ''),
108
+ 'continent': continent_raw,
109
+ 'file': relative_path,
110
+ 'size_bytes': country_file.stat().st_size
111
+ }
112
+
113
+ # Track continent membership
114
+ if continent not in continent_countries:
115
+ continent_countries[continent] = []
116
+ continent_countries[continent].append(iso2)
117
+
118
+ # Update continent index
119
+ master_index['continents'] = continent_countries
120
+
121
+ # Save master index
122
+ index_output = output_dir / "index.json"
123
+ with open(index_output, 'w', encoding='utf-8') as f:
124
+ json.dump(master_index, f, indent=2, ensure_ascii=False)
125
+
126
+ return {
127
+ 'countries_migrated': len(master_index['countries']),
128
+ 'continents': list(continent_countries.keys())
129
+ }
130
+
131
+
132
+ def main():
133
+ """CLI entry point."""
134
+ if len(sys.argv) < 3:
135
+ print("Usage: python -m geo_intel_offline.migrate_to_modular <monolithic_dir> <output_dir>")
136
+ print("\nExample:")
137
+ print(" python -m geo_intel_offline.migrate_to_modular geo_intel_offline/data geo_intel_offline/data_modular")
138
+ sys.exit(1)
139
+
140
+ monolithic_dir = Path(sys.argv[1])
141
+ output_dir = Path(sys.argv[2])
142
+
143
+ print(f"Migrating monolithic data from {monolithic_dir} to {output_dir}...")
144
+
145
+ try:
146
+ stats = migrate_monolithic_to_modular(monolithic_dir, output_dir)
147
+ print(f"\n✓ Migration complete!")
148
+ print(f" Countries migrated: {stats['countries_migrated']}")
149
+ print(f" Continents: {', '.join(stats['continents'])}")
150
+ print(f"\nModular data saved to: {output_dir}")
151
+ except Exception as e:
152
+ print(f"\n✗ Migration failed: {e}")
153
+ import traceback
154
+ traceback.print_exc()
155
+ sys.exit(1)
156
+
157
+
158
+ if __name__ == '__main__':
159
+ main()
@@ -0,0 +1,212 @@
1
+ """
2
+ Modular data loader - supports selective country/continent loading.
3
+
4
+ Features:
5
+ - Load specific countries or continents
6
+ - Lazy loading of country files
7
+ - Efficient memory usage
8
+ - Backward compatible with monolithic format
9
+ - Supports gzip-compressed data files
10
+ """
11
+
12
+ import json
13
+ import gzip
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional
16
+ from .data_loader import DataLoader as MonolithicLoader
17
+
18
+
19
+ class ModularDataLoader:
20
+ """
21
+ Modular data loader that supports selective loading.
22
+
23
+ Can load:
24
+ - All countries (backward compatible)
25
+ - Specific countries (by ISO2 codes)
26
+ - Specific continents
27
+ - All except excluded countries
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ data_dir: Optional[str] = None,
33
+ countries: Optional[List[str]] = None,
34
+ continents: Optional[List[str]] = None,
35
+ exclude_countries: Optional[List[str]] = None
36
+ ):
37
+ """
38
+ Initialize modular data loader.
39
+
40
+ Args:
41
+ data_dir: Data directory (defaults to package data)
42
+ countries: List of ISO2 codes to load (None = all)
43
+ continents: List of continent names to load (None = all)
44
+ exclude_countries: List of ISO2 codes to exclude
45
+ """
46
+ if data_dir is None:
47
+ package_dir = Path(__file__).parent
48
+ data_dir = package_dir / "data"
49
+
50
+ self.data_dir = Path(data_dir)
51
+ self.countries_filter = set(c.upper() for c in (countries or []))
52
+ self.continents_filter = set(c.lower().replace(" ", "_") for c in (continents or [])) if continents else None
53
+ self.exclude_filter = set(c.upper() for c in (exclude_countries or []))
54
+
55
+ # Check if modular format exists
56
+ self.index_path = self.data_dir / "index.json"
57
+ self.is_modular = self.index_path.exists()
58
+
59
+ if not self.is_modular:
60
+ # Fallback to monolithic loader
61
+ self.monolithic_loader = MonolithicLoader(data_dir)
62
+ return
63
+
64
+ # Load master index (supports gzip compression)
65
+ index_gzip_path = self.index_path.with_suffix(self.index_path.suffix + '.gz')
66
+ if index_gzip_path.exists():
67
+ with gzip.open(index_gzip_path, 'rt', encoding='utf-8') as f:
68
+ self.index = json.load(f)
69
+ else:
70
+ with open(self.index_path, 'r', encoding='utf-8') as f:
71
+ self.index = json.load(f)
72
+
73
+ # Determine which countries to load
74
+ self._determine_countries_to_load()
75
+
76
+ # Cache for loaded country data
77
+ self._loaded_countries: Dict[str, Dict] = {}
78
+ self._geohash_index: Dict[str, List[int]] = {}
79
+ self._polygons: Dict[int, Dict] = {}
80
+ self._metadata: Dict[int, Dict] = {}
81
+
82
+ def _determine_countries_to_load(self):
83
+ """Determine which countries should be loaded based on filters."""
84
+ available_countries = set(self.index['countries'].keys())
85
+
86
+ if self.countries_filter:
87
+ # Specific countries requested
88
+ to_load = available_countries & self.countries_filter
89
+ elif self.continents_filter:
90
+ # Specific continents requested
91
+ to_load = set()
92
+ for continent in self.continents_filter:
93
+ if continent in self.index.get('continents', {}):
94
+ to_load.update(self.index['continents'][continent])
95
+ else:
96
+ # Load all countries
97
+ to_load = available_countries
98
+
99
+ # Apply exclusion filter
100
+ to_load -= self.exclude_filter
101
+
102
+ self.countries_to_load = to_load
103
+
104
+ def _load_country(self, iso2: str) -> Optional[Dict]:
105
+ """Load a single country file."""
106
+ if iso2 in self._loaded_countries:
107
+ return self._loaded_countries[iso2]
108
+
109
+ if iso2 not in self.index['countries']:
110
+ return None
111
+
112
+ country_info = self.index['countries'][iso2]
113
+ country_file = self.data_dir / country_info['file']
114
+
115
+ # Try compressed version first, fallback to uncompressed
116
+ country_gzip_file = country_file.with_suffix(country_file.suffix + '.gz')
117
+ if country_gzip_file.exists():
118
+ with gzip.open(country_gzip_file, 'rt', encoding='utf-8') as f:
119
+ country_data = json.load(f)
120
+ elif country_file.exists():
121
+ with open(country_file, 'r', encoding='utf-8') as f:
122
+ country_data = json.load(f)
123
+ else:
124
+ return None
125
+
126
+ self._loaded_countries[iso2] = country_data
127
+
128
+ # Update caches
129
+ country_id = country_data['country_id']
130
+
131
+ # Add to geohash index
132
+ for geohash, ids in country_data['geohashes'].items():
133
+ if geohash not in self._geohash_index:
134
+ self._geohash_index[geohash] = []
135
+ if country_id not in self._geohash_index[geohash]:
136
+ self._geohash_index[geohash].append(country_id)
137
+
138
+ # Add to polygons and metadata
139
+ self._polygons[country_id] = country_data['polygon']
140
+ self._metadata[country_id] = country_data['metadata']
141
+
142
+ return country_data
143
+
144
+ def _load_all_countries(self):
145
+ """Load all countries that match filters."""
146
+ for iso2 in self.countries_to_load:
147
+ self._load_country(iso2)
148
+
149
+ @property
150
+ def geohash_index(self) -> Dict[str, List[int]]:
151
+ """Get geohash index (lazy-loaded)."""
152
+ if not self.is_modular:
153
+ return self.monolithic_loader.geohash_index
154
+
155
+ if not self._geohash_index:
156
+ self._load_all_countries()
157
+ return self._geohash_index
158
+
159
+ @property
160
+ def polygons(self) -> Dict[int, Dict]:
161
+ """Get polygons (lazy-loaded)."""
162
+ if not self.is_modular:
163
+ return self.monolithic_loader.polygons
164
+
165
+ if not self._polygons:
166
+ self._load_all_countries()
167
+ return self._polygons
168
+
169
+ @property
170
+ def metadata(self) -> Dict[int, Dict]:
171
+ """Get metadata (lazy-loaded)."""
172
+ if not self.is_modular:
173
+ return self.monolithic_loader.metadata
174
+
175
+ if not self._metadata:
176
+ self._load_all_countries()
177
+ return self._metadata
178
+
179
+ def get_candidate_countries(self, geohash: str) -> List[int]:
180
+ """Get candidate country IDs for a geohash."""
181
+ index = self.geohash_index
182
+
183
+ # Try full geohash first
184
+ candidates = index.get(geohash, [])
185
+
186
+ # If no exact match, try prefixes
187
+ if not candidates:
188
+ for prefix_len in range(len(geohash), 0, -1):
189
+ prefix = geohash[:prefix_len]
190
+ if prefix in index:
191
+ candidates.extend(index[prefix])
192
+ break
193
+
194
+ return list(set(candidates))
195
+
196
+ def get_polygon(self, country_id: int) -> Optional[Dict]:
197
+ """Get polygon for a country."""
198
+ return self.polygons.get(country_id)
199
+
200
+ def get_metadata(self, country_id: int) -> Optional[Dict]:
201
+ """Get metadata for a country."""
202
+ return self.metadata.get(country_id)
203
+
204
+ def get_loaded_countries(self) -> List[str]:
205
+ """Get list of loaded country ISO2 codes."""
206
+ if not self.is_modular:
207
+ return list(self.monolithic_loader.metadata.keys())
208
+ return list(self.countries_to_load)
209
+
210
+ def get_loaded_count(self) -> int:
211
+ """Get count of loaded countries."""
212
+ return len(self.countries_to_load) if self.is_modular else len(self.monolithic_loader.metadata)