geo-intel-offline 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,474 @@
1
+ """
2
+ Modular data builder - generates country-wise data files.
3
+
4
+ Design:
5
+ - Each country in its own file
6
+ - Organized by continent directories
7
+ - Master index file for lookup
8
+ - Supports selective country/continent building
9
+ - Uses PIP validation for accurate geohash indexing
10
+ """
11
+
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+ from typing import Dict, List, Tuple, Optional, Set
16
+ from .geohash import encode
17
+ from .data_builder import simplify_polygon # Uses improved simplify_polygon with validation
18
+ from .pip import point_in_polygon_with_holes
19
+ from .polygon_utils import (
20
+ calculate_bounding_box,
21
+ calculate_adaptive_step_size,
22
+ calculate_safe_iteration_limits,
23
+ get_polygon_centroid,
24
+ convert_geojson_coords_to_latlon
25
+ )
26
+
27
+
28
+ # Continent mapping (normalize various continent name formats)
29
+ CONTINENT_MAPPING = {
30
+ "Africa": "africa",
31
+ "africa": "africa",
32
+ "AFRICA": "africa",
33
+
34
+ "Asia": "asia",
35
+ "asia": "asia",
36
+ "ASIA": "asia",
37
+
38
+ "Europe": "europe",
39
+ "europe": "europe",
40
+ "EUROPE": "europe",
41
+
42
+ "North America": "north_america",
43
+ "north america": "north_america",
44
+ "NorthAmerica": "north_america",
45
+ "north_america": "north_america",
46
+
47
+ "South America": "south_america",
48
+ "south america": "south_america",
49
+ "SouthAmerica": "south_america",
50
+ "south_america": "south_america",
51
+
52
+ "Oceania": "oceania",
53
+ "oceania": "oceania",
54
+ "OCEANIA": "oceania",
55
+ "Australia": "oceania",
56
+
57
+ "Antarctica": "antarctica",
58
+ "antarctica": "antarctica",
59
+ "ANTARCTICA": "antarctica",
60
+ }
61
+
62
+
63
+ def normalize_continent(continent: str) -> str:
64
+ """Normalize continent name to directory format."""
65
+ return CONTINENT_MAPPING.get(continent, continent.lower().replace(" ", "_"))
66
+
67
+
68
+ def build_country_geohash_index(
69
+ polygon_exterior: List[Tuple[float, float]],
70
+ polygon_holes: List[List[Tuple[float, float]]],
71
+ country_id: int,
72
+ geohash_precision: int = 6,
73
+ validate_with_pip: bool = True
74
+ ) -> Dict[str, List[int]]:
75
+ """
76
+ Build geohash index for a single country with PIP validation.
77
+
78
+ Args:
79
+ polygon_exterior: Exterior polygon coordinates
80
+ polygon_holes: Interior rings (holes) if any
81
+ country_id: Country ID
82
+ geohash_precision: Geohash precision level
83
+ validate_with_pip: If True, validate points are in polygon before indexing
84
+
85
+ Returns:
86
+ Dict mapping geohash strings to [country_id]
87
+ """
88
+ if not polygon_exterior:
89
+ return {}
90
+
91
+ # Calculate bounding box using shared utility
92
+ min_lat, max_lat, min_lon, max_lon = calculate_bounding_box(polygon_exterior)
93
+
94
+ lat_range = max_lat - min_lat
95
+ lon_range = max_lon - min_lon
96
+
97
+ # Calculate adaptive step size using shared utility
98
+ step = calculate_adaptive_step_size(lat_range, lon_range)
99
+
100
+ # For very small polygons, ensure we sample multiple points
101
+ if lat_range < 0.001 or lon_range < 0.001:
102
+ # Sample multiple points for tiny polygons
103
+ sample_points = []
104
+ sample_points.append(get_polygon_centroid(polygon_exterior))
105
+ sample_points.append((min_lat, min_lon))
106
+ sample_points.append((min_lat, max_lon))
107
+ sample_points.append((max_lat, min_lon))
108
+ sample_points.append((max_lat, max_lon))
109
+
110
+ index = {}
111
+ for point in sample_points:
112
+ if validate_with_pip:
113
+ if not point_in_polygon_with_holes(point, polygon_exterior, polygon_holes):
114
+ continue
115
+ geohash = encode(point[0], point[1], geohash_precision)
116
+ if geohash not in index:
117
+ index[geohash] = []
118
+ if country_id not in index[geohash]:
119
+ index[geohash].append(country_id)
120
+ return index
121
+
122
+ # Sample points from bounding boxes with validation
123
+ index = {}
124
+ geohashes_added = set()
125
+ lat = min_lat
126
+
127
+ while lat <= max_lat:
128
+ lon = min_lon
129
+ while lon <= max_lon:
130
+ point = (lat, lon)
131
+
132
+ # Validate point is in polygon if enabled
133
+ if validate_with_pip:
134
+ if not point_in_polygon_with_holes(point, polygon_exterior, polygon_holes):
135
+ lon += step
136
+ continue
137
+
138
+ # Add to index
139
+ geohash = encode(lat, lon, geohash_precision)
140
+ geohash_key = (geohash, country_id)
141
+
142
+ if geohash_key not in geohashes_added:
143
+ if geohash not in index:
144
+ index[geohash] = []
145
+ if country_id not in index[geohash]:
146
+ index[geohash].append(country_id)
147
+ geohashes_added.add(geohash_key)
148
+
149
+ lon += step
150
+ lat += step
151
+
152
+ # Ensure at least one geohash for very small countries
153
+ if len(geohashes_added) == 0 and validate_with_pip:
154
+ centroid_lat, centroid_lon = get_polygon_centroid(polygon_exterior)
155
+ centroid_point = (centroid_lat, centroid_lon)
156
+
157
+ if point_in_polygon_with_holes(centroid_point, polygon_exterior, polygon_holes):
158
+ geohash = encode(centroid_lat, centroid_lon, geohash_precision)
159
+ if geohash not in index:
160
+ index[geohash] = []
161
+ if country_id not in index[geohash]:
162
+ index[geohash].append(country_id)
163
+
164
+ return index
165
+
166
+
167
+ def process_country(
168
+ feature: Dict,
169
+ country_id: int,
170
+ geohash_precision: int = 6,
171
+ polygon_tolerance: float = 0.005
172
+ ) -> Optional[Dict]:
173
+ """
174
+ Process a single country feature into modular format.
175
+
176
+ Returns:
177
+ Dict with country data or None if invalid
178
+ """
179
+ geometry = feature.get('geometry', {})
180
+ properties = feature.get('properties', {})
181
+
182
+ if geometry.get('type') not in ('Polygon', 'MultiPolygon'):
183
+ return None
184
+
185
+ coords = geometry.get('coordinates', [])
186
+
187
+ # Extract metadata
188
+ metadata = {
189
+ 'name': properties.get('NAME', properties.get('name', f'Country {country_id}')),
190
+ 'iso2': properties.get('ISO_A2', properties.get('iso_a2', '')),
191
+ 'iso3': properties.get('ISO_A3', properties.get('iso_a3', '')),
192
+ 'continent': properties.get('CONTINENT', properties.get('continent', '')),
193
+ 'timezone': properties.get('TIMEZONE', properties.get('timezone', ''))
194
+ }
195
+
196
+ # Process polygon
197
+ if geometry['type'] == 'Polygon':
198
+ exterior_coords = coords[0] if coords else []
199
+ hole_coords = coords[1:] if len(coords) > 1 else []
200
+
201
+ # Convert GeoJSON [lon, lat] to [lat, lon]
202
+ exterior = convert_geojson_coords_to_latlon(exterior_coords)
203
+ holes = [convert_geojson_coords_to_latlon(hole) for hole in hole_coords] if hole_coords else []
204
+
205
+ # Simplify with tighter tolerance for better accuracy
206
+ exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
207
+
208
+ # Ensure simplified polygon is still valid
209
+ if len(exterior_simplified) < 3 and len(exterior) >= 3:
210
+ exterior_simplified = exterior # Use original if simplification made it invalid
211
+
212
+ holes_simplified = []
213
+ for hole in holes:
214
+ if len(hole) >= 3:
215
+ hole_simpl = simplify_polygon(hole, tolerance=polygon_tolerance)
216
+ if len(hole_simpl) >= 3:
217
+ holes_simplified.append(hole_simpl)
218
+ elif len(hole) >= 3:
219
+ holes_simplified.append(hole) # Use original
220
+
221
+ # Build geohash index for single polygon
222
+ geohashes = build_country_geohash_index(
223
+ exterior_simplified,
224
+ holes_simplified if holes_simplified else [],
225
+ country_id,
226
+ geohash_precision,
227
+ validate_with_pip=True
228
+ )
229
+
230
+ polygon_data = {
231
+ 'exterior': exterior_simplified,
232
+ 'holes': holes_simplified if holes_simplified else []
233
+ }
234
+ else: # MultiPolygon - process ALL polygons
235
+ all_exteriors = []
236
+ all_holes = []
237
+ all_geohashes = {}
238
+
239
+ for poly_part in coords:
240
+ if not poly_part or not poly_part[0]:
241
+ continue
242
+
243
+ exterior_coords = poly_part[0]
244
+ hole_coords = poly_part[1:] if len(poly_part) > 1 else []
245
+
246
+ exterior = convert_geojson_coords_to_latlon(exterior_coords)
247
+ holes = [convert_geojson_coords_to_latlon(hole) for hole in hole_coords] if hole_coords else []
248
+
249
+ exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
250
+
251
+ if len(exterior_simplified) >= 3: # Valid polygon
252
+ all_exteriors.append(exterior_simplified)
253
+
254
+ if holes:
255
+ holes_simplified = [simplify_polygon(hole, tolerance=polygon_tolerance) for hole in holes]
256
+ all_holes.extend(holes_simplified)
257
+
258
+ # Build geohash index for each polygon part
259
+ part_geohashes = build_country_geohash_index(
260
+ exterior_simplified,
261
+ holes_simplified if holes else [],
262
+ country_id,
263
+ geohash_precision,
264
+ validate_with_pip=True
265
+ )
266
+ # Merge geohashes
267
+ for gh, ids in part_geohashes.items():
268
+ if gh not in all_geohashes:
269
+ all_geohashes[gh] = []
270
+ all_geohashes[gh].extend(ids)
271
+
272
+ if not all_exteriors:
273
+ return None
274
+
275
+ geohashes = all_geohashes
276
+
277
+ # Store MultiPolygon structure
278
+ polygon_data = {
279
+ 'exterior': all_exteriors[0], # Primary exterior
280
+ 'holes': all_holes if all_holes else [],
281
+ 'multi': True,
282
+ 'exteriors': all_exteriors # All exteriors for complete coverage
283
+ }
284
+
285
+ return {
286
+ 'country_id': country_id,
287
+ 'metadata': metadata,
288
+ 'geohashes': geohashes,
289
+ 'polygon': polygon_data
290
+ }
291
+
292
+
293
+ def build_modular_data(
294
+ geojson_path: Path,
295
+ output_dir: Path,
296
+ countries: Optional[List[str]] = None,
297
+ continents: Optional[List[str]] = None,
298
+ exclude_countries: Optional[List[str]] = None,
299
+ geohash_precision: int = 6,
300
+ polygon_tolerance: float = 0.005
301
+ ) -> Dict:
302
+ """
303
+ Build modular country-wise data files.
304
+
305
+ Args:
306
+ geojson_path: Path to source GeoJSON file
307
+ output_dir: Output directory for data files
308
+ countries: List of ISO2 codes to include (None = all)
309
+ continents: List of continent names to include (None = all)
310
+ exclude_countries: List of ISO2 codes to exclude
311
+ geohash_precision: Geohash precision level
312
+ polygon_tolerance: Polygon simplification tolerance (0.005 = better accuracy)
313
+
314
+ Returns:
315
+ Dict with build statistics
316
+ """
317
+ # Load GeoJSON
318
+ with open(geojson_path, 'r', encoding='utf-8') as f:
319
+ geojson = json.load(f)
320
+
321
+ features = geojson.get('features', [])
322
+
323
+ # Normalize filters
324
+ countries_set = set(c.upper() for c in (countries or []))
325
+ exclude_set = set(c.upper() for c in (exclude_countries or []))
326
+ continents_normalized = [normalize_continent(c) for c in (continents or [])]
327
+
328
+ # Create output structure
329
+ output_dir.mkdir(parents=True, exist_ok=True)
330
+ continents_dir = output_dir / "continents"
331
+ continents_dir.mkdir(exist_ok=True)
332
+
333
+ # Process countries
334
+ master_index = {
335
+ 'version': '1.0.0',
336
+ 'countries': {},
337
+ 'continents': {}
338
+ }
339
+
340
+ continent_countries = {} # continent -> [iso2 codes]
341
+
342
+ processed = 0
343
+ skipped = 0
344
+
345
+ print(f"Processing {len(features)} countries with PIP validation...")
346
+
347
+ for idx, feature in enumerate(features):
348
+ country_id = idx + 1
349
+ properties = feature.get('properties', {})
350
+
351
+ iso2 = properties.get('ISO_A2', properties.get('iso_a2', '')).upper()
352
+ continent_raw = properties.get('CONTINENT', properties.get('continent', ''))
353
+ continent = normalize_continent(continent_raw) if continent_raw else 'unknown'
354
+
355
+ # Apply filters
356
+ if countries_set and iso2 not in countries_set:
357
+ skipped += 1
358
+ continue
359
+
360
+ if exclude_set and iso2 in exclude_set:
361
+ skipped += 1
362
+ continue
363
+
364
+ if continents_normalized and continent not in continents_normalized:
365
+ skipped += 1
366
+ continue
367
+
368
+ # Process country
369
+ country_data = process_country(feature, country_id, geohash_precision, polygon_tolerance)
370
+ if not country_data:
371
+ skipped += 1
372
+ continue
373
+
374
+ # Create continent directory
375
+ continent_dir = continents_dir / continent
376
+ continent_dir.mkdir(exist_ok=True)
377
+
378
+ # Save country file
379
+ country_file = continent_dir / f"{iso2}.json"
380
+ with open(country_file, 'w', encoding='utf-8') as f:
381
+ json.dump(country_data, f, separators=(',', ':'))
382
+
383
+ # Update master index
384
+ relative_path = f"continents/{continent}/{iso2}.json"
385
+ master_index['countries'][iso2] = {
386
+ 'id': country_id,
387
+ 'name': country_data['metadata']['name'],
388
+ 'iso2': iso2,
389
+ 'iso3': country_data['metadata'].get('iso3', ''),
390
+ 'continent': continent_raw,
391
+ 'file': relative_path,
392
+ 'size_bytes': country_file.stat().st_size
393
+ }
394
+
395
+ # Track continent membership
396
+ if continent not in continent_countries:
397
+ continent_countries[continent] = []
398
+ continent_countries[continent].append(iso2)
399
+
400
+ processed += 1
401
+
402
+ # Progress indicator
403
+ if processed % 20 == 0:
404
+ print(f" Processed {processed} countries...")
405
+
406
+ # Update continent index
407
+ master_index['continents'] = continent_countries
408
+
409
+ # Save master index
410
+ index_file = output_dir / "index.json"
411
+ with open(index_file, 'w', encoding='utf-8') as f:
412
+ json.dump(master_index, f, indent=2, ensure_ascii=False)
413
+
414
+ return {
415
+ 'processed': processed,
416
+ 'skipped': skipped,
417
+ 'countries': list(master_index['countries'].keys()),
418
+ 'continents': list(continent_countries.keys())
419
+ }
420
+
421
+
422
+ def main():
423
+ """CLI entry point."""
424
+ import argparse
425
+
426
+ parser = argparse.ArgumentParser(
427
+ description='Build modular country-wise data files with high accuracy'
428
+ )
429
+ parser.add_argument('source', type=Path, help='Source GeoJSON file')
430
+ parser.add_argument('output', type=Path, help='Output directory')
431
+ parser.add_argument('--countries', help='Comma-separated ISO2 codes (e.g., US,CA,MX)')
432
+ parser.add_argument('--continents', help='Comma-separated continent names')
433
+ parser.add_argument('--exclude', help='Comma-separated ISO2 codes to exclude')
434
+ parser.add_argument('--precision', type=int, default=6, help='Geohash precision (default: 6)')
435
+ parser.add_argument('--tolerance', type=float, default=0.005, help='Polygon tolerance (default: 0.005 for high accuracy)')
436
+
437
+ args = parser.parse_args()
438
+
439
+ # Parse filters
440
+ countries = [c.strip().upper() for c in args.countries.split(',')] if args.countries else None
441
+ continents = [c.strip() for c in args.continents.split(',')] if args.continents else None
442
+ exclude = [c.strip().upper() for c in args.exclude.split(',')] if args.exclude else None
443
+
444
+ print(f"Building modular data from {args.source}...")
445
+ print(f" Polygon tolerance: {args.tolerance}° (smaller = more accurate)")
446
+ print(f" Geohash precision: {args.precision}")
447
+ print(f" PIP validation: Enabled")
448
+ if countries:
449
+ print(f" Countries: {', '.join(countries)}")
450
+ if continents:
451
+ print(f" Continents: {', '.join(continents)}")
452
+ if exclude:
453
+ print(f" Exclude: {', '.join(exclude)}")
454
+ print()
455
+
456
+ stats = build_modular_data(
457
+ args.source,
458
+ args.output,
459
+ countries=countries,
460
+ continents=continents,
461
+ exclude_countries=exclude,
462
+ geohash_precision=args.precision,
463
+ polygon_tolerance=args.tolerance
464
+ )
465
+
466
+ print(f"\n✓ Processed: {stats['processed']} countries")
467
+ print(f" Skipped: {stats['skipped']} countries")
468
+ print(f" Continents: {', '.join(stats['continents'])}")
469
+ print(f"\nData saved to: {args.output}")
470
+ print(f"Master index: {args.output / 'index.json'}")
471
+
472
+
473
+ if __name__ == '__main__':
474
+ main()
@@ -0,0 +1,170 @@
1
+ """
2
+ Data loading and binary format handling.
3
+
4
+ Binary Format Design:
5
+ 1. geohash_index.json(.gz) - Geohash → country_id mappings (compressed)
6
+ 2. polygons.json(.gz) - Country polygons (simplified, coordinate arrays)
7
+ 3. metadata.json(.gz) - Country metadata (ISO codes, continent, timezone)
8
+
9
+ Design Decisions:
10
+ - JSON for simplicity (can be compressed/gzipped in production)
11
+ - Supports gzip compression for reduced file size
12
+ - Alternative: msgpack/binary for even smaller size
13
+ - Lazy loading: Load only when needed
14
+ - In-memory caching: Keep in memory after first load
15
+
16
+ For production with < 15 MB constraint:
17
+ - Simplify polygons (reduce vertices using Douglas-Peucker)
18
+ - Compress geohash index (sparse representation)
19
+ - Use efficient coordinate storage (fixed-point integers)
20
+ - Gzip compression reduces file size by ~60-80%
21
+ """
22
+
23
+ import json
24
+ import gzip
25
+ from typing import Dict, List, Tuple, Optional
26
+ from pathlib import Path
27
+
28
+
29
+ class DataLoader:
30
+ """Loads and caches geo-intelligence data."""
31
+
32
+ def __init__(self, data_dir: Optional[str] = None):
33
+ """
34
+ Initialize data loader.
35
+
36
+ Args:
37
+ data_dir: Directory containing data files. If None, uses package data directory.
38
+ """
39
+ if data_dir is None:
40
+ # Default to package data directory
41
+ package_dir = Path(__file__).parent
42
+ data_dir = package_dir / "data"
43
+
44
+ self.data_dir = Path(data_dir)
45
+ self._geohash_index: Optional[Dict[str, List[int]]] = None
46
+ self._polygons: Optional[Dict[int, Dict]] = None
47
+ self._metadata: Optional[Dict[int, Dict]] = None
48
+
49
+ def _load_json(self, filename: str) -> dict:
50
+ """
51
+ Load JSON file from data directory.
52
+
53
+ Automatically detects and handles gzip-compressed files (.json.gz).
54
+ Falls back to uncompressed .json in data_dev/ for development if needed.
55
+ """
56
+ filepath = self.data_dir / filename
57
+ gzip_filepath = self.data_dir / f"{filename}.gz"
58
+ dev_data_dir = self.data_dir.parent / "data_dev"
59
+ dev_filepath = dev_data_dir / filename
60
+
61
+ # Try compressed version first (smaller, preferred - for distribution)
62
+ if gzip_filepath.exists():
63
+ with gzip.open(gzip_filepath, 'rt', encoding='utf-8') as f:
64
+ return json.load(f)
65
+
66
+ # Fallback to uncompressed in data directory (development)
67
+ if filepath.exists():
68
+ with open(filepath, 'r', encoding='utf-8') as f:
69
+ return json.load(f)
70
+
71
+ # Fallback to dev directory (uncompressed files moved here)
72
+ if dev_filepath.exists():
73
+ with open(dev_filepath, 'r', encoding='utf-8') as f:
74
+ return json.load(f)
75
+
76
+ # None found
77
+ raise FileNotFoundError(
78
+ f"Data file not found: {gzip_filepath} or {filepath} or {dev_filepath}\n"
79
+ f"Please run data_builder.py to generate data files."
80
+ )
81
+
82
+ @property
83
+ def geohash_index(self) -> Dict[str, List[int]]:
84
+ """Get geohash index (lazy-loaded)."""
85
+ if self._geohash_index is None:
86
+ data = self._load_json("geohash_index.json")
87
+ self._geohash_index = {
88
+ k: v if isinstance(v, list) else [v]
89
+ for k, v in data.items()
90
+ }
91
+ return self._geohash_index
92
+
93
+ @property
94
+ def polygons(self) -> Dict[int, Dict]:
95
+ """Get country polygons (lazy-loaded)."""
96
+ if self._polygons is None:
97
+ self._polygons = self._load_json("polygons.json")
98
+ # Convert list keys to int (JSON doesn't support int keys)
99
+ self._polygons = {
100
+ int(k): v for k, v in self._polygons.items()
101
+ }
102
+ return self._polygons
103
+
104
+ @property
105
+ def metadata(self) -> Dict[int, Dict]:
106
+ """Get country metadata (lazy-loaded)."""
107
+ if self._metadata is None:
108
+ self._metadata = self._load_json("metadata.json")
109
+ # Convert list keys to int
110
+ self._metadata = {
111
+ int(k): v for k, v in self._metadata.items()
112
+ }
113
+ return self._metadata
114
+
115
+ def get_candidate_countries(self, geohash: str) -> List[int]:
116
+ """
117
+ Get candidate country IDs for a geohash.
118
+
119
+ Args:
120
+ geohash: Geohash string
121
+
122
+ Returns:
123
+ List of country IDs that may contain this geohash
124
+ """
125
+ index = self.geohash_index
126
+
127
+ # Try full geohash first
128
+ candidates = index.get(geohash, [])
129
+
130
+ # If no exact match, try prefixes (geohash can overlap borders)
131
+ if not candidates:
132
+ for prefix_len in range(len(geohash), 0, -1):
133
+ prefix = geohash[:prefix_len]
134
+ if prefix in index:
135
+ candidates.extend(index[prefix])
136
+ break
137
+
138
+ return list(set(candidates)) # Deduplicate
139
+
140
+ def get_polygon(self, country_id: int) -> Optional[Dict]:
141
+ """
142
+ Get polygon data for a country.
143
+
144
+ Returns:
145
+ Dict with 'exterior' and optionally 'holes' keys,
146
+ or None if country not found
147
+ """
148
+ return self.polygons.get(country_id)
149
+
150
+ def get_metadata(self, country_id: int) -> Optional[Dict]:
151
+ """
152
+ Get metadata for a country.
153
+
154
+ Returns:
155
+ Dict with 'name', 'iso2', 'iso3', 'continent', 'timezone',
156
+ or None if country not found
157
+ """
158
+ return self.metadata.get(country_id)
159
+
160
+
161
+ # Global instance (lazy-loaded)
162
+ _loader: Optional[DataLoader] = None
163
+
164
+
165
+ def get_loader(data_dir: Optional[str] = None) -> DataLoader:
166
+ """Get or create global data loader instance."""
167
+ global _loader
168
+ if _loader is None:
169
+ _loader = DataLoader(data_dir)
170
+ return _loader