geo-intel-offline 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geo_intel_offline/__init__.py +15 -0
- geo_intel_offline/api.py +114 -0
- geo_intel_offline/compression.py +238 -0
- geo_intel_offline/confidence.py +89 -0
- geo_intel_offline/data/geohash_index.json.gz +0 -0
- geo_intel_offline/data/metadata.json.gz +0 -0
- geo_intel_offline/data/polygons.json.gz +0 -0
- geo_intel_offline/data_builder.py +528 -0
- geo_intel_offline/data_builder_minimal.py +173 -0
- geo_intel_offline/data_builder_modular.py +474 -0
- geo_intel_offline/data_loader.py +170 -0
- geo_intel_offline/geohash.py +150 -0
- geo_intel_offline/hierarchical_resolver.py +136 -0
- geo_intel_offline/migrate_to_modular.py +159 -0
- geo_intel_offline/modular_data_loader.py +212 -0
- geo_intel_offline/pip.py +150 -0
- geo_intel_offline/polygon_utils.py +104 -0
- geo_intel_offline/resolver.py +306 -0
- geo_intel_offline-1.0.1.dist-info/LICENSE +21 -0
- geo_intel_offline-1.0.1.dist-info/METADATA +784 -0
- geo_intel_offline-1.0.1.dist-info/RECORD +24 -0
- geo_intel_offline-1.0.1.dist-info/WHEEL +5 -0
- geo_intel_offline-1.0.1.dist-info/entry_points.txt +2 -0
- geo_intel_offline-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modular data builder - generates country-wise data files.
|
|
3
|
+
|
|
4
|
+
Design:
|
|
5
|
+
- Each country in its own file
|
|
6
|
+
- Organized by continent directories
|
|
7
|
+
- Master index file for lookup
|
|
8
|
+
- Supports selective country/continent building
|
|
9
|
+
- Uses PIP validation for accurate geohash indexing
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Tuple, Optional, Set
|
|
16
|
+
from .geohash import encode
|
|
17
|
+
from .data_builder import simplify_polygon # Uses improved simplify_polygon with validation
|
|
18
|
+
from .pip import point_in_polygon_with_holes
|
|
19
|
+
from .polygon_utils import (
|
|
20
|
+
calculate_bounding_box,
|
|
21
|
+
calculate_adaptive_step_size,
|
|
22
|
+
calculate_safe_iteration_limits,
|
|
23
|
+
get_polygon_centroid,
|
|
24
|
+
convert_geojson_coords_to_latlon
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Continent mapping (normalize various continent name formats)
|
|
29
|
+
CONTINENT_MAPPING = {
|
|
30
|
+
"Africa": "africa",
|
|
31
|
+
"africa": "africa",
|
|
32
|
+
"AFRICA": "africa",
|
|
33
|
+
|
|
34
|
+
"Asia": "asia",
|
|
35
|
+
"asia": "asia",
|
|
36
|
+
"ASIA": "asia",
|
|
37
|
+
|
|
38
|
+
"Europe": "europe",
|
|
39
|
+
"europe": "europe",
|
|
40
|
+
"EUROPE": "europe",
|
|
41
|
+
|
|
42
|
+
"North America": "north_america",
|
|
43
|
+
"north america": "north_america",
|
|
44
|
+
"NorthAmerica": "north_america",
|
|
45
|
+
"north_america": "north_america",
|
|
46
|
+
|
|
47
|
+
"South America": "south_america",
|
|
48
|
+
"south america": "south_america",
|
|
49
|
+
"SouthAmerica": "south_america",
|
|
50
|
+
"south_america": "south_america",
|
|
51
|
+
|
|
52
|
+
"Oceania": "oceania",
|
|
53
|
+
"oceania": "oceania",
|
|
54
|
+
"OCEANIA": "oceania",
|
|
55
|
+
"Australia": "oceania",
|
|
56
|
+
|
|
57
|
+
"Antarctica": "antarctica",
|
|
58
|
+
"antarctica": "antarctica",
|
|
59
|
+
"ANTARCTICA": "antarctica",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def normalize_continent(continent: str) -> str:
|
|
64
|
+
"""Normalize continent name to directory format."""
|
|
65
|
+
return CONTINENT_MAPPING.get(continent, continent.lower().replace(" ", "_"))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def build_country_geohash_index(
|
|
69
|
+
polygon_exterior: List[Tuple[float, float]],
|
|
70
|
+
polygon_holes: List[List[Tuple[float, float]]],
|
|
71
|
+
country_id: int,
|
|
72
|
+
geohash_precision: int = 6,
|
|
73
|
+
validate_with_pip: bool = True
|
|
74
|
+
) -> Dict[str, List[int]]:
|
|
75
|
+
"""
|
|
76
|
+
Build geohash index for a single country with PIP validation.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
polygon_exterior: Exterior polygon coordinates
|
|
80
|
+
polygon_holes: Interior rings (holes) if any
|
|
81
|
+
country_id: Country ID
|
|
82
|
+
geohash_precision: Geohash precision level
|
|
83
|
+
validate_with_pip: If True, validate points are in polygon before indexing
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dict mapping geohash strings to [country_id]
|
|
87
|
+
"""
|
|
88
|
+
if not polygon_exterior:
|
|
89
|
+
return {}
|
|
90
|
+
|
|
91
|
+
# Calculate bounding box using shared utility
|
|
92
|
+
min_lat, max_lat, min_lon, max_lon = calculate_bounding_box(polygon_exterior)
|
|
93
|
+
|
|
94
|
+
lat_range = max_lat - min_lat
|
|
95
|
+
lon_range = max_lon - min_lon
|
|
96
|
+
|
|
97
|
+
# Calculate adaptive step size using shared utility
|
|
98
|
+
step = calculate_adaptive_step_size(lat_range, lon_range)
|
|
99
|
+
|
|
100
|
+
# For very small polygons, ensure we sample multiple points
|
|
101
|
+
if lat_range < 0.001 or lon_range < 0.001:
|
|
102
|
+
# Sample multiple points for tiny polygons
|
|
103
|
+
sample_points = []
|
|
104
|
+
sample_points.append(get_polygon_centroid(polygon_exterior))
|
|
105
|
+
sample_points.append((min_lat, min_lon))
|
|
106
|
+
sample_points.append((min_lat, max_lon))
|
|
107
|
+
sample_points.append((max_lat, min_lon))
|
|
108
|
+
sample_points.append((max_lat, max_lon))
|
|
109
|
+
|
|
110
|
+
index = {}
|
|
111
|
+
for point in sample_points:
|
|
112
|
+
if validate_with_pip:
|
|
113
|
+
if not point_in_polygon_with_holes(point, polygon_exterior, polygon_holes):
|
|
114
|
+
continue
|
|
115
|
+
geohash = encode(point[0], point[1], geohash_precision)
|
|
116
|
+
if geohash not in index:
|
|
117
|
+
index[geohash] = []
|
|
118
|
+
if country_id not in index[geohash]:
|
|
119
|
+
index[geohash].append(country_id)
|
|
120
|
+
return index
|
|
121
|
+
|
|
122
|
+
# Sample points from bounding boxes with validation
|
|
123
|
+
index = {}
|
|
124
|
+
geohashes_added = set()
|
|
125
|
+
lat = min_lat
|
|
126
|
+
|
|
127
|
+
while lat <= max_lat:
|
|
128
|
+
lon = min_lon
|
|
129
|
+
while lon <= max_lon:
|
|
130
|
+
point = (lat, lon)
|
|
131
|
+
|
|
132
|
+
# Validate point is in polygon if enabled
|
|
133
|
+
if validate_with_pip:
|
|
134
|
+
if not point_in_polygon_with_holes(point, polygon_exterior, polygon_holes):
|
|
135
|
+
lon += step
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Add to index
|
|
139
|
+
geohash = encode(lat, lon, geohash_precision)
|
|
140
|
+
geohash_key = (geohash, country_id)
|
|
141
|
+
|
|
142
|
+
if geohash_key not in geohashes_added:
|
|
143
|
+
if geohash not in index:
|
|
144
|
+
index[geohash] = []
|
|
145
|
+
if country_id not in index[geohash]:
|
|
146
|
+
index[geohash].append(country_id)
|
|
147
|
+
geohashes_added.add(geohash_key)
|
|
148
|
+
|
|
149
|
+
lon += step
|
|
150
|
+
lat += step
|
|
151
|
+
|
|
152
|
+
# Ensure at least one geohash for very small countries
|
|
153
|
+
if len(geohashes_added) == 0 and validate_with_pip:
|
|
154
|
+
centroid_lat, centroid_lon = get_polygon_centroid(polygon_exterior)
|
|
155
|
+
centroid_point = (centroid_lat, centroid_lon)
|
|
156
|
+
|
|
157
|
+
if point_in_polygon_with_holes(centroid_point, polygon_exterior, polygon_holes):
|
|
158
|
+
geohash = encode(centroid_lat, centroid_lon, geohash_precision)
|
|
159
|
+
if geohash not in index:
|
|
160
|
+
index[geohash] = []
|
|
161
|
+
if country_id not in index[geohash]:
|
|
162
|
+
index[geohash].append(country_id)
|
|
163
|
+
|
|
164
|
+
return index
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def process_country(
|
|
168
|
+
feature: Dict,
|
|
169
|
+
country_id: int,
|
|
170
|
+
geohash_precision: int = 6,
|
|
171
|
+
polygon_tolerance: float = 0.005
|
|
172
|
+
) -> Optional[Dict]:
|
|
173
|
+
"""
|
|
174
|
+
Process a single country feature into modular format.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dict with country data or None if invalid
|
|
178
|
+
"""
|
|
179
|
+
geometry = feature.get('geometry', {})
|
|
180
|
+
properties = feature.get('properties', {})
|
|
181
|
+
|
|
182
|
+
if geometry.get('type') not in ('Polygon', 'MultiPolygon'):
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
coords = geometry.get('coordinates', [])
|
|
186
|
+
|
|
187
|
+
# Extract metadata
|
|
188
|
+
metadata = {
|
|
189
|
+
'name': properties.get('NAME', properties.get('name', f'Country {country_id}')),
|
|
190
|
+
'iso2': properties.get('ISO_A2', properties.get('iso_a2', '')),
|
|
191
|
+
'iso3': properties.get('ISO_A3', properties.get('iso_a3', '')),
|
|
192
|
+
'continent': properties.get('CONTINENT', properties.get('continent', '')),
|
|
193
|
+
'timezone': properties.get('TIMEZONE', properties.get('timezone', ''))
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# Process polygon
|
|
197
|
+
if geometry['type'] == 'Polygon':
|
|
198
|
+
exterior_coords = coords[0] if coords else []
|
|
199
|
+
hole_coords = coords[1:] if len(coords) > 1 else []
|
|
200
|
+
|
|
201
|
+
# Convert GeoJSON [lon, lat] to [lat, lon]
|
|
202
|
+
exterior = convert_geojson_coords_to_latlon(exterior_coords)
|
|
203
|
+
holes = [convert_geojson_coords_to_latlon(hole) for hole in hole_coords] if hole_coords else []
|
|
204
|
+
|
|
205
|
+
# Simplify with tighter tolerance for better accuracy
|
|
206
|
+
exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
|
|
207
|
+
|
|
208
|
+
# Ensure simplified polygon is still valid
|
|
209
|
+
if len(exterior_simplified) < 3 and len(exterior) >= 3:
|
|
210
|
+
exterior_simplified = exterior # Use original if simplification made it invalid
|
|
211
|
+
|
|
212
|
+
holes_simplified = []
|
|
213
|
+
for hole in holes:
|
|
214
|
+
if len(hole) >= 3:
|
|
215
|
+
hole_simpl = simplify_polygon(hole, tolerance=polygon_tolerance)
|
|
216
|
+
if len(hole_simpl) >= 3:
|
|
217
|
+
holes_simplified.append(hole_simpl)
|
|
218
|
+
elif len(hole) >= 3:
|
|
219
|
+
holes_simplified.append(hole) # Use original
|
|
220
|
+
|
|
221
|
+
# Build geohash index for single polygon
|
|
222
|
+
geohashes = build_country_geohash_index(
|
|
223
|
+
exterior_simplified,
|
|
224
|
+
holes_simplified if holes_simplified else [],
|
|
225
|
+
country_id,
|
|
226
|
+
geohash_precision,
|
|
227
|
+
validate_with_pip=True
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
polygon_data = {
|
|
231
|
+
'exterior': exterior_simplified,
|
|
232
|
+
'holes': holes_simplified if holes_simplified else []
|
|
233
|
+
}
|
|
234
|
+
else: # MultiPolygon - process ALL polygons
|
|
235
|
+
all_exteriors = []
|
|
236
|
+
all_holes = []
|
|
237
|
+
all_geohashes = {}
|
|
238
|
+
|
|
239
|
+
for poly_part in coords:
|
|
240
|
+
if not poly_part or not poly_part[0]:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
exterior_coords = poly_part[0]
|
|
244
|
+
hole_coords = poly_part[1:] if len(poly_part) > 1 else []
|
|
245
|
+
|
|
246
|
+
exterior = convert_geojson_coords_to_latlon(exterior_coords)
|
|
247
|
+
holes = [convert_geojson_coords_to_latlon(hole) for hole in hole_coords] if hole_coords else []
|
|
248
|
+
|
|
249
|
+
exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
|
|
250
|
+
|
|
251
|
+
if len(exterior_simplified) >= 3: # Valid polygon
|
|
252
|
+
all_exteriors.append(exterior_simplified)
|
|
253
|
+
|
|
254
|
+
if holes:
|
|
255
|
+
holes_simplified = [simplify_polygon(hole, tolerance=polygon_tolerance) for hole in holes]
|
|
256
|
+
all_holes.extend(holes_simplified)
|
|
257
|
+
|
|
258
|
+
# Build geohash index for each polygon part
|
|
259
|
+
part_geohashes = build_country_geohash_index(
|
|
260
|
+
exterior_simplified,
|
|
261
|
+
holes_simplified if holes else [],
|
|
262
|
+
country_id,
|
|
263
|
+
geohash_precision,
|
|
264
|
+
validate_with_pip=True
|
|
265
|
+
)
|
|
266
|
+
# Merge geohashes
|
|
267
|
+
for gh, ids in part_geohashes.items():
|
|
268
|
+
if gh not in all_geohashes:
|
|
269
|
+
all_geohashes[gh] = []
|
|
270
|
+
all_geohashes[gh].extend(ids)
|
|
271
|
+
|
|
272
|
+
if not all_exteriors:
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
geohashes = all_geohashes
|
|
276
|
+
|
|
277
|
+
# Store MultiPolygon structure
|
|
278
|
+
polygon_data = {
|
|
279
|
+
'exterior': all_exteriors[0], # Primary exterior
|
|
280
|
+
'holes': all_holes if all_holes else [],
|
|
281
|
+
'multi': True,
|
|
282
|
+
'exteriors': all_exteriors # All exteriors for complete coverage
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
'country_id': country_id,
|
|
287
|
+
'metadata': metadata,
|
|
288
|
+
'geohashes': geohashes,
|
|
289
|
+
'polygon': polygon_data
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def build_modular_data(
|
|
294
|
+
geojson_path: Path,
|
|
295
|
+
output_dir: Path,
|
|
296
|
+
countries: Optional[List[str]] = None,
|
|
297
|
+
continents: Optional[List[str]] = None,
|
|
298
|
+
exclude_countries: Optional[List[str]] = None,
|
|
299
|
+
geohash_precision: int = 6,
|
|
300
|
+
polygon_tolerance: float = 0.005
|
|
301
|
+
) -> Dict:
|
|
302
|
+
"""
|
|
303
|
+
Build modular country-wise data files.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
geojson_path: Path to source GeoJSON file
|
|
307
|
+
output_dir: Output directory for data files
|
|
308
|
+
countries: List of ISO2 codes to include (None = all)
|
|
309
|
+
continents: List of continent names to include (None = all)
|
|
310
|
+
exclude_countries: List of ISO2 codes to exclude
|
|
311
|
+
geohash_precision: Geohash precision level
|
|
312
|
+
polygon_tolerance: Polygon simplification tolerance (0.005 = better accuracy)
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Dict with build statistics
|
|
316
|
+
"""
|
|
317
|
+
# Load GeoJSON
|
|
318
|
+
with open(geojson_path, 'r', encoding='utf-8') as f:
|
|
319
|
+
geojson = json.load(f)
|
|
320
|
+
|
|
321
|
+
features = geojson.get('features', [])
|
|
322
|
+
|
|
323
|
+
# Normalize filters
|
|
324
|
+
countries_set = set(c.upper() for c in (countries or []))
|
|
325
|
+
exclude_set = set(c.upper() for c in (exclude_countries or []))
|
|
326
|
+
continents_normalized = [normalize_continent(c) for c in (continents or [])]
|
|
327
|
+
|
|
328
|
+
# Create output structure
|
|
329
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
330
|
+
continents_dir = output_dir / "continents"
|
|
331
|
+
continents_dir.mkdir(exist_ok=True)
|
|
332
|
+
|
|
333
|
+
# Process countries
|
|
334
|
+
master_index = {
|
|
335
|
+
'version': '1.0.0',
|
|
336
|
+
'countries': {},
|
|
337
|
+
'continents': {}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
continent_countries = {} # continent -> [iso2 codes]
|
|
341
|
+
|
|
342
|
+
processed = 0
|
|
343
|
+
skipped = 0
|
|
344
|
+
|
|
345
|
+
print(f"Processing {len(features)} countries with PIP validation...")
|
|
346
|
+
|
|
347
|
+
for idx, feature in enumerate(features):
|
|
348
|
+
country_id = idx + 1
|
|
349
|
+
properties = feature.get('properties', {})
|
|
350
|
+
|
|
351
|
+
iso2 = properties.get('ISO_A2', properties.get('iso_a2', '')).upper()
|
|
352
|
+
continent_raw = properties.get('CONTINENT', properties.get('continent', ''))
|
|
353
|
+
continent = normalize_continent(continent_raw) if continent_raw else 'unknown'
|
|
354
|
+
|
|
355
|
+
# Apply filters
|
|
356
|
+
if countries_set and iso2 not in countries_set:
|
|
357
|
+
skipped += 1
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
if exclude_set and iso2 in exclude_set:
|
|
361
|
+
skipped += 1
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
if continents_normalized and continent not in continents_normalized:
|
|
365
|
+
skipped += 1
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
# Process country
|
|
369
|
+
country_data = process_country(feature, country_id, geohash_precision, polygon_tolerance)
|
|
370
|
+
if not country_data:
|
|
371
|
+
skipped += 1
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
# Create continent directory
|
|
375
|
+
continent_dir = continents_dir / continent
|
|
376
|
+
continent_dir.mkdir(exist_ok=True)
|
|
377
|
+
|
|
378
|
+
# Save country file
|
|
379
|
+
country_file = continent_dir / f"{iso2}.json"
|
|
380
|
+
with open(country_file, 'w', encoding='utf-8') as f:
|
|
381
|
+
json.dump(country_data, f, separators=(',', ':'))
|
|
382
|
+
|
|
383
|
+
# Update master index
|
|
384
|
+
relative_path = f"continents/{continent}/{iso2}.json"
|
|
385
|
+
master_index['countries'][iso2] = {
|
|
386
|
+
'id': country_id,
|
|
387
|
+
'name': country_data['metadata']['name'],
|
|
388
|
+
'iso2': iso2,
|
|
389
|
+
'iso3': country_data['metadata'].get('iso3', ''),
|
|
390
|
+
'continent': continent_raw,
|
|
391
|
+
'file': relative_path,
|
|
392
|
+
'size_bytes': country_file.stat().st_size
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
# Track continent membership
|
|
396
|
+
if continent not in continent_countries:
|
|
397
|
+
continent_countries[continent] = []
|
|
398
|
+
continent_countries[continent].append(iso2)
|
|
399
|
+
|
|
400
|
+
processed += 1
|
|
401
|
+
|
|
402
|
+
# Progress indicator
|
|
403
|
+
if processed % 20 == 0:
|
|
404
|
+
print(f" Processed {processed} countries...")
|
|
405
|
+
|
|
406
|
+
# Update continent index
|
|
407
|
+
master_index['continents'] = continent_countries
|
|
408
|
+
|
|
409
|
+
# Save master index
|
|
410
|
+
index_file = output_dir / "index.json"
|
|
411
|
+
with open(index_file, 'w', encoding='utf-8') as f:
|
|
412
|
+
json.dump(master_index, f, indent=2, ensure_ascii=False)
|
|
413
|
+
|
|
414
|
+
return {
|
|
415
|
+
'processed': processed,
|
|
416
|
+
'skipped': skipped,
|
|
417
|
+
'countries': list(master_index['countries'].keys()),
|
|
418
|
+
'continents': list(continent_countries.keys())
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def main():
|
|
423
|
+
"""CLI entry point."""
|
|
424
|
+
import argparse
|
|
425
|
+
|
|
426
|
+
parser = argparse.ArgumentParser(
|
|
427
|
+
description='Build modular country-wise data files with high accuracy'
|
|
428
|
+
)
|
|
429
|
+
parser.add_argument('source', type=Path, help='Source GeoJSON file')
|
|
430
|
+
parser.add_argument('output', type=Path, help='Output directory')
|
|
431
|
+
parser.add_argument('--countries', help='Comma-separated ISO2 codes (e.g., US,CA,MX)')
|
|
432
|
+
parser.add_argument('--continents', help='Comma-separated continent names')
|
|
433
|
+
parser.add_argument('--exclude', help='Comma-separated ISO2 codes to exclude')
|
|
434
|
+
parser.add_argument('--precision', type=int, default=6, help='Geohash precision (default: 6)')
|
|
435
|
+
parser.add_argument('--tolerance', type=float, default=0.005, help='Polygon tolerance (default: 0.005 for high accuracy)')
|
|
436
|
+
|
|
437
|
+
args = parser.parse_args()
|
|
438
|
+
|
|
439
|
+
# Parse filters
|
|
440
|
+
countries = [c.strip().upper() for c in args.countries.split(',')] if args.countries else None
|
|
441
|
+
continents = [c.strip() for c in args.continents.split(',')] if args.continents else None
|
|
442
|
+
exclude = [c.strip().upper() for c in args.exclude.split(',')] if args.exclude else None
|
|
443
|
+
|
|
444
|
+
print(f"Building modular data from {args.source}...")
|
|
445
|
+
print(f" Polygon tolerance: {args.tolerance}° (smaller = more accurate)")
|
|
446
|
+
print(f" Geohash precision: {args.precision}")
|
|
447
|
+
print(f" PIP validation: Enabled")
|
|
448
|
+
if countries:
|
|
449
|
+
print(f" Countries: {', '.join(countries)}")
|
|
450
|
+
if continents:
|
|
451
|
+
print(f" Continents: {', '.join(continents)}")
|
|
452
|
+
if exclude:
|
|
453
|
+
print(f" Exclude: {', '.join(exclude)}")
|
|
454
|
+
print()
|
|
455
|
+
|
|
456
|
+
stats = build_modular_data(
|
|
457
|
+
args.source,
|
|
458
|
+
args.output,
|
|
459
|
+
countries=countries,
|
|
460
|
+
continents=continents,
|
|
461
|
+
exclude_countries=exclude,
|
|
462
|
+
geohash_precision=args.precision,
|
|
463
|
+
polygon_tolerance=args.tolerance
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
print(f"\n✓ Processed: {stats['processed']} countries")
|
|
467
|
+
print(f" Skipped: {stats['skipped']} countries")
|
|
468
|
+
print(f" Continents: {', '.join(stats['continents'])}")
|
|
469
|
+
print(f"\nData saved to: {args.output}")
|
|
470
|
+
print(f"Master index: {args.output / 'index.json'}")
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
if __name__ == '__main__':
|
|
474
|
+
main()
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loading and binary format handling.
|
|
3
|
+
|
|
4
|
+
Binary Format Design:
|
|
5
|
+
1. geohash_index.json(.gz) - Geohash → country_id mappings (compressed)
|
|
6
|
+
2. polygons.json(.gz) - Country polygons (simplified, coordinate arrays)
|
|
7
|
+
3. metadata.json(.gz) - Country metadata (ISO codes, continent, timezone)
|
|
8
|
+
|
|
9
|
+
Design Decisions:
|
|
10
|
+
- JSON for simplicity (can be compressed/gzipped in production)
|
|
11
|
+
- Supports gzip compression for reduced file size
|
|
12
|
+
- Alternative: msgpack/binary for even smaller size
|
|
13
|
+
- Lazy loading: Load only when needed
|
|
14
|
+
- In-memory caching: Keep in memory after first load
|
|
15
|
+
|
|
16
|
+
For production with < 15 MB constraint:
|
|
17
|
+
- Simplify polygons (reduce vertices using Douglas-Peucker)
|
|
18
|
+
- Compress geohash index (sparse representation)
|
|
19
|
+
- Use efficient coordinate storage (fixed-point integers)
|
|
20
|
+
- Gzip compression reduces file size by ~60-80%
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import gzip
|
|
25
|
+
from typing import Dict, List, Tuple, Optional
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DataLoader:
|
|
30
|
+
"""Loads and caches geo-intelligence data."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, data_dir: Optional[str] = None):
|
|
33
|
+
"""
|
|
34
|
+
Initialize data loader.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data_dir: Directory containing data files. If None, uses package data directory.
|
|
38
|
+
"""
|
|
39
|
+
if data_dir is None:
|
|
40
|
+
# Default to package data directory
|
|
41
|
+
package_dir = Path(__file__).parent
|
|
42
|
+
data_dir = package_dir / "data"
|
|
43
|
+
|
|
44
|
+
self.data_dir = Path(data_dir)
|
|
45
|
+
self._geohash_index: Optional[Dict[str, List[int]]] = None
|
|
46
|
+
self._polygons: Optional[Dict[int, Dict]] = None
|
|
47
|
+
self._metadata: Optional[Dict[int, Dict]] = None
|
|
48
|
+
|
|
49
|
+
def _load_json(self, filename: str) -> dict:
|
|
50
|
+
"""
|
|
51
|
+
Load JSON file from data directory.
|
|
52
|
+
|
|
53
|
+
Automatically detects and handles gzip-compressed files (.json.gz).
|
|
54
|
+
Falls back to uncompressed .json in data_dev/ for development if needed.
|
|
55
|
+
"""
|
|
56
|
+
filepath = self.data_dir / filename
|
|
57
|
+
gzip_filepath = self.data_dir / f"{filename}.gz"
|
|
58
|
+
dev_data_dir = self.data_dir.parent / "data_dev"
|
|
59
|
+
dev_filepath = dev_data_dir / filename
|
|
60
|
+
|
|
61
|
+
# Try compressed version first (smaller, preferred - for distribution)
|
|
62
|
+
if gzip_filepath.exists():
|
|
63
|
+
with gzip.open(gzip_filepath, 'rt', encoding='utf-8') as f:
|
|
64
|
+
return json.load(f)
|
|
65
|
+
|
|
66
|
+
# Fallback to uncompressed in data directory (development)
|
|
67
|
+
if filepath.exists():
|
|
68
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
69
|
+
return json.load(f)
|
|
70
|
+
|
|
71
|
+
# Fallback to dev directory (uncompressed files moved here)
|
|
72
|
+
if dev_filepath.exists():
|
|
73
|
+
with open(dev_filepath, 'r', encoding='utf-8') as f:
|
|
74
|
+
return json.load(f)
|
|
75
|
+
|
|
76
|
+
# None found
|
|
77
|
+
raise FileNotFoundError(
|
|
78
|
+
f"Data file not found: {gzip_filepath} or {filepath} or {dev_filepath}\n"
|
|
79
|
+
f"Please run data_builder.py to generate data files."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def geohash_index(self) -> Dict[str, List[int]]:
|
|
84
|
+
"""Get geohash index (lazy-loaded)."""
|
|
85
|
+
if self._geohash_index is None:
|
|
86
|
+
data = self._load_json("geohash_index.json")
|
|
87
|
+
self._geohash_index = {
|
|
88
|
+
k: v if isinstance(v, list) else [v]
|
|
89
|
+
for k, v in data.items()
|
|
90
|
+
}
|
|
91
|
+
return self._geohash_index
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def polygons(self) -> Dict[int, Dict]:
|
|
95
|
+
"""Get country polygons (lazy-loaded)."""
|
|
96
|
+
if self._polygons is None:
|
|
97
|
+
self._polygons = self._load_json("polygons.json")
|
|
98
|
+
# Convert list keys to int (JSON doesn't support int keys)
|
|
99
|
+
self._polygons = {
|
|
100
|
+
int(k): v for k, v in self._polygons.items()
|
|
101
|
+
}
|
|
102
|
+
return self._polygons
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def metadata(self) -> Dict[int, Dict]:
|
|
106
|
+
"""Get country metadata (lazy-loaded)."""
|
|
107
|
+
if self._metadata is None:
|
|
108
|
+
self._metadata = self._load_json("metadata.json")
|
|
109
|
+
# Convert list keys to int
|
|
110
|
+
self._metadata = {
|
|
111
|
+
int(k): v for k, v in self._metadata.items()
|
|
112
|
+
}
|
|
113
|
+
return self._metadata
|
|
114
|
+
|
|
115
|
+
def get_candidate_countries(self, geohash: str) -> List[int]:
|
|
116
|
+
"""
|
|
117
|
+
Get candidate country IDs for a geohash.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
geohash: Geohash string
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
List of country IDs that may contain this geohash
|
|
124
|
+
"""
|
|
125
|
+
index = self.geohash_index
|
|
126
|
+
|
|
127
|
+
# Try full geohash first
|
|
128
|
+
candidates = index.get(geohash, [])
|
|
129
|
+
|
|
130
|
+
# If no exact match, try prefixes (geohash can overlap borders)
|
|
131
|
+
if not candidates:
|
|
132
|
+
for prefix_len in range(len(geohash), 0, -1):
|
|
133
|
+
prefix = geohash[:prefix_len]
|
|
134
|
+
if prefix in index:
|
|
135
|
+
candidates.extend(index[prefix])
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
return list(set(candidates)) # Deduplicate
|
|
139
|
+
|
|
140
|
+
def get_polygon(self, country_id: int) -> Optional[Dict]:
|
|
141
|
+
"""
|
|
142
|
+
Get polygon data for a country.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict with 'exterior' and optionally 'holes' keys,
|
|
146
|
+
or None if country not found
|
|
147
|
+
"""
|
|
148
|
+
return self.polygons.get(country_id)
|
|
149
|
+
|
|
150
|
+
def get_metadata(self, country_id: int) -> Optional[Dict]:
|
|
151
|
+
"""
|
|
152
|
+
Get metadata for a country.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict with 'name', 'iso2', 'iso3', 'continent', 'timezone',
|
|
156
|
+
or None if country not found
|
|
157
|
+
"""
|
|
158
|
+
return self.metadata.get(country_id)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# Global instance (lazy-loaded)
|
|
162
|
+
_loader: Optional[DataLoader] = None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_loader(data_dir: Optional[str] = None) -> DataLoader:
|
|
166
|
+
"""Get or create global data loader instance."""
|
|
167
|
+
global _loader
|
|
168
|
+
if _loader is None:
|
|
169
|
+
_loader = DataLoader(data_dir)
|
|
170
|
+
return _loader
|