geo-intel-offline 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geo_intel_offline/__init__.py +15 -0
- geo_intel_offline/api.py +114 -0
- geo_intel_offline/compression.py +238 -0
- geo_intel_offline/confidence.py +89 -0
- geo_intel_offline/data/geohash_index.json.gz +0 -0
- geo_intel_offline/data/metadata.json.gz +0 -0
- geo_intel_offline/data/polygons.json.gz +0 -0
- geo_intel_offline/data_builder.py +528 -0
- geo_intel_offline/data_builder_minimal.py +173 -0
- geo_intel_offline/data_builder_modular.py +474 -0
- geo_intel_offline/data_loader.py +170 -0
- geo_intel_offline/geohash.py +150 -0
- geo_intel_offline/hierarchical_resolver.py +136 -0
- geo_intel_offline/migrate_to_modular.py +159 -0
- geo_intel_offline/modular_data_loader.py +212 -0
- geo_intel_offline/pip.py +150 -0
- geo_intel_offline/polygon_utils.py +104 -0
- geo_intel_offline/resolver.py +306 -0
- geo_intel_offline-1.0.1.dist-info/LICENSE +21 -0
- geo_intel_offline-1.0.1.dist-info/METADATA +784 -0
- geo_intel_offline-1.0.1.dist-info/RECORD +24 -0
- geo_intel_offline-1.0.1.dist-info/WHEEL +5 -0
- geo_intel_offline-1.0.1.dist-info/entry_points.txt +2 -0
- geo_intel_offline-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Builder - Process GeoJSON and generate optimized data files
|
|
3
|
+
|
|
4
|
+
Design Decisions:
|
|
5
|
+
1. Accept GeoJSON input (standard format, widely available)
|
|
6
|
+
2. Simplify polygons using Douglas-Peucker algorithm (reduces memory)
|
|
7
|
+
3. Build geohash index by sampling polygon coverage
|
|
8
|
+
4. Extract metadata from GeoJSON properties
|
|
9
|
+
|
|
10
|
+
Note: In production, you would download source data from Natural Earth
|
|
11
|
+
or similar authoritative sources. This script provides the processing pipeline.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Dict, List, Tuple
|
|
18
|
+
from .geohash import encode
|
|
19
|
+
from .polygon_utils import (
|
|
20
|
+
calculate_bounding_box,
|
|
21
|
+
calculate_adaptive_step_size,
|
|
22
|
+
calculate_safe_iteration_limits,
|
|
23
|
+
get_polygon_centroid,
|
|
24
|
+
convert_geojson_coords_to_latlon
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def simplify_polygon(
|
|
29
|
+
polygon: List[Tuple[float, float]],
|
|
30
|
+
tolerance: float = 0.01
|
|
31
|
+
) -> List[Tuple[float, float]]:
|
|
32
|
+
"""
|
|
33
|
+
Simplify polygon using Douglas-Peucker algorithm.
|
|
34
|
+
|
|
35
|
+
For very small polygons, skip simplification to preserve validity.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
polygon: List of (lat, lon) tuples
|
|
39
|
+
tolerance: Simplification tolerance in degrees
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Simplified polygon (guaranteed to have at least 3 vertices)
|
|
43
|
+
"""
|
|
44
|
+
if len(polygon) <= 2:
|
|
45
|
+
return polygon
|
|
46
|
+
|
|
47
|
+
# For very small polygons, skip simplification to preserve validity
|
|
48
|
+
if len(polygon) <= 10:
|
|
49
|
+
return polygon if len(polygon) >= 3 else []
|
|
50
|
+
|
|
51
|
+
# Douglas-Peucker algorithm
|
|
52
|
+
def douglas_peucker(points: List[Tuple[float, float]], tol: float) -> List[Tuple[float, float]]:
|
|
53
|
+
if len(points) <= 3:
|
|
54
|
+
return points if len(points) >= 3 else []
|
|
55
|
+
|
|
56
|
+
# Find point with maximum distance
|
|
57
|
+
max_dist = 0
|
|
58
|
+
max_idx = 0
|
|
59
|
+
end_idx = len(points) - 1
|
|
60
|
+
|
|
61
|
+
for i in range(1, end_idx):
|
|
62
|
+
dist = _point_to_line_distance(points[i], points[0], points[end_idx])
|
|
63
|
+
if dist > max_dist:
|
|
64
|
+
max_dist = dist
|
|
65
|
+
max_idx = i
|
|
66
|
+
|
|
67
|
+
# If max distance is greater than tolerance, recursively simplify
|
|
68
|
+
if max_dist > tol:
|
|
69
|
+
left = douglas_peucker(points[:max_idx + 1], tol)
|
|
70
|
+
right = douglas_peucker(points[max_idx:], tol)
|
|
71
|
+
|
|
72
|
+
if left and right:
|
|
73
|
+
combined = left[:-1] + right
|
|
74
|
+
if len(combined) < 3 and len(points) >= 3:
|
|
75
|
+
return points
|
|
76
|
+
return combined
|
|
77
|
+
elif left:
|
|
78
|
+
return left if len(left) >= 3 else points
|
|
79
|
+
elif right:
|
|
80
|
+
return right if len(right) >= 3 else points
|
|
81
|
+
return points
|
|
82
|
+
else:
|
|
83
|
+
if len(points) >= 3:
|
|
84
|
+
mid_idx = len(points) // 2
|
|
85
|
+
return [points[0], points[mid_idx], points[end_idx]]
|
|
86
|
+
return points
|
|
87
|
+
|
|
88
|
+
result = douglas_peucker(polygon, tolerance)
|
|
89
|
+
|
|
90
|
+
# Final validation
|
|
91
|
+
if len(result) < 3:
|
|
92
|
+
return polygon if len(polygon) >= 3 else result
|
|
93
|
+
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _point_to_line_distance(
|
|
98
|
+
point: Tuple[float, float],
|
|
99
|
+
line_start: Tuple[float, float],
|
|
100
|
+
line_end: Tuple[float, float]
|
|
101
|
+
) -> float:
|
|
102
|
+
"""Calculate perpendicular distance from point to line segment."""
|
|
103
|
+
px, py = point
|
|
104
|
+
sx, sy = line_start
|
|
105
|
+
ex, ey = line_end
|
|
106
|
+
|
|
107
|
+
dx = ex - sx
|
|
108
|
+
dy = ey - sy
|
|
109
|
+
|
|
110
|
+
if dx == 0 and dy == 0:
|
|
111
|
+
# Degenerate line (point)
|
|
112
|
+
return ((px - sx) ** 2 + (py - sy) ** 2) ** 0.5
|
|
113
|
+
|
|
114
|
+
# Calculate t parameter
|
|
115
|
+
t = ((px - sx) * dx + (py - sy) * dy) / (dx * dx + dy * dy)
|
|
116
|
+
|
|
117
|
+
# Clamp to line segment
|
|
118
|
+
t = max(0, min(1, t))
|
|
119
|
+
|
|
120
|
+
# Projection point
|
|
121
|
+
proj_x = sx + t * dx
|
|
122
|
+
proj_y = sy + t * dy
|
|
123
|
+
|
|
124
|
+
# Distance
|
|
125
|
+
return ((px - proj_x) ** 2 + (py - proj_y) ** 2) ** 0.5
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def build_geohash_index_from_polygons(
|
|
129
|
+
polygons: Dict[int, Dict],
|
|
130
|
+
geohash_precision: int = 6,
|
|
131
|
+
validate_with_pip: bool = True
|
|
132
|
+
) -> Dict[str, List[int]]:
|
|
133
|
+
"""
|
|
134
|
+
Build geohash index from processed polygons.
|
|
135
|
+
|
|
136
|
+
Optimized version that uses already-processed polygon data.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
polygons: Dict mapping country_id to polygon data
|
|
140
|
+
geohash_precision: Geohash precision level
|
|
141
|
+
validate_with_pip: If True, validate points are in polygon
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Dict mapping geohash strings to lists of country IDs
|
|
145
|
+
"""
|
|
146
|
+
from .pip import point_in_polygon_with_holes
|
|
147
|
+
|
|
148
|
+
index: Dict[str, List[int]] = {}
|
|
149
|
+
total_countries = len(polygons)
|
|
150
|
+
|
|
151
|
+
print(f"Building geohash index for {total_countries} countries...")
|
|
152
|
+
|
|
153
|
+
for idx, (country_id, polygon_data) in enumerate(polygons.items(), 1):
|
|
154
|
+
if (idx) % max(1, total_countries // 20) == 0 or idx == total_countries:
|
|
155
|
+
progress = (idx / total_countries) * 100
|
|
156
|
+
print(f" Progress: {progress:.1f}% ({idx}/{total_countries})", end='\r')
|
|
157
|
+
|
|
158
|
+
is_multi = polygon_data.get('multi', False)
|
|
159
|
+
exteriors_data = polygon_data.get('exteriors', [])
|
|
160
|
+
exterior = polygon_data.get('exterior', [])
|
|
161
|
+
holes = polygon_data.get('holes', [])
|
|
162
|
+
|
|
163
|
+
# Handle MultiPolygon
|
|
164
|
+
if is_multi and exteriors_data:
|
|
165
|
+
exteriors_to_process = exteriors_data
|
|
166
|
+
else:
|
|
167
|
+
exteriors_to_process = [exterior] if exterior else []
|
|
168
|
+
|
|
169
|
+
geohashes_added = set()
|
|
170
|
+
|
|
171
|
+
for exterior_coords in exteriors_to_process:
|
|
172
|
+
if not exterior_coords or len(exterior_coords) < 3:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Calculate bounding box using shared utility
|
|
176
|
+
min_lat, max_lat, min_lon, max_lon = calculate_bounding_box(exterior_coords)
|
|
177
|
+
|
|
178
|
+
lat_range = max_lat - min_lat
|
|
179
|
+
lon_range = max_lon - min_lon
|
|
180
|
+
|
|
181
|
+
# Handle extremely small polygons - sample multiple points, not just centroid
|
|
182
|
+
if lat_range < 0.001 or lon_range < 0.001:
|
|
183
|
+
# For tiny polygons, sample multiple points (centroid + corners + midpoints)
|
|
184
|
+
# to ensure better geohash coverage
|
|
185
|
+
sample_points = []
|
|
186
|
+
sample_points.append(get_polygon_centroid(exterior_coords))
|
|
187
|
+
# Add bounding box corners and midpoints
|
|
188
|
+
sample_points.append((min_lat, min_lon))
|
|
189
|
+
sample_points.append((min_lat, max_lon))
|
|
190
|
+
sample_points.append((max_lat, min_lon))
|
|
191
|
+
sample_points.append((max_lat, max_lon))
|
|
192
|
+
sample_points.append((min_lat, (min_lon + max_lon) / 2))
|
|
193
|
+
sample_points.append((max_lat, (min_lon + max_lon) / 2))
|
|
194
|
+
sample_points.append(((min_lat + max_lat) / 2, min_lon))
|
|
195
|
+
sample_points.append(((min_lat + max_lat) / 2, max_lon))
|
|
196
|
+
|
|
197
|
+
if validate_with_pip:
|
|
198
|
+
for point in sample_points:
|
|
199
|
+
if point_in_polygon_with_holes(point, exterior_coords, holes):
|
|
200
|
+
geohash = encode(point[0], point[1], geohash_precision)
|
|
201
|
+
if geohash not in index:
|
|
202
|
+
index[geohash] = []
|
|
203
|
+
if country_id not in index[geohash]:
|
|
204
|
+
index[geohash].append(country_id)
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
# Calculate adaptive step size using shared utility
|
|
208
|
+
step = calculate_adaptive_step_size(lat_range, lon_range)
|
|
209
|
+
|
|
210
|
+
# Calculate safe iteration limits (may adjust step for very large countries)
|
|
211
|
+
max_lat_iterations, max_lon_iterations, max_total_iterations, step = calculate_safe_iteration_limits(
|
|
212
|
+
min_lat, max_lat, min_lon, max_lon, step
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Convert to tuples for PIP (already in lat,lon format)
|
|
216
|
+
exterior_tuples = [(p[0], p[1]) for p in exterior_coords]
|
|
217
|
+
holes_tuples = [[(p[0], p[1]) for p in hole] for hole in holes] if holes else None
|
|
218
|
+
|
|
219
|
+
# Sample points
|
|
220
|
+
lat = min_lat
|
|
221
|
+
iterations = 0
|
|
222
|
+
|
|
223
|
+
while lat <= max_lat and iterations < max_lat_iterations * 2:
|
|
224
|
+
lon = min_lon
|
|
225
|
+
lon_iterations = 0
|
|
226
|
+
|
|
227
|
+
while lon <= max_lon and lon_iterations < max_lon_iterations * 2:
|
|
228
|
+
point = (lat, lon)
|
|
229
|
+
iterations += 1
|
|
230
|
+
lon_iterations += 1
|
|
231
|
+
|
|
232
|
+
# Safety: prevent infinite loops
|
|
233
|
+
if iterations > max_total_iterations:
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
# Validate point is in polygon
|
|
237
|
+
if validate_with_pip:
|
|
238
|
+
if not point_in_polygon_with_holes(point, exterior_tuples, holes_tuples):
|
|
239
|
+
lon += step
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
# Add to index
|
|
243
|
+
geohash = encode(lat, lon, geohash_precision)
|
|
244
|
+
geohash_key = (geohash, country_id)
|
|
245
|
+
|
|
246
|
+
if geohash_key not in geohashes_added:
|
|
247
|
+
if geohash not in index:
|
|
248
|
+
index[geohash] = []
|
|
249
|
+
if country_id not in index[geohash]:
|
|
250
|
+
index[geohash].append(country_id)
|
|
251
|
+
geohashes_added.add(geohash_key)
|
|
252
|
+
|
|
253
|
+
lon += step
|
|
254
|
+
|
|
255
|
+
if iterations > max_total_iterations:
|
|
256
|
+
break
|
|
257
|
+
lat += step
|
|
258
|
+
|
|
259
|
+
# Fallback: ensure at least one geohash for very small countries
|
|
260
|
+
if len(geohashes_added) == 0 and exterior:
|
|
261
|
+
exterior_tuples = [(p[0], p[1]) for p in exterior]
|
|
262
|
+
centroid_lat, centroid_lon = get_polygon_centroid(exterior)
|
|
263
|
+
|
|
264
|
+
if not validate_with_pip or point_in_polygon_with_holes(
|
|
265
|
+
(centroid_lat, centroid_lon), exterior_tuples,
|
|
266
|
+
[[(p[0], p[1]) for p in hole] for hole in holes] if holes else None
|
|
267
|
+
):
|
|
268
|
+
geohash = encode(centroid_lat, centroid_lon, geohash_precision)
|
|
269
|
+
if geohash not in index:
|
|
270
|
+
index[geohash] = []
|
|
271
|
+
if country_id not in index[geohash]:
|
|
272
|
+
index[geohash].append(country_id)
|
|
273
|
+
|
|
274
|
+
print() # New line after progress
|
|
275
|
+
print(f"✓ Completed: {total_countries} countries indexed")
|
|
276
|
+
return index
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def process_geojson(filepath: Path, polygon_tolerance: float = 0.005, geohash_precision: int = 6) -> Tuple[Dict[str, List[int]], Dict, Dict]:
|
|
280
|
+
"""
|
|
281
|
+
Process GeoJSON file and generate data files.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Tuple of (geohash_index, polygons_dict, metadata_dict)
|
|
285
|
+
"""
|
|
286
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
287
|
+
geojson = json.load(f)
|
|
288
|
+
|
|
289
|
+
features = geojson.get('features', [])
|
|
290
|
+
|
|
291
|
+
print(f"Loading {len(features)} countries from GeoJSON...")
|
|
292
|
+
|
|
293
|
+
polygons: Dict[int, Dict] = {}
|
|
294
|
+
metadata: Dict[int, Dict] = {}
|
|
295
|
+
skipped_countries = []
|
|
296
|
+
total_features = len(features)
|
|
297
|
+
|
|
298
|
+
for idx, feature in enumerate(features):
|
|
299
|
+
country_id = idx + 1
|
|
300
|
+
progress_pct = ((idx + 1) / total_features) * 100
|
|
301
|
+
|
|
302
|
+
if (idx + 1) % max(1, total_features // 20) == 0 or (idx + 1) == total_features:
|
|
303
|
+
print(f" Processing polygons: {progress_pct:.1f}% ({idx + 1}/{total_features})", end='\r')
|
|
304
|
+
|
|
305
|
+
geometry = feature.get('geometry', {})
|
|
306
|
+
properties = feature.get('properties', {})
|
|
307
|
+
country_name = properties.get('NAME', properties.get('name', f'Country {country_id}'))
|
|
308
|
+
|
|
309
|
+
if geometry.get('type') not in ('Polygon', 'MultiPolygon'):
|
|
310
|
+
skipped_countries.append(f"{country_name} (type: {geometry.get('type', 'unknown')})")
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
coords = geometry.get('coordinates', [])
|
|
314
|
+
|
|
315
|
+
# Process coordinates
|
|
316
|
+
if geometry['type'] == 'Polygon':
|
|
317
|
+
exterior_coords = coords[0] if coords else []
|
|
318
|
+
hole_coords = coords[1:] if len(coords) > 1 else []
|
|
319
|
+
|
|
320
|
+
if not exterior_coords or len(exterior_coords) < 3:
|
|
321
|
+
skipped_countries.append(f"{country_name} (invalid polygon: {len(exterior_coords) if exterior_coords else 0} vertices)")
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
exterior = [(p[1], p[0]) for p in exterior_coords]
|
|
325
|
+
holes = [[(p[1], p[0]) for p in hole] for hole in hole_coords] if hole_coords else []
|
|
326
|
+
|
|
327
|
+
# Simplify (skip for small polygons)
|
|
328
|
+
if len(exterior) <= 10:
|
|
329
|
+
exterior_simplified = exterior
|
|
330
|
+
else:
|
|
331
|
+
exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
|
|
332
|
+
if len(exterior_simplified) < 3:
|
|
333
|
+
exterior_simplified = exterior
|
|
334
|
+
|
|
335
|
+
holes_simplified = []
|
|
336
|
+
for hole in holes:
|
|
337
|
+
if len(hole) <= 10:
|
|
338
|
+
holes_simplified.append(hole)
|
|
339
|
+
else:
|
|
340
|
+
hole_simpl = simplify_polygon(hole, tolerance=polygon_tolerance)
|
|
341
|
+
if len(hole_simpl) >= 3:
|
|
342
|
+
holes_simplified.append(hole_simpl)
|
|
343
|
+
elif len(hole) >= 3:
|
|
344
|
+
holes_simplified.append(hole)
|
|
345
|
+
|
|
346
|
+
if len(exterior_simplified) >= 3:
|
|
347
|
+
polygons[country_id] = {
|
|
348
|
+
'exterior': exterior_simplified,
|
|
349
|
+
'holes': holes_simplified if holes_simplified else []
|
|
350
|
+
}
|
|
351
|
+
else:
|
|
352
|
+
skipped_countries.append(f"{country_name} (polygon invalid after processing)")
|
|
353
|
+
else: # MultiPolygon
|
|
354
|
+
all_exteriors = []
|
|
355
|
+
all_holes = []
|
|
356
|
+
|
|
357
|
+
for poly_part in coords:
|
|
358
|
+
if not poly_part:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
exterior_coords = poly_part[0] if poly_part else []
|
|
362
|
+
hole_coords = poly_part[1:] if len(poly_part) > 1 else []
|
|
363
|
+
|
|
364
|
+
if not exterior_coords or len(exterior_coords) < 3:
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
exterior = [(p[1], p[0]) for p in exterior_coords]
|
|
368
|
+
|
|
369
|
+
if len(exterior) <= 10:
|
|
370
|
+
exterior_simplified = exterior
|
|
371
|
+
else:
|
|
372
|
+
exterior_simplified = simplify_polygon(exterior, tolerance=polygon_tolerance)
|
|
373
|
+
if len(exterior_simplified) < 3:
|
|
374
|
+
exterior_simplified = exterior
|
|
375
|
+
|
|
376
|
+
if len(exterior_simplified) >= 3:
|
|
377
|
+
all_exteriors.append(exterior_simplified)
|
|
378
|
+
|
|
379
|
+
if hole_coords:
|
|
380
|
+
for hole_coord in hole_coords:
|
|
381
|
+
hole = [(p[1], p[0]) for p in hole_coord]
|
|
382
|
+
if len(hole) >= 3:
|
|
383
|
+
if len(hole) <= 10:
|
|
384
|
+
all_holes.append(hole)
|
|
385
|
+
else:
|
|
386
|
+
hole_simpl = simplify_polygon(hole, tolerance=polygon_tolerance)
|
|
387
|
+
if len(hole_simpl) >= 3:
|
|
388
|
+
all_holes.append(hole_simpl)
|
|
389
|
+
elif len(hole) >= 3:
|
|
390
|
+
all_holes.append(hole)
|
|
391
|
+
|
|
392
|
+
if all_exteriors:
|
|
393
|
+
polygons[country_id] = {
|
|
394
|
+
'exterior': all_exteriors[0],
|
|
395
|
+
'holes': all_holes if all_holes else [],
|
|
396
|
+
'multi': True,
|
|
397
|
+
'exteriors': all_exteriors
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
# Extract metadata
|
|
401
|
+
metadata[country_id] = {
|
|
402
|
+
'name': country_name,
|
|
403
|
+
'iso2': properties.get('ISO_A2', properties.get('iso_a2', '')),
|
|
404
|
+
'iso3': properties.get('ISO_A3', properties.get('iso_a3', '')),
|
|
405
|
+
'continent': properties.get('CONTINENT', properties.get('continent', '')),
|
|
406
|
+
'timezone': properties.get('TIMEZONE', properties.get('timezone', ''))
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Validate stored polygon
|
|
410
|
+
if country_id in polygons:
|
|
411
|
+
polygon_data = polygons[country_id]
|
|
412
|
+
exterior = polygon_data.get('exterior', [])
|
|
413
|
+
is_multi = polygon_data.get('multi', False)
|
|
414
|
+
|
|
415
|
+
if is_multi:
|
|
416
|
+
exteriors = polygon_data.get('exteriors', [])
|
|
417
|
+
if not exteriors or all(len(ext) < 3 for ext in exteriors):
|
|
418
|
+
skipped_countries.append(f"{country_name} (invalid MultiPolygon)")
|
|
419
|
+
del polygons[country_id]
|
|
420
|
+
elif not exterior or len(exterior) < 3:
|
|
421
|
+
skipped_countries.append(f"{country_name} (invalid polygon: {len(exterior)} vertices)")
|
|
422
|
+
del polygons[country_id]
|
|
423
|
+
|
|
424
|
+
print() # New line after progress
|
|
425
|
+
if skipped_countries:
|
|
426
|
+
print(f"\n⚠ Skipped {len(skipped_countries)} countries with invalid geometry:")
|
|
427
|
+
for country in skipped_countries[:15]:
|
|
428
|
+
print(f" - {country}")
|
|
429
|
+
if len(skipped_countries) > 15:
|
|
430
|
+
print(f" ... and {len(skipped_countries) - 15} more")
|
|
431
|
+
|
|
432
|
+
print(f"\n✓ Processed {len(polygons)} countries with valid polygons")
|
|
433
|
+
|
|
434
|
+
# Build geohash index from processed polygons (optimized)
|
|
435
|
+
print("\nBuilding geohash index with PIP validation...")
|
|
436
|
+
geohash_index = build_geohash_index_from_polygons(
|
|
437
|
+
polygons,
|
|
438
|
+
geohash_precision=geohash_precision,
|
|
439
|
+
validate_with_pip=True
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return geohash_index, polygons, metadata
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def main():
|
|
446
|
+
"""CLI entry point for data builder."""
|
|
447
|
+
if len(sys.argv) < 3:
|
|
448
|
+
print("Usage: python -m geo_intel_offline.data_builder <source.geojson> <output_dir> [tolerance] [precision]")
|
|
449
|
+
sys.exit(1)
|
|
450
|
+
|
|
451
|
+
source_path = Path(sys.argv[1])
|
|
452
|
+
output_dir = Path(sys.argv[2])
|
|
453
|
+
polygon_tolerance = float(sys.argv[3]) if len(sys.argv) > 3 else 0.005
|
|
454
|
+
geohash_precision = int(sys.argv[4]) if len(sys.argv) > 4 else 6
|
|
455
|
+
|
|
456
|
+
if not source_path.exists():
|
|
457
|
+
print(f"Error: Source file not found: {source_path}")
|
|
458
|
+
sys.exit(1)
|
|
459
|
+
|
|
460
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
461
|
+
|
|
462
|
+
print("=" * 70)
|
|
463
|
+
print("GEO_INTEL_OFFLINE - DATA BUILDER")
|
|
464
|
+
print("=" * 70)
|
|
465
|
+
print(f"Source: {source_path}")
|
|
466
|
+
print(f"Output: {output_dir}")
|
|
467
|
+
print(f"Polygon tolerance: {polygon_tolerance}°")
|
|
468
|
+
print(f"Geohash precision: {geohash_precision}")
|
|
469
|
+
print()
|
|
470
|
+
|
|
471
|
+
# Process GeoJSON
|
|
472
|
+
geohash_index, polygons, metadata = process_geojson(
|
|
473
|
+
source_path,
|
|
474
|
+
polygon_tolerance=polygon_tolerance,
|
|
475
|
+
geohash_precision=geohash_precision
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Save data files (both uncompressed and compressed)
|
|
479
|
+
print("\nSaving data files...")
|
|
480
|
+
|
|
481
|
+
import gzip
|
|
482
|
+
|
|
483
|
+
# Save uncompressed (for compatibility)
|
|
484
|
+
print(" Saving uncompressed JSON files...")
|
|
485
|
+
with open(output_dir / 'geohash_index.json', 'w', encoding='utf-8') as f:
|
|
486
|
+
json.dump(geohash_index, f, separators=(',', ':'))
|
|
487
|
+
|
|
488
|
+
with open(output_dir / 'polygons.json', 'w', encoding='utf-8') as f:
|
|
489
|
+
json.dump(polygons, f, separators=(',', ':'))
|
|
490
|
+
|
|
491
|
+
with open(output_dir / 'metadata.json', 'w', encoding='utf-8') as f:
|
|
492
|
+
json.dump(metadata, f, separators=(',', ':'), ensure_ascii=False)
|
|
493
|
+
|
|
494
|
+
# Save compressed versions (smaller file size)
|
|
495
|
+
print(" Saving compressed JSON files (gzip)...")
|
|
496
|
+
with gzip.open(output_dir / 'geohash_index.json.gz', 'wt', encoding='utf-8', compresslevel=9) as f:
|
|
497
|
+
json.dump(geohash_index, f, separators=(',', ':'))
|
|
498
|
+
|
|
499
|
+
with gzip.open(output_dir / 'polygons.json.gz', 'wt', encoding='utf-8', compresslevel=9) as f:
|
|
500
|
+
json.dump(polygons, f, separators=(',', ':'))
|
|
501
|
+
|
|
502
|
+
with gzip.open(output_dir / 'metadata.json.gz', 'wt', encoding='utf-8', compresslevel=9) as f:
|
|
503
|
+
json.dump(metadata, f, separators=(',', ':'), ensure_ascii=False)
|
|
504
|
+
|
|
505
|
+
# Show file sizes
|
|
506
|
+
print()
|
|
507
|
+
print("File sizes:")
|
|
508
|
+
for filename in ['geohash_index.json', 'polygons.json', 'metadata.json']:
|
|
509
|
+
json_file = output_dir / filename
|
|
510
|
+
gzip_file = output_dir / f"{filename}.gz"
|
|
511
|
+
if json_file.exists() and gzip_file.exists():
|
|
512
|
+
json_size = json_file.stat().st_size / 1024 / 1024 # MB
|
|
513
|
+
gzip_size = gzip_file.stat().st_size / 1024 / 1024 # MB
|
|
514
|
+
ratio = (gzip_size / json_size) * 100 if json_size > 0 else 0
|
|
515
|
+
print(f" {filename}: {json_size:.2f} MB -> {gzip_size:.2f} MB ({ratio:.1f}%)")
|
|
516
|
+
|
|
517
|
+
print()
|
|
518
|
+
print("=" * 70)
|
|
519
|
+
print("BUILD COMPLETE")
|
|
520
|
+
print("=" * 70)
|
|
521
|
+
print(f"✓ Generated {len(geohash_index)} geohashes")
|
|
522
|
+
print(f"✓ Processed {len(polygons)} countries")
|
|
523
|
+
print(f"✓ Files saved to: {output_dir}")
|
|
524
|
+
print()
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
if __name__ == '__main__':
|
|
528
|
+
main()
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal test data generator for development/testing.
|
|
3
|
+
|
|
4
|
+
Creates a small test dataset with a few countries for testing without
|
|
5
|
+
requiring full Natural Earth data.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python -m geo_intel_offline.data_builder_minimal <output_dir>
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from .geohash import encode
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_minimal_test_data():
|
|
18
|
+
"""
|
|
19
|
+
Create minimal test data with a few countries.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Tuple of (geohash_index, polygons, metadata)
|
|
23
|
+
"""
|
|
24
|
+
# Define a few test countries with simple square polygons
|
|
25
|
+
# Coordinates: (lat, lon)
|
|
26
|
+
|
|
27
|
+
# United States (rough bounding box)
|
|
28
|
+
usa_exterior = [
|
|
29
|
+
(49.0, -125.0), # NW
|
|
30
|
+
(49.0, -66.0), # NE
|
|
31
|
+
(25.0, -66.0), # SE
|
|
32
|
+
(25.0, -125.0), # SW
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
# United Kingdom (rough bounding box)
|
|
36
|
+
uk_exterior = [
|
|
37
|
+
(60.0, -8.0), # NW
|
|
38
|
+
(60.0, 2.0), # NE
|
|
39
|
+
(50.0, 2.0), # SE
|
|
40
|
+
(50.0, -8.0), # SW
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Japan (rough bounding box)
|
|
44
|
+
japan_exterior = [
|
|
45
|
+
(45.0, 129.0), # NW
|
|
46
|
+
(45.0, 146.0), # NE
|
|
47
|
+
(31.0, 146.0), # SE
|
|
48
|
+
(31.0, 129.0), # SW
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# France (rough bounding box)
|
|
52
|
+
france_exterior = [
|
|
53
|
+
(51.0, -5.0), # NW
|
|
54
|
+
(51.0, 10.0), # NE
|
|
55
|
+
(42.0, 10.0), # SE
|
|
56
|
+
(42.0, -5.0), # SW
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Germany (rough bounding box)
|
|
60
|
+
germany_exterior = [
|
|
61
|
+
(55.0, 6.0), # NW
|
|
62
|
+
(55.0, 15.0), # NE
|
|
63
|
+
(47.0, 15.0), # SE
|
|
64
|
+
(47.0, 6.0), # SW
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
polygons = {
|
|
68
|
+
1: {"exterior": usa_exterior, "holes": []},
|
|
69
|
+
2: {"exterior": uk_exterior, "holes": []},
|
|
70
|
+
3: {"exterior": japan_exterior, "holes": []},
|
|
71
|
+
4: {"exterior": france_exterior, "holes": []},
|
|
72
|
+
5: {"exterior": germany_exterior, "holes": []},
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
metadata = {
|
|
76
|
+
1: {
|
|
77
|
+
"name": "United States",
|
|
78
|
+
"iso2": "US",
|
|
79
|
+
"iso3": "USA",
|
|
80
|
+
"continent": "North America",
|
|
81
|
+
"timezone": "America/New_York"
|
|
82
|
+
},
|
|
83
|
+
2: {
|
|
84
|
+
"name": "United Kingdom",
|
|
85
|
+
"iso2": "GB",
|
|
86
|
+
"iso3": "GBR",
|
|
87
|
+
"continent": "Europe",
|
|
88
|
+
"timezone": "Europe/London"
|
|
89
|
+
},
|
|
90
|
+
3: {
|
|
91
|
+
"name": "Japan",
|
|
92
|
+
"iso2": "JP",
|
|
93
|
+
"iso3": "JPN",
|
|
94
|
+
"continent": "Asia",
|
|
95
|
+
"timezone": "Asia/Tokyo"
|
|
96
|
+
},
|
|
97
|
+
4: {
|
|
98
|
+
"name": "France",
|
|
99
|
+
"iso2": "FR",
|
|
100
|
+
"iso3": "FRA",
|
|
101
|
+
"continent": "Europe",
|
|
102
|
+
"timezone": "Europe/Paris"
|
|
103
|
+
},
|
|
104
|
+
5: {
|
|
105
|
+
"name": "Germany",
|
|
106
|
+
"iso2": "DE",
|
|
107
|
+
"iso3": "DEU",
|
|
108
|
+
"continent": "Europe",
|
|
109
|
+
"timezone": "Europe/Berlin"
|
|
110
|
+
},
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# Build geohash index by sampling bounding boxes
|
|
114
|
+
geohash_index = {}
|
|
115
|
+
|
|
116
|
+
for country_id, poly_data in polygons.items():
|
|
117
|
+
exterior = poly_data["exterior"]
|
|
118
|
+
|
|
119
|
+
# Get bounding box
|
|
120
|
+
lats = [p[0] for p in exterior]
|
|
121
|
+
lons = [p[1] for p in exterior]
|
|
122
|
+
|
|
123
|
+
min_lat, max_lat = min(lats), max(lats)
|
|
124
|
+
min_lon, max_lon = min(lons), max(lons)
|
|
125
|
+
|
|
126
|
+
# Sample points
|
|
127
|
+
step = 1.0 # Larger step for test data
|
|
128
|
+
lat = min_lat
|
|
129
|
+
while lat <= max_lat:
|
|
130
|
+
lon = min_lon
|
|
131
|
+
while lon <= max_lon:
|
|
132
|
+
geohash = encode(lat, lon, precision=6)
|
|
133
|
+
if geohash not in geohash_index:
|
|
134
|
+
geohash_index[geohash] = []
|
|
135
|
+
if country_id not in geohash_index[geohash]:
|
|
136
|
+
geohash_index[geohash].append(country_id)
|
|
137
|
+
lon += step
|
|
138
|
+
lat += step
|
|
139
|
+
|
|
140
|
+
return geohash_index, polygons, metadata
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def main():
|
|
144
|
+
"""CLI entry point."""
|
|
145
|
+
if len(sys.argv) < 2:
|
|
146
|
+
print("Usage: python -m geo_intel_offline.data_builder_minimal <output_dir>")
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
output_dir = Path(sys.argv[1])
|
|
150
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
print("Generating minimal test data...")
|
|
153
|
+
geohash_index, polygons, metadata = create_minimal_test_data()
|
|
154
|
+
|
|
155
|
+
# Save files
|
|
156
|
+
print(f"Writing data files to {output_dir}...")
|
|
157
|
+
|
|
158
|
+
with open(output_dir / "geohash_index.json", 'w', encoding='utf-8') as f:
|
|
159
|
+
json.dump(geohash_index, f, separators=(',', ':'))
|
|
160
|
+
|
|
161
|
+
with open(output_dir / "polygons.json", 'w', encoding='utf-8') as f:
|
|
162
|
+
json.dump(polygons, f, separators=(',', ':'))
|
|
163
|
+
|
|
164
|
+
with open(output_dir / "metadata.json", 'w', encoding='utf-8') as f:
|
|
165
|
+
json.dump(metadata, f, separators=(',', ':'), ensure_ascii=False)
|
|
166
|
+
|
|
167
|
+
print(f"Done! Generated {len(geohash_index)} geohashes, {len(polygons)} countries.")
|
|
168
|
+
print("\nNote: This is minimal test data. For production, use data_builder.py")
|
|
169
|
+
print(" with Natural Earth or similar authoritative source data.")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == '__main__':
|
|
173
|
+
main()
|