waze-logs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
collector_worldwide.py ADDED
@@ -0,0 +1,532 @@
1
+ #!/usr/bin/env python3
2
+ # collector_worldwide.py
3
+ """Worldwide autonomous Waze data collector - all continents."""
4
+
5
+ import hashlib
6
+ import json
7
+ import time
8
+ import os
9
+ import signal
10
+ import yaml
11
+ import logging
12
+ import threading
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from datetime import datetime, timezone
15
+ from typing import Dict, Any, Optional, List
16
+ from pathlib import Path
17
+
18
+ # Status file for real-time UI updates
19
+ STATUS_FILE = "./data/collector_status.json"
20
+ CHECKPOINT_FILE = "./data/collector_checkpoint.json"
21
+ status_lock = threading.Lock()
22
+ checkpoint_lock = threading.Lock()
23
+
24
+
25
+ def write_status(region: str, cell_name: str, country: str, cell_idx: int, total_cells: int,
26
+ alerts_count: int, new_count: int, event_types: List[str] = None):
27
+ """Write current collector status to file for UI consumption (thread-safe)."""
28
+ try:
29
+ status = {
30
+ "timestamp": datetime.now(timezone.utc).isoformat(),
31
+ "region": region,
32
+ "cell_name": cell_name,
33
+ "country": country,
34
+ "cell_idx": cell_idx,
35
+ "total_cells": total_cells,
36
+ "alerts_found": alerts_count,
37
+ "new_events": new_count,
38
+ "event_types": event_types or [],
39
+ "status": "scanning"
40
+ }
41
+ with status_lock:
42
+ with open(STATUS_FILE, "w") as f:
43
+ json.dump(status, f)
44
+ except Exception:
45
+ pass # Don't crash on status write failures
46
+
47
+
48
+ def load_checkpoint() -> Dict[str, Any]:
49
+ """Load checkpoint from file. Returns empty dict if no checkpoint exists."""
50
+ try:
51
+ if os.path.exists(CHECKPOINT_FILE):
52
+ with open(CHECKPOINT_FILE, "r") as f:
53
+ return json.load(f)
54
+ except Exception:
55
+ pass
56
+ return {"cycle": 0, "scanned": {}}
57
+
58
+
59
+ def save_checkpoint(cycle: int, scanned: Dict[str, List[str]]):
60
+ """Save checkpoint to file (thread-safe)."""
61
+ try:
62
+ checkpoint = {
63
+ "cycle": cycle,
64
+ "scanned": scanned,
65
+ "timestamp": datetime.now(timezone.utc).isoformat()
66
+ }
67
+ with checkpoint_lock:
68
+ with open(CHECKPOINT_FILE, "w") as f:
69
+ json.dump(checkpoint, f)
70
+ except Exception:
71
+ pass
72
+
73
+
74
+ def clear_checkpoint():
75
+ """Clear checkpoint file when cycle completes."""
76
+ try:
77
+ if os.path.exists(CHECKPOINT_FILE):
78
+ os.remove(CHECKPOINT_FILE)
79
+ except Exception:
80
+ pass
81
+
82
+
83
+ # Set up logging
84
+ logging.basicConfig(
85
+ level=logging.INFO,
86
+ format='%(asctime)s [%(levelname)s] %(message)s',
87
+ datefmt='%Y-%m-%d %H:%M:%S',
88
+ handlers=[
89
+ logging.StreamHandler(),
90
+ logging.FileHandler('logs/worldwide_collector.log')
91
+ ]
92
+ )
93
+ logger = logging.getLogger("worldwide")
94
+
95
+
96
+ def generate_event_hash(username: str, latitude: float, longitude: float,
97
+ timestamp_ms: int, report_type: str) -> str:
98
+ timestamp_minute = timestamp_ms // 60000
99
+ data = f"{username}|{round(latitude, 4)}|{round(longitude, 4)}|{timestamp_minute}|{report_type}"
100
+ return hashlib.sha256(data.encode()).hexdigest()[:16]
101
+
102
+
103
+ def process_alert(alert: Dict[str, Any], grid_cell: str) -> Dict[str, Any]:
104
+ username = alert.get("reportBy", "anonymous")
105
+ latitude = alert.get("latitude", 0.0)
106
+ longitude = alert.get("longitude", 0.0)
107
+ timestamp_ms = alert.get("pubMillis", int(time.time() * 1000))
108
+ report_type = alert.get("type", "UNKNOWN")
109
+ subtype = alert.get("subtype")
110
+
111
+ timestamp_utc = datetime.fromtimestamp(
112
+ timestamp_ms / 1000, tz=timezone.utc
113
+ ).isoformat()
114
+
115
+ return {
116
+ "event_hash": generate_event_hash(username, latitude, longitude, timestamp_ms, report_type),
117
+ "username": username,
118
+ "latitude": latitude,
119
+ "longitude": longitude,
120
+ "timestamp_utc": timestamp_utc,
121
+ "timestamp_ms": timestamp_ms,
122
+ "report_type": report_type,
123
+ "subtype": subtype,
124
+ "raw_json": json.dumps(alert),
125
+ "collected_at": datetime.now(timezone.utc).isoformat(),
126
+ "grid_cell": grid_cell
127
+ }
128
+
129
+
130
+ class RegionScanner:
131
+ """Scanner for a specific region."""
132
+
133
+ def __init__(self, name: str, config_path: str, db, client):
134
+ self.name = name
135
+ self.config_path = config_path
136
+ self.db = db
137
+ self.client = client
138
+ self.logger = logging.getLogger(name)
139
+ self.cells_by_priority = {}
140
+ self._load_cells()
141
+
142
+ def _load_cells(self):
143
+ with open(self.config_path) as f:
144
+ config = yaml.safe_load(f)
145
+
146
+ for cell in config.get("grid_cells", []):
147
+ p = cell.get("priority", 2)
148
+ if p not in self.cells_by_priority:
149
+ self.cells_by_priority[p] = []
150
+ self.cells_by_priority[p].append(cell)
151
+
152
+ def get_cell_counts(self) -> Dict[int, int]:
153
+ return {p: len(cells) for p, cells in self.cells_by_priority.items()}
154
+
155
+ def scan(self, priority: int, running_flag, already_scanned: set = None,
156
+ on_cell_scanned: callable = None) -> Dict[str, Any]:
157
+ """Scan cells of given priority, skipping already-scanned cells.
158
+
159
+ Args:
160
+ priority: Priority level to scan (1 or 3)
161
+ running_flag: Callable that returns False to stop scanning
162
+ already_scanned: Set of cell names to skip
163
+ on_cell_scanned: Callback called with cell_name after each cell is scanned
164
+ """
165
+ cells = self.cells_by_priority.get(priority, [])
166
+ stats = {"requests": 0, "errors": 0, "events": 0, "cells": len(cells), "scanned_cells": []}
167
+ total_cells = len(cells)
168
+ already_scanned = already_scanned or set()
169
+
170
+ # Filter out already-scanned cells
171
+ remaining_cells = [(idx, cell) for idx, cell in enumerate(cells, 1)
172
+ if cell["name"] not in already_scanned]
173
+
174
+ if len(remaining_cells) < len(cells):
175
+ skipped = len(cells) - len(remaining_cells)
176
+ self.logger.info(f"Resuming: skipping {skipped} already-scanned cells, {len(remaining_cells)} remaining")
177
+
178
+ for idx, cell in remaining_cells:
179
+ if not running_flag():
180
+ break
181
+
182
+ try:
183
+ stats["requests"] += 1
184
+ cell_name = cell["name"]
185
+ country = cell.get("country", "??")
186
+
187
+ alerts, _ = self.client.get_traffic_notifications(
188
+ lat_top=cell["lat_top"],
189
+ lat_bottom=cell["lat_bottom"],
190
+ lon_left=cell["lon_left"],
191
+ lon_right=cell["lon_right"]
192
+ )
193
+
194
+ new_count = 0
195
+ new_types = []
196
+ for alert in alerts:
197
+ event = process_alert(alert, cell_name)
198
+ if self.db.insert_event(event):
199
+ new_count += 1
200
+ new_types.append(event["report_type"])
201
+ self.db.upsert_tracked_user(event["username"], event["timestamp_utc"])
202
+
203
+ stats["events"] += new_count
204
+ stats["scanned_cells"].append(cell_name)
205
+
206
+ # Notify callback that cell was scanned (for checkpoint saving)
207
+ if on_cell_scanned:
208
+ on_cell_scanned(cell_name)
209
+
210
+ # Only log and write status when there are alerts or new events
211
+ if len(alerts) > 0 or new_count > 0:
212
+ type_summary = ""
213
+ if new_types:
214
+ from collections import Counter
215
+ counts = Counter(new_types)
216
+ type_summary = " | " + ", ".join(f"{t}:{c}" for t, c in counts.most_common(3))
217
+
218
+ status = f"+{new_count}" if new_count > 0 else "0"
219
+ self.logger.info(f"[{idx:3}/{total_cells}] {cell_name:25} ({country}) -> {len(alerts):3} alerts, {status} new{type_summary}")
220
+
221
+ # Write status for real-time UI updates
222
+ write_status(
223
+ region=self.name,
224
+ cell_name=cell_name,
225
+ country=country,
226
+ cell_idx=idx,
227
+ total_cells=total_cells,
228
+ alerts_count=len(alerts),
229
+ new_count=new_count,
230
+ event_types=new_types
231
+ )
232
+
233
+ except Exception as e:
234
+ stats["errors"] += 1
235
+ stats["scanned_cells"].append(cell["name"]) # Mark as scanned even on error
236
+ self.logger.error(f"[{idx:3}/{total_cells}] {cell['name']:25} -> ERROR: {e}")
237
+
238
+ return stats
239
+
240
+
241
+ class WorldwideCollector:
242
+ """Autonomous worldwide Waze data collector."""
243
+
244
+ REGIONS = [
245
+ ("europe", "config_europe.yaml", "./data/waze_europe.db"),
246
+ ("americas", "config_americas.yaml", "./data/waze_americas.db"),
247
+ ("asia", "config_asia.yaml", "./data/waze_asia.db"),
248
+ ("oceania", "config_oceania.yaml", "./data/waze_oceania.db"),
249
+ ("africa", "config_africa.yaml", "./data/waze_africa.db"),
250
+ ]
251
+
252
+ def __init__(self):
253
+ self.running = False
254
+ self.pid_file = "collector_worldwide.pid"
255
+ self.scanners = {}
256
+ self.databases = {}
257
+ self.clients = {}
258
+
259
+ def _generate_all_configs(self):
260
+ """Generate all regional configs."""
261
+ configs = [
262
+ ("europe_grid", "save_europe_config"),
263
+ ("americas_grid", "save_americas_config"),
264
+ ("asia_grid", "save_asia_config"),
265
+ ("oceania_grid", "save_oceania_config"),
266
+ ("africa_grid", "save_africa_config"),
267
+ ]
268
+
269
+ for module_name, func_name in configs:
270
+ config_file = f"config_{module_name.replace('_grid', '')}.yaml"
271
+ if not os.path.exists(config_file):
272
+ logger.info(f"Generating {config_file}...")
273
+ module = __import__(module_name)
274
+ getattr(module, func_name)()
275
+
276
+ def _save_pid(self):
277
+ with open(self.pid_file, "w") as f:
278
+ f.write(str(os.getpid()))
279
+
280
+ def _remove_pid(self):
281
+ if os.path.exists(self.pid_file):
282
+ os.remove(self.pid_file)
283
+
284
+ @staticmethod
285
+ def get_pid() -> Optional[int]:
286
+ if os.path.exists("collector_worldwide.pid"):
287
+ with open("collector_worldwide.pid") as f:
288
+ pid = int(f.read().strip())
289
+ try:
290
+ os.kill(pid, 0)
291
+ return pid
292
+ except OSError:
293
+ return None
294
+ return None
295
+
296
+ def run(self):
297
+ """Main worldwide collection loop."""
298
+ from database import Database
299
+ from waze_client import WazeClient
300
+
301
+ # Create directories
302
+ Path("data").mkdir(exist_ok=True)
303
+ Path("logs").mkdir(exist_ok=True)
304
+
305
+ # Generate configs
306
+ self._generate_all_configs()
307
+
308
+ # Initialize scanners for each region
309
+ logger.info("=" * 70)
310
+ logger.info("WORLDWIDE WAZE COLLECTOR")
311
+ logger.info("Covering: Europe, Americas, Asia, Oceania, Africa")
312
+ logger.info("=" * 70)
313
+
314
+ total_p1 = 0
315
+ total_p3 = 0
316
+
317
+ for region_name, config_path, db_path in self.REGIONS:
318
+ if not os.path.exists(config_path):
319
+ logger.warning(f"Config not found: {config_path}, skipping {region_name}")
320
+ continue
321
+
322
+ db = Database(db_path, check_same_thread=False) # Thread-safe for parallel scanning
323
+ client = WazeClient()
324
+
325
+ scanner = RegionScanner(region_name, config_path, db, client)
326
+ self.scanners[region_name] = scanner
327
+ self.databases[region_name] = db
328
+ self.clients[region_name] = client
329
+
330
+ counts = scanner.get_cell_counts()
331
+ p1 = counts.get(1, 0)
332
+ p3 = counts.get(3, 0)
333
+ total_p1 += p1
334
+ total_p3 += p3
335
+
336
+ logger.info(f" {region_name.upper():10} - P1 (cities): {p1:4}, P3 (coverage): {p3:4}")
337
+
338
+ logger.info("-" * 70)
339
+ logger.info(f" {'TOTAL':10} - P1 (cities): {total_p1:4}, P3 (coverage): {total_p3:4}")
340
+ logger.info(f" {'':10} Grand total: {total_p1 + total_p3} grid cells")
341
+ logger.info("=" * 70)
342
+ logger.info("Collection strategy (MULTITHREADED):")
343
+ logger.info(" - All regions scanned in PARALLEL for P1 (city) scans")
344
+ logger.info(" - Full P3 (coverage) scan every 10 cycles (parallel)")
345
+ logger.info(" - 10 second pause between cycles")
346
+ logger.info("=" * 70)
347
+
348
+ self.running = True
349
+ self._save_pid()
350
+
351
+ def handle_signal(signum, frame):
352
+ logger.info("Shutdown signal received...")
353
+ self.running = False
354
+
355
+ signal.signal(signal.SIGINT, handle_signal)
356
+ signal.signal(signal.SIGTERM, handle_signal)
357
+
358
+ region_names = list(self.scanners.keys())
359
+
360
+ # Load checkpoint to resume from where we left off
361
+ checkpoint = load_checkpoint()
362
+ cycle = checkpoint.get("cycle", 0)
363
+ scanned_cells = checkpoint.get("scanned", {})
364
+
365
+ if cycle > 0:
366
+ logger.info(f"Resuming from checkpoint: cycle {cycle}")
367
+ for key, cells in scanned_cells.items():
368
+ logger.info(f" {key}: {len(cells)} cells already scanned")
369
+
370
+ def scan_region(region_name: str, priority: int, today: str, already_scanned: set,
371
+ checkpoint_key: str) -> Dict[str, Any]:
372
+ """Scan a single region (runs in thread)."""
373
+ scanner = self.scanners[region_name]
374
+ db = self.databases[region_name]
375
+
376
+ p_count = scanner.get_cell_counts().get(priority, 0)
377
+ if p_count == 0:
378
+ return {"region": region_name, "events": 0, "errors": 0, "requests": 0, "cells": 0, "scanned_cells": []}
379
+
380
+ def on_cell_scanned(cell_name):
381
+ """Callback to save checkpoint after each cell (thread-safe)."""
382
+ with checkpoint_lock:
383
+ if checkpoint_key not in scanned_cells:
384
+ scanned_cells[checkpoint_key] = []
385
+ scanned_cells[checkpoint_key].append(cell_name)
386
+ save_checkpoint(cycle, scanned_cells)
387
+
388
+ stats = scanner.scan(priority, lambda: self.running, already_scanned, on_cell_scanned)
389
+
390
+ # Update daily stats (thread-safe - SQLite handles this)
391
+ db.update_daily_stats(
392
+ date=today,
393
+ events=stats["events"],
394
+ requests=stats["requests"],
395
+ errors=stats["errors"],
396
+ cells=stats["cells"]
397
+ )
398
+
399
+ return {"region": region_name, **stats}
400
+
401
+ try:
402
+ while self.running:
403
+ cycle += 1
404
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
405
+
406
+ logger.info(f"\n{'='*50}")
407
+ logger.info(f"CYCLE {cycle} (PARALLEL MODE)")
408
+ logger.info(f"{'='*50}")
409
+
410
+ # Parallel P1 scan - all regions at once
411
+ logger.info(f"Starting parallel P1 scan across {len(region_names)} regions...")
412
+ total_events = 0
413
+ total_errors = 0
414
+ cycle_complete = True
415
+
416
+ with ThreadPoolExecutor(max_workers=len(region_names)) as executor:
417
+ futures = {}
418
+ for region in region_names:
419
+ key = f"{region}_p1"
420
+ already_scanned = set(scanned_cells.get(key, []))
421
+ futures[executor.submit(scan_region, region, 1, today, already_scanned, key)] = (region, key)
422
+
423
+ for future in as_completed(futures):
424
+ region, key = futures[future]
425
+ try:
426
+ result = future.result()
427
+ total_events += result["events"]
428
+ total_errors += result["errors"]
429
+
430
+ # Checkpoint is saved per-cell by callback, no need to save here
431
+
432
+ if result["events"] > 0 or result["errors"] > 0:
433
+ logger.info(f" [{region.upper()}] +{result['events']} events, {result['errors']} errors")
434
+ except Exception as e:
435
+ logger.error(f" [{region.upper()}] Thread error: {e}")
436
+ cycle_complete = False
437
+
438
+ logger.info(f"P1 cycle complete: +{total_events} total events, {total_errors} errors")
439
+
440
+ # Clear P1 checkpoint data after successful cycle
441
+ if cycle_complete:
442
+ for region in region_names:
443
+ scanned_cells.pop(f"{region}_p1", None)
444
+ save_checkpoint(cycle, scanned_cells)
445
+
446
+ # Full coverage scan every 10 cycles (also parallel)
447
+ if cycle % 10 == 0 and self.running:
448
+ logger.info("\n--- FULL COVERAGE SCAN (PARALLEL) ---")
449
+ total_p3_events = 0
450
+
451
+ with ThreadPoolExecutor(max_workers=len(region_names)) as executor:
452
+ futures = {}
453
+ for region in region_names:
454
+ key = f"{region}_p3"
455
+ already_scanned = set(scanned_cells.get(key, []))
456
+ futures[executor.submit(scan_region, region, 3, today, already_scanned, key)] = (region, key)
457
+
458
+ for future in as_completed(futures):
459
+ region, key = futures[future]
460
+ try:
461
+ result = future.result()
462
+ total_p3_events += result["events"]
463
+
464
+ # Checkpoint is saved per-cell by callback, no need to save here
465
+
466
+ if result["events"] > 0:
467
+ logger.info(f" [{region.upper()}] +{result['events']} events")
468
+ except Exception as e:
469
+ logger.error(f" [{region.upper()}] Thread error: {e}")
470
+
471
+ logger.info(f"P3 coverage complete: +{total_p3_events} total events")
472
+
473
+ # Clear P3 checkpoint data after successful coverage scan
474
+ for region in region_names:
475
+ scanned_cells.pop(f"{region}_p3", None)
476
+ save_checkpoint(cycle, scanned_cells)
477
+
478
+ # Print summary every 5 cycles
479
+ if cycle % 5 == 0:
480
+ logger.info("\n--- DATABASE SUMMARY ---")
481
+ for region_name, db in self.databases.items():
482
+ result = db.execute(
483
+ "SELECT COUNT(*) as events, COUNT(DISTINCT username) as users FROM events"
484
+ ).fetchone()
485
+ logger.info(f" {region_name.upper():10}: {result[0]:,} events, {result[1]:,} users")
486
+
487
+ # Wait between cycles (shorter since parallel is faster)
488
+ if self.running:
489
+ time.sleep(10)
490
+
491
+ except Exception as e:
492
+ logger.error(f"Fatal error: {e}", exc_info=True)
493
+ raise
494
+ finally:
495
+ self._remove_pid()
496
+ for db in self.databases.values():
497
+ db.close()
498
+ logger.info("Worldwide collector stopped.")
499
+
500
+
501
+ def main():
502
+ import argparse
503
+ parser = argparse.ArgumentParser(description="Worldwide Waze Data Collector")
504
+ parser.add_argument("--generate-configs", action="store_true", help="Generate all configs and exit")
505
+ parser.add_argument("--status", action="store_true", help="Show collector status")
506
+ args = parser.parse_args()
507
+
508
+ if args.generate_configs:
509
+ from europe_grid import save_europe_config
510
+ from americas_grid import save_americas_config
511
+ from asia_grid import save_asia_config
512
+ from oceania_grid import save_oceania_config
513
+ from africa_grid import save_africa_config
514
+
515
+ save_europe_config()
516
+ save_americas_config()
517
+ save_asia_config()
518
+ save_oceania_config()
519
+ save_africa_config()
520
+ return
521
+
522
+ if args.status:
523
+ pid = WorldwideCollector.get_pid()
524
+ print(f"Worldwide Collector: {'Running (PID ' + str(pid) + ')' if pid else 'Stopped'}")
525
+ return
526
+
527
+ collector = WorldwideCollector()
528
+ collector.run()
529
+
530
+
531
+ if __name__ == "__main__":
532
+ main()