waze-logs 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis.py +91 -0
- cli.py +1219 -0
- collector.py +193 -0
- collector_europe.py +312 -0
- collector_worldwide.py +532 -0
- database.py +176 -0
- waze_client.py +234 -0
- waze_logs-1.0.0.dist-info/METADATA +411 -0
- waze_logs-1.0.0.dist-info/RECORD +15 -0
- waze_logs-1.0.0.dist-info/WHEEL +5 -0
- waze_logs-1.0.0.dist-info/entry_points.txt +2 -0
- waze_logs-1.0.0.dist-info/licenses/LICENSE +21 -0
- waze_logs-1.0.0.dist-info/top_level.txt +8 -0
- web/app.py +536 -0
- web/templates/index.html +1241 -0
collector_worldwide.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# collector_worldwide.py
|
|
3
|
+
"""Worldwide autonomous Waze data collector - all continents."""
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
import os
|
|
9
|
+
import signal
|
|
10
|
+
import yaml
|
|
11
|
+
import logging
|
|
12
|
+
import threading
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from typing import Dict, Any, Optional, List
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
# Status file for real-time UI updates
|
|
19
|
+
STATUS_FILE = "./data/collector_status.json"
|
|
20
|
+
CHECKPOINT_FILE = "./data/collector_checkpoint.json"
|
|
21
|
+
status_lock = threading.Lock()
|
|
22
|
+
checkpoint_lock = threading.Lock()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def write_status(region: str, cell_name: str, country: str, cell_idx: int, total_cells: int,
|
|
26
|
+
alerts_count: int, new_count: int, event_types: List[str] = None):
|
|
27
|
+
"""Write current collector status to file for UI consumption (thread-safe)."""
|
|
28
|
+
try:
|
|
29
|
+
status = {
|
|
30
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
31
|
+
"region": region,
|
|
32
|
+
"cell_name": cell_name,
|
|
33
|
+
"country": country,
|
|
34
|
+
"cell_idx": cell_idx,
|
|
35
|
+
"total_cells": total_cells,
|
|
36
|
+
"alerts_found": alerts_count,
|
|
37
|
+
"new_events": new_count,
|
|
38
|
+
"event_types": event_types or [],
|
|
39
|
+
"status": "scanning"
|
|
40
|
+
}
|
|
41
|
+
with status_lock:
|
|
42
|
+
with open(STATUS_FILE, "w") as f:
|
|
43
|
+
json.dump(status, f)
|
|
44
|
+
except Exception:
|
|
45
|
+
pass # Don't crash on status write failures
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_checkpoint() -> Dict[str, Any]:
|
|
49
|
+
"""Load checkpoint from file. Returns empty dict if no checkpoint exists."""
|
|
50
|
+
try:
|
|
51
|
+
if os.path.exists(CHECKPOINT_FILE):
|
|
52
|
+
with open(CHECKPOINT_FILE, "r") as f:
|
|
53
|
+
return json.load(f)
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
return {"cycle": 0, "scanned": {}}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def save_checkpoint(cycle: int, scanned: Dict[str, List[str]]):
|
|
60
|
+
"""Save checkpoint to file (thread-safe)."""
|
|
61
|
+
try:
|
|
62
|
+
checkpoint = {
|
|
63
|
+
"cycle": cycle,
|
|
64
|
+
"scanned": scanned,
|
|
65
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
66
|
+
}
|
|
67
|
+
with checkpoint_lock:
|
|
68
|
+
with open(CHECKPOINT_FILE, "w") as f:
|
|
69
|
+
json.dump(checkpoint, f)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def clear_checkpoint():
|
|
75
|
+
"""Clear checkpoint file when cycle completes."""
|
|
76
|
+
try:
|
|
77
|
+
if os.path.exists(CHECKPOINT_FILE):
|
|
78
|
+
os.remove(CHECKPOINT_FILE)
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Set up logging
|
|
84
|
+
logging.basicConfig(
|
|
85
|
+
level=logging.INFO,
|
|
86
|
+
format='%(asctime)s [%(levelname)s] %(message)s',
|
|
87
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
88
|
+
handlers=[
|
|
89
|
+
logging.StreamHandler(),
|
|
90
|
+
logging.FileHandler('logs/worldwide_collector.log')
|
|
91
|
+
]
|
|
92
|
+
)
|
|
93
|
+
logger = logging.getLogger("worldwide")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def generate_event_hash(username: str, latitude: float, longitude: float,
|
|
97
|
+
timestamp_ms: int, report_type: str) -> str:
|
|
98
|
+
timestamp_minute = timestamp_ms // 60000
|
|
99
|
+
data = f"{username}|{round(latitude, 4)}|{round(longitude, 4)}|{timestamp_minute}|{report_type}"
|
|
100
|
+
return hashlib.sha256(data.encode()).hexdigest()[:16]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def process_alert(alert: Dict[str, Any], grid_cell: str) -> Dict[str, Any]:
|
|
104
|
+
username = alert.get("reportBy", "anonymous")
|
|
105
|
+
latitude = alert.get("latitude", 0.0)
|
|
106
|
+
longitude = alert.get("longitude", 0.0)
|
|
107
|
+
timestamp_ms = alert.get("pubMillis", int(time.time() * 1000))
|
|
108
|
+
report_type = alert.get("type", "UNKNOWN")
|
|
109
|
+
subtype = alert.get("subtype")
|
|
110
|
+
|
|
111
|
+
timestamp_utc = datetime.fromtimestamp(
|
|
112
|
+
timestamp_ms / 1000, tz=timezone.utc
|
|
113
|
+
).isoformat()
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"event_hash": generate_event_hash(username, latitude, longitude, timestamp_ms, report_type),
|
|
117
|
+
"username": username,
|
|
118
|
+
"latitude": latitude,
|
|
119
|
+
"longitude": longitude,
|
|
120
|
+
"timestamp_utc": timestamp_utc,
|
|
121
|
+
"timestamp_ms": timestamp_ms,
|
|
122
|
+
"report_type": report_type,
|
|
123
|
+
"subtype": subtype,
|
|
124
|
+
"raw_json": json.dumps(alert),
|
|
125
|
+
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
126
|
+
"grid_cell": grid_cell
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class RegionScanner:
|
|
131
|
+
"""Scanner for a specific region."""
|
|
132
|
+
|
|
133
|
+
def __init__(self, name: str, config_path: str, db, client):
|
|
134
|
+
self.name = name
|
|
135
|
+
self.config_path = config_path
|
|
136
|
+
self.db = db
|
|
137
|
+
self.client = client
|
|
138
|
+
self.logger = logging.getLogger(name)
|
|
139
|
+
self.cells_by_priority = {}
|
|
140
|
+
self._load_cells()
|
|
141
|
+
|
|
142
|
+
def _load_cells(self):
|
|
143
|
+
with open(self.config_path) as f:
|
|
144
|
+
config = yaml.safe_load(f)
|
|
145
|
+
|
|
146
|
+
for cell in config.get("grid_cells", []):
|
|
147
|
+
p = cell.get("priority", 2)
|
|
148
|
+
if p not in self.cells_by_priority:
|
|
149
|
+
self.cells_by_priority[p] = []
|
|
150
|
+
self.cells_by_priority[p].append(cell)
|
|
151
|
+
|
|
152
|
+
def get_cell_counts(self) -> Dict[int, int]:
|
|
153
|
+
return {p: len(cells) for p, cells in self.cells_by_priority.items()}
|
|
154
|
+
|
|
155
|
+
def scan(self, priority: int, running_flag, already_scanned: set = None,
|
|
156
|
+
on_cell_scanned: callable = None) -> Dict[str, Any]:
|
|
157
|
+
"""Scan cells of given priority, skipping already-scanned cells.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
priority: Priority level to scan (1 or 3)
|
|
161
|
+
running_flag: Callable that returns False to stop scanning
|
|
162
|
+
already_scanned: Set of cell names to skip
|
|
163
|
+
on_cell_scanned: Callback called with cell_name after each cell is scanned
|
|
164
|
+
"""
|
|
165
|
+
cells = self.cells_by_priority.get(priority, [])
|
|
166
|
+
stats = {"requests": 0, "errors": 0, "events": 0, "cells": len(cells), "scanned_cells": []}
|
|
167
|
+
total_cells = len(cells)
|
|
168
|
+
already_scanned = already_scanned or set()
|
|
169
|
+
|
|
170
|
+
# Filter out already-scanned cells
|
|
171
|
+
remaining_cells = [(idx, cell) for idx, cell in enumerate(cells, 1)
|
|
172
|
+
if cell["name"] not in already_scanned]
|
|
173
|
+
|
|
174
|
+
if len(remaining_cells) < len(cells):
|
|
175
|
+
skipped = len(cells) - len(remaining_cells)
|
|
176
|
+
self.logger.info(f"Resuming: skipping {skipped} already-scanned cells, {len(remaining_cells)} remaining")
|
|
177
|
+
|
|
178
|
+
for idx, cell in remaining_cells:
|
|
179
|
+
if not running_flag():
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
stats["requests"] += 1
|
|
184
|
+
cell_name = cell["name"]
|
|
185
|
+
country = cell.get("country", "??")
|
|
186
|
+
|
|
187
|
+
alerts, _ = self.client.get_traffic_notifications(
|
|
188
|
+
lat_top=cell["lat_top"],
|
|
189
|
+
lat_bottom=cell["lat_bottom"],
|
|
190
|
+
lon_left=cell["lon_left"],
|
|
191
|
+
lon_right=cell["lon_right"]
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
new_count = 0
|
|
195
|
+
new_types = []
|
|
196
|
+
for alert in alerts:
|
|
197
|
+
event = process_alert(alert, cell_name)
|
|
198
|
+
if self.db.insert_event(event):
|
|
199
|
+
new_count += 1
|
|
200
|
+
new_types.append(event["report_type"])
|
|
201
|
+
self.db.upsert_tracked_user(event["username"], event["timestamp_utc"])
|
|
202
|
+
|
|
203
|
+
stats["events"] += new_count
|
|
204
|
+
stats["scanned_cells"].append(cell_name)
|
|
205
|
+
|
|
206
|
+
# Notify callback that cell was scanned (for checkpoint saving)
|
|
207
|
+
if on_cell_scanned:
|
|
208
|
+
on_cell_scanned(cell_name)
|
|
209
|
+
|
|
210
|
+
# Only log and write status when there are alerts or new events
|
|
211
|
+
if len(alerts) > 0 or new_count > 0:
|
|
212
|
+
type_summary = ""
|
|
213
|
+
if new_types:
|
|
214
|
+
from collections import Counter
|
|
215
|
+
counts = Counter(new_types)
|
|
216
|
+
type_summary = " | " + ", ".join(f"{t}:{c}" for t, c in counts.most_common(3))
|
|
217
|
+
|
|
218
|
+
status = f"+{new_count}" if new_count > 0 else "0"
|
|
219
|
+
self.logger.info(f"[{idx:3}/{total_cells}] {cell_name:25} ({country}) -> {len(alerts):3} alerts, {status} new{type_summary}")
|
|
220
|
+
|
|
221
|
+
# Write status for real-time UI updates
|
|
222
|
+
write_status(
|
|
223
|
+
region=self.name,
|
|
224
|
+
cell_name=cell_name,
|
|
225
|
+
country=country,
|
|
226
|
+
cell_idx=idx,
|
|
227
|
+
total_cells=total_cells,
|
|
228
|
+
alerts_count=len(alerts),
|
|
229
|
+
new_count=new_count,
|
|
230
|
+
event_types=new_types
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
stats["errors"] += 1
|
|
235
|
+
stats["scanned_cells"].append(cell["name"]) # Mark as scanned even on error
|
|
236
|
+
self.logger.error(f"[{idx:3}/{total_cells}] {cell['name']:25} -> ERROR: {e}")
|
|
237
|
+
|
|
238
|
+
return stats
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class WorldwideCollector:
|
|
242
|
+
"""Autonomous worldwide Waze data collector."""
|
|
243
|
+
|
|
244
|
+
REGIONS = [
|
|
245
|
+
("europe", "config_europe.yaml", "./data/waze_europe.db"),
|
|
246
|
+
("americas", "config_americas.yaml", "./data/waze_americas.db"),
|
|
247
|
+
("asia", "config_asia.yaml", "./data/waze_asia.db"),
|
|
248
|
+
("oceania", "config_oceania.yaml", "./data/waze_oceania.db"),
|
|
249
|
+
("africa", "config_africa.yaml", "./data/waze_africa.db"),
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
def __init__(self):
|
|
253
|
+
self.running = False
|
|
254
|
+
self.pid_file = "collector_worldwide.pid"
|
|
255
|
+
self.scanners = {}
|
|
256
|
+
self.databases = {}
|
|
257
|
+
self.clients = {}
|
|
258
|
+
|
|
259
|
+
def _generate_all_configs(self):
|
|
260
|
+
"""Generate all regional configs."""
|
|
261
|
+
configs = [
|
|
262
|
+
("europe_grid", "save_europe_config"),
|
|
263
|
+
("americas_grid", "save_americas_config"),
|
|
264
|
+
("asia_grid", "save_asia_config"),
|
|
265
|
+
("oceania_grid", "save_oceania_config"),
|
|
266
|
+
("africa_grid", "save_africa_config"),
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
for module_name, func_name in configs:
|
|
270
|
+
config_file = f"config_{module_name.replace('_grid', '')}.yaml"
|
|
271
|
+
if not os.path.exists(config_file):
|
|
272
|
+
logger.info(f"Generating {config_file}...")
|
|
273
|
+
module = __import__(module_name)
|
|
274
|
+
getattr(module, func_name)()
|
|
275
|
+
|
|
276
|
+
def _save_pid(self):
|
|
277
|
+
with open(self.pid_file, "w") as f:
|
|
278
|
+
f.write(str(os.getpid()))
|
|
279
|
+
|
|
280
|
+
def _remove_pid(self):
|
|
281
|
+
if os.path.exists(self.pid_file):
|
|
282
|
+
os.remove(self.pid_file)
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def get_pid() -> Optional[int]:
|
|
286
|
+
if os.path.exists("collector_worldwide.pid"):
|
|
287
|
+
with open("collector_worldwide.pid") as f:
|
|
288
|
+
pid = int(f.read().strip())
|
|
289
|
+
try:
|
|
290
|
+
os.kill(pid, 0)
|
|
291
|
+
return pid
|
|
292
|
+
except OSError:
|
|
293
|
+
return None
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
def run(self):
|
|
297
|
+
"""Main worldwide collection loop."""
|
|
298
|
+
from database import Database
|
|
299
|
+
from waze_client import WazeClient
|
|
300
|
+
|
|
301
|
+
# Create directories
|
|
302
|
+
Path("data").mkdir(exist_ok=True)
|
|
303
|
+
Path("logs").mkdir(exist_ok=True)
|
|
304
|
+
|
|
305
|
+
# Generate configs
|
|
306
|
+
self._generate_all_configs()
|
|
307
|
+
|
|
308
|
+
# Initialize scanners for each region
|
|
309
|
+
logger.info("=" * 70)
|
|
310
|
+
logger.info("WORLDWIDE WAZE COLLECTOR")
|
|
311
|
+
logger.info("Covering: Europe, Americas, Asia, Oceania, Africa")
|
|
312
|
+
logger.info("=" * 70)
|
|
313
|
+
|
|
314
|
+
total_p1 = 0
|
|
315
|
+
total_p3 = 0
|
|
316
|
+
|
|
317
|
+
for region_name, config_path, db_path in self.REGIONS:
|
|
318
|
+
if not os.path.exists(config_path):
|
|
319
|
+
logger.warning(f"Config not found: {config_path}, skipping {region_name}")
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
db = Database(db_path, check_same_thread=False) # Thread-safe for parallel scanning
|
|
323
|
+
client = WazeClient()
|
|
324
|
+
|
|
325
|
+
scanner = RegionScanner(region_name, config_path, db, client)
|
|
326
|
+
self.scanners[region_name] = scanner
|
|
327
|
+
self.databases[region_name] = db
|
|
328
|
+
self.clients[region_name] = client
|
|
329
|
+
|
|
330
|
+
counts = scanner.get_cell_counts()
|
|
331
|
+
p1 = counts.get(1, 0)
|
|
332
|
+
p3 = counts.get(3, 0)
|
|
333
|
+
total_p1 += p1
|
|
334
|
+
total_p3 += p3
|
|
335
|
+
|
|
336
|
+
logger.info(f" {region_name.upper():10} - P1 (cities): {p1:4}, P3 (coverage): {p3:4}")
|
|
337
|
+
|
|
338
|
+
logger.info("-" * 70)
|
|
339
|
+
logger.info(f" {'TOTAL':10} - P1 (cities): {total_p1:4}, P3 (coverage): {total_p3:4}")
|
|
340
|
+
logger.info(f" {'':10} Grand total: {total_p1 + total_p3} grid cells")
|
|
341
|
+
logger.info("=" * 70)
|
|
342
|
+
logger.info("Collection strategy (MULTITHREADED):")
|
|
343
|
+
logger.info(" - All regions scanned in PARALLEL for P1 (city) scans")
|
|
344
|
+
logger.info(" - Full P3 (coverage) scan every 10 cycles (parallel)")
|
|
345
|
+
logger.info(" - 10 second pause between cycles")
|
|
346
|
+
logger.info("=" * 70)
|
|
347
|
+
|
|
348
|
+
self.running = True
|
|
349
|
+
self._save_pid()
|
|
350
|
+
|
|
351
|
+
def handle_signal(signum, frame):
|
|
352
|
+
logger.info("Shutdown signal received...")
|
|
353
|
+
self.running = False
|
|
354
|
+
|
|
355
|
+
signal.signal(signal.SIGINT, handle_signal)
|
|
356
|
+
signal.signal(signal.SIGTERM, handle_signal)
|
|
357
|
+
|
|
358
|
+
region_names = list(self.scanners.keys())
|
|
359
|
+
|
|
360
|
+
# Load checkpoint to resume from where we left off
|
|
361
|
+
checkpoint = load_checkpoint()
|
|
362
|
+
cycle = checkpoint.get("cycle", 0)
|
|
363
|
+
scanned_cells = checkpoint.get("scanned", {})
|
|
364
|
+
|
|
365
|
+
if cycle > 0:
|
|
366
|
+
logger.info(f"Resuming from checkpoint: cycle {cycle}")
|
|
367
|
+
for key, cells in scanned_cells.items():
|
|
368
|
+
logger.info(f" {key}: {len(cells)} cells already scanned")
|
|
369
|
+
|
|
370
|
+
def scan_region(region_name: str, priority: int, today: str, already_scanned: set,
|
|
371
|
+
checkpoint_key: str) -> Dict[str, Any]:
|
|
372
|
+
"""Scan a single region (runs in thread)."""
|
|
373
|
+
scanner = self.scanners[region_name]
|
|
374
|
+
db = self.databases[region_name]
|
|
375
|
+
|
|
376
|
+
p_count = scanner.get_cell_counts().get(priority, 0)
|
|
377
|
+
if p_count == 0:
|
|
378
|
+
return {"region": region_name, "events": 0, "errors": 0, "requests": 0, "cells": 0, "scanned_cells": []}
|
|
379
|
+
|
|
380
|
+
def on_cell_scanned(cell_name):
|
|
381
|
+
"""Callback to save checkpoint after each cell (thread-safe)."""
|
|
382
|
+
with checkpoint_lock:
|
|
383
|
+
if checkpoint_key not in scanned_cells:
|
|
384
|
+
scanned_cells[checkpoint_key] = []
|
|
385
|
+
scanned_cells[checkpoint_key].append(cell_name)
|
|
386
|
+
save_checkpoint(cycle, scanned_cells)
|
|
387
|
+
|
|
388
|
+
stats = scanner.scan(priority, lambda: self.running, already_scanned, on_cell_scanned)
|
|
389
|
+
|
|
390
|
+
# Update daily stats (thread-safe - SQLite handles this)
|
|
391
|
+
db.update_daily_stats(
|
|
392
|
+
date=today,
|
|
393
|
+
events=stats["events"],
|
|
394
|
+
requests=stats["requests"],
|
|
395
|
+
errors=stats["errors"],
|
|
396
|
+
cells=stats["cells"]
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return {"region": region_name, **stats}
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
while self.running:
|
|
403
|
+
cycle += 1
|
|
404
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
405
|
+
|
|
406
|
+
logger.info(f"\n{'='*50}")
|
|
407
|
+
logger.info(f"CYCLE {cycle} (PARALLEL MODE)")
|
|
408
|
+
logger.info(f"{'='*50}")
|
|
409
|
+
|
|
410
|
+
# Parallel P1 scan - all regions at once
|
|
411
|
+
logger.info(f"Starting parallel P1 scan across {len(region_names)} regions...")
|
|
412
|
+
total_events = 0
|
|
413
|
+
total_errors = 0
|
|
414
|
+
cycle_complete = True
|
|
415
|
+
|
|
416
|
+
with ThreadPoolExecutor(max_workers=len(region_names)) as executor:
|
|
417
|
+
futures = {}
|
|
418
|
+
for region in region_names:
|
|
419
|
+
key = f"{region}_p1"
|
|
420
|
+
already_scanned = set(scanned_cells.get(key, []))
|
|
421
|
+
futures[executor.submit(scan_region, region, 1, today, already_scanned, key)] = (region, key)
|
|
422
|
+
|
|
423
|
+
for future in as_completed(futures):
|
|
424
|
+
region, key = futures[future]
|
|
425
|
+
try:
|
|
426
|
+
result = future.result()
|
|
427
|
+
total_events += result["events"]
|
|
428
|
+
total_errors += result["errors"]
|
|
429
|
+
|
|
430
|
+
# Checkpoint is saved per-cell by callback, no need to save here
|
|
431
|
+
|
|
432
|
+
if result["events"] > 0 or result["errors"] > 0:
|
|
433
|
+
logger.info(f" [{region.upper()}] +{result['events']} events, {result['errors']} errors")
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.error(f" [{region.upper()}] Thread error: {e}")
|
|
436
|
+
cycle_complete = False
|
|
437
|
+
|
|
438
|
+
logger.info(f"P1 cycle complete: +{total_events} total events, {total_errors} errors")
|
|
439
|
+
|
|
440
|
+
# Clear P1 checkpoint data after successful cycle
|
|
441
|
+
if cycle_complete:
|
|
442
|
+
for region in region_names:
|
|
443
|
+
scanned_cells.pop(f"{region}_p1", None)
|
|
444
|
+
save_checkpoint(cycle, scanned_cells)
|
|
445
|
+
|
|
446
|
+
# Full coverage scan every 10 cycles (also parallel)
|
|
447
|
+
if cycle % 10 == 0 and self.running:
|
|
448
|
+
logger.info("\n--- FULL COVERAGE SCAN (PARALLEL) ---")
|
|
449
|
+
total_p3_events = 0
|
|
450
|
+
|
|
451
|
+
with ThreadPoolExecutor(max_workers=len(region_names)) as executor:
|
|
452
|
+
futures = {}
|
|
453
|
+
for region in region_names:
|
|
454
|
+
key = f"{region}_p3"
|
|
455
|
+
already_scanned = set(scanned_cells.get(key, []))
|
|
456
|
+
futures[executor.submit(scan_region, region, 3, today, already_scanned, key)] = (region, key)
|
|
457
|
+
|
|
458
|
+
for future in as_completed(futures):
|
|
459
|
+
region, key = futures[future]
|
|
460
|
+
try:
|
|
461
|
+
result = future.result()
|
|
462
|
+
total_p3_events += result["events"]
|
|
463
|
+
|
|
464
|
+
# Checkpoint is saved per-cell by callback, no need to save here
|
|
465
|
+
|
|
466
|
+
if result["events"] > 0:
|
|
467
|
+
logger.info(f" [{region.upper()}] +{result['events']} events")
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.error(f" [{region.upper()}] Thread error: {e}")
|
|
470
|
+
|
|
471
|
+
logger.info(f"P3 coverage complete: +{total_p3_events} total events")
|
|
472
|
+
|
|
473
|
+
# Clear P3 checkpoint data after successful coverage scan
|
|
474
|
+
for region in region_names:
|
|
475
|
+
scanned_cells.pop(f"{region}_p3", None)
|
|
476
|
+
save_checkpoint(cycle, scanned_cells)
|
|
477
|
+
|
|
478
|
+
# Print summary every 5 cycles
|
|
479
|
+
if cycle % 5 == 0:
|
|
480
|
+
logger.info("\n--- DATABASE SUMMARY ---")
|
|
481
|
+
for region_name, db in self.databases.items():
|
|
482
|
+
result = db.execute(
|
|
483
|
+
"SELECT COUNT(*) as events, COUNT(DISTINCT username) as users FROM events"
|
|
484
|
+
).fetchone()
|
|
485
|
+
logger.info(f" {region_name.upper():10}: {result[0]:,} events, {result[1]:,} users")
|
|
486
|
+
|
|
487
|
+
# Wait between cycles (shorter since parallel is faster)
|
|
488
|
+
if self.running:
|
|
489
|
+
time.sleep(10)
|
|
490
|
+
|
|
491
|
+
except Exception as e:
|
|
492
|
+
logger.error(f"Fatal error: {e}", exc_info=True)
|
|
493
|
+
raise
|
|
494
|
+
finally:
|
|
495
|
+
self._remove_pid()
|
|
496
|
+
for db in self.databases.values():
|
|
497
|
+
db.close()
|
|
498
|
+
logger.info("Worldwide collector stopped.")
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def main():
|
|
502
|
+
import argparse
|
|
503
|
+
parser = argparse.ArgumentParser(description="Worldwide Waze Data Collector")
|
|
504
|
+
parser.add_argument("--generate-configs", action="store_true", help="Generate all configs and exit")
|
|
505
|
+
parser.add_argument("--status", action="store_true", help="Show collector status")
|
|
506
|
+
args = parser.parse_args()
|
|
507
|
+
|
|
508
|
+
if args.generate_configs:
|
|
509
|
+
from europe_grid import save_europe_config
|
|
510
|
+
from americas_grid import save_americas_config
|
|
511
|
+
from asia_grid import save_asia_config
|
|
512
|
+
from oceania_grid import save_oceania_config
|
|
513
|
+
from africa_grid import save_africa_config
|
|
514
|
+
|
|
515
|
+
save_europe_config()
|
|
516
|
+
save_americas_config()
|
|
517
|
+
save_asia_config()
|
|
518
|
+
save_oceania_config()
|
|
519
|
+
save_africa_config()
|
|
520
|
+
return
|
|
521
|
+
|
|
522
|
+
if args.status:
|
|
523
|
+
pid = WorldwideCollector.get_pid()
|
|
524
|
+
print(f"Worldwide Collector: {'Running (PID ' + str(pid) + ')' if pid else 'Stopped'}")
|
|
525
|
+
return
|
|
526
|
+
|
|
527
|
+
collector = WorldwideCollector()
|
|
528
|
+
collector.run()
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
if __name__ == "__main__":
|
|
532
|
+
main()
|