earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,677 @@
1
+ """Catalog statistics collection for EarthCatalog ingestion pipeline.
2
+
3
+ This module provides comprehensive statistics tracking during STAC item ingestion,
4
+ enabling detailed insights into catalog composition, spatial/temporal distribution,
5
+ data quality, and processing performance without requiring post-hoc queries.
6
+
7
+ Key Features:
8
+ - HyperLogLog for memory-efficient approximate unique counting
9
+ - Spatial distribution tracking (items per cell, hotspots)
10
+ - Temporal distribution (items per year/month)
11
+ - Overhead metrics (spanning items, duplication ratio)
12
+ - Data quality metrics (null geometries, missing datetimes)
13
+ - Processing performance metrics
14
+
15
+ Memory Efficiency:
16
+ Uses HyperLogLog algorithm for unique counting, requiring only ~1.5KB
17
+ to track billions of unique items with ~2% error rate. This allows
18
+ accurate overhead calculation even for 100M+ item catalogs.
19
+
20
+ Thread Safety:
21
+ The IngestionStatistics class is designed for single-threaded use within
22
+ each worker. For distributed processing, each worker maintains its own
23
+ statistics instance, which are merged at the end of processing.
24
+
25
+ Example:
26
+ >>> stats = IngestionStatistics()
27
+ >>> for item in items:
28
+ ... stats.record_item(item, tiles=["abc123"], is_spanning=False)
29
+ >>> summary = stats.get_summary()
30
+ >>> print(f"Unique items: {summary['unique_granules']}")
31
+ """
32
+
33
+ import hashlib
34
+ import math
35
+ import time
36
+ from collections import defaultdict
37
+ from dataclasses import dataclass, field
38
+ from datetime import UTC, datetime
39
+ from typing import Any
40
+
41
+
42
+ class HyperLogLog:
43
+ """Memory-efficient probabilistic cardinality estimator.
44
+
45
+ Implements the HyperLogLog algorithm for approximate distinct counting.
46
+ Uses only ~1.5KB of memory to estimate cardinality with ~2% error rate,
47
+ regardless of the number of unique elements.
48
+
49
+ The algorithm works by:
50
+ 1. Hashing each element to a 64-bit value
51
+ 2. Partitioning hash space into 2^p buckets (default p=14 = 16384 buckets)
52
+ 3. Tracking the maximum number of leading zeros seen in each bucket
53
+ 4. Using harmonic mean of bucket estimates for final cardinality
54
+
55
+ Attributes:
56
+ p: Precision parameter (default 14, giving 16384 registers)
57
+ m: Number of registers (2^p)
58
+ registers: Array of maximum leading zero counts per bucket
59
+
60
+ Example:
61
+ >>> hll = HyperLogLog(precision=14)
62
+ >>> for item_id in item_ids:
63
+ ... hll.add(item_id)
64
+ >>> estimated_unique = hll.count()
65
+ >>> print(f"Approximately {estimated_unique} unique items")
66
+
67
+ Error Rate:
68
+ Standard error is approximately 1.04 / sqrt(m)
69
+ With p=14 (m=16384): error ≈ 0.81% (typically <2% in practice)
70
+
71
+ Memory Usage:
72
+ Uses 1 byte per register = 2^p bytes
73
+ p=14: 16KB, p=12: 4KB, p=10: 1KB
74
+ """
75
+
76
+ def __init__(self, precision: int = 14):
77
+ """Initialize HyperLogLog with specified precision.
78
+
79
+ Args:
80
+ precision: Number of bits for bucket addressing (4-16).
81
+ Higher precision = more accuracy but more memory.
82
+ Default 14 gives ~0.8% error with 16KB memory.
83
+ """
84
+ if not 4 <= precision <= 16:
85
+ raise ValueError("Precision must be between 4 and 16")
86
+
87
+ self.p = precision
88
+ self.m = 1 << precision # 2^p registers
89
+ self.registers = [0] * self.m
90
+
91
+ # Alpha constant for bias correction (depends on m)
92
+ if self.m >= 128:
93
+ self.alpha = 0.7213 / (1 + 1.079 / self.m)
94
+ elif self.m == 64:
95
+ self.alpha = 0.709
96
+ elif self.m == 32:
97
+ self.alpha = 0.697
98
+ else:
99
+ self.alpha = 0.673
100
+
101
+ def _hash(self, value: str) -> int:
102
+ """Hash a string value to a 64-bit integer."""
103
+ # Use MD5 for speed and reasonable distribution (not for security)
104
+ h = hashlib.md5(value.encode("utf-8"), usedforsecurity=False).digest()
105
+ return int.from_bytes(h[:8], "big")
106
+
107
+ def _leading_zeros(self, value: int, max_bits: int = 64) -> int:
108
+ """Count leading zeros in the binary representation."""
109
+ if value == 0:
110
+ return max_bits
111
+ return max_bits - value.bit_length()
112
+
113
+ def add(self, value: str) -> None:
114
+ """Add an element to the HyperLogLog.
115
+
116
+ Args:
117
+ value: String value to add (typically an item ID)
118
+ """
119
+ h = self._hash(value)
120
+
121
+ # Use first p bits for bucket index
122
+ bucket = h >> (64 - self.p)
123
+
124
+ # Use remaining bits for leading zero count
125
+ remaining = h & ((1 << (64 - self.p)) - 1)
126
+ zeros = self._leading_zeros(remaining, 64 - self.p) + 1
127
+
128
+ # Update register if this is a new maximum
129
+ self.registers[bucket] = max(self.registers[bucket], zeros)
130
+
131
+ def count(self) -> int:
132
+ """Estimate the cardinality (number of unique elements).
133
+
134
+ Returns:
135
+ Estimated number of unique elements added.
136
+ """
137
+ # Compute harmonic mean of 2^register values
138
+ indicator = sum(2.0 ** (-r) for r in self.registers)
139
+ estimate = self.alpha * self.m * self.m / indicator
140
+
141
+ # Apply small range correction (linear counting)
142
+ if estimate <= 2.5 * self.m:
143
+ zeros = self.registers.count(0)
144
+ if zeros > 0:
145
+ estimate = self.m * math.log(self.m / zeros)
146
+
147
+ # Apply large range correction
148
+ elif estimate > (1 << 32) / 30:
149
+ estimate = -(1 << 32) * math.log(1 - estimate / (1 << 32))
150
+
151
+ return int(estimate)
152
+
153
+ def merge(self, other: "HyperLogLog") -> None:
154
+ """Merge another HyperLogLog into this one.
155
+
156
+ Useful for combining statistics from multiple workers.
157
+
158
+ Args:
159
+ other: Another HyperLogLog instance with same precision
160
+ """
161
+ if self.p != other.p:
162
+ raise ValueError("Cannot merge HyperLogLogs with different precision")
163
+
164
+ for i in range(self.m):
165
+ self.registers[i] = max(self.registers[i], other.registers[i])
166
+
167
+
168
+ @dataclass
169
+ class IngestionStatistics:
170
+ """Comprehensive statistics collector for STAC ingestion pipeline.
171
+
172
+ Tracks all relevant metrics during ingestion with minimal memory overhead.
173
+ Designed to be used within a single worker process and merged after
174
+ distributed processing completes.
175
+
176
+ Categories of statistics tracked:
177
+ - Core counts: stored references, unique granules (via HyperLogLog)
178
+ - Overhead: spanning items, duplication metrics
179
+ - Spatial: items per cell, hotspots, bounding box
180
+ - Temporal: items per year/month, temporal extent
181
+ - Quality: null geometries, missing datetimes, geometry types
182
+ - Missions: items per dataset/collection
183
+ - Processing: timing, failures, throughput
184
+
185
+ Example:
186
+ >>> stats = IngestionStatistics()
187
+ >>> stats.start_processing()
188
+ >>> for item in items:
189
+ ... tiles = grid.tiles_for_geometry(item['geometry'])
190
+ ... stats.record_item(item, tiles, is_spanning=len(tiles)>1)
191
+ >>> stats.finish_processing()
192
+ >>> summary = stats.get_summary()
193
+ """
194
+
195
+ # HyperLogLog for unique counting (precision 14 = ~0.8% error, 16KB memory)
196
+ unique_ids: HyperLogLog = field(default_factory=lambda: HyperLogLog(precision=14))
197
+
198
+ # Core counts
199
+ stored_references: int = 0
200
+ items_routed_to_global: int = 0
201
+
202
+ # Spanning metrics
203
+ spanning_items_count: int = 0
204
+ total_tiles_for_spanning: int = 0 # Sum of tiles for all spanning items
205
+ max_tiles_per_item: int = 0
206
+ tiles_per_spanning_histogram: dict[int, int] = field(default_factory=lambda: defaultdict(int))
207
+
208
+ # Spatial distribution
209
+ items_per_cell: dict[str, int] = field(default_factory=lambda: defaultdict(int))
210
+ bbox_min_lon: float = 180.0
211
+ bbox_max_lon: float = -180.0
212
+ bbox_min_lat: float = 90.0
213
+ bbox_max_lat: float = -90.0
214
+
215
+ # Temporal distribution (year -> month -> count)
216
+ items_per_year_month: dict[str, dict[str, int]] = field(
217
+ default_factory=lambda: defaultdict(lambda: defaultdict(int))
218
+ )
219
+ earliest_datetime: datetime | None = None
220
+ latest_datetime: datetime | None = None
221
+
222
+ # Data quality
223
+ null_geometry_count: int = 0
224
+ missing_datetime_count: int = 0
225
+ geometry_types: dict[str, int] = field(default_factory=lambda: defaultdict(int))
226
+
227
+ # Mission breakdown
228
+ items_per_mission: dict[str, int] = field(default_factory=lambda: defaultdict(int))
229
+
230
+ # Processing metrics
231
+ start_time: float | None = None
232
+ end_time: float | None = None
233
+ urls_processed: int = 0
234
+ urls_failed: int = 0
235
+
236
+ # File statistics (populated during consolidation)
237
+ file_sizes: list[int] = field(default_factory=list)
238
+ items_per_file: list[int] = field(default_factory=list)
239
+
240
+ # Consolidation metrics
241
+ duplicates_removed: int = 0
242
+ new_items: int = 0
243
+ existing_items: int = 0
244
+
245
+ def start_processing(self) -> None:
246
+ """Mark the start of processing for timing metrics."""
247
+ self.start_time = time.time()
248
+
249
+ def finish_processing(self) -> None:
250
+ """Mark the end of processing for timing metrics."""
251
+ self.end_time = time.time()
252
+
253
+ def record_url_processed(self, success: bool = True) -> None:
254
+ """Record a URL processing attempt.
255
+
256
+ Args:
257
+ success: Whether the URL was successfully processed
258
+ """
259
+ self.urls_processed += 1
260
+ if not success:
261
+ self.urls_failed += 1
262
+
263
+ def record_item(
264
+ self,
265
+ item: dict[str, Any],
266
+ tiles: list[str],
267
+ is_spanning: bool,
268
+ routed_to_global: bool = False,
269
+ mission: str | None = None,
270
+ ) -> None:
271
+ """Record statistics for a single STAC item.
272
+
273
+ Args:
274
+ item: STAC item dictionary
275
+ tiles: List of tile IDs this item maps to
276
+ is_spanning: Whether item spans multiple tiles
277
+ routed_to_global: Whether item was routed to global partition
278
+ mission: Extracted mission/dataset name
279
+ """
280
+ item_id = item.get("id", "")
281
+
282
+ # Track unique IDs via HyperLogLog
283
+ if item_id:
284
+ self.unique_ids.add(item_id)
285
+
286
+ # Count stored reference (this counts each tile assignment)
287
+ if routed_to_global:
288
+ self.stored_references += 1
289
+ self.items_routed_to_global += 1
290
+ else:
291
+ # Item stored once per tile it intersects
292
+ self.stored_references += len(tiles) if tiles else 1
293
+
294
+ # Track spanning metrics
295
+ if is_spanning and not routed_to_global:
296
+ self.spanning_items_count += 1
297
+ tile_count = len(tiles)
298
+ self.total_tiles_for_spanning += tile_count
299
+ self.max_tiles_per_item = max(self.max_tiles_per_item, tile_count)
300
+
301
+ # Bucket into histogram (1, 2, 3-5, 6-10, 11-20, 21-50, 50+)
302
+ if tile_count <= 2:
303
+ bucket = tile_count
304
+ elif tile_count <= 5:
305
+ bucket = 5
306
+ elif tile_count <= 10:
307
+ bucket = 10
308
+ elif tile_count <= 20:
309
+ bucket = 20
310
+ elif tile_count <= 50:
311
+ bucket = 50
312
+ else:
313
+ bucket = 100 # 50+
314
+ self.tiles_per_spanning_histogram[bucket] += 1
315
+
316
+ # Track spatial distribution
317
+ for tile in tiles:
318
+ self.items_per_cell[tile] += 1
319
+
320
+ # Update bounding box from geometry
321
+ geometry = item.get("geometry")
322
+ if geometry:
323
+ self._update_bbox_from_geometry(geometry)
324
+ geom_type = geometry.get("type", "Unknown")
325
+ self.geometry_types[geom_type] += 1
326
+ else:
327
+ self.null_geometry_count += 1
328
+
329
+ # Track temporal distribution
330
+ props = item.get("properties", {})
331
+ dt_str = props.get("datetime")
332
+ if dt_str:
333
+ self._record_datetime(dt_str)
334
+ else:
335
+ self.missing_datetime_count += 1
336
+
337
+ # Track mission
338
+ if mission:
339
+ self.items_per_mission[mission] += 1
340
+
341
+ def _update_bbox_from_geometry(self, geometry: dict[str, Any]) -> None:
342
+ """Update bounding box from GeoJSON geometry."""
343
+ try:
344
+ # Extract coordinates based on geometry type
345
+ geom_type = geometry.get("type", "")
346
+ coords = geometry.get("coordinates", [])
347
+
348
+ if not coords:
349
+ return
350
+
351
+ # Flatten coordinates to list of [lon, lat] pairs
352
+ flat_coords = self._flatten_coordinates(coords, geom_type)
353
+
354
+ for lon, lat in flat_coords:
355
+ self.bbox_min_lon = min(self.bbox_min_lon, lon)
356
+ self.bbox_max_lon = max(self.bbox_max_lon, lon)
357
+ self.bbox_min_lat = min(self.bbox_min_lat, lat)
358
+ self.bbox_max_lat = max(self.bbox_max_lat, lat)
359
+ except (TypeError, ValueError, KeyError, IndexError):
360
+ # Silently ignore malformed geometries that can't be processed
361
+ pass
362
+
363
+ def _flatten_coordinates(self, coords: Any, geom_type: str) -> list[tuple[float, float]]:
364
+ """Flatten nested coordinate arrays to list of (lon, lat) tuples."""
365
+ result = []
366
+
367
+ if geom_type == "Point":
368
+ if len(coords) >= 2:
369
+ result.append((coords[0], coords[1]))
370
+ elif geom_type == "LineString":
371
+ for coord in coords:
372
+ if len(coord) >= 2:
373
+ result.append((coord[0], coord[1]))
374
+ elif geom_type == "Polygon":
375
+ for ring in coords:
376
+ for coord in ring:
377
+ if len(coord) >= 2:
378
+ result.append((coord[0], coord[1]))
379
+ elif geom_type == "MultiPoint":
380
+ for coord in coords:
381
+ if len(coord) >= 2:
382
+ result.append((coord[0], coord[1]))
383
+ elif geom_type == "MultiLineString":
384
+ for line in coords:
385
+ for coord in line:
386
+ if len(coord) >= 2:
387
+ result.append((coord[0], coord[1]))
388
+ elif geom_type == "MultiPolygon":
389
+ for polygon in coords:
390
+ for ring in polygon:
391
+ for coord in ring:
392
+ if len(coord) >= 2:
393
+ result.append((coord[0], coord[1]))
394
+
395
+ return result
396
+
397
+ def _record_datetime(self, dt_str: str) -> None:
398
+ """Record datetime for temporal statistics.
399
+
400
+ Uses datetime.fromisoformat() for fast parsing of ISO 8601 strings.
401
+ Falls back gracefully for non-standard formats.
402
+ """
403
+ try:
404
+ # Fast path: datetime.fromisoformat() is ~100x faster than pd.to_datetime()
405
+ # Handle 'Z' suffix which fromisoformat doesn't support directly
406
+ if dt_str.endswith("Z"):
407
+ dt_str = dt_str[:-1] + "+00:00"
408
+ dt = datetime.fromisoformat(dt_str)
409
+
410
+ # Update temporal extent
411
+ if self.earliest_datetime is None or dt < self.earliest_datetime:
412
+ self.earliest_datetime = dt
413
+ if self.latest_datetime is None or dt > self.latest_datetime:
414
+ self.latest_datetime = dt
415
+
416
+ # Track by year/month
417
+ year = str(dt.year)
418
+ month = f"{dt.month:02d}"
419
+ self.items_per_year_month[year][month] += 1
420
+
421
+ except (ValueError, TypeError):
422
+ self.missing_datetime_count += 1
423
+
424
+ def record_file(self, file_size_bytes: int, item_count: int) -> None:
425
+ """Record statistics for a written file.
426
+
427
+ Args:
428
+ file_size_bytes: Size of the written file in bytes
429
+ item_count: Number of items in the file
430
+ """
431
+ self.file_sizes.append(file_size_bytes)
432
+ self.items_per_file.append(item_count)
433
+
434
+ def record_consolidation(self, new_items: int, existing_items: int, duplicates_removed: int) -> None:
435
+ """Record consolidation statistics.
436
+
437
+ Args:
438
+ new_items: Number of new items added
439
+ existing_items: Number of items that existed before
440
+ duplicates_removed: Number of duplicate items removed
441
+ """
442
+ self.new_items += new_items
443
+ self.existing_items += existing_items
444
+ self.duplicates_removed += duplicates_removed
445
+
446
+ def merge(self, other: "IngestionStatistics") -> None:
447
+ """Merge statistics from another instance.
448
+
449
+ Used to combine statistics from multiple workers in distributed processing.
450
+
451
+ Args:
452
+ other: Another IngestionStatistics instance to merge
453
+ """
454
+ # Merge HyperLogLog
455
+ self.unique_ids.merge(other.unique_ids)
456
+
457
+ # Merge counts
458
+ self.stored_references += other.stored_references
459
+ self.items_routed_to_global += other.items_routed_to_global
460
+ self.spanning_items_count += other.spanning_items_count
461
+ self.total_tiles_for_spanning += other.total_tiles_for_spanning
462
+ self.max_tiles_per_item = max(self.max_tiles_per_item, other.max_tiles_per_item)
463
+
464
+ # Merge histograms
465
+ for bucket, count in other.tiles_per_spanning_histogram.items():
466
+ self.tiles_per_spanning_histogram[bucket] += count
467
+
468
+ # Merge spatial distribution
469
+ for cell, count in other.items_per_cell.items():
470
+ self.items_per_cell[cell] += count
471
+
472
+ # Merge bbox
473
+ self.bbox_min_lon = min(self.bbox_min_lon, other.bbox_min_lon)
474
+ self.bbox_max_lon = max(self.bbox_max_lon, other.bbox_max_lon)
475
+ self.bbox_min_lat = min(self.bbox_min_lat, other.bbox_min_lat)
476
+ self.bbox_max_lat = max(self.bbox_max_lat, other.bbox_max_lat)
477
+
478
+ # Merge temporal distribution
479
+ for year, months in other.items_per_year_month.items():
480
+ for month, count in months.items():
481
+ self.items_per_year_month[year][month] += count
482
+
483
+ # Merge temporal extent
484
+ if other.earliest_datetime:
485
+ if self.earliest_datetime is None or other.earliest_datetime < self.earliest_datetime:
486
+ self.earliest_datetime = other.earliest_datetime
487
+ if other.latest_datetime:
488
+ if self.latest_datetime is None or other.latest_datetime > self.latest_datetime:
489
+ self.latest_datetime = other.latest_datetime
490
+
491
+ # Merge quality metrics
492
+ self.null_geometry_count += other.null_geometry_count
493
+ self.missing_datetime_count += other.missing_datetime_count
494
+ for geom_type, count in other.geometry_types.items():
495
+ self.geometry_types[geom_type] += count
496
+
497
+ # Merge mission counts
498
+ for mission, count in other.items_per_mission.items():
499
+ self.items_per_mission[mission] += count
500
+
501
+ # Merge processing metrics
502
+ self.urls_processed += other.urls_processed
503
+ self.urls_failed += other.urls_failed
504
+
505
+ # Merge file statistics
506
+ self.file_sizes.extend(other.file_sizes)
507
+ self.items_per_file.extend(other.items_per_file)
508
+
509
+ # Merge consolidation metrics
510
+ self.new_items += other.new_items
511
+ self.existing_items += other.existing_items
512
+ self.duplicates_removed += other.duplicates_removed
513
+
514
+ def get_summary(self) -> dict[str, Any]:
515
+ """Generate comprehensive statistics summary for schema output.
516
+
517
+ Returns:
518
+ Dictionary containing all collected statistics formatted for
519
+ inclusion in the catalog schema JSON.
520
+ """
521
+ unique_granules = self.unique_ids.count()
522
+
523
+ # Calculate derived metrics
524
+ duplication_ratio = self.stored_references / unique_granules if unique_granules > 0 else 1.0
525
+ overhead_percentage = (
526
+ ((self.stored_references - unique_granules) / unique_granules) * 100 if unique_granules > 0 else 0.0
527
+ )
528
+ spanning_percentage = (self.spanning_items_count / unique_granules) * 100 if unique_granules > 0 else 0.0
529
+ avg_tiles_per_spanning = (
530
+ self.total_tiles_for_spanning / self.spanning_items_count if self.spanning_items_count > 0 else 0.0
531
+ )
532
+
533
+ # Calculate spatial statistics
534
+ cell_counts = list(self.items_per_cell.values())
535
+ spatial_stats = self._calculate_distribution_stats(cell_counts)
536
+
537
+ # Get hotspot cells (top 10 by count)
538
+ sorted_cells = sorted(self.items_per_cell.items(), key=lambda x: x[1], reverse=True)[:10]
539
+ hotspot_cells = [{"cell": cell, "count": count} for cell, count in sorted_cells]
540
+
541
+ # Calculate file statistics
542
+ file_size_stats = self._calculate_distribution_stats(self.file_sizes)
543
+ items_per_file_stats = self._calculate_distribution_stats(self.items_per_file)
544
+
545
+ # Calculate processing metrics
546
+ duration = self.end_time - self.start_time if self.start_time and self.end_time else 0.0
547
+ items_per_second = unique_granules / duration if duration > 0 else 0.0
548
+
549
+ # Format temporal distribution
550
+ temporal_distribution = {}
551
+ for year in sorted(self.items_per_year_month.keys()):
552
+ year_data: dict[str, Any] = {"total": 0, "months": {}}
553
+ for month in sorted(self.items_per_year_month[year].keys()):
554
+ count = self.items_per_year_month[year][month]
555
+ year_data["months"][month] = count
556
+ year_data["total"] += count
557
+ temporal_distribution[year] = year_data
558
+
559
+ # Build summary
560
+ summary = {
561
+ # Core counts
562
+ "unique_granules": unique_granules,
563
+ "stored_references": self.stored_references,
564
+ "total_partitions": len(self.items_per_cell),
565
+ "total_files": len(self.items_per_file) if self.items_per_file else len(self.items_per_cell),
566
+ # Overhead metrics
567
+ "overhead": {
568
+ "spanning_items": self.spanning_items_count,
569
+ "spanning_percentage": round(spanning_percentage, 2),
570
+ "duplication_ratio": round(duplication_ratio, 3),
571
+ "overhead_percentage": round(overhead_percentage, 2),
572
+ "avg_tiles_per_spanning_item": round(avg_tiles_per_spanning, 2),
573
+ "max_tiles_per_item": self.max_tiles_per_item,
574
+ "tiles_distribution": dict(self.tiles_per_spanning_histogram),
575
+ },
576
+ # Global partition metrics
577
+ "global_partition": {
578
+ "items_routed_to_global": self.items_routed_to_global,
579
+ "percentage_global": round(
580
+ (self.items_routed_to_global / unique_granules) * 100 if unique_granules > 0 else 0.0, 2
581
+ ),
582
+ },
583
+ # Spatial distribution
584
+ "spatial": {
585
+ "bbox": (
586
+ [
587
+ round(self.bbox_min_lon, 6),
588
+ round(self.bbox_min_lat, 6),
589
+ round(self.bbox_max_lon, 6),
590
+ round(self.bbox_max_lat, 6),
591
+ ]
592
+ if self.bbox_min_lon <= self.bbox_max_lon
593
+ else None
594
+ ),
595
+ "cells_with_data": len(self.items_per_cell),
596
+ "items_per_cell": spatial_stats,
597
+ "hotspot_cells": hotspot_cells,
598
+ },
599
+ # Temporal distribution
600
+ "temporal": {
601
+ "earliest": (self.earliest_datetime.isoformat() + "Z" if self.earliest_datetime else None),
602
+ "latest": (self.latest_datetime.isoformat() + "Z" if self.latest_datetime else None),
603
+ "years_with_data": sorted(self.items_per_year_month.keys()),
604
+ "distribution": temporal_distribution,
605
+ },
606
+ # Data quality
607
+ "quality": {
608
+ "null_geometries": self.null_geometry_count,
609
+ "missing_datetime": self.missing_datetime_count,
610
+ "geometry_types": dict(self.geometry_types),
611
+ },
612
+ # Mission breakdown
613
+ "missions": dict(self.items_per_mission),
614
+ # File statistics
615
+ "files": {
616
+ "total_count": len(self.items_per_file) if self.items_per_file else 0,
617
+ "total_size_bytes": sum(self.file_sizes) if self.file_sizes else None,
618
+ "size_stats": file_size_stats if self.file_sizes else None,
619
+ "items_per_file": items_per_file_stats if self.items_per_file else None,
620
+ },
621
+ # Processing metrics
622
+ "processing": {
623
+ "run_timestamp": (datetime.now(UTC).isoformat().replace("+00:00", "Z")),
624
+ "duration_seconds": round(duration, 2),
625
+ "urls_processed": self.urls_processed,
626
+ "urls_failed": self.urls_failed,
627
+ "success_rate": round(
628
+ (
629
+ ((self.urls_processed - self.urls_failed) / self.urls_processed) * 100
630
+ if self.urls_processed > 0
631
+ else 100.0
632
+ ),
633
+ 2,
634
+ ),
635
+ "items_per_second": round(items_per_second, 2),
636
+ "new_items": self.new_items,
637
+ "existing_items": self.existing_items,
638
+ "duplicates_removed": self.duplicates_removed,
639
+ },
640
+ }
641
+
642
+ return summary
643
+
644
+ def _calculate_distribution_stats(self, values: list[int] | list[float]) -> dict[str, float] | None:
645
+ """Calculate distribution statistics for a list of values.
646
+
647
+ Args:
648
+ values: List of numeric values
649
+
650
+ Returns:
651
+ Dictionary with min, max, mean, median, std_dev, or None if empty
652
+ """
653
+ if not values:
654
+ return None
655
+
656
+ sorted_values = sorted(values)
657
+ n = len(sorted_values)
658
+
659
+ mean = sum(sorted_values) / n
660
+
661
+ # Calculate median
662
+ if n % 2 == 0:
663
+ median = (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2
664
+ else:
665
+ median = sorted_values[n // 2]
666
+
667
+ # Calculate standard deviation
668
+ variance = sum((x - mean) ** 2 for x in sorted_values) / n
669
+ std_dev = variance**0.5
670
+
671
+ return {
672
+ "min": sorted_values[0],
673
+ "max": sorted_values[-1],
674
+ "mean": round(mean, 2),
675
+ "median": round(median, 2),
676
+ "std_dev": round(std_dev, 2),
677
+ }