earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""Catalog statistics collection for EarthCatalog ingestion pipeline.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive statistics tracking during STAC item ingestion,
|
|
4
|
+
enabling detailed insights into catalog composition, spatial/temporal distribution,
|
|
5
|
+
data quality, and processing performance without requiring post-hoc queries.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- HyperLogLog for memory-efficient approximate unique counting
|
|
9
|
+
- Spatial distribution tracking (items per cell, hotspots)
|
|
10
|
+
- Temporal distribution (items per year/month)
|
|
11
|
+
- Overhead metrics (spanning items, duplication ratio)
|
|
12
|
+
- Data quality metrics (null geometries, missing datetimes)
|
|
13
|
+
- Processing performance metrics
|
|
14
|
+
|
|
15
|
+
Memory Efficiency:
|
|
16
|
+
Uses HyperLogLog algorithm for unique counting, requiring only ~1.5KB
|
|
17
|
+
to track billions of unique items with ~2% error rate. This allows
|
|
18
|
+
accurate overhead calculation even for 100M+ item catalogs.
|
|
19
|
+
|
|
20
|
+
Thread Safety:
|
|
21
|
+
The IngestionStatistics class is designed for single-threaded use within
|
|
22
|
+
each worker. For distributed processing, each worker maintains its own
|
|
23
|
+
statistics instance, which are merged at the end of processing.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>> stats = IngestionStatistics()
|
|
27
|
+
>>> for item in items:
|
|
28
|
+
... stats.record_item(item, tiles=["abc123"], is_spanning=False)
|
|
29
|
+
>>> summary = stats.get_summary()
|
|
30
|
+
>>> print(f"Unique items: {summary['unique_granules']}")
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import hashlib
|
|
34
|
+
import math
|
|
35
|
+
import time
|
|
36
|
+
from collections import defaultdict
|
|
37
|
+
from dataclasses import dataclass, field
|
|
38
|
+
from datetime import UTC, datetime
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class HyperLogLog:
|
|
43
|
+
"""Memory-efficient probabilistic cardinality estimator.
|
|
44
|
+
|
|
45
|
+
Implements the HyperLogLog algorithm for approximate distinct counting.
|
|
46
|
+
Uses only ~1.5KB of memory to estimate cardinality with ~2% error rate,
|
|
47
|
+
regardless of the number of unique elements.
|
|
48
|
+
|
|
49
|
+
The algorithm works by:
|
|
50
|
+
1. Hashing each element to a 64-bit value
|
|
51
|
+
2. Partitioning hash space into 2^p buckets (default p=14 = 16384 buckets)
|
|
52
|
+
3. Tracking the maximum number of leading zeros seen in each bucket
|
|
53
|
+
4. Using harmonic mean of bucket estimates for final cardinality
|
|
54
|
+
|
|
55
|
+
Attributes:
|
|
56
|
+
p: Precision parameter (default 14, giving 16384 registers)
|
|
57
|
+
m: Number of registers (2^p)
|
|
58
|
+
registers: Array of maximum leading zero counts per bucket
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
>>> hll = HyperLogLog(precision=14)
|
|
62
|
+
>>> for item_id in item_ids:
|
|
63
|
+
... hll.add(item_id)
|
|
64
|
+
>>> estimated_unique = hll.count()
|
|
65
|
+
>>> print(f"Approximately {estimated_unique} unique items")
|
|
66
|
+
|
|
67
|
+
Error Rate:
|
|
68
|
+
Standard error is approximately 1.04 / sqrt(m)
|
|
69
|
+
With p=14 (m=16384): error ≈ 0.81% (typically <2% in practice)
|
|
70
|
+
|
|
71
|
+
Memory Usage:
|
|
72
|
+
Uses 1 byte per register = 2^p bytes
|
|
73
|
+
p=14: 16KB, p=12: 4KB, p=10: 1KB
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, precision: int = 14):
|
|
77
|
+
"""Initialize HyperLogLog with specified precision.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
precision: Number of bits for bucket addressing (4-16).
|
|
81
|
+
Higher precision = more accuracy but more memory.
|
|
82
|
+
Default 14 gives ~0.8% error with 16KB memory.
|
|
83
|
+
"""
|
|
84
|
+
if not 4 <= precision <= 16:
|
|
85
|
+
raise ValueError("Precision must be between 4 and 16")
|
|
86
|
+
|
|
87
|
+
self.p = precision
|
|
88
|
+
self.m = 1 << precision # 2^p registers
|
|
89
|
+
self.registers = [0] * self.m
|
|
90
|
+
|
|
91
|
+
# Alpha constant for bias correction (depends on m)
|
|
92
|
+
if self.m >= 128:
|
|
93
|
+
self.alpha = 0.7213 / (1 + 1.079 / self.m)
|
|
94
|
+
elif self.m == 64:
|
|
95
|
+
self.alpha = 0.709
|
|
96
|
+
elif self.m == 32:
|
|
97
|
+
self.alpha = 0.697
|
|
98
|
+
else:
|
|
99
|
+
self.alpha = 0.673
|
|
100
|
+
|
|
101
|
+
def _hash(self, value: str) -> int:
|
|
102
|
+
"""Hash a string value to a 64-bit integer."""
|
|
103
|
+
# Use MD5 for speed and reasonable distribution (not for security)
|
|
104
|
+
h = hashlib.md5(value.encode("utf-8"), usedforsecurity=False).digest()
|
|
105
|
+
return int.from_bytes(h[:8], "big")
|
|
106
|
+
|
|
107
|
+
def _leading_zeros(self, value: int, max_bits: int = 64) -> int:
|
|
108
|
+
"""Count leading zeros in the binary representation."""
|
|
109
|
+
if value == 0:
|
|
110
|
+
return max_bits
|
|
111
|
+
return max_bits - value.bit_length()
|
|
112
|
+
|
|
113
|
+
def add(self, value: str) -> None:
|
|
114
|
+
"""Add an element to the HyperLogLog.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
value: String value to add (typically an item ID)
|
|
118
|
+
"""
|
|
119
|
+
h = self._hash(value)
|
|
120
|
+
|
|
121
|
+
# Use first p bits for bucket index
|
|
122
|
+
bucket = h >> (64 - self.p)
|
|
123
|
+
|
|
124
|
+
# Use remaining bits for leading zero count
|
|
125
|
+
remaining = h & ((1 << (64 - self.p)) - 1)
|
|
126
|
+
zeros = self._leading_zeros(remaining, 64 - self.p) + 1
|
|
127
|
+
|
|
128
|
+
# Update register if this is a new maximum
|
|
129
|
+
self.registers[bucket] = max(self.registers[bucket], zeros)
|
|
130
|
+
|
|
131
|
+
def count(self) -> int:
|
|
132
|
+
"""Estimate the cardinality (number of unique elements).
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Estimated number of unique elements added.
|
|
136
|
+
"""
|
|
137
|
+
# Compute harmonic mean of 2^register values
|
|
138
|
+
indicator = sum(2.0 ** (-r) for r in self.registers)
|
|
139
|
+
estimate = self.alpha * self.m * self.m / indicator
|
|
140
|
+
|
|
141
|
+
# Apply small range correction (linear counting)
|
|
142
|
+
if estimate <= 2.5 * self.m:
|
|
143
|
+
zeros = self.registers.count(0)
|
|
144
|
+
if zeros > 0:
|
|
145
|
+
estimate = self.m * math.log(self.m / zeros)
|
|
146
|
+
|
|
147
|
+
# Apply large range correction
|
|
148
|
+
elif estimate > (1 << 32) / 30:
|
|
149
|
+
estimate = -(1 << 32) * math.log(1 - estimate / (1 << 32))
|
|
150
|
+
|
|
151
|
+
return int(estimate)
|
|
152
|
+
|
|
153
|
+
def merge(self, other: "HyperLogLog") -> None:
|
|
154
|
+
"""Merge another HyperLogLog into this one.
|
|
155
|
+
|
|
156
|
+
Useful for combining statistics from multiple workers.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
other: Another HyperLogLog instance with same precision
|
|
160
|
+
"""
|
|
161
|
+
if self.p != other.p:
|
|
162
|
+
raise ValueError("Cannot merge HyperLogLogs with different precision")
|
|
163
|
+
|
|
164
|
+
for i in range(self.m):
|
|
165
|
+
self.registers[i] = max(self.registers[i], other.registers[i])
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class IngestionStatistics:
|
|
170
|
+
"""Comprehensive statistics collector for STAC ingestion pipeline.
|
|
171
|
+
|
|
172
|
+
Tracks all relevant metrics during ingestion with minimal memory overhead.
|
|
173
|
+
Designed to be used within a single worker process and merged after
|
|
174
|
+
distributed processing completes.
|
|
175
|
+
|
|
176
|
+
Categories of statistics tracked:
|
|
177
|
+
- Core counts: stored references, unique granules (via HyperLogLog)
|
|
178
|
+
- Overhead: spanning items, duplication metrics
|
|
179
|
+
- Spatial: items per cell, hotspots, bounding box
|
|
180
|
+
- Temporal: items per year/month, temporal extent
|
|
181
|
+
- Quality: null geometries, missing datetimes, geometry types
|
|
182
|
+
- Missions: items per dataset/collection
|
|
183
|
+
- Processing: timing, failures, throughput
|
|
184
|
+
|
|
185
|
+
Example:
|
|
186
|
+
>>> stats = IngestionStatistics()
|
|
187
|
+
>>> stats.start_processing()
|
|
188
|
+
>>> for item in items:
|
|
189
|
+
... tiles = grid.tiles_for_geometry(item['geometry'])
|
|
190
|
+
... stats.record_item(item, tiles, is_spanning=len(tiles)>1)
|
|
191
|
+
>>> stats.finish_processing()
|
|
192
|
+
>>> summary = stats.get_summary()
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
# HyperLogLog for unique counting (precision 14 = ~0.8% error, 16KB memory)
|
|
196
|
+
unique_ids: HyperLogLog = field(default_factory=lambda: HyperLogLog(precision=14))
|
|
197
|
+
|
|
198
|
+
# Core counts
|
|
199
|
+
stored_references: int = 0
|
|
200
|
+
items_routed_to_global: int = 0
|
|
201
|
+
|
|
202
|
+
# Spanning metrics
|
|
203
|
+
spanning_items_count: int = 0
|
|
204
|
+
total_tiles_for_spanning: int = 0 # Sum of tiles for all spanning items
|
|
205
|
+
max_tiles_per_item: int = 0
|
|
206
|
+
tiles_per_spanning_histogram: dict[int, int] = field(default_factory=lambda: defaultdict(int))
|
|
207
|
+
|
|
208
|
+
# Spatial distribution
|
|
209
|
+
items_per_cell: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
210
|
+
bbox_min_lon: float = 180.0
|
|
211
|
+
bbox_max_lon: float = -180.0
|
|
212
|
+
bbox_min_lat: float = 90.0
|
|
213
|
+
bbox_max_lat: float = -90.0
|
|
214
|
+
|
|
215
|
+
# Temporal distribution (year -> month -> count)
|
|
216
|
+
items_per_year_month: dict[str, dict[str, int]] = field(
|
|
217
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
218
|
+
)
|
|
219
|
+
earliest_datetime: datetime | None = None
|
|
220
|
+
latest_datetime: datetime | None = None
|
|
221
|
+
|
|
222
|
+
# Data quality
|
|
223
|
+
null_geometry_count: int = 0
|
|
224
|
+
missing_datetime_count: int = 0
|
|
225
|
+
geometry_types: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
226
|
+
|
|
227
|
+
# Mission breakdown
|
|
228
|
+
items_per_mission: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
229
|
+
|
|
230
|
+
# Processing metrics
|
|
231
|
+
start_time: float | None = None
|
|
232
|
+
end_time: float | None = None
|
|
233
|
+
urls_processed: int = 0
|
|
234
|
+
urls_failed: int = 0
|
|
235
|
+
|
|
236
|
+
# File statistics (populated during consolidation)
|
|
237
|
+
file_sizes: list[int] = field(default_factory=list)
|
|
238
|
+
items_per_file: list[int] = field(default_factory=list)
|
|
239
|
+
|
|
240
|
+
# Consolidation metrics
|
|
241
|
+
duplicates_removed: int = 0
|
|
242
|
+
new_items: int = 0
|
|
243
|
+
existing_items: int = 0
|
|
244
|
+
|
|
245
|
+
def start_processing(self) -> None:
|
|
246
|
+
"""Mark the start of processing for timing metrics."""
|
|
247
|
+
self.start_time = time.time()
|
|
248
|
+
|
|
249
|
+
def finish_processing(self) -> None:
|
|
250
|
+
"""Mark the end of processing for timing metrics."""
|
|
251
|
+
self.end_time = time.time()
|
|
252
|
+
|
|
253
|
+
def record_url_processed(self, success: bool = True) -> None:
|
|
254
|
+
"""Record a URL processing attempt.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
success: Whether the URL was successfully processed
|
|
258
|
+
"""
|
|
259
|
+
self.urls_processed += 1
|
|
260
|
+
if not success:
|
|
261
|
+
self.urls_failed += 1
|
|
262
|
+
|
|
263
|
+
def record_item(
|
|
264
|
+
self,
|
|
265
|
+
item: dict[str, Any],
|
|
266
|
+
tiles: list[str],
|
|
267
|
+
is_spanning: bool,
|
|
268
|
+
routed_to_global: bool = False,
|
|
269
|
+
mission: str | None = None,
|
|
270
|
+
) -> None:
|
|
271
|
+
"""Record statistics for a single STAC item.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
item: STAC item dictionary
|
|
275
|
+
tiles: List of tile IDs this item maps to
|
|
276
|
+
is_spanning: Whether item spans multiple tiles
|
|
277
|
+
routed_to_global: Whether item was routed to global partition
|
|
278
|
+
mission: Extracted mission/dataset name
|
|
279
|
+
"""
|
|
280
|
+
item_id = item.get("id", "")
|
|
281
|
+
|
|
282
|
+
# Track unique IDs via HyperLogLog
|
|
283
|
+
if item_id:
|
|
284
|
+
self.unique_ids.add(item_id)
|
|
285
|
+
|
|
286
|
+
# Count stored reference (this counts each tile assignment)
|
|
287
|
+
if routed_to_global:
|
|
288
|
+
self.stored_references += 1
|
|
289
|
+
self.items_routed_to_global += 1
|
|
290
|
+
else:
|
|
291
|
+
# Item stored once per tile it intersects
|
|
292
|
+
self.stored_references += len(tiles) if tiles else 1
|
|
293
|
+
|
|
294
|
+
# Track spanning metrics
|
|
295
|
+
if is_spanning and not routed_to_global:
|
|
296
|
+
self.spanning_items_count += 1
|
|
297
|
+
tile_count = len(tiles)
|
|
298
|
+
self.total_tiles_for_spanning += tile_count
|
|
299
|
+
self.max_tiles_per_item = max(self.max_tiles_per_item, tile_count)
|
|
300
|
+
|
|
301
|
+
# Bucket into histogram (1, 2, 3-5, 6-10, 11-20, 21-50, 50+)
|
|
302
|
+
if tile_count <= 2:
|
|
303
|
+
bucket = tile_count
|
|
304
|
+
elif tile_count <= 5:
|
|
305
|
+
bucket = 5
|
|
306
|
+
elif tile_count <= 10:
|
|
307
|
+
bucket = 10
|
|
308
|
+
elif tile_count <= 20:
|
|
309
|
+
bucket = 20
|
|
310
|
+
elif tile_count <= 50:
|
|
311
|
+
bucket = 50
|
|
312
|
+
else:
|
|
313
|
+
bucket = 100 # 50+
|
|
314
|
+
self.tiles_per_spanning_histogram[bucket] += 1
|
|
315
|
+
|
|
316
|
+
# Track spatial distribution
|
|
317
|
+
for tile in tiles:
|
|
318
|
+
self.items_per_cell[tile] += 1
|
|
319
|
+
|
|
320
|
+
# Update bounding box from geometry
|
|
321
|
+
geometry = item.get("geometry")
|
|
322
|
+
if geometry:
|
|
323
|
+
self._update_bbox_from_geometry(geometry)
|
|
324
|
+
geom_type = geometry.get("type", "Unknown")
|
|
325
|
+
self.geometry_types[geom_type] += 1
|
|
326
|
+
else:
|
|
327
|
+
self.null_geometry_count += 1
|
|
328
|
+
|
|
329
|
+
# Track temporal distribution
|
|
330
|
+
props = item.get("properties", {})
|
|
331
|
+
dt_str = props.get("datetime")
|
|
332
|
+
if dt_str:
|
|
333
|
+
self._record_datetime(dt_str)
|
|
334
|
+
else:
|
|
335
|
+
self.missing_datetime_count += 1
|
|
336
|
+
|
|
337
|
+
# Track mission
|
|
338
|
+
if mission:
|
|
339
|
+
self.items_per_mission[mission] += 1
|
|
340
|
+
|
|
341
|
+
def _update_bbox_from_geometry(self, geometry: dict[str, Any]) -> None:
|
|
342
|
+
"""Update bounding box from GeoJSON geometry."""
|
|
343
|
+
try:
|
|
344
|
+
# Extract coordinates based on geometry type
|
|
345
|
+
geom_type = geometry.get("type", "")
|
|
346
|
+
coords = geometry.get("coordinates", [])
|
|
347
|
+
|
|
348
|
+
if not coords:
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
# Flatten coordinates to list of [lon, lat] pairs
|
|
352
|
+
flat_coords = self._flatten_coordinates(coords, geom_type)
|
|
353
|
+
|
|
354
|
+
for lon, lat in flat_coords:
|
|
355
|
+
self.bbox_min_lon = min(self.bbox_min_lon, lon)
|
|
356
|
+
self.bbox_max_lon = max(self.bbox_max_lon, lon)
|
|
357
|
+
self.bbox_min_lat = min(self.bbox_min_lat, lat)
|
|
358
|
+
self.bbox_max_lat = max(self.bbox_max_lat, lat)
|
|
359
|
+
except (TypeError, ValueError, KeyError, IndexError):
|
|
360
|
+
# Silently ignore malformed geometries that can't be processed
|
|
361
|
+
pass
|
|
362
|
+
|
|
363
|
+
def _flatten_coordinates(self, coords: Any, geom_type: str) -> list[tuple[float, float]]:
|
|
364
|
+
"""Flatten nested coordinate arrays to list of (lon, lat) tuples."""
|
|
365
|
+
result = []
|
|
366
|
+
|
|
367
|
+
if geom_type == "Point":
|
|
368
|
+
if len(coords) >= 2:
|
|
369
|
+
result.append((coords[0], coords[1]))
|
|
370
|
+
elif geom_type == "LineString":
|
|
371
|
+
for coord in coords:
|
|
372
|
+
if len(coord) >= 2:
|
|
373
|
+
result.append((coord[0], coord[1]))
|
|
374
|
+
elif geom_type == "Polygon":
|
|
375
|
+
for ring in coords:
|
|
376
|
+
for coord in ring:
|
|
377
|
+
if len(coord) >= 2:
|
|
378
|
+
result.append((coord[0], coord[1]))
|
|
379
|
+
elif geom_type == "MultiPoint":
|
|
380
|
+
for coord in coords:
|
|
381
|
+
if len(coord) >= 2:
|
|
382
|
+
result.append((coord[0], coord[1]))
|
|
383
|
+
elif geom_type == "MultiLineString":
|
|
384
|
+
for line in coords:
|
|
385
|
+
for coord in line:
|
|
386
|
+
if len(coord) >= 2:
|
|
387
|
+
result.append((coord[0], coord[1]))
|
|
388
|
+
elif geom_type == "MultiPolygon":
|
|
389
|
+
for polygon in coords:
|
|
390
|
+
for ring in polygon:
|
|
391
|
+
for coord in ring:
|
|
392
|
+
if len(coord) >= 2:
|
|
393
|
+
result.append((coord[0], coord[1]))
|
|
394
|
+
|
|
395
|
+
return result
|
|
396
|
+
|
|
397
|
+
def _record_datetime(self, dt_str: str) -> None:
|
|
398
|
+
"""Record datetime for temporal statistics.
|
|
399
|
+
|
|
400
|
+
Uses datetime.fromisoformat() for fast parsing of ISO 8601 strings.
|
|
401
|
+
Falls back gracefully for non-standard formats.
|
|
402
|
+
"""
|
|
403
|
+
try:
|
|
404
|
+
# Fast path: datetime.fromisoformat() is ~100x faster than pd.to_datetime()
|
|
405
|
+
# Handle 'Z' suffix which fromisoformat doesn't support directly
|
|
406
|
+
if dt_str.endswith("Z"):
|
|
407
|
+
dt_str = dt_str[:-1] + "+00:00"
|
|
408
|
+
dt = datetime.fromisoformat(dt_str)
|
|
409
|
+
|
|
410
|
+
# Update temporal extent
|
|
411
|
+
if self.earliest_datetime is None or dt < self.earliest_datetime:
|
|
412
|
+
self.earliest_datetime = dt
|
|
413
|
+
if self.latest_datetime is None or dt > self.latest_datetime:
|
|
414
|
+
self.latest_datetime = dt
|
|
415
|
+
|
|
416
|
+
# Track by year/month
|
|
417
|
+
year = str(dt.year)
|
|
418
|
+
month = f"{dt.month:02d}"
|
|
419
|
+
self.items_per_year_month[year][month] += 1
|
|
420
|
+
|
|
421
|
+
except (ValueError, TypeError):
|
|
422
|
+
self.missing_datetime_count += 1
|
|
423
|
+
|
|
424
|
+
def record_file(self, file_size_bytes: int, item_count: int) -> None:
|
|
425
|
+
"""Record statistics for a written file.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
file_size_bytes: Size of the written file in bytes
|
|
429
|
+
item_count: Number of items in the file
|
|
430
|
+
"""
|
|
431
|
+
self.file_sizes.append(file_size_bytes)
|
|
432
|
+
self.items_per_file.append(item_count)
|
|
433
|
+
|
|
434
|
+
def record_consolidation(self, new_items: int, existing_items: int, duplicates_removed: int) -> None:
|
|
435
|
+
"""Record consolidation statistics.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
new_items: Number of new items added
|
|
439
|
+
existing_items: Number of items that existed before
|
|
440
|
+
duplicates_removed: Number of duplicate items removed
|
|
441
|
+
"""
|
|
442
|
+
self.new_items += new_items
|
|
443
|
+
self.existing_items += existing_items
|
|
444
|
+
self.duplicates_removed += duplicates_removed
|
|
445
|
+
|
|
446
|
+
def merge(self, other: "IngestionStatistics") -> None:
|
|
447
|
+
"""Merge statistics from another instance.
|
|
448
|
+
|
|
449
|
+
Used to combine statistics from multiple workers in distributed processing.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
other: Another IngestionStatistics instance to merge
|
|
453
|
+
"""
|
|
454
|
+
# Merge HyperLogLog
|
|
455
|
+
self.unique_ids.merge(other.unique_ids)
|
|
456
|
+
|
|
457
|
+
# Merge counts
|
|
458
|
+
self.stored_references += other.stored_references
|
|
459
|
+
self.items_routed_to_global += other.items_routed_to_global
|
|
460
|
+
self.spanning_items_count += other.spanning_items_count
|
|
461
|
+
self.total_tiles_for_spanning += other.total_tiles_for_spanning
|
|
462
|
+
self.max_tiles_per_item = max(self.max_tiles_per_item, other.max_tiles_per_item)
|
|
463
|
+
|
|
464
|
+
# Merge histograms
|
|
465
|
+
for bucket, count in other.tiles_per_spanning_histogram.items():
|
|
466
|
+
self.tiles_per_spanning_histogram[bucket] += count
|
|
467
|
+
|
|
468
|
+
# Merge spatial distribution
|
|
469
|
+
for cell, count in other.items_per_cell.items():
|
|
470
|
+
self.items_per_cell[cell] += count
|
|
471
|
+
|
|
472
|
+
# Merge bbox
|
|
473
|
+
self.bbox_min_lon = min(self.bbox_min_lon, other.bbox_min_lon)
|
|
474
|
+
self.bbox_max_lon = max(self.bbox_max_lon, other.bbox_max_lon)
|
|
475
|
+
self.bbox_min_lat = min(self.bbox_min_lat, other.bbox_min_lat)
|
|
476
|
+
self.bbox_max_lat = max(self.bbox_max_lat, other.bbox_max_lat)
|
|
477
|
+
|
|
478
|
+
# Merge temporal distribution
|
|
479
|
+
for year, months in other.items_per_year_month.items():
|
|
480
|
+
for month, count in months.items():
|
|
481
|
+
self.items_per_year_month[year][month] += count
|
|
482
|
+
|
|
483
|
+
# Merge temporal extent
|
|
484
|
+
if other.earliest_datetime:
|
|
485
|
+
if self.earliest_datetime is None or other.earliest_datetime < self.earliest_datetime:
|
|
486
|
+
self.earliest_datetime = other.earliest_datetime
|
|
487
|
+
if other.latest_datetime:
|
|
488
|
+
if self.latest_datetime is None or other.latest_datetime > self.latest_datetime:
|
|
489
|
+
self.latest_datetime = other.latest_datetime
|
|
490
|
+
|
|
491
|
+
# Merge quality metrics
|
|
492
|
+
self.null_geometry_count += other.null_geometry_count
|
|
493
|
+
self.missing_datetime_count += other.missing_datetime_count
|
|
494
|
+
for geom_type, count in other.geometry_types.items():
|
|
495
|
+
self.geometry_types[geom_type] += count
|
|
496
|
+
|
|
497
|
+
# Merge mission counts
|
|
498
|
+
for mission, count in other.items_per_mission.items():
|
|
499
|
+
self.items_per_mission[mission] += count
|
|
500
|
+
|
|
501
|
+
# Merge processing metrics
|
|
502
|
+
self.urls_processed += other.urls_processed
|
|
503
|
+
self.urls_failed += other.urls_failed
|
|
504
|
+
|
|
505
|
+
# Merge file statistics
|
|
506
|
+
self.file_sizes.extend(other.file_sizes)
|
|
507
|
+
self.items_per_file.extend(other.items_per_file)
|
|
508
|
+
|
|
509
|
+
# Merge consolidation metrics
|
|
510
|
+
self.new_items += other.new_items
|
|
511
|
+
self.existing_items += other.existing_items
|
|
512
|
+
self.duplicates_removed += other.duplicates_removed
|
|
513
|
+
|
|
514
|
+
def get_summary(self) -> dict[str, Any]:
|
|
515
|
+
"""Generate comprehensive statistics summary for schema output.
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
Dictionary containing all collected statistics formatted for
|
|
519
|
+
inclusion in the catalog schema JSON.
|
|
520
|
+
"""
|
|
521
|
+
unique_granules = self.unique_ids.count()
|
|
522
|
+
|
|
523
|
+
# Calculate derived metrics
|
|
524
|
+
duplication_ratio = self.stored_references / unique_granules if unique_granules > 0 else 1.0
|
|
525
|
+
overhead_percentage = (
|
|
526
|
+
((self.stored_references - unique_granules) / unique_granules) * 100 if unique_granules > 0 else 0.0
|
|
527
|
+
)
|
|
528
|
+
spanning_percentage = (self.spanning_items_count / unique_granules) * 100 if unique_granules > 0 else 0.0
|
|
529
|
+
avg_tiles_per_spanning = (
|
|
530
|
+
self.total_tiles_for_spanning / self.spanning_items_count if self.spanning_items_count > 0 else 0.0
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Calculate spatial statistics
|
|
534
|
+
cell_counts = list(self.items_per_cell.values())
|
|
535
|
+
spatial_stats = self._calculate_distribution_stats(cell_counts)
|
|
536
|
+
|
|
537
|
+
# Get hotspot cells (top 10 by count)
|
|
538
|
+
sorted_cells = sorted(self.items_per_cell.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
539
|
+
hotspot_cells = [{"cell": cell, "count": count} for cell, count in sorted_cells]
|
|
540
|
+
|
|
541
|
+
# Calculate file statistics
|
|
542
|
+
file_size_stats = self._calculate_distribution_stats(self.file_sizes)
|
|
543
|
+
items_per_file_stats = self._calculate_distribution_stats(self.items_per_file)
|
|
544
|
+
|
|
545
|
+
# Calculate processing metrics
|
|
546
|
+
duration = self.end_time - self.start_time if self.start_time and self.end_time else 0.0
|
|
547
|
+
items_per_second = unique_granules / duration if duration > 0 else 0.0
|
|
548
|
+
|
|
549
|
+
# Format temporal distribution
|
|
550
|
+
temporal_distribution = {}
|
|
551
|
+
for year in sorted(self.items_per_year_month.keys()):
|
|
552
|
+
year_data: dict[str, Any] = {"total": 0, "months": {}}
|
|
553
|
+
for month in sorted(self.items_per_year_month[year].keys()):
|
|
554
|
+
count = self.items_per_year_month[year][month]
|
|
555
|
+
year_data["months"][month] = count
|
|
556
|
+
year_data["total"] += count
|
|
557
|
+
temporal_distribution[year] = year_data
|
|
558
|
+
|
|
559
|
+
# Build summary
|
|
560
|
+
summary = {
|
|
561
|
+
# Core counts
|
|
562
|
+
"unique_granules": unique_granules,
|
|
563
|
+
"stored_references": self.stored_references,
|
|
564
|
+
"total_partitions": len(self.items_per_cell),
|
|
565
|
+
"total_files": len(self.items_per_file) if self.items_per_file else len(self.items_per_cell),
|
|
566
|
+
# Overhead metrics
|
|
567
|
+
"overhead": {
|
|
568
|
+
"spanning_items": self.spanning_items_count,
|
|
569
|
+
"spanning_percentage": round(spanning_percentage, 2),
|
|
570
|
+
"duplication_ratio": round(duplication_ratio, 3),
|
|
571
|
+
"overhead_percentage": round(overhead_percentage, 2),
|
|
572
|
+
"avg_tiles_per_spanning_item": round(avg_tiles_per_spanning, 2),
|
|
573
|
+
"max_tiles_per_item": self.max_tiles_per_item,
|
|
574
|
+
"tiles_distribution": dict(self.tiles_per_spanning_histogram),
|
|
575
|
+
},
|
|
576
|
+
# Global partition metrics
|
|
577
|
+
"global_partition": {
|
|
578
|
+
"items_routed_to_global": self.items_routed_to_global,
|
|
579
|
+
"percentage_global": round(
|
|
580
|
+
(self.items_routed_to_global / unique_granules) * 100 if unique_granules > 0 else 0.0, 2
|
|
581
|
+
),
|
|
582
|
+
},
|
|
583
|
+
# Spatial distribution
|
|
584
|
+
"spatial": {
|
|
585
|
+
"bbox": (
|
|
586
|
+
[
|
|
587
|
+
round(self.bbox_min_lon, 6),
|
|
588
|
+
round(self.bbox_min_lat, 6),
|
|
589
|
+
round(self.bbox_max_lon, 6),
|
|
590
|
+
round(self.bbox_max_lat, 6),
|
|
591
|
+
]
|
|
592
|
+
if self.bbox_min_lon <= self.bbox_max_lon
|
|
593
|
+
else None
|
|
594
|
+
),
|
|
595
|
+
"cells_with_data": len(self.items_per_cell),
|
|
596
|
+
"items_per_cell": spatial_stats,
|
|
597
|
+
"hotspot_cells": hotspot_cells,
|
|
598
|
+
},
|
|
599
|
+
# Temporal distribution
|
|
600
|
+
"temporal": {
|
|
601
|
+
"earliest": (self.earliest_datetime.isoformat() + "Z" if self.earliest_datetime else None),
|
|
602
|
+
"latest": (self.latest_datetime.isoformat() + "Z" if self.latest_datetime else None),
|
|
603
|
+
"years_with_data": sorted(self.items_per_year_month.keys()),
|
|
604
|
+
"distribution": temporal_distribution,
|
|
605
|
+
},
|
|
606
|
+
# Data quality
|
|
607
|
+
"quality": {
|
|
608
|
+
"null_geometries": self.null_geometry_count,
|
|
609
|
+
"missing_datetime": self.missing_datetime_count,
|
|
610
|
+
"geometry_types": dict(self.geometry_types),
|
|
611
|
+
},
|
|
612
|
+
# Mission breakdown
|
|
613
|
+
"missions": dict(self.items_per_mission),
|
|
614
|
+
# File statistics
|
|
615
|
+
"files": {
|
|
616
|
+
"total_count": len(self.items_per_file) if self.items_per_file else 0,
|
|
617
|
+
"total_size_bytes": sum(self.file_sizes) if self.file_sizes else None,
|
|
618
|
+
"size_stats": file_size_stats if self.file_sizes else None,
|
|
619
|
+
"items_per_file": items_per_file_stats if self.items_per_file else None,
|
|
620
|
+
},
|
|
621
|
+
# Processing metrics
|
|
622
|
+
"processing": {
|
|
623
|
+
"run_timestamp": (datetime.now(UTC).isoformat().replace("+00:00", "Z")),
|
|
624
|
+
"duration_seconds": round(duration, 2),
|
|
625
|
+
"urls_processed": self.urls_processed,
|
|
626
|
+
"urls_failed": self.urls_failed,
|
|
627
|
+
"success_rate": round(
|
|
628
|
+
(
|
|
629
|
+
((self.urls_processed - self.urls_failed) / self.urls_processed) * 100
|
|
630
|
+
if self.urls_processed > 0
|
|
631
|
+
else 100.0
|
|
632
|
+
),
|
|
633
|
+
2,
|
|
634
|
+
),
|
|
635
|
+
"items_per_second": round(items_per_second, 2),
|
|
636
|
+
"new_items": self.new_items,
|
|
637
|
+
"existing_items": self.existing_items,
|
|
638
|
+
"duplicates_removed": self.duplicates_removed,
|
|
639
|
+
},
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
return summary
|
|
643
|
+
|
|
644
|
+
def _calculate_distribution_stats(self, values: list[int] | list[float]) -> dict[str, float] | None:
|
|
645
|
+
"""Calculate distribution statistics for a list of values.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
values: List of numeric values
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
Dictionary with min, max, mean, median, std_dev, or None if empty
|
|
652
|
+
"""
|
|
653
|
+
if not values:
|
|
654
|
+
return None
|
|
655
|
+
|
|
656
|
+
sorted_values = sorted(values)
|
|
657
|
+
n = len(sorted_values)
|
|
658
|
+
|
|
659
|
+
mean = sum(sorted_values) / n
|
|
660
|
+
|
|
661
|
+
# Calculate median
|
|
662
|
+
if n % 2 == 0:
|
|
663
|
+
median = (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2
|
|
664
|
+
else:
|
|
665
|
+
median = sorted_values[n // 2]
|
|
666
|
+
|
|
667
|
+
# Calculate standard deviation
|
|
668
|
+
variance = sum((x - mean) ** 2 for x in sorted_values) / n
|
|
669
|
+
std_dev = variance**0.5
|
|
670
|
+
|
|
671
|
+
return {
|
|
672
|
+
"min": sorted_values[0],
|
|
673
|
+
"max": sorted_values[-1],
|
|
674
|
+
"mean": round(mean, 2),
|
|
675
|
+
"median": round(median, 2),
|
|
676
|
+
"std_dev": round(std_dev, 2),
|
|
677
|
+
}
|