earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,754 @@
|
|
|
1
|
+
# stac_hooks.py
|
|
2
|
+
"""STAC fetch hooks for custom item generation.
|
|
3
|
+
|
|
4
|
+
This module provides a flexible hook system for generating STAC items from URLs
|
|
5
|
+
when the URL doesn't point directly to a STAC JSON. This is useful when:
|
|
6
|
+
|
|
7
|
+
1. URLs point to raw data files (COG, NetCDF, HDF5) that need metadata extraction
|
|
8
|
+
2. URLs are identifiers that need to be transformed via an external API
|
|
9
|
+
3. Custom STAC generation logic is required for specific data providers
|
|
10
|
+
|
|
11
|
+
Architecture:
|
|
12
|
+
Hooks are designed to be serializable for Dask distributed processing.
|
|
13
|
+
Instead of passing callable objects (which can't be pickled), hooks are
|
|
14
|
+
configured via module paths that can be imported on remote workers.
|
|
15
|
+
|
|
16
|
+
Hook Types:
|
|
17
|
+
- default: Standard STAC JSON fetch from URL (current behavior)
|
|
18
|
+
- callable: Python callable (local processing only, not serializable)
|
|
19
|
+
- module: Import path to Python function (serializable for Dask)
|
|
20
|
+
- script: External executable (serializable, runs as subprocess)
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
>>> # Module-based hook (serializable, recommended for Dask)
|
|
24
|
+
>>> config = ProcessingConfig(
|
|
25
|
+
... input_file='urls.parquet',
|
|
26
|
+
... output_catalog='./catalog',
|
|
27
|
+
... scratch_location='./scratch',
|
|
28
|
+
... stac_hook='module:mypackage.hooks:url_to_stac_item',
|
|
29
|
+
... )
|
|
30
|
+
|
|
31
|
+
>>> # Script-based hook (serializable)
|
|
32
|
+
>>> config = ProcessingConfig(
|
|
33
|
+
... stac_hook='script:/path/to/generate_stac.py',
|
|
34
|
+
... )
|
|
35
|
+
|
|
36
|
+
>>> # Callable hook (local processing only)
|
|
37
|
+
>>> def my_hook(url: str) -> dict:
|
|
38
|
+
... return {...}
|
|
39
|
+
>>> config = ProcessingConfig(
|
|
40
|
+
... stac_hook=my_hook, # Only works with LocalProcessor
|
|
41
|
+
... )
|
|
42
|
+
|
|
43
|
+
Hook Function Signature:
|
|
44
|
+
All hook functions must implement the following signature:
|
|
45
|
+
|
|
46
|
+
def hook_function(url: str, **kwargs) -> dict[str, Any] | pystac.Item | None:
|
|
47
|
+
'''Generate a STAC item from a URL.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
url: The input URL/identifier to process.
|
|
51
|
+
**kwargs: Additional context (timeout, retry_attempts, etc.)
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
A valid STAC item dictionary, pystac.Item object, or None if generation failed.
|
|
55
|
+
pystac.Item objects are automatically converted to dictionaries.
|
|
56
|
+
'''
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
Batch Hook Signature (Optional):
|
|
60
|
+
For efficiency, hooks can optionally support batch processing:
|
|
61
|
+
|
|
62
|
+
def batch_hook_function(urls: list[str], **kwargs) -> list[dict[str, Any] | pystac.Item | None]:
|
|
63
|
+
'''Generate STAC items from multiple URLs.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
urls: List of URLs/identifiers to process.
|
|
67
|
+
**kwargs: Additional context.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of STAC items (dict or pystac.Item) or None for failures, same order as input.
|
|
71
|
+
pystac.Item objects are automatically converted to dictionaries.
|
|
72
|
+
'''
|
|
73
|
+
pass
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
from __future__ import annotations
|
|
77
|
+
|
|
78
|
+
import importlib
|
|
79
|
+
import json
|
|
80
|
+
import logging
|
|
81
|
+
import subprocess # nosec B404 - subprocess used for external tool integration
|
|
82
|
+
import tempfile
|
|
83
|
+
from abc import ABC, abstractmethod
|
|
84
|
+
from collections.abc import Callable
|
|
85
|
+
from typing import Any, Protocol, runtime_checkable
|
|
86
|
+
|
|
87
|
+
import requests
|
|
88
|
+
|
|
89
|
+
logger = logging.getLogger(__name__)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# =============================================================================
|
|
93
|
+
# Helper Functions
|
|
94
|
+
# =============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _normalize_stac_result(result: Any) -> dict[str, Any] | None:
|
|
98
|
+
"""Convert hook result to a dictionary, handling pystac.Item objects.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
result: The result from a hook function. Can be:
|
|
102
|
+
- dict: Returned as-is
|
|
103
|
+
- pystac.Item: Converted to dict via to_dict()
|
|
104
|
+
- None: Returned as-is
|
|
105
|
+
- Other: Returns None
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
STAC item dictionary or None.
|
|
109
|
+
"""
|
|
110
|
+
if result is None:
|
|
111
|
+
return None
|
|
112
|
+
if isinstance(result, dict):
|
|
113
|
+
return result
|
|
114
|
+
# Check for pystac.Item or any object with to_dict method
|
|
115
|
+
if hasattr(result, "to_dict") and callable(result.to_dict):
|
|
116
|
+
try:
|
|
117
|
+
converted = result.to_dict()
|
|
118
|
+
if isinstance(converted, dict):
|
|
119
|
+
return converted
|
|
120
|
+
return None
|
|
121
|
+
except (ValueError, TypeError, AttributeError, RuntimeError) as e:
|
|
122
|
+
logger.warning(f"Failed to convert pystac.Item to dict: {e}")
|
|
123
|
+
return None
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _normalize_stac_results(results: list[Any]) -> list[dict[str, Any] | None]:
|
|
128
|
+
"""Convert a list of hook results to dictionaries.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
results: List of results from a batch hook function.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of STAC item dictionaries (or None for failures).
|
|
135
|
+
"""
|
|
136
|
+
return [_normalize_stac_result(r) for r in results]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# =============================================================================
|
|
140
|
+
# Protocol Definitions
|
|
141
|
+
# =============================================================================
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@runtime_checkable
|
|
145
|
+
class STACHookProtocol(Protocol):
|
|
146
|
+
"""Protocol for STAC fetch hooks."""
|
|
147
|
+
|
|
148
|
+
def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
|
|
149
|
+
"""Fetch/generate a STAC item from a URL.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
url: The URL or identifier to process.
|
|
153
|
+
**kwargs: Additional context like timeout, retry_attempts.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
STAC item dictionary or None if fetch failed.
|
|
157
|
+
"""
|
|
158
|
+
...
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@runtime_checkable
|
|
162
|
+
class BatchSTACHookProtocol(Protocol):
|
|
163
|
+
"""Protocol for batch STAC fetch hooks."""
|
|
164
|
+
|
|
165
|
+
def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
|
|
166
|
+
"""Fetch/generate STAC items from multiple URLs.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
urls: List of URLs or identifiers to process.
|
|
170
|
+
**kwargs: Additional context.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of STAC items (or None for failures), in same order as input.
|
|
174
|
+
"""
|
|
175
|
+
...
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# =============================================================================
|
|
179
|
+
# Hook Implementations
|
|
180
|
+
# =============================================================================
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class BaseSTACHook(ABC):
|
|
184
|
+
"""Base class for STAC fetch hooks."""
|
|
185
|
+
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
|
|
188
|
+
"""Fetch a single STAC item."""
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
|
|
192
|
+
"""Fetch multiple STAC items. Default: sequential fetch."""
|
|
193
|
+
return [self.fetch(url, **kwargs) for url in urls]
|
|
194
|
+
|
|
195
|
+
def to_config(self) -> str:
|
|
196
|
+
"""Serialize hook to config string for storage/transmission."""
|
|
197
|
+
raise NotImplementedError("Subclass must implement to_config()")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class DefaultSTACHook(BaseSTACHook):
|
|
201
|
+
"""Default hook: fetch STAC JSON directly from URL.
|
|
202
|
+
|
|
203
|
+
This is the standard behavior - assumes the URL points to a valid STAC item JSON.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
def fetch(
|
|
207
|
+
self,
|
|
208
|
+
url: str,
|
|
209
|
+
timeout: int = 30,
|
|
210
|
+
retry_attempts: int = 3,
|
|
211
|
+
**kwargs,
|
|
212
|
+
) -> dict[str, Any] | None:
|
|
213
|
+
"""Download and parse STAC item from URL.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
url: URL to STAC item JSON.
|
|
217
|
+
timeout: Request timeout in seconds.
|
|
218
|
+
retry_attempts: Number of retry attempts.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Parsed STAC item or None if failed.
|
|
222
|
+
"""
|
|
223
|
+
import time
|
|
224
|
+
|
|
225
|
+
for attempt in range(retry_attempts):
|
|
226
|
+
try:
|
|
227
|
+
response = requests.get(url, timeout=timeout)
|
|
228
|
+
response.raise_for_status()
|
|
229
|
+
return response.json()
|
|
230
|
+
except requests.RequestException as e:
|
|
231
|
+
if attempt == retry_attempts - 1:
|
|
232
|
+
logger.warning(f"Failed to download {url} after {retry_attempts} attempts: {e}")
|
|
233
|
+
return None
|
|
234
|
+
time.sleep(2**attempt) # Exponential backoff
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
def to_config(self) -> str:
|
|
238
|
+
"""Return config string for default hook."""
|
|
239
|
+
return "default"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class ModuleSTACHook(BaseSTACHook):
|
|
243
|
+
"""Hook that calls a Python function specified by module path.
|
|
244
|
+
|
|
245
|
+
This is the recommended approach for Dask distributed processing because
|
|
246
|
+
the module path is a string that can be serialized and the function is
|
|
247
|
+
imported on each worker.
|
|
248
|
+
|
|
249
|
+
The module path format is: 'package.module:function_name'
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> hook = ModuleSTACHook('mypackage.stac_generator:url_to_item')
|
|
253
|
+
>>> item = hook.fetch('https://example.com/data.tif')
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def __init__(self, module_path: str):
|
|
257
|
+
"""Initialize with module path.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
module_path: Import path in format 'package.module:function_name'
|
|
261
|
+
"""
|
|
262
|
+
self.module_path = module_path
|
|
263
|
+
self._func: Callable | None = None
|
|
264
|
+
self._batch_func: Callable | None = None
|
|
265
|
+
|
|
266
|
+
def _load_function(self) -> Callable[..., Any]:
|
|
267
|
+
"""Import and return the hook function."""
|
|
268
|
+
if self._func is not None:
|
|
269
|
+
return self._func
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
module_name, func_name = self.module_path.rsplit(":", 1)
|
|
273
|
+
module = importlib.import_module(module_name)
|
|
274
|
+
func = getattr(module, func_name)
|
|
275
|
+
self._func = func
|
|
276
|
+
|
|
277
|
+
# Check for batch function (convention: func_name + '_batch')
|
|
278
|
+
batch_func_name = f"{func_name}_batch"
|
|
279
|
+
if hasattr(module, batch_func_name):
|
|
280
|
+
self._batch_func = getattr(module, batch_func_name)
|
|
281
|
+
|
|
282
|
+
return func
|
|
283
|
+
except (ValueError, ImportError, AttributeError) as e:
|
|
284
|
+
raise ImportError(f"Failed to import hook function from '{self.module_path}': {e}") from e
|
|
285
|
+
|
|
286
|
+
def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
|
|
287
|
+
"""Fetch STAC item using the imported function.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
url: URL or identifier to process.
|
|
291
|
+
**kwargs: Additional context passed to the function.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
STAC item or None. Supports both dict and pystac.Item returns.
|
|
295
|
+
"""
|
|
296
|
+
func = self._load_function()
|
|
297
|
+
try:
|
|
298
|
+
result = func(url, **kwargs)
|
|
299
|
+
return _normalize_stac_result(result)
|
|
300
|
+
except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
|
|
301
|
+
logger.warning(f"Hook function failed for {url}: {e}")
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
|
|
305
|
+
"""Fetch multiple STAC items, using batch function if available.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
urls: List of URLs to process.
|
|
309
|
+
**kwargs: Additional context.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of STAC items (or None for failures). Supports pystac.Item returns.
|
|
313
|
+
"""
|
|
314
|
+
self._load_function() # Ensure functions are loaded
|
|
315
|
+
|
|
316
|
+
if self._batch_func is not None:
|
|
317
|
+
try:
|
|
318
|
+
results = self._batch_func(urls, **kwargs)
|
|
319
|
+
if isinstance(results, list) and len(results) == len(urls):
|
|
320
|
+
return _normalize_stac_results(results)
|
|
321
|
+
logger.warning("Batch hook returned invalid result, falling back to sequential")
|
|
322
|
+
except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
|
|
323
|
+
logger.warning(f"Batch hook failed: {e}, falling back to sequential")
|
|
324
|
+
|
|
325
|
+
# Fall back to sequential
|
|
326
|
+
return super().fetch_batch(urls, **kwargs)
|
|
327
|
+
|
|
328
|
+
def to_config(self) -> str:
|
|
329
|
+
"""Return config string for module hook."""
|
|
330
|
+
return f"module:{self.module_path}"
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class PassthroughSTACHook(BaseSTACHook):
|
|
334
|
+
"""Hook that treats input URLs as pre-fetched STAC item JSON.
|
|
335
|
+
|
|
336
|
+
This hook is designed for processing NDJSON files that already contain
|
|
337
|
+
complete STAC items, eliminating the need for HTTP fetching. The "URL"
|
|
338
|
+
in the input file is actually the full STAC item JSON as a string.
|
|
339
|
+
|
|
340
|
+
Use Cases:
|
|
341
|
+
- ITS_LIVE bulk data: NDJSON files with pre-aggregated STAC items
|
|
342
|
+
- Local STAC collections: STAC items already downloaded
|
|
343
|
+
- Performance optimization: Skip HTTP fetch for cached data
|
|
344
|
+
|
|
345
|
+
Example:
|
|
346
|
+
>>> hook = PassthroughSTACHook()
|
|
347
|
+
>>> # url is actually a JSON string
|
|
348
|
+
>>> item = hook.fetch('{"type": "Feature", "id": "abc", ...}')
|
|
349
|
+
>>> print(item['id'])
|
|
350
|
+
'abc'
|
|
351
|
+
|
|
352
|
+
Performance:
|
|
353
|
+
- Eliminates HTTP overhead (no network calls)
|
|
354
|
+
- Faster processing for local/cached STAC items
|
|
355
|
+
- Reduces load on STAC catalog servers
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
|
|
359
|
+
"""Parse input URL as STAC item JSON directly.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
url: JSON string containing a complete STAC item.
|
|
363
|
+
**kwargs: Additional context (timeout, retry_attempts) - ignored for passthrough.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Parsed STAC item dictionary or None if JSON is invalid.
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
# Input "url" is actually a JSON string
|
|
370
|
+
item = json.loads(url)
|
|
371
|
+
|
|
372
|
+
# Validate it's a dictionary with STAC item structure
|
|
373
|
+
if not isinstance(item, dict):
|
|
374
|
+
logger.warning(f"Passthrough hook input is not a dict: {type(url)}")
|
|
375
|
+
return None
|
|
376
|
+
|
|
377
|
+
# Basic STAC item validation - should have at minimum 'type': 'Feature'
|
|
378
|
+
if item.get("type") != "Feature":
|
|
379
|
+
logger.warning(f"Passthrough hook input missing 'type': 'Feature': {url[:100]}...")
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
# Should have geometry and properties
|
|
383
|
+
if "geometry" not in item:
|
|
384
|
+
logger.warning(f"Passthrough hook input missing 'geometry': {url[:100]}...")
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
if "properties" not in item:
|
|
388
|
+
logger.warning(f"Passthrough hook input missing 'properties': {url[:100]}...")
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
return item
|
|
392
|
+
|
|
393
|
+
except json.JSONDecodeError as e:
|
|
394
|
+
logger.warning(f"Passthrough hook failed to parse JSON: {e}")
|
|
395
|
+
return None
|
|
396
|
+
except (ValueError, TypeError, OSError) as e:
|
|
397
|
+
logger.warning(f"Passthrough hook error: {e}")
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
|
|
401
|
+
"""Parse multiple JSON strings as STAC items.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
urls: List of JSON strings containing STAC items.
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
List of STAC item dictionaries (or None for failures), in same order as input.
|
|
408
|
+
"""
|
|
409
|
+
return [self.fetch(url, **kwargs) for url in urls]
|
|
410
|
+
|
|
411
|
+
def to_config(self) -> str:
|
|
412
|
+
"""Return config string for passthrough hook."""
|
|
413
|
+
return "passthrough"
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class ScriptSTACHook(BaseSTACHook):
|
|
417
|
+
"""Hook that calls an external script/executable.
|
|
418
|
+
|
|
419
|
+
The script receives the URL as an argument and should output valid STAC JSON
|
|
420
|
+
to stdout. For batch processing, the script can accept multiple URLs.
|
|
421
|
+
|
|
422
|
+
Script invocation:
|
|
423
|
+
Single: script_path URL
|
|
424
|
+
Batch: script_path --batch < urls.txt (one URL per line)
|
|
425
|
+
|
|
426
|
+
The script should exit with code 0 on success, non-zero on failure.
|
|
427
|
+
For batch mode, output should be NDJSON (one JSON object per line).
|
|
428
|
+
|
|
429
|
+
Example:
|
|
430
|
+
>>> hook = ScriptSTACHook('/path/to/generate_stac.py')
|
|
431
|
+
>>> item = hook.fetch('https://example.com/data.tif')
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
def __init__(self, script_path: str, interpreter: str | None = None):
|
|
435
|
+
"""Initialize with script path.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
script_path: Path to the script/executable.
|
|
439
|
+
interpreter: Optional interpreter (e.g., 'python', 'python3').
|
|
440
|
+
If None, script is executed directly.
|
|
441
|
+
"""
|
|
442
|
+
self.script_path = script_path
|
|
443
|
+
self.interpreter = interpreter
|
|
444
|
+
|
|
445
|
+
def fetch(
|
|
446
|
+
self,
|
|
447
|
+
url: str,
|
|
448
|
+
timeout: int = 60,
|
|
449
|
+
**kwargs,
|
|
450
|
+
) -> dict[str, Any] | None:
|
|
451
|
+
"""Run script to generate STAC item.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
url: URL to pass to the script.
|
|
455
|
+
timeout: Script execution timeout in seconds.
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
Parsed STAC item from script stdout, or None if failed.
|
|
459
|
+
"""
|
|
460
|
+
cmd = self._build_command([url])
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
result = subprocess.run( # nosec B603 - command built from validated config
|
|
464
|
+
cmd,
|
|
465
|
+
capture_output=True,
|
|
466
|
+
text=True,
|
|
467
|
+
timeout=timeout,
|
|
468
|
+
check=True,
|
|
469
|
+
)
|
|
470
|
+
return json.loads(result.stdout.strip())
|
|
471
|
+
except subprocess.TimeoutExpired:
|
|
472
|
+
logger.warning(f"Script timed out for {url}")
|
|
473
|
+
return None
|
|
474
|
+
except subprocess.CalledProcessError as e:
|
|
475
|
+
logger.warning(f"Script failed for {url}: {e.stderr}")
|
|
476
|
+
return None
|
|
477
|
+
except json.JSONDecodeError as e:
|
|
478
|
+
logger.warning(f"Script output is not valid JSON for {url}: {e}")
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
def fetch_batch(
|
|
482
|
+
self,
|
|
483
|
+
urls: list[str],
|
|
484
|
+
timeout: int = 300,
|
|
485
|
+
**kwargs,
|
|
486
|
+
) -> list[dict[str, Any] | None]:
|
|
487
|
+
"""Run script in batch mode for multiple URLs.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
urls: List of URLs to process.
|
|
491
|
+
timeout: Script execution timeout in seconds.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
List of STAC items (or None for failures).
|
|
495
|
+
"""
|
|
496
|
+
if len(urls) == 0:
|
|
497
|
+
return []
|
|
498
|
+
|
|
499
|
+
if len(urls) == 1:
|
|
500
|
+
return [self.fetch(urls[0], timeout=timeout, **kwargs)]
|
|
501
|
+
|
|
502
|
+
# Try batch mode first
|
|
503
|
+
cmd = self._build_command(["--batch"])
|
|
504
|
+
|
|
505
|
+
try:
|
|
506
|
+
# Write URLs to temp file for stdin
|
|
507
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
|
508
|
+
f.write("\n".join(urls))
|
|
509
|
+
temp_path = f.name
|
|
510
|
+
|
|
511
|
+
with open(temp_path) as stdin_file:
|
|
512
|
+
result = subprocess.run( # nosec B603 - command built from validated config
|
|
513
|
+
cmd,
|
|
514
|
+
stdin=stdin_file,
|
|
515
|
+
capture_output=True,
|
|
516
|
+
text=True,
|
|
517
|
+
timeout=timeout,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Clean up temp file
|
|
521
|
+
import os
|
|
522
|
+
|
|
523
|
+
os.unlink(temp_path)
|
|
524
|
+
|
|
525
|
+
if result.returncode == 0:
|
|
526
|
+
# Parse NDJSON output
|
|
527
|
+
items: list[dict[str, Any] | None] = []
|
|
528
|
+
for line in result.stdout.strip().split("\n"):
|
|
529
|
+
if line.strip():
|
|
530
|
+
try:
|
|
531
|
+
items.append(json.loads(line))
|
|
532
|
+
except json.JSONDecodeError:
|
|
533
|
+
items.append(None)
|
|
534
|
+
else:
|
|
535
|
+
items.append(None)
|
|
536
|
+
|
|
537
|
+
if len(items) == len(urls):
|
|
538
|
+
return items
|
|
539
|
+
|
|
540
|
+
logger.warning("Batch script returned wrong number of items, falling back to sequential")
|
|
541
|
+
|
|
542
|
+
except subprocess.TimeoutExpired:
|
|
543
|
+
logger.warning("Batch script timed out, falling back to sequential")
|
|
544
|
+
except (subprocess.CalledProcessError, OSError, ValueError, FileNotFoundError) as e:
|
|
545
|
+
logger.warning(f"Batch script failed: {e}, falling back to sequential")
|
|
546
|
+
|
|
547
|
+
# Fall back to sequential processing
|
|
548
|
+
return super().fetch_batch(urls, **kwargs)
|
|
549
|
+
|
|
550
|
+
def _build_command(self, args: list[str]) -> list[str]:
|
|
551
|
+
"""Build command list for subprocess."""
|
|
552
|
+
if self.interpreter:
|
|
553
|
+
return [self.interpreter, self.script_path, *args]
|
|
554
|
+
return [self.script_path, *args]
|
|
555
|
+
|
|
556
|
+
def to_config(self) -> str:
|
|
557
|
+
"""Return config string for script hook."""
|
|
558
|
+
if self.interpreter:
|
|
559
|
+
return f"script:{self.interpreter}:{self.script_path}"
|
|
560
|
+
return f"script:{self.script_path}"
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
class CallableSTACHook(BaseSTACHook):
|
|
564
|
+
"""Hook that wraps a Python callable.
|
|
565
|
+
|
|
566
|
+
WARNING: This hook is NOT serializable and will not work with Dask
|
|
567
|
+
distributed processing. Use ModuleSTACHook for distributed workloads.
|
|
568
|
+
|
|
569
|
+
This hook is useful for local processing or testing where you want to
|
|
570
|
+
pass a function directly without creating a module.
|
|
571
|
+
|
|
572
|
+
Example:
|
|
573
|
+
>>> def my_hook(url, **kwargs):
|
|
574
|
+
... return {'type': 'Feature', 'id': url, ...}
|
|
575
|
+
>>> hook = CallableSTACHook(my_hook)
|
|
576
|
+
>>> item = hook.fetch('https://example.com/data.tif')
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
def __init__(
|
|
580
|
+
self,
|
|
581
|
+
func: Callable[[str], dict[str, Any] | None],
|
|
582
|
+
batch_func: Callable[[list[str]], list[dict[str, Any] | None]] | None = None,
|
|
583
|
+
):
|
|
584
|
+
"""Initialize with callable.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
func: Function that takes URL and returns STAC item or None.
|
|
588
|
+
batch_func: Optional batch function for efficient batch processing.
|
|
589
|
+
"""
|
|
590
|
+
self.func = func
|
|
591
|
+
self.batch_func = batch_func
|
|
592
|
+
|
|
593
|
+
def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
|
|
594
|
+
"""Invoke the callable with the URL.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
url: URL to process.
|
|
598
|
+
**kwargs: Additional context.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
STAC item or None. Supports both dict and pystac.Item returns.
|
|
602
|
+
"""
|
|
603
|
+
try:
|
|
604
|
+
result = self.func(url, **kwargs)
|
|
605
|
+
return _normalize_stac_result(result)
|
|
606
|
+
except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
|
|
607
|
+
logger.warning(f"Callable hook failed for {url}: {e}")
|
|
608
|
+
return None
|
|
609
|
+
|
|
610
|
+
def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
|
|
611
|
+
"""Invoke batch callable if available, otherwise sequential.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
urls: List of URLs to process.
|
|
615
|
+
**kwargs: Additional context.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
List of STAC items (or None). Supports pystac.Item returns.
|
|
619
|
+
"""
|
|
620
|
+
if self.batch_func is not None:
|
|
621
|
+
try:
|
|
622
|
+
results = self.batch_func(urls, **kwargs)
|
|
623
|
+
if isinstance(results, list) and len(results) == len(urls):
|
|
624
|
+
return _normalize_stac_results(results)
|
|
625
|
+
except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
|
|
626
|
+
logger.warning(f"Batch callable failed: {e}, falling back to sequential")
|
|
627
|
+
|
|
628
|
+
return super().fetch_batch(urls, **kwargs)
|
|
629
|
+
|
|
630
|
+
def to_config(self) -> str:
|
|
631
|
+
"""Return config string - not meaningful for callables."""
|
|
632
|
+
func_name = getattr(self.func, "__name__", "anonymous")
|
|
633
|
+
module = getattr(self.func, "__module__", "unknown")
|
|
634
|
+
return f"callable:{module}.{func_name}"
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
# =============================================================================
|
|
638
|
+
# Hook Factory
|
|
639
|
+
# =============================================================================
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def parse_hook_config(config: str | Callable | BaseSTACHook | None) -> BaseSTACHook:
|
|
643
|
+
"""Parse hook configuration and return appropriate hook instance.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
config: Hook configuration, can be:
|
|
647
|
+
- None or "default": Use DefaultSTACHook
|
|
648
|
+
- "passthrough": Use PassthroughSTACHook (URLs are pre-fetched STAC JSON)
|
|
649
|
+
- "module:path.to.module:function": Use ModuleSTACHook
|
|
650
|
+
- "script:/path/to/script": Use ScriptSTACHook
|
|
651
|
+
- "script:python:/path/to/script.py": Use ScriptSTACHook with interpreter
|
|
652
|
+
- Callable: Use CallableSTACHook (local only)
|
|
653
|
+
- BaseSTACHook instance: Use as-is
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
Configured hook instance.
|
|
657
|
+
|
|
658
|
+
Raises:
|
|
659
|
+
ValueError: If config format is invalid.
|
|
660
|
+
"""
|
|
661
|
+
if config is None or config == "default":
|
|
662
|
+
return DefaultSTACHook()
|
|
663
|
+
|
|
664
|
+
if isinstance(config, BaseSTACHook):
|
|
665
|
+
return config
|
|
666
|
+
|
|
667
|
+
if callable(config):
|
|
668
|
+
return CallableSTACHook(config)
|
|
669
|
+
|
|
670
|
+
if not isinstance(config, str):
|
|
671
|
+
raise ValueError(f"Invalid hook config type: {type(config)}")
|
|
672
|
+
|
|
673
|
+
# Parse string config
|
|
674
|
+
if config == "passthrough":
|
|
675
|
+
return PassthroughSTACHook()
|
|
676
|
+
|
|
677
|
+
if config.startswith("module:"):
|
|
678
|
+
module_path = config[7:] # Remove 'module:' prefix
|
|
679
|
+
return ModuleSTACHook(module_path)
|
|
680
|
+
|
|
681
|
+
if config.startswith("script:"):
|
|
682
|
+
script_config = config[7:] # Remove 'script:' prefix
|
|
683
|
+
# Check for interpreter:path format
|
|
684
|
+
if ":" in script_config and not script_config.startswith("/"):
|
|
685
|
+
interpreter, script_path = script_config.split(":", 1)
|
|
686
|
+
return ScriptSTACHook(script_path, interpreter=interpreter)
|
|
687
|
+
return ScriptSTACHook(script_config)
|
|
688
|
+
|
|
689
|
+
raise ValueError(
|
|
690
|
+
f"Invalid hook config: {config}. Expected 'default', 'passthrough', 'module:path:func', "
|
|
691
|
+
"'script:/path', or a callable."
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def get_hook(config: str | Callable | BaseSTACHook | None = None) -> BaseSTACHook:
|
|
696
|
+
"""Get a STAC hook instance from configuration.
|
|
697
|
+
|
|
698
|
+
This is the main entry point for getting hooks. It's an alias for
|
|
699
|
+
parse_hook_config with a friendlier name.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
config: Hook configuration (see parse_hook_config for details).
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Configured hook instance.
|
|
706
|
+
"""
|
|
707
|
+
return parse_hook_config(config)
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def serialize_hook(hook: BaseSTACHook | Callable | None) -> str:
|
|
711
|
+
"""Serialize a hook to a config string for transmission.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
hook: Hook instance or callable.
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
Config string that can be used with parse_hook_config.
|
|
718
|
+
"""
|
|
719
|
+
if hook is None:
|
|
720
|
+
return "default"
|
|
721
|
+
|
|
722
|
+
if isinstance(hook, BaseSTACHook):
|
|
723
|
+
return hook.to_config()
|
|
724
|
+
|
|
725
|
+
if callable(hook):
|
|
726
|
+
# Try to create a module path for the callable
|
|
727
|
+
func_name = getattr(hook, "__name__", None)
|
|
728
|
+
module = getattr(hook, "__module__", None)
|
|
729
|
+
if func_name and module and module != "__main__":
|
|
730
|
+
return f"module:{module}:{func_name}"
|
|
731
|
+
# Can't serialize anonymous or __main__ functions
|
|
732
|
+
raise ValueError(
|
|
733
|
+
f"Cannot serialize callable {hook}. Use ModuleSTACHook with an importable "
|
|
734
|
+
"function path for distributed processing."
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
raise ValueError(f"Cannot serialize hook: {hook}")
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
__all__ = [
|
|
741
|
+
"STACHookProtocol",
|
|
742
|
+
"BatchSTACHookProtocol",
|
|
743
|
+
"BaseSTACHook",
|
|
744
|
+
"DefaultSTACHook",
|
|
745
|
+
"PassthroughSTACHook",
|
|
746
|
+
"ModuleSTACHook",
|
|
747
|
+
"ScriptSTACHook",
|
|
748
|
+
"CallableSTACHook",
|
|
749
|
+
"parse_hook_config",
|
|
750
|
+
"get_hook",
|
|
751
|
+
"serialize_hook",
|
|
752
|
+
"_normalize_stac_result",
|
|
753
|
+
"_normalize_stac_results",
|
|
754
|
+
]
|