earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,754 @@
1
+ # stac_hooks.py
2
+ """STAC fetch hooks for custom item generation.
3
+
4
+ This module provides a flexible hook system for generating STAC items from URLs
5
+ when the URL doesn't point directly to a STAC JSON. This is useful when:
6
+
7
+ 1. URLs point to raw data files (COG, NetCDF, HDF5) that need metadata extraction
8
+ 2. URLs are identifiers that need to be transformed via an external API
9
+ 3. Custom STAC generation logic is required for specific data providers
10
+
11
+ Architecture:
12
+ Hooks are designed to be serializable for Dask distributed processing.
13
+ Instead of passing callable objects (which can't be pickled), hooks are
14
+ configured via module paths that can be imported on remote workers.
15
+
16
+ Hook Types:
17
+ - default: Standard STAC JSON fetch from URL (current behavior)
18
+ - callable: Python callable (local processing only, not serializable)
19
+ - module: Import path to Python function (serializable for Dask)
20
+ - script: External executable (serializable, runs as subprocess)
21
+
22
+ Usage:
23
+ >>> # Module-based hook (serializable, recommended for Dask)
24
+ >>> config = ProcessingConfig(
25
+ ... input_file='urls.parquet',
26
+ ... output_catalog='./catalog',
27
+ ... scratch_location='./scratch',
28
+ ... stac_hook='module:mypackage.hooks:url_to_stac_item',
29
+ ... )
30
+
31
+ >>> # Script-based hook (serializable)
32
+ >>> config = ProcessingConfig(
33
+ ... stac_hook='script:/path/to/generate_stac.py',
34
+ ... )
35
+
36
+ >>> # Callable hook (local processing only)
37
+ >>> def my_hook(url: str) -> dict:
38
+ ... return {...}
39
+ >>> config = ProcessingConfig(
40
+ ... stac_hook=my_hook, # Only works with LocalProcessor
41
+ ... )
42
+
43
+ Hook Function Signature:
44
+ All hook functions must implement the following signature:
45
+
46
+ def hook_function(url: str, **kwargs) -> dict[str, Any] | pystac.Item | None:
47
+ '''Generate a STAC item from a URL.
48
+
49
+ Args:
50
+ url: The input URL/identifier to process.
51
+ **kwargs: Additional context (timeout, retry_attempts, etc.)
52
+
53
+ Returns:
54
+ A valid STAC item dictionary, pystac.Item object, or None if generation failed.
55
+ pystac.Item objects are automatically converted to dictionaries.
56
+ '''
57
+ pass
58
+
59
+ Batch Hook Signature (Optional):
60
+ For efficiency, hooks can optionally support batch processing:
61
+
62
+ def batch_hook_function(urls: list[str], **kwargs) -> list[dict[str, Any] | pystac.Item | None]:
63
+ '''Generate STAC items from multiple URLs.
64
+
65
+ Args:
66
+ urls: List of URLs/identifiers to process.
67
+ **kwargs: Additional context.
68
+
69
+ Returns:
70
+ List of STAC items (dict or pystac.Item) or None for failures, same order as input.
71
+ pystac.Item objects are automatically converted to dictionaries.
72
+ '''
73
+ pass
74
+ """
75
+
76
+ from __future__ import annotations
77
+
78
+ import importlib
79
+ import json
80
+ import logging
81
+ import subprocess # nosec B404 - subprocess used for external tool integration
82
+ import tempfile
83
+ from abc import ABC, abstractmethod
84
+ from collections.abc import Callable
85
+ from typing import Any, Protocol, runtime_checkable
86
+
87
+ import requests
88
+
89
+ logger = logging.getLogger(__name__)
90
+
91
+
92
+ # =============================================================================
93
+ # Helper Functions
94
+ # =============================================================================
95
+
96
+
97
+ def _normalize_stac_result(result: Any) -> dict[str, Any] | None:
98
+ """Convert hook result to a dictionary, handling pystac.Item objects.
99
+
100
+ Args:
101
+ result: The result from a hook function. Can be:
102
+ - dict: Returned as-is
103
+ - pystac.Item: Converted to dict via to_dict()
104
+ - None: Returned as-is
105
+ - Other: Returns None
106
+
107
+ Returns:
108
+ STAC item dictionary or None.
109
+ """
110
+ if result is None:
111
+ return None
112
+ if isinstance(result, dict):
113
+ return result
114
+ # Check for pystac.Item or any object with to_dict method
115
+ if hasattr(result, "to_dict") and callable(result.to_dict):
116
+ try:
117
+ converted = result.to_dict()
118
+ if isinstance(converted, dict):
119
+ return converted
120
+ return None
121
+ except (ValueError, TypeError, AttributeError, RuntimeError) as e:
122
+ logger.warning(f"Failed to convert pystac.Item to dict: {e}")
123
+ return None
124
+ return None
125
+
126
+
127
+ def _normalize_stac_results(results: list[Any]) -> list[dict[str, Any] | None]:
128
+ """Convert a list of hook results to dictionaries.
129
+
130
+ Args:
131
+ results: List of results from a batch hook function.
132
+
133
+ Returns:
134
+ List of STAC item dictionaries (or None for failures).
135
+ """
136
+ return [_normalize_stac_result(r) for r in results]
137
+
138
+
139
+ # =============================================================================
140
+ # Protocol Definitions
141
+ # =============================================================================
142
+
143
+
144
+ @runtime_checkable
145
+ class STACHookProtocol(Protocol):
146
+ """Protocol for STAC fetch hooks."""
147
+
148
+ def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
149
+ """Fetch/generate a STAC item from a URL.
150
+
151
+ Args:
152
+ url: The URL or identifier to process.
153
+ **kwargs: Additional context like timeout, retry_attempts.
154
+
155
+ Returns:
156
+ STAC item dictionary or None if fetch failed.
157
+ """
158
+ ...
159
+
160
+
161
+ @runtime_checkable
162
+ class BatchSTACHookProtocol(Protocol):
163
+ """Protocol for batch STAC fetch hooks."""
164
+
165
+ def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
166
+ """Fetch/generate STAC items from multiple URLs.
167
+
168
+ Args:
169
+ urls: List of URLs or identifiers to process.
170
+ **kwargs: Additional context.
171
+
172
+ Returns:
173
+ List of STAC items (or None for failures), in same order as input.
174
+ """
175
+ ...
176
+
177
+
178
+ # =============================================================================
179
+ # Hook Implementations
180
+ # =============================================================================
181
+
182
+
183
+ class BaseSTACHook(ABC):
184
+ """Base class for STAC fetch hooks."""
185
+
186
+ @abstractmethod
187
+ def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
188
+ """Fetch a single STAC item."""
189
+ pass
190
+
191
+ def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
192
+ """Fetch multiple STAC items. Default: sequential fetch."""
193
+ return [self.fetch(url, **kwargs) for url in urls]
194
+
195
+ def to_config(self) -> str:
196
+ """Serialize hook to config string for storage/transmission."""
197
+ raise NotImplementedError("Subclass must implement to_config()")
198
+
199
+
200
+ class DefaultSTACHook(BaseSTACHook):
201
+ """Default hook: fetch STAC JSON directly from URL.
202
+
203
+ This is the standard behavior - assumes the URL points to a valid STAC item JSON.
204
+ """
205
+
206
+ def fetch(
207
+ self,
208
+ url: str,
209
+ timeout: int = 30,
210
+ retry_attempts: int = 3,
211
+ **kwargs,
212
+ ) -> dict[str, Any] | None:
213
+ """Download and parse STAC item from URL.
214
+
215
+ Args:
216
+ url: URL to STAC item JSON.
217
+ timeout: Request timeout in seconds.
218
+ retry_attempts: Number of retry attempts.
219
+
220
+ Returns:
221
+ Parsed STAC item or None if failed.
222
+ """
223
+ import time
224
+
225
+ for attempt in range(retry_attempts):
226
+ try:
227
+ response = requests.get(url, timeout=timeout)
228
+ response.raise_for_status()
229
+ return response.json()
230
+ except requests.RequestException as e:
231
+ if attempt == retry_attempts - 1:
232
+ logger.warning(f"Failed to download {url} after {retry_attempts} attempts: {e}")
233
+ return None
234
+ time.sleep(2**attempt) # Exponential backoff
235
+ return None
236
+
237
+ def to_config(self) -> str:
238
+ """Return config string for default hook."""
239
+ return "default"
240
+
241
+
242
+ class ModuleSTACHook(BaseSTACHook):
243
+ """Hook that calls a Python function specified by module path.
244
+
245
+ This is the recommended approach for Dask distributed processing because
246
+ the module path is a string that can be serialized and the function is
247
+ imported on each worker.
248
+
249
+ The module path format is: 'package.module:function_name'
250
+
251
+ Example:
252
+ >>> hook = ModuleSTACHook('mypackage.stac_generator:url_to_item')
253
+ >>> item = hook.fetch('https://example.com/data.tif')
254
+ """
255
+
256
+ def __init__(self, module_path: str):
257
+ """Initialize with module path.
258
+
259
+ Args:
260
+ module_path: Import path in format 'package.module:function_name'
261
+ """
262
+ self.module_path = module_path
263
+ self._func: Callable | None = None
264
+ self._batch_func: Callable | None = None
265
+
266
+ def _load_function(self) -> Callable[..., Any]:
267
+ """Import and return the hook function."""
268
+ if self._func is not None:
269
+ return self._func
270
+
271
+ try:
272
+ module_name, func_name = self.module_path.rsplit(":", 1)
273
+ module = importlib.import_module(module_name)
274
+ func = getattr(module, func_name)
275
+ self._func = func
276
+
277
+ # Check for batch function (convention: func_name + '_batch')
278
+ batch_func_name = f"{func_name}_batch"
279
+ if hasattr(module, batch_func_name):
280
+ self._batch_func = getattr(module, batch_func_name)
281
+
282
+ return func
283
+ except (ValueError, ImportError, AttributeError) as e:
284
+ raise ImportError(f"Failed to import hook function from '{self.module_path}': {e}") from e
285
+
286
+ def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
287
+ """Fetch STAC item using the imported function.
288
+
289
+ Args:
290
+ url: URL or identifier to process.
291
+ **kwargs: Additional context passed to the function.
292
+
293
+ Returns:
294
+ STAC item or None. Supports both dict and pystac.Item returns.
295
+ """
296
+ func = self._load_function()
297
+ try:
298
+ result = func(url, **kwargs)
299
+ return _normalize_stac_result(result)
300
+ except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
301
+ logger.warning(f"Hook function failed for {url}: {e}")
302
+ return None
303
+
304
+ def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
305
+ """Fetch multiple STAC items, using batch function if available.
306
+
307
+ Args:
308
+ urls: List of URLs to process.
309
+ **kwargs: Additional context.
310
+
311
+ Returns:
312
+ List of STAC items (or None for failures). Supports pystac.Item returns.
313
+ """
314
+ self._load_function() # Ensure functions are loaded
315
+
316
+ if self._batch_func is not None:
317
+ try:
318
+ results = self._batch_func(urls, **kwargs)
319
+ if isinstance(results, list) and len(results) == len(urls):
320
+ return _normalize_stac_results(results)
321
+ logger.warning("Batch hook returned invalid result, falling back to sequential")
322
+ except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
323
+ logger.warning(f"Batch hook failed: {e}, falling back to sequential")
324
+
325
+ # Fall back to sequential
326
+ return super().fetch_batch(urls, **kwargs)
327
+
328
+ def to_config(self) -> str:
329
+ """Return config string for module hook."""
330
+ return f"module:{self.module_path}"
331
+
332
+
333
+ class PassthroughSTACHook(BaseSTACHook):
334
+ """Hook that treats input URLs as pre-fetched STAC item JSON.
335
+
336
+ This hook is designed for processing NDJSON files that already contain
337
+ complete STAC items, eliminating the need for HTTP fetching. The "URL"
338
+ in the input file is actually the full STAC item JSON as a string.
339
+
340
+ Use Cases:
341
+ - ITS_LIVE bulk data: NDJSON files with pre-aggregated STAC items
342
+ - Local STAC collections: STAC items already downloaded
343
+ - Performance optimization: Skip HTTP fetch for cached data
344
+
345
+ Example:
346
+ >>> hook = PassthroughSTACHook()
347
+ >>> # url is actually a JSON string
348
+ >>> item = hook.fetch('{"type": "Feature", "id": "abc", ...}')
349
+ >>> print(item['id'])
350
+ 'abc'
351
+
352
+ Performance:
353
+ - Eliminates HTTP overhead (no network calls)
354
+ - Faster processing for local/cached STAC items
355
+ - Reduces load on STAC catalog servers
356
+ """
357
+
358
+ def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
359
+ """Parse input URL as STAC item JSON directly.
360
+
361
+ Args:
362
+ url: JSON string containing a complete STAC item.
363
+ **kwargs: Additional context (timeout, retry_attempts) - ignored for passthrough.
364
+
365
+ Returns:
366
+ Parsed STAC item dictionary or None if JSON is invalid.
367
+ """
368
+ try:
369
+ # Input "url" is actually a JSON string
370
+ item = json.loads(url)
371
+
372
+ # Validate it's a dictionary with STAC item structure
373
+ if not isinstance(item, dict):
374
+ logger.warning(f"Passthrough hook input is not a dict: {type(url)}")
375
+ return None
376
+
377
+ # Basic STAC item validation - should have at minimum 'type': 'Feature'
378
+ if item.get("type") != "Feature":
379
+ logger.warning(f"Passthrough hook input missing 'type': 'Feature': {url[:100]}...")
380
+ return None
381
+
382
+ # Should have geometry and properties
383
+ if "geometry" not in item:
384
+ logger.warning(f"Passthrough hook input missing 'geometry': {url[:100]}...")
385
+ return None
386
+
387
+ if "properties" not in item:
388
+ logger.warning(f"Passthrough hook input missing 'properties': {url[:100]}...")
389
+ return None
390
+
391
+ return item
392
+
393
+ except json.JSONDecodeError as e:
394
+ logger.warning(f"Passthrough hook failed to parse JSON: {e}")
395
+ return None
396
+ except (ValueError, TypeError, OSError) as e:
397
+ logger.warning(f"Passthrough hook error: {e}")
398
+ return None
399
+
400
+ def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
401
+ """Parse multiple JSON strings as STAC items.
402
+
403
+ Args:
404
+ urls: List of JSON strings containing STAC items.
405
+
406
+ Returns:
407
+ List of STAC item dictionaries (or None for failures), in same order as input.
408
+ """
409
+ return [self.fetch(url, **kwargs) for url in urls]
410
+
411
+ def to_config(self) -> str:
412
+ """Return config string for passthrough hook."""
413
+ return "passthrough"
414
+
415
+
416
+ class ScriptSTACHook(BaseSTACHook):
417
+ """Hook that calls an external script/executable.
418
+
419
+ The script receives the URL as an argument and should output valid STAC JSON
420
+ to stdout. For batch processing, the script can accept multiple URLs.
421
+
422
+ Script invocation:
423
+ Single: script_path URL
424
+ Batch: script_path --batch < urls.txt (one URL per line)
425
+
426
+ The script should exit with code 0 on success, non-zero on failure.
427
+ For batch mode, output should be NDJSON (one JSON object per line).
428
+
429
+ Example:
430
+ >>> hook = ScriptSTACHook('/path/to/generate_stac.py')
431
+ >>> item = hook.fetch('https://example.com/data.tif')
432
+ """
433
+
434
+ def __init__(self, script_path: str, interpreter: str | None = None):
435
+ """Initialize with script path.
436
+
437
+ Args:
438
+ script_path: Path to the script/executable.
439
+ interpreter: Optional interpreter (e.g., 'python', 'python3').
440
+ If None, script is executed directly.
441
+ """
442
+ self.script_path = script_path
443
+ self.interpreter = interpreter
444
+
445
+ def fetch(
446
+ self,
447
+ url: str,
448
+ timeout: int = 60,
449
+ **kwargs,
450
+ ) -> dict[str, Any] | None:
451
+ """Run script to generate STAC item.
452
+
453
+ Args:
454
+ url: URL to pass to the script.
455
+ timeout: Script execution timeout in seconds.
456
+
457
+ Returns:
458
+ Parsed STAC item from script stdout, or None if failed.
459
+ """
460
+ cmd = self._build_command([url])
461
+
462
+ try:
463
+ result = subprocess.run( # nosec B603 - command built from validated config
464
+ cmd,
465
+ capture_output=True,
466
+ text=True,
467
+ timeout=timeout,
468
+ check=True,
469
+ )
470
+ return json.loads(result.stdout.strip())
471
+ except subprocess.TimeoutExpired:
472
+ logger.warning(f"Script timed out for {url}")
473
+ return None
474
+ except subprocess.CalledProcessError as e:
475
+ logger.warning(f"Script failed for {url}: {e.stderr}")
476
+ return None
477
+ except json.JSONDecodeError as e:
478
+ logger.warning(f"Script output is not valid JSON for {url}: {e}")
479
+ return None
480
+
481
+ def fetch_batch(
482
+ self,
483
+ urls: list[str],
484
+ timeout: int = 300,
485
+ **kwargs,
486
+ ) -> list[dict[str, Any] | None]:
487
+ """Run script in batch mode for multiple URLs.
488
+
489
+ Args:
490
+ urls: List of URLs to process.
491
+ timeout: Script execution timeout in seconds.
492
+
493
+ Returns:
494
+ List of STAC items (or None for failures).
495
+ """
496
+ if len(urls) == 0:
497
+ return []
498
+
499
+ if len(urls) == 1:
500
+ return [self.fetch(urls[0], timeout=timeout, **kwargs)]
501
+
502
+ # Try batch mode first
503
+ cmd = self._build_command(["--batch"])
504
+
505
+ try:
506
+ # Write URLs to temp file for stdin
507
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
508
+ f.write("\n".join(urls))
509
+ temp_path = f.name
510
+
511
+ with open(temp_path) as stdin_file:
512
+ result = subprocess.run( # nosec B603 - command built from validated config
513
+ cmd,
514
+ stdin=stdin_file,
515
+ capture_output=True,
516
+ text=True,
517
+ timeout=timeout,
518
+ )
519
+
520
+ # Clean up temp file
521
+ import os
522
+
523
+ os.unlink(temp_path)
524
+
525
+ if result.returncode == 0:
526
+ # Parse NDJSON output
527
+ items: list[dict[str, Any] | None] = []
528
+ for line in result.stdout.strip().split("\n"):
529
+ if line.strip():
530
+ try:
531
+ items.append(json.loads(line))
532
+ except json.JSONDecodeError:
533
+ items.append(None)
534
+ else:
535
+ items.append(None)
536
+
537
+ if len(items) == len(urls):
538
+ return items
539
+
540
+ logger.warning("Batch script returned wrong number of items, falling back to sequential")
541
+
542
+ except subprocess.TimeoutExpired:
543
+ logger.warning("Batch script timed out, falling back to sequential")
544
+ except (subprocess.CalledProcessError, OSError, ValueError, FileNotFoundError) as e:
545
+ logger.warning(f"Batch script failed: {e}, falling back to sequential")
546
+
547
+ # Fall back to sequential processing
548
+ return super().fetch_batch(urls, **kwargs)
549
+
550
+ def _build_command(self, args: list[str]) -> list[str]:
551
+ """Build command list for subprocess."""
552
+ if self.interpreter:
553
+ return [self.interpreter, self.script_path, *args]
554
+ return [self.script_path, *args]
555
+
556
+ def to_config(self) -> str:
557
+ """Return config string for script hook."""
558
+ if self.interpreter:
559
+ return f"script:{self.interpreter}:{self.script_path}"
560
+ return f"script:{self.script_path}"
561
+
562
+
563
+ class CallableSTACHook(BaseSTACHook):
564
+ """Hook that wraps a Python callable.
565
+
566
+ WARNING: This hook is NOT serializable and will not work with Dask
567
+ distributed processing. Use ModuleSTACHook for distributed workloads.
568
+
569
+ This hook is useful for local processing or testing where you want to
570
+ pass a function directly without creating a module.
571
+
572
+ Example:
573
+ >>> def my_hook(url, **kwargs):
574
+ ... return {'type': 'Feature', 'id': url, ...}
575
+ >>> hook = CallableSTACHook(my_hook)
576
+ >>> item = hook.fetch('https://example.com/data.tif')
577
+ """
578
+
579
+ def __init__(
580
+ self,
581
+ func: Callable[[str], dict[str, Any] | None],
582
+ batch_func: Callable[[list[str]], list[dict[str, Any] | None]] | None = None,
583
+ ):
584
+ """Initialize with callable.
585
+
586
+ Args:
587
+ func: Function that takes URL and returns STAC item or None.
588
+ batch_func: Optional batch function for efficient batch processing.
589
+ """
590
+ self.func = func
591
+ self.batch_func = batch_func
592
+
593
+ def fetch(self, url: str, **kwargs) -> dict[str, Any] | None:
594
+ """Invoke the callable with the URL.
595
+
596
+ Args:
597
+ url: URL to process.
598
+ **kwargs: Additional context.
599
+
600
+ Returns:
601
+ STAC item or None. Supports both dict and pystac.Item returns.
602
+ """
603
+ try:
604
+ result = self.func(url, **kwargs)
605
+ return _normalize_stac_result(result)
606
+ except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
607
+ logger.warning(f"Callable hook failed for {url}: {e}")
608
+ return None
609
+
610
+ def fetch_batch(self, urls: list[str], **kwargs) -> list[dict[str, Any] | None]:
611
+ """Invoke batch callable if available, otherwise sequential.
612
+
613
+ Args:
614
+ urls: List of URLs to process.
615
+ **kwargs: Additional context.
616
+
617
+ Returns:
618
+ List of STAC items (or None). Supports pystac.Item returns.
619
+ """
620
+ if self.batch_func is not None:
621
+ try:
622
+ results = self.batch_func(urls, **kwargs)
623
+ if isinstance(results, list) and len(results) == len(urls):
624
+ return _normalize_stac_results(results)
625
+ except (ValueError, TypeError, AttributeError, RuntimeError, OSError) as e:
626
+ logger.warning(f"Batch callable failed: {e}, falling back to sequential")
627
+
628
+ return super().fetch_batch(urls, **kwargs)
629
+
630
+ def to_config(self) -> str:
631
+ """Return config string - not meaningful for callables."""
632
+ func_name = getattr(self.func, "__name__", "anonymous")
633
+ module = getattr(self.func, "__module__", "unknown")
634
+ return f"callable:{module}.{func_name}"
635
+
636
+
637
+ # =============================================================================
638
+ # Hook Factory
639
+ # =============================================================================
640
+
641
+
642
+ def parse_hook_config(config: str | Callable | BaseSTACHook | None) -> BaseSTACHook:
643
+ """Parse hook configuration and return appropriate hook instance.
644
+
645
+ Args:
646
+ config: Hook configuration, can be:
647
+ - None or "default": Use DefaultSTACHook
648
+ - "passthrough": Use PassthroughSTACHook (URLs are pre-fetched STAC JSON)
649
+ - "module:path.to.module:function": Use ModuleSTACHook
650
+ - "script:/path/to/script": Use ScriptSTACHook
651
+ - "script:python:/path/to/script.py": Use ScriptSTACHook with interpreter
652
+ - Callable: Use CallableSTACHook (local only)
653
+ - BaseSTACHook instance: Use as-is
654
+
655
+ Returns:
656
+ Configured hook instance.
657
+
658
+ Raises:
659
+ ValueError: If config format is invalid.
660
+ """
661
+ if config is None or config == "default":
662
+ return DefaultSTACHook()
663
+
664
+ if isinstance(config, BaseSTACHook):
665
+ return config
666
+
667
+ if callable(config):
668
+ return CallableSTACHook(config)
669
+
670
+ if not isinstance(config, str):
671
+ raise ValueError(f"Invalid hook config type: {type(config)}")
672
+
673
+ # Parse string config
674
+ if config == "passthrough":
675
+ return PassthroughSTACHook()
676
+
677
+ if config.startswith("module:"):
678
+ module_path = config[7:] # Remove 'module:' prefix
679
+ return ModuleSTACHook(module_path)
680
+
681
+ if config.startswith("script:"):
682
+ script_config = config[7:] # Remove 'script:' prefix
683
+ # Check for interpreter:path format
684
+ if ":" in script_config and not script_config.startswith("/"):
685
+ interpreter, script_path = script_config.split(":", 1)
686
+ return ScriptSTACHook(script_path, interpreter=interpreter)
687
+ return ScriptSTACHook(script_config)
688
+
689
+ raise ValueError(
690
+ f"Invalid hook config: {config}. Expected 'default', 'passthrough', 'module:path:func', "
691
+ "'script:/path', or a callable."
692
+ )
693
+
694
+
695
+ def get_hook(config: str | Callable | BaseSTACHook | None = None) -> BaseSTACHook:
696
+ """Get a STAC hook instance from configuration.
697
+
698
+ This is the main entry point for getting hooks. It's an alias for
699
+ parse_hook_config with a friendlier name.
700
+
701
+ Args:
702
+ config: Hook configuration (see parse_hook_config for details).
703
+
704
+ Returns:
705
+ Configured hook instance.
706
+ """
707
+ return parse_hook_config(config)
708
+
709
+
710
+ def serialize_hook(hook: BaseSTACHook | Callable | None) -> str:
711
+ """Serialize a hook to a config string for transmission.
712
+
713
+ Args:
714
+ hook: Hook instance or callable.
715
+
716
+ Returns:
717
+ Config string that can be used with parse_hook_config.
718
+ """
719
+ if hook is None:
720
+ return "default"
721
+
722
+ if isinstance(hook, BaseSTACHook):
723
+ return hook.to_config()
724
+
725
+ if callable(hook):
726
+ # Try to create a module path for the callable
727
+ func_name = getattr(hook, "__name__", None)
728
+ module = getattr(hook, "__module__", None)
729
+ if func_name and module and module != "__main__":
730
+ return f"module:{module}:{func_name}"
731
+ # Can't serialize anonymous or __main__ functions
732
+ raise ValueError(
733
+ f"Cannot serialize callable {hook}. Use ModuleSTACHook with an importable "
734
+ "function path for distributed processing."
735
+ )
736
+
737
+ raise ValueError(f"Cannot serialize hook: {hook}")
738
+
739
+
740
+ __all__ = [
741
+ "STACHookProtocol",
742
+ "BatchSTACHookProtocol",
743
+ "BaseSTACHook",
744
+ "DefaultSTACHook",
745
+ "PassthroughSTACHook",
746
+ "ModuleSTACHook",
747
+ "ScriptSTACHook",
748
+ "CallableSTACHook",
749
+ "parse_hook_config",
750
+ "get_hook",
751
+ "serialize_hook",
752
+ "_normalize_stac_result",
753
+ "_normalize_stac_results",
754
+ ]