sfeos-helpers 6.1.0__py3-none-any.whl → 6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ """Async index insertion strategies."""
2
+ import logging
3
+ from datetime import timedelta
4
+ from typing import Any, Dict, List
5
+
6
+ from fastapi import HTTPException, status
7
+
8
+ from stac_fastapi.sfeos_helpers.database import (
9
+ extract_date,
10
+ extract_first_date_from_index,
11
+ index_alias_by_collection_id,
12
+ mk_item_id,
13
+ )
14
+
15
+ from .base import BaseIndexInserter
16
+ from .index_operations import IndexOperations
17
+ from .managers import DatetimeIndexManager
18
+ from .selection import DatetimeBasedIndexSelector
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DatetimeIndexInserter(BaseIndexInserter):
24
+ """Async datetime-based index insertion strategy."""
25
+
26
+ def __init__(self, client: Any, index_operations: IndexOperations):
27
+ """Initialize the async datetime index inserter.
28
+
29
+ Args:
30
+ client: Async search engine client instance.
31
+ index_operations (IndexOperations): Search engine adapter instance.
32
+ """
33
+ self.client = client
34
+ self.index_operations = index_operations
35
+ self.datetime_manager = DatetimeIndexManager(client, index_operations)
36
+
37
+ @staticmethod
38
+ def should_create_collection_index() -> bool:
39
+ """Whether this strategy requires collection index creation.
40
+
41
+ Returns:
42
+ bool: False, as datetime strategy doesn't create collection indexes.
43
+ """
44
+ return False
45
+
46
+ async def create_simple_index(self, client: Any, collection_id: str) -> str:
47
+ """Create a simple index asynchronously.
48
+
49
+ Args:
50
+ client: Search engine client instance.
51
+ collection_id (str): Collection identifier.
52
+
53
+ Returns:
54
+ str: Created index name.
55
+ """
56
+ return await self.index_operations.create_simple_index(client, collection_id)
57
+
58
+ async def get_target_index(
59
+ self, collection_id: str, product: Dict[str, Any]
60
+ ) -> str:
61
+ """Get target index for a single product.
62
+
63
+ Args:
64
+ collection_id (str): Collection identifier.
65
+ product (Dict[str, Any]): Product data containing datetime information.
66
+
67
+ Returns:
68
+ str: Target index name for the product.
69
+ """
70
+ index_selector = DatetimeBasedIndexSelector(self.client)
71
+ return await self._get_target_index_internal(
72
+ index_selector, collection_id, product, check_size=True
73
+ )
74
+
75
+ async def prepare_bulk_actions(
76
+ self, collection_id: str, items: List[Dict[str, Any]]
77
+ ) -> List[Dict[str, Any]]:
78
+ """Prepare bulk actions for multiple items.
79
+
80
+ Args:
81
+ collection_id (str): Collection identifier.
82
+ items (List[Dict[str, Any]]): List of items to process.
83
+
84
+ Returns:
85
+ List[Dict[str, Any]]: List of bulk actions ready for execution.
86
+ """
87
+ if not items:
88
+ msg = "The product list cannot be empty."
89
+ logger.error(msg)
90
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=msg)
91
+
92
+ items.sort(key=lambda item: item["properties"]["datetime"])
93
+ index_selector = DatetimeBasedIndexSelector(self.client)
94
+
95
+ await self._ensure_indexes_exist(index_selector, collection_id, items)
96
+ await self._check_and_handle_oversized_index(
97
+ index_selector, collection_id, items
98
+ )
99
+
100
+ actions = []
101
+ for item in items:
102
+ target_index = await self._get_target_index_internal(
103
+ index_selector, collection_id, item, check_size=False
104
+ )
105
+ actions.append(
106
+ {
107
+ "_index": target_index,
108
+ "_id": mk_item_id(item["id"], item["collection"]),
109
+ "_source": item,
110
+ }
111
+ )
112
+
113
+ return actions
114
+
115
+ async def _get_target_index_internal(
116
+ self,
117
+ index_selector,
118
+ collection_id: str,
119
+ product: Dict[str, Any],
120
+ check_size: bool = True,
121
+ ) -> str:
122
+ """Get target index with size checking internally.
123
+
124
+ Args:
125
+ index_selector: Index selector instance.
126
+ collection_id (str): Collection identifier.
127
+ product (Dict[str, Any]): Product data.
128
+ check_size (bool): Whetheru to check index size limits.
129
+
130
+ Returns:
131
+ str: Target index name.
132
+ """
133
+ product_datetime = self.datetime_manager.validate_product_datetime(product)
134
+ datetime_range = {"gte": product_datetime, "lte": product_datetime}
135
+ target_index = await index_selector.select_indexes(
136
+ [collection_id], datetime_range
137
+ )
138
+ all_indexes = await index_selector.get_collection_indexes(collection_id)
139
+
140
+ if not all_indexes:
141
+ target_index = await self.datetime_manager.handle_new_collection(
142
+ collection_id, product_datetime
143
+ )
144
+ await index_selector.refresh_cache()
145
+ return target_index
146
+
147
+ all_indexes.sort()
148
+ start_date = extract_date(product_datetime)
149
+ end_date = extract_first_date_from_index(all_indexes[0])
150
+
151
+ if start_date < end_date:
152
+ alias = await self.datetime_manager.handle_early_date(
153
+ collection_id, start_date, end_date
154
+ )
155
+ await index_selector.refresh_cache()
156
+
157
+ return alias
158
+
159
+ if target_index != all_indexes[-1]:
160
+ return target_index
161
+
162
+ if check_size and await self.datetime_manager.size_manager.is_index_oversized(
163
+ target_index
164
+ ):
165
+ target_index = await self.datetime_manager.handle_oversized_index(
166
+ collection_id, target_index, product_datetime
167
+ )
168
+ await index_selector.refresh_cache()
169
+
170
+ return target_index
171
+
172
+ async def _ensure_indexes_exist(
173
+ self, index_selector, collection_id: str, items: List[Dict[str, Any]]
174
+ ):
175
+ """Ensure necessary indexes exist for the items.
176
+
177
+ Args:
178
+ index_selector: Index selector instance.
179
+ collection_id (str): Collection identifier.
180
+ items (List[Dict[str, Any]]): List of items to process.
181
+ """
182
+ all_indexes = await index_selector.get_collection_indexes(collection_id)
183
+
184
+ if not all_indexes:
185
+ first_item = items[0]
186
+ await self.index_operations.create_datetime_index(
187
+ self.client,
188
+ collection_id,
189
+ extract_date(first_item["properties"]["datetime"]),
190
+ )
191
+ await index_selector.refresh_cache()
192
+
193
+ async def _check_and_handle_oversized_index(
194
+ self, index_selector, collection_id: str, items: List[Dict[str, Any]]
195
+ ) -> None:
196
+ """Check if index is oversized and create new index if needed.
197
+
198
+ Checks if the index where the first item would be inserted is oversized.
199
+ If so, creates a new index starting from the next day.
200
+
201
+ Args:
202
+ index_selector: Index selector instance.
203
+ collection_id (str): Collection identifier.
204
+ items (List[Dict[str, Any]]): List of items to process.
205
+
206
+ Returns:
207
+ None
208
+ """
209
+ first_item = items[0]
210
+ first_item_index = await self._get_target_index_internal(
211
+ index_selector, collection_id, first_item, check_size=False
212
+ )
213
+
214
+ all_indexes = await index_selector.get_collection_indexes(collection_id)
215
+ all_indexes.sort()
216
+ latest_index = all_indexes[-1]
217
+
218
+ if first_item_index != latest_index:
219
+ return None
220
+
221
+ if not await self.datetime_manager.size_manager.is_index_oversized(
222
+ first_item_index
223
+ ):
224
+ return None
225
+
226
+ latest_item = await self.index_operations.find_latest_item_in_index(
227
+ self.client, latest_index
228
+ )
229
+ product_datetime = latest_item["_source"]["properties"]["datetime"]
230
+ end_date = extract_date(product_datetime)
231
+ await self.index_operations.update_index_alias(
232
+ self.client, str(end_date), latest_index
233
+ )
234
+ next_day_start = end_date + timedelta(days=1)
235
+ await self.index_operations.create_datetime_index(
236
+ self.client, collection_id, str(next_day_start)
237
+ )
238
+ await index_selector.refresh_cache()
239
+
240
+
241
+ class SimpleIndexInserter(BaseIndexInserter):
242
+ """Simple async index insertion strategy."""
243
+
244
+ def __init__(self, index_operations: IndexOperations, client: Any):
245
+ """Initialize the async simple index inserter.
246
+
247
+ Args:
248
+ index_operations (IndexOperations): Search engine adapter instance.
249
+ client: Async search engine client instance.
250
+ """
251
+ self.search_adapter = index_operations
252
+ self.client = client
253
+
254
+ @staticmethod
255
+ def should_create_collection_index() -> bool:
256
+ """Whether this strategy requires collection index creation.
257
+
258
+ Returns:
259
+ bool: True, as simple strategy creates collection indexes.
260
+ """
261
+ return True
262
+
263
+ async def create_simple_index(self, client: Any, collection_id: str) -> str:
264
+ """Create a simple index asynchronously.
265
+
266
+ Args:
267
+ client: Search engine client instance.
268
+ collection_id (str): Collection identifier.
269
+
270
+ Returns:
271
+ str: Created index name.
272
+ """
273
+ return await self.search_adapter.create_simple_index(client, collection_id)
274
+
275
+ async def get_target_index(
276
+ self, collection_id: str, product: Dict[str, Any]
277
+ ) -> str:
278
+ """Get target index (always the collection alias).
279
+
280
+ Args:
281
+ collection_id (str): Collection identifier.
282
+ product (Dict[str, Any]): Product data (not used in simple strategy).
283
+
284
+ Returns:
285
+ str: Collection alias name.
286
+ """
287
+ return index_alias_by_collection_id(collection_id)
288
+
289
+ async def prepare_bulk_actions(
290
+ self, collection_id: str, items: List[Dict[str, Any]]
291
+ ) -> List[Dict[str, Any]]:
292
+ """Prepare bulk actions for simple indexing.
293
+
294
+ Args:
295
+ collection_id (str): Collection identifier.
296
+ items (List[Dict[str, Any]]): List of items to process.
297
+
298
+ Returns:
299
+ List[Dict[str, Any]]: List of bulk actions with collection alias as target.
300
+ """
301
+ target_index = index_alias_by_collection_id(collection_id)
302
+ return [
303
+ {
304
+ "_index": target_index,
305
+ "_id": mk_item_id(item["id"], item["collection"]),
306
+ "_source": item,
307
+ }
308
+ for item in items
309
+ ]
@@ -0,0 +1,198 @@
1
+ """Index management utilities."""
2
+
3
+ import logging
4
+ import os
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict
7
+
8
+ from fastapi import HTTPException, status
9
+
10
+ from stac_fastapi.sfeos_helpers.database import (
11
+ extract_date,
12
+ extract_first_date_from_index,
13
+ )
14
+
15
+ from .index_operations import IndexOperations
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class IndexSizeManager:
21
+ """Manages index size limits and operations."""
22
+
23
+ def __init__(self, client: Any):
24
+ """Initialize the index size manager.
25
+
26
+ Args:
27
+ client: Search engine client instance.
28
+ """
29
+ self.client = client
30
+ self.max_size_gb = self._get_max_size_from_env()
31
+
32
+ async def get_index_size_in_gb(self, index_name: str) -> float:
33
+ """Get index size in gigabytes asynchronously.
34
+
35
+ Args:
36
+ index_name (str): Name of the index to check.
37
+
38
+ Returns:
39
+ float: Size of the index in gigabytes.
40
+ """
41
+ data = await self.client.indices.stats(index=index_name)
42
+ return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9
43
+
44
+ async def is_index_oversized(self, index_name: str) -> bool:
45
+ """Check if index exceeds size limit asynchronously.
46
+
47
+ Args:
48
+ index_name (str): Name of the index to check.
49
+
50
+ Returns:
51
+ bool: True if index exceeds size limit, False otherwise.
52
+ """
53
+ size_gb = await self.get_index_size_in_gb(index_name)
54
+ is_oversized = size_gb > self.max_size_gb
55
+ gb_milestone = int(size_gb)
56
+ if gb_milestone > 0:
57
+ logger.info(f"Index '{index_name}' size: {gb_milestone}GB")
58
+
59
+ if is_oversized:
60
+ logger.warning(
61
+ f"Index '{index_name}' is oversized: {size_gb:.2f} GB "
62
+ f"(limit: {self.max_size_gb} GB)"
63
+ )
64
+
65
+ return is_oversized
66
+
67
+ @staticmethod
68
+ def _get_max_size_from_env() -> float:
69
+ """Get max size from environment variable with error handling.
70
+
71
+ Returns:
72
+ float: Maximum index size in GB.
73
+
74
+ Raises:
75
+ ValueError: If environment variable contains invalid value.
76
+ """
77
+ env_value = os.getenv("DATETIME_INDEX_MAX_SIZE_GB", "25")
78
+
79
+ try:
80
+ max_size = float(env_value)
81
+ if max_size <= 0:
82
+ raise ValueError(
83
+ f"DATETIME_INDEX_MAX_SIZE_GB must be positive, got: {max_size}"
84
+ )
85
+ return max_size
86
+ except (ValueError, TypeError):
87
+ error_msg = (
88
+ f"Invalid value for DATETIME_INDEX_MAX_SIZE_GB environment variable: "
89
+ f"'{env_value}'. Must be a positive number. Using default value 25.0 GB."
90
+ )
91
+ logger.warning(error_msg)
92
+
93
+ return 25.0
94
+
95
+
96
+ class DatetimeIndexManager:
97
+ """Manages datetime-based index operations."""
98
+
99
+ def __init__(self, client: Any, index_operations: IndexOperations):
100
+ """Initialize the datetime index manager.
101
+
102
+ Args:
103
+ client: Search engine client instance.
104
+ index_operations (IndexOperations): Search engine adapter instance.
105
+ """
106
+ self.client = client
107
+ self.index_operations = index_operations
108
+ self.size_manager = IndexSizeManager(client)
109
+
110
+ @staticmethod
111
+ def validate_product_datetime(product: Dict[str, Any]) -> str:
112
+ """Validate and extract datetime from product.
113
+
114
+ Args:
115
+ product (Dict[str, Any]): Product data containing datetime information.
116
+
117
+ Returns:
118
+ str: Validated product datetime.
119
+
120
+ Raises:
121
+ HTTPException: If product datetime is missing or invalid.
122
+ """
123
+ product_datetime = product["properties"]["datetime"]
124
+ if not product_datetime:
125
+ raise HTTPException(
126
+ status_code=status.HTTP_400_BAD_REQUEST,
127
+ detail="Product datetime is required for indexing",
128
+ )
129
+ return product_datetime
130
+
131
+ async def handle_new_collection(
132
+ self, collection_id: str, product_datetime: str
133
+ ) -> str:
134
+ """Handle index creation for new collection asynchronously.
135
+
136
+ Args:
137
+ collection_id (str): Collection identifier.
138
+ product_datetime (str): Product datetime for index naming.
139
+
140
+
141
+ Returns:
142
+ str: Created index name.
143
+ """
144
+ target_index = await self.index_operations.create_datetime_index(
145
+ self.client, collection_id, extract_date(product_datetime)
146
+ )
147
+ logger.info(
148
+ f"Successfully created index '{target_index}' for collection '{collection_id}'"
149
+ )
150
+ return target_index
151
+
152
+ async def handle_early_date(
153
+ self, collection_id: str, start_date: datetime, end_date: datetime
154
+ ) -> str:
155
+ """Handle product with date earlier than existing indexes asynchronously.
156
+
157
+ Args:
158
+ collection_id (str): Collection identifier.
159
+ start_date (datetime): Start date for the new index.
160
+ end_date (datetime): End date for alias update.
161
+
162
+ Returns:
163
+ str: Updated alias name.
164
+ """
165
+ old_alias = self.index_operations.create_alias_name(
166
+ collection_id, str(end_date)
167
+ )
168
+ new_alias = self.index_operations.create_alias_name(
169
+ collection_id, str(start_date)
170
+ )
171
+ await self.index_operations.change_alias_name(self.client, old_alias, new_alias)
172
+ return new_alias
173
+
174
+ async def handle_oversized_index(
175
+ self, collection_id: str, target_index: str, product_datetime: str
176
+ ) -> str:
177
+ """Handle index that exceeds size limit asynchronously.
178
+
179
+ Args:
180
+ collection_id (str): Collection identifier.
181
+ target_index (str): Current target index name.
182
+ product_datetime (str): Product datetime for new index.
183
+
184
+ Returns:
185
+ str: New or updated index name.
186
+ """
187
+ end_date = extract_date(product_datetime)
188
+ latest_index_start = extract_first_date_from_index(target_index)
189
+
190
+ if end_date != latest_index_start:
191
+ await self.index_operations.update_index_alias(
192
+ self.client, str(end_date), target_index
193
+ )
194
+ target_index = await self.index_operations.create_datetime_index(
195
+ self.client, collection_id, str(end_date + timedelta(days=1))
196
+ )
197
+
198
+ return target_index
@@ -0,0 +1,15 @@
1
+ """Index selection strategies package."""
2
+
3
+ from .base import BaseIndexSelector
4
+ from .cache_manager import IndexAliasLoader, IndexCacheManager
5
+ from .factory import IndexSelectorFactory
6
+ from .selectors import DatetimeBasedIndexSelector, UnfilteredIndexSelector
7
+
8
+ __all__ = [
9
+ "IndexCacheManager",
10
+ "IndexAliasLoader",
11
+ "DatetimeBasedIndexSelector",
12
+ "UnfilteredIndexSelector",
13
+ "IndexSelectorFactory",
14
+ "BaseIndexSelector",
15
+ ]
@@ -0,0 +1,30 @@
1
+ """Base classes for index selection strategies."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List, Optional
5
+
6
+
7
+ class BaseIndexSelector(ABC):
8
+ """Base class for async index selectors."""
9
+
10
+ @abstractmethod
11
+ async def select_indexes(
12
+ self,
13
+ collection_ids: Optional[List[str]],
14
+ datetime_search: Dict[str, Optional[str]],
15
+ ) -> str:
16
+ """Select appropriate indexes asynchronously.
17
+
18
+ Args:
19
+ collection_ids (Optional[List[str]]): List of collection IDs to filter by.
20
+ datetime_search (Dict[str, Optional[str]]): Datetime search criteria.
21
+
22
+ Returns:
23
+ str: Comma-separated string of selected index names.
24
+ """
25
+ pass
26
+
27
+ @abstractmethod
28
+ async def refresh_cache(self):
29
+ """Refresh cache (no-op for unfiltered selector)."""
30
+ pass
@@ -0,0 +1,127 @@
1
+ """Cache management for index selection strategies."""
2
+
3
+ import threading
4
+ import time
5
+ from collections import defaultdict
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from stac_fastapi.sfeos_helpers.database import index_alias_by_collection_id
9
+ from stac_fastapi.sfeos_helpers.mappings import ITEMS_INDEX_PREFIX
10
+
11
+
12
+ class IndexCacheManager:
13
+ """Manages caching of index aliases with expiration."""
14
+
15
+ def __init__(self, cache_ttl_seconds: int = 3600):
16
+ """Initialize the cache manager.
17
+
18
+ Args:
19
+ cache_ttl_seconds (int): Time-to-live for cache entries in seconds.
20
+ """
21
+ self._cache: Optional[Dict[str, List[str]]] = None
22
+ self._timestamp: float = 0
23
+ self._ttl = cache_ttl_seconds
24
+ self._lock = threading.Lock()
25
+
26
+ @property
27
+ def is_expired(self) -> bool:
28
+ """Check if the cache has expired.
29
+
30
+ Returns:
31
+ bool: True if cache is expired, False otherwise.
32
+ """
33
+ return time.time() - self._timestamp > self._ttl
34
+
35
+ def get_cache(self) -> Optional[Dict[str, List[str]]]:
36
+ """Get the current cache if not expired.
37
+
38
+ Returns:
39
+ Optional[Dict[str, List[str]]]: Cache data if valid, None if expired.
40
+ """
41
+ with self._lock:
42
+ if self.is_expired:
43
+ return None
44
+ return {k: v.copy() for k, v in self._cache.items()}
45
+
46
+ def set_cache(self, data: Dict[str, List[str]]) -> None:
47
+ """Set cache data and update timestamp.
48
+
49
+ Args:
50
+ data (Dict[str, List[str]]): Cache data to store.
51
+ """
52
+ self._cache = data
53
+ self._timestamp = time.time()
54
+
55
+ def clear_cache(self) -> None:
56
+ """Clear the cache and reset timestamp."""
57
+ self._cache = None
58
+ self._timestamp = 0
59
+
60
+
61
+ class IndexAliasLoader:
62
+ """Asynchronous loader for index aliases."""
63
+
64
+ def __init__(self, client: Any, cache_manager: IndexCacheManager):
65
+ """Initialize the async alias loader.
66
+
67
+ Args:
68
+ client: Async search engine client instance.
69
+ cache_manager (IndexCacheManager): Cache manager instance.
70
+ """
71
+ self.client = client
72
+ self.cache_manager = cache_manager
73
+
74
+ async def load_aliases(self) -> Dict[str, List[str]]:
75
+ """Load index aliases from search engine.
76
+
77
+ Returns:
78
+ Dict[str, List[str]]: Mapping of base aliases to item aliases.
79
+ """
80
+ response = await self.client.indices.get_alias(index=f"{ITEMS_INDEX_PREFIX}*")
81
+ result = defaultdict(list)
82
+ for index_info in response.values():
83
+ aliases = index_info.get("aliases", {})
84
+ items_aliases = sorted(
85
+ [
86
+ alias
87
+ for alias in aliases.keys()
88
+ if alias.startswith(ITEMS_INDEX_PREFIX)
89
+ ]
90
+ )
91
+
92
+ if items_aliases:
93
+ result[items_aliases[0]].extend(items_aliases[1:])
94
+
95
+ self.cache_manager.set_cache(result)
96
+ return result
97
+
98
+ async def get_aliases(self) -> Dict[str, List[str]]:
99
+ """Get aliases from cache or load if expired.
100
+
101
+ Returns:
102
+ Dict[str, List[str]]: Alias mapping data.
103
+ """
104
+ cached = self.cache_manager.get_cache()
105
+ if cached is not None:
106
+ return cached
107
+ return await self.load_aliases()
108
+
109
+ async def refresh_aliases(self) -> Dict[str, List[str]]:
110
+ """Force refresh aliases from search engine.
111
+
112
+ Returns:
113
+ Dict[str, List[str]]: Fresh alias mapping data.
114
+ """
115
+ return await self.load_aliases()
116
+
117
+ async def get_collection_indexes(self, collection_id: str) -> List[str]:
118
+ """Get all index aliases for a specific collection.
119
+
120
+ Args:
121
+ collection_id (str): Collection identifier.
122
+
123
+ Returns:
124
+ List[str]: List of index aliases for the collection.
125
+ """
126
+ aliases = await self.get_aliases()
127
+ return aliases.get(index_alias_by_collection_id(collection_id), [])