sfeos-helpers 6.9.0__py3-none-any.whl → 6.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,17 @@
2
2
 
3
3
  import logging
4
4
  import os
5
- from datetime import datetime, timedelta
6
- from typing import Any, Dict
5
+ from datetime import timedelta
6
+ from typing import Any, Dict, NamedTuple
7
7
 
8
+ from dateutil import parser # type: ignore
8
9
  from fastapi import HTTPException, status
9
10
 
10
11
  from stac_fastapi.sfeos_helpers.database import (
11
12
  extract_date,
12
13
  extract_first_date_from_index,
14
+ extract_last_date_from_index,
15
+ is_index_closed,
13
16
  )
14
17
 
15
18
  from .index_operations import IndexOperations
@@ -17,6 +20,20 @@ from .index_operations import IndexOperations
17
20
  logger = logging.getLogger(__name__)
18
21
 
19
22
 
23
+ class ProductDatetimes(NamedTuple):
24
+ """Named tuple representing product datetime fields.
25
+
26
+ Attributes:
27
+ start_datetime (str | None): ISO format start datetime string or None.
28
+ datetime (str | None): ISO format datetime string or None.
29
+ end_datetime (str | None): ISO format end datetime string or None.
30
+ """
31
+
32
+ start_datetime: str | None
33
+ datetime: str | None
34
+ end_datetime: str | None
35
+
36
+
20
37
  class IndexSizeManager:
21
38
  """Manages index size limits and operations."""
22
39
 
@@ -29,18 +46,6 @@ class IndexSizeManager:
29
46
  self.client = client
30
47
  self.max_size_gb = self._get_max_size_from_env()
31
48
 
32
- async def get_index_size_in_gb(self, index_name: str) -> float:
33
- """Get index size in gigabytes asynchronously.
34
-
35
- Args:
36
- index_name (str): Name of the index to check.
37
-
38
- Returns:
39
- float: Size of the index in gigabytes.
40
- """
41
- data = await self.client.indices.stats(index=index_name)
42
- return data["_all"]["primaries"]["store"]["size_in_bytes"] / 1e9
43
-
44
49
  async def is_index_oversized(self, index_name: str) -> bool:
45
50
  """Check if index exceeds size limit asynchronously.
46
51
 
@@ -50,16 +55,33 @@ class IndexSizeManager:
50
55
  Returns:
51
56
  bool: True if index exceeds size limit, False otherwise.
52
57
  """
53
- size_gb = await self.get_index_size_in_gb(index_name)
58
+ stats = await self.client.indices.stats(index=index_name)
59
+
60
+ total_size_bytes = 0
61
+ total_doc_count = 0
62
+
63
+ for idx_name, idx_stats in stats["indices"].items():
64
+ primaries = idx_stats["primaries"]
65
+ total_size_bytes += primaries["store"]["size_in_bytes"]
66
+ total_doc_count += primaries["docs"]["count"]
67
+
68
+ if total_doc_count == 0:
69
+ logger.debug(f"Index '{index_name}' is empty (0 documents)")
70
+ return False
71
+
72
+ size_gb = total_size_bytes / (1024**3)
54
73
  is_oversized = size_gb > self.max_size_gb
55
74
  gb_milestone = int(size_gb)
75
+
56
76
  if gb_milestone > 0:
57
- logger.info(f"Index '{index_name}' size: {gb_milestone}GB")
77
+ logger.info(
78
+ f"Index '{index_name}' size: {gb_milestone}GB ({total_doc_count} documents)"
79
+ )
58
80
 
59
81
  if is_oversized:
60
82
  logger.warning(
61
83
  f"Index '{index_name}' is oversized: {size_gb:.2f} GB "
62
- f"(limit: {self.max_size_gb} GB)"
84
+ f"(limit: {self.max_size_gb} GB, documents: {total_doc_count})"
63
85
  )
64
86
 
65
87
  return is_oversized
@@ -108,91 +130,353 @@ class DatetimeIndexManager:
108
130
  self.size_manager = IndexSizeManager(client)
109
131
 
110
132
  @staticmethod
111
- def validate_product_datetime(product: Dict[str, Any]) -> str:
112
- """Validate and extract datetime from product.
133
+ def validate_product_datetimes(
134
+ product: Dict[str, Any], use_datetime
135
+ ) -> ProductDatetimes:
136
+ """Validate and extract datetime fields from product.
137
+
138
+ Validation rules depend on USE_DATETIME:
139
+ - USE_DATETIME=True: 'datetime' is required, optional start/end
140
+ - USE_DATETIME=False: both 'start_datetime' and 'end_datetime' required, start <= end
113
141
 
114
142
  Args:
115
143
  product (Dict[str, Any]): Product data containing datetime information.
144
+ use_datetime (bool): Flag determining validation mode.
145
+ - True: validates against 'datetime' field.
146
+ - False: validates against 'start_datetime' and 'end_datetime' fields.
116
147
 
117
148
  Returns:
118
- str: Validated product datetime.
149
+ ProductDatetimes: Named tuple containing parsed datetime values:
150
+ - start_datetime (str | None): ISO 8601 start datetime string or None.
151
+ - datetime (str | None): ISO 8601 datetime string or None.
152
+ - end_datetime (str | None): ISO 8601 end datetime string or None.
119
153
 
120
154
  Raises:
121
- HTTPException: If product datetime is missing or invalid.
155
+ HTTPException: If validation fails based on USE_DATETIME configuration.
122
156
  """
123
- product_datetime = product["properties"]["datetime"]
124
- if not product_datetime:
125
- raise HTTPException(
126
- status_code=status.HTTP_400_BAD_REQUEST,
127
- detail="Product datetime is required for indexing",
128
- )
129
- return product_datetime
157
+ properties = product.get("properties", {})
158
+ start_str = properties.get("start_datetime")
159
+ dt_str = properties.get("datetime")
160
+ end_str = properties.get("end_datetime")
161
+
162
+ start = parser.isoparse(start_str) if start_str else None
163
+ dt = parser.isoparse(dt_str) if dt_str else None
164
+ end = parser.isoparse(end_str) if end_str else None
165
+
166
+ if use_datetime:
167
+ if not dt:
168
+ raise HTTPException(
169
+ status_code=status.HTTP_400_BAD_REQUEST,
170
+ detail="'datetime' field is required",
171
+ )
172
+ else:
173
+ if not start or not end:
174
+ raise HTTPException(
175
+ status_code=status.HTTP_400_BAD_REQUEST,
176
+ detail="Both 'start_datetime' and 'end_datetime' fields are required",
177
+ )
178
+ if not (start <= end):
179
+ raise HTTPException(
180
+ status_code=status.HTTP_400_BAD_REQUEST,
181
+ detail="'start_datetime' must be <= 'end_datetime'",
182
+ )
183
+ if dt and not (start <= dt <= end):
184
+ raise HTTPException(
185
+ status_code=status.HTTP_400_BAD_REQUEST,
186
+ detail="'start_datetime' <= 'datetime' <= 'end_datetime' is required",
187
+ )
188
+
189
+ return ProductDatetimes(
190
+ start_datetime=start_str,
191
+ datetime=dt_str,
192
+ end_datetime=end_str,
193
+ )
130
194
 
131
195
  async def handle_new_collection(
132
- self, collection_id: str, product_datetime: str
196
+ self,
197
+ collection_id: str,
198
+ primary_datetime_name: str,
199
+ product_datetimes: ProductDatetimes,
133
200
  ) -> str:
134
201
  """Handle index creation for new collection asynchronously.
135
202
 
136
203
  Args:
137
204
  collection_id (str): Collection identifier.
138
- product_datetime (str): Product datetime for index naming.
139
-
205
+ primary_datetime_name (str): Name of the primary datetime field.
206
+ If "start_datetime", indexes are created on start_datetime and end_datetime fields.
207
+ If "datetime", indexes are created on the datetime field.
208
+ product_datetimes (ProductDatetimes): Object containing start_datetime, datetime, and end_datetime.
140
209
 
141
210
  Returns:
142
- str: Created index name.
211
+ str: Created datetime index name.
143
212
  """
213
+ index_params = {
214
+ "start_datetime": str(extract_date(product_datetimes.start_datetime))
215
+ if primary_datetime_name == "start_datetime"
216
+ else None,
217
+ "datetime": str(extract_date(product_datetimes.datetime))
218
+ if primary_datetime_name == "datetime"
219
+ else None,
220
+ "end_datetime": str(extract_date(product_datetimes.end_datetime))
221
+ if primary_datetime_name == "start_datetime"
222
+ else None,
223
+ }
224
+
144
225
  target_index = await self.index_operations.create_datetime_index(
145
- self.client, collection_id, extract_date(product_datetime)
226
+ self.client, collection_id, **index_params
146
227
  )
228
+
147
229
  logger.info(
148
230
  f"Successfully created index '{target_index}' for collection '{collection_id}'"
149
231
  )
150
232
  return target_index
151
233
 
152
234
  async def handle_early_date(
153
- self, collection_id: str, start_date: datetime, end_date: datetime
235
+ self,
236
+ collection_id: str,
237
+ primary_datetime_name: str,
238
+ product_datetimes: ProductDatetimes,
239
+ old_aliases: Dict[str, str],
240
+ is_first_index: bool,
154
241
  ) -> str:
155
- """Handle product with date earlier than existing indexes asynchronously.
242
+ """Handle product with datetime earlier than current index range.
156
243
 
157
244
  Args:
158
245
  collection_id (str): Collection identifier.
159
- start_date (datetime): Start date for the new index.
160
- end_date (datetime): End date for alias update.
246
+ primary_datetime_name (str): Primary datetime field name.
247
+ product_datetimes (ProductDatetimes): Product datetime values.
248
+ old_aliases (Dict[str, str]): Current datetime aliases.
249
+ is_first_index (bool): Whether this is the first index in the collection.
161
250
 
162
251
  Returns:
163
- str: Updated alias name.
252
+ str: Datetime alias to use.
164
253
  """
165
- old_alias = self.index_operations.create_alias_name(
166
- collection_id, str(end_date)
167
- )
168
- new_alias = self.index_operations.create_alias_name(
169
- collection_id, str(start_date)
254
+ if primary_datetime_name == "start_datetime":
255
+ return await self._handle_start_datetime_mode(
256
+ collection_id, product_datetimes, old_aliases, is_first_index
257
+ )
258
+ else:
259
+ return await self._handle_datetime_mode(
260
+ collection_id, product_datetimes, old_aliases, is_first_index
261
+ )
262
+
263
+ async def _handle_start_datetime_mode(
264
+ self,
265
+ collection_id: str,
266
+ product_datetimes: ProductDatetimes,
267
+ old_aliases: Dict[str, str],
268
+ is_first_index: bool,
269
+ ) -> str:
270
+ """Handle early-date logic for start/end datetime indexing.
271
+
272
+ Args:
273
+ collection_id (str): Collection identifier.
274
+ product_datetimes (ProductDatetimes): Product datetime values.
275
+ old_aliases (Dict[str, str]): Current start/end datetime aliases.
276
+ is_first_index (bool): Whether this is the first index in the collection.
277
+
278
+ Returns:
279
+ str: Primary datetime alias.
280
+ """
281
+ product_start = extract_date(product_datetimes.start_datetime)
282
+ product_end = extract_date(product_datetimes.end_datetime)
283
+
284
+ index_start = extract_first_date_from_index(old_aliases["start_datetime"])
285
+ index_end = extract_first_date_from_index(old_aliases["end_datetime"])
286
+ index_is_closed = is_index_closed(old_aliases["start_datetime"])
287
+
288
+ start_changed = product_start < index_start
289
+ end_changed = product_end > index_end
290
+
291
+ if not start_changed and not end_changed:
292
+ return old_aliases["start_datetime"]
293
+
294
+ new_aliases = []
295
+ old_alias_names = []
296
+ new_primary_alias = old_aliases["start_datetime"]
297
+
298
+ if start_changed:
299
+ if index_is_closed and is_first_index:
300
+ new_index_start = f"{product_start}-{index_start - timedelta(days=1)}"
301
+ return await self.index_operations.create_datetime_index(
302
+ self.client,
303
+ collection_id,
304
+ str(new_index_start),
305
+ None,
306
+ str(product_end),
307
+ )
308
+ elif index_is_closed:
309
+ closed_end = extract_last_date_from_index(old_aliases["start_datetime"])
310
+ new_start_alias = self.index_operations.create_alias_name(
311
+ collection_id, "start_datetime", f"{product_start}-{closed_end}"
312
+ )
313
+ else:
314
+ new_start_alias = self.index_operations.create_alias_name(
315
+ collection_id, "start_datetime", str(product_start)
316
+ )
317
+ new_aliases.append(new_start_alias)
318
+ old_alias_names.append(old_aliases["start_datetime"])
319
+ new_primary_alias = new_start_alias
320
+
321
+ if end_changed:
322
+ new_end_alias = self.index_operations.create_alias_name(
323
+ collection_id, "end_datetime", str(product_end)
324
+ )
325
+ new_aliases.append(new_end_alias)
326
+ old_alias_names.append(old_aliases["end_datetime"])
327
+
328
+ if old_alias_names:
329
+ await self.index_operations.change_alias_name(
330
+ self.client,
331
+ old_aliases["start_datetime"],
332
+ old_alias_names,
333
+ new_aliases,
334
+ )
335
+
336
+ return new_primary_alias
337
+
338
+ async def _handle_datetime_mode(
339
+ self,
340
+ collection_id: str,
341
+ product_datetimes: ProductDatetimes,
342
+ old_aliases: Dict[str, str],
343
+ is_first_index: bool,
344
+ ) -> str:
345
+ """Handle early-date logic for single datetime indexing.
346
+
347
+ Args:
348
+ collection_id (str): Collection identifier.
349
+ product_datetimes (ProductDatetimes): Product datetime values.
350
+ old_aliases (Dict[str, str]): Current datetime alias.
351
+ is_first_index (bool): Whether this is the first index in the collection.
352
+
353
+ Returns:
354
+ str: Datetime alias to use.
355
+ """
356
+ product_dt = extract_date(product_datetimes.datetime)
357
+
358
+ index_start = extract_first_date_from_index(old_aliases["datetime"])
359
+ index_is_closed = is_index_closed(old_aliases["datetime"])
360
+
361
+ if is_first_index and index_is_closed:
362
+ new_index_start = f"{product_dt}-{index_start - timedelta(days=1)}"
363
+ return await self.index_operations.create_datetime_index(
364
+ self.client, collection_id, None, str(new_index_start), None
365
+ )
366
+ elif index_is_closed:
367
+ index_end = extract_last_date_from_index(old_aliases["datetime"])
368
+ start_changed = product_dt < index_start
369
+
370
+ if not start_changed:
371
+ return old_aliases["datetime"]
372
+
373
+ new_alias = self.index_operations.create_alias_name(
374
+ collection_id, "datetime", f"{product_dt}-{index_end}"
375
+ )
376
+ else:
377
+ if product_dt >= index_start:
378
+ return old_aliases["datetime"]
379
+
380
+ new_alias = self.index_operations.create_alias_name(
381
+ collection_id, "datetime", str(product_dt)
382
+ )
383
+
384
+ await self.index_operations.change_alias_name(
385
+ self.client,
386
+ old_aliases["datetime"],
387
+ [old_aliases["datetime"]],
388
+ [new_alias],
170
389
  )
171
- await self.index_operations.change_alias_name(self.client, old_alias, new_alias)
390
+
172
391
  return new_alias
173
392
 
174
393
  async def handle_oversized_index(
175
- self, collection_id: str, target_index: str, product_datetime: str
394
+ self,
395
+ collection_id: str,
396
+ primary_datetime_name: str,
397
+ product_datetimes: ProductDatetimes,
398
+ latest_index_datetimes: ProductDatetimes | None,
399
+ old_aliases: Dict[str, str],
176
400
  ) -> str:
177
401
  """Handle index that exceeds size limit asynchronously.
178
402
 
179
403
  Args:
180
404
  collection_id (str): Collection identifier.
181
- target_index (str): Current target index name.
182
- product_datetime (str): Product datetime for new index.
405
+ primary_datetime_name (str): Name of the primary datetime field.
406
+ If "start_datetime", handles start_datetime and end_datetime fields.
407
+ If "datetime", handles the datetime field.
408
+ product_datetimes (ProductDatetimes): Product datetime values.
409
+ latest_index_datetimes (ProductDatetimes | None): Datetime range of the latest index.
410
+ old_aliases (Dict[str, str]): Current datetime aliases.
183
411
 
184
412
  Returns:
185
- str: New or updated index name.
413
+ str: Updated or newly created datetime alias name.
186
414
  """
187
- end_date = extract_date(product_datetime)
188
- latest_index_start = extract_first_date_from_index(target_index)
415
+ current_alias = old_aliases[primary_datetime_name]
416
+ new_aliases = []
417
+ old_alias_names = []
189
418
 
190
- if end_date != latest_index_start:
191
- await self.index_operations.update_index_alias(
192
- self.client, str(end_date), target_index
419
+ if primary_datetime_name == "start_datetime":
420
+ new_start_alias = (
421
+ f"{current_alias}-{str(latest_index_datetimes.start_datetime)}"
193
422
  )
194
- target_index = await self.index_operations.create_datetime_index(
195
- self.client, collection_id, str(end_date + timedelta(days=1))
423
+ new_aliases.append(new_start_alias)
424
+ old_alias_names.append(current_alias)
425
+
426
+ product_start_datetime = parser.isoparse(
427
+ product_datetimes.start_datetime
428
+ ).date()
429
+ latest_start_datetime_in_index = parser.isoparse(
430
+ latest_index_datetimes.start_datetime
431
+ ).date()
432
+ product_end_date = parser.isoparse(product_datetimes.end_datetime).date()
433
+ latest_end_datetime_in_index = parser.isoparse(
434
+ latest_index_datetimes.end_datetime
435
+ ).date()
436
+
437
+ if product_start_datetime > latest_start_datetime_in_index:
438
+ end_datetime = latest_end_datetime_in_index
439
+ else:
440
+ end_datetime = max(product_end_date, latest_end_datetime_in_index)
441
+
442
+ new_end_alias = self.index_operations.create_alias_name(
443
+ collection_id, "end_datetime", str(end_datetime)
196
444
  )
445
+ new_aliases.append(new_end_alias)
446
+ old_alias_names.append(old_aliases["end_datetime"])
197
447
 
198
- return target_index
448
+ await self.index_operations.change_alias_name(
449
+ self.client, current_alias, old_alias_names, new_aliases
450
+ )
451
+
452
+ if product_start_datetime > latest_start_datetime_in_index:
453
+ end_date = str(parser.isoparse(product_datetimes.end_datetime).date())
454
+ else:
455
+ end_date = str(
456
+ parser.isoparse(latest_index_datetimes.start_datetime).date()
457
+ + timedelta(days=1)
458
+ )
459
+
460
+ return await self.index_operations.create_datetime_index(
461
+ self.client,
462
+ collection_id,
463
+ start_datetime=str(latest_start_datetime_in_index + timedelta(days=1)),
464
+ datetime=None,
465
+ end_datetime=end_date,
466
+ )
467
+ else:
468
+ dt = extract_date(product_datetimes.datetime)
469
+
470
+ new_datetime_alias = (
471
+ f"{current_alias}-{str(latest_index_datetimes.datetime)}"
472
+ )
473
+ await self.index_operations.change_alias_name(
474
+ self.client, current_alias, [current_alias], [new_datetime_alias]
475
+ )
476
+ return await self.index_operations.create_datetime_index(
477
+ self.client,
478
+ collection_id,
479
+ start_datetime=None,
480
+ datetime=str(dt + timedelta(days=1)),
481
+ end_datetime=None,
482
+ )
@@ -1,7 +1,7 @@
1
1
  """Base classes for index selection strategies."""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Optional
4
+ from typing import List, Optional
5
5
 
6
6
 
7
7
  class BaseIndexSelector(ABC):
@@ -11,13 +11,17 @@ class BaseIndexSelector(ABC):
11
11
  async def select_indexes(
12
12
  self,
13
13
  collection_ids: Optional[List[str]],
14
- datetime_search: Dict[str, Optional[str]],
14
+ datetime_search: str,
15
+ for_insertion: bool = False,
15
16
  ) -> str:
16
17
  """Select appropriate indexes asynchronously.
17
18
 
18
19
  Args:
19
20
  collection_ids (Optional[List[str]]): List of collection IDs to filter by.
20
- datetime_search (Dict[str, Optional[str]]): Datetime search criteria.
21
+ datetime_search (str): Datetime search criteria.
22
+ for_insertion (bool): If True, selects indexes for inserting items into
23
+ the database. If False, selects indexes for searching/querying items.
24
+ Defaults to False (search mode).
21
25
 
22
26
  Returns:
23
27
  str: Comma-separated string of selected index names.