geoai-py 0.16.0__py2.py3-none-any.whl → 0.17.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,907 @@
1
+ """Tools for searching data catalogs."""
2
+
3
+ import io
4
+ import json
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ import pandas as pd
8
+ import requests
9
+ from strands import tool
10
+
11
+ from .catalog_models import CatalogDatasetInfo, CatalogSearchResult, LocationInfo
12
+
13
+
14
+ class CatalogTools:
15
+ """Collection of tools for searching and interacting with data catalogs."""
16
+
17
+ # Common location cache to avoid repeated geocoding
18
+ _LOCATION_CACHE = {
19
+ "san francisco": {
20
+ "name": "San Francisco",
21
+ "bbox": [-122.5155, 37.7034, -122.3549, 37.8324],
22
+ "center": [-122.4194, 37.7749],
23
+ },
24
+ "new york": {
25
+ "name": "New York",
26
+ "bbox": [-74.0479, 40.6829, -73.9067, 40.8820],
27
+ "center": [-73.9352, 40.7306],
28
+ },
29
+ "new york city": {
30
+ "name": "New York City",
31
+ "bbox": [-74.0479, 40.6829, -73.9067, 40.8820],
32
+ "center": [-73.9352, 40.7306],
33
+ },
34
+ "paris": {
35
+ "name": "Paris",
36
+ "bbox": [2.2241, 48.8156, 2.4698, 48.9022],
37
+ "center": [2.3522, 48.8566],
38
+ },
39
+ "london": {
40
+ "name": "London",
41
+ "bbox": [-0.5103, 51.2868, 0.3340, 51.6919],
42
+ "center": [-0.1276, 51.5074],
43
+ },
44
+ "tokyo": {
45
+ "name": "Tokyo",
46
+ "bbox": [139.5694, 35.5232, 139.9182, 35.8173],
47
+ "center": [139.6917, 35.6895],
48
+ },
49
+ "los angeles": {
50
+ "name": "Los Angeles",
51
+ "bbox": [-118.6682, 33.7037, -118.1553, 34.3373],
52
+ "center": [-118.2437, 34.0522],
53
+ },
54
+ "chicago": {
55
+ "name": "Chicago",
56
+ "bbox": [-87.9401, 41.6445, -87.5241, 42.0230],
57
+ "center": [-87.6298, 41.8781],
58
+ },
59
+ "seattle": {
60
+ "name": "Seattle",
61
+ "bbox": [-122.4595, 47.4810, -122.2244, 47.7341],
62
+ "center": [-122.3321, 47.6062],
63
+ },
64
+ "california": {
65
+ "name": "California",
66
+ "bbox": [-124.4820, 32.5288, -114.1315, 42.0095],
67
+ "center": [-119.4179, 36.7783],
68
+ },
69
+ "las vegas": {
70
+ "name": "Las Vegas",
71
+ "bbox": [-115.3711, 35.9630, -114.9372, 36.2610],
72
+ "center": [-115.1400, 36.1177],
73
+ },
74
+ }
75
+
76
+ def __init__(
77
+ self,
78
+ catalog_url: Optional[str] = None,
79
+ catalog_df: Optional[pd.DataFrame] = None,
80
+ ) -> None:
81
+ """Initialize CatalogTools.
82
+
83
+ Args:
84
+ catalog_url: URL to a catalog file (TSV, CSV, or JSON). If None, must provide catalog_df.
85
+ catalog_df: Pre-loaded catalog as a pandas DataFrame. If None, must provide catalog_url.
86
+ """
87
+ self.catalog_url = catalog_url
88
+ self._catalog_df = catalog_df
89
+ self._cache = {}
90
+ # Runtime cache for geocoding results
91
+ self._geocode_cache = {}
92
+
93
+ # Load catalog if URL provided
94
+ if catalog_url and catalog_df is None:
95
+ self._catalog_df = self._load_catalog(catalog_url)
96
+
97
+ def _load_catalog(self, url: str) -> pd.DataFrame:
98
+ """Load catalog from a URL.
99
+
100
+ Args:
101
+ url: URL to catalog file (TSV, CSV, or JSON).
102
+
103
+ Returns:
104
+ DataFrame containing catalog data.
105
+ """
106
+ # Check cache first
107
+ if url in self._cache:
108
+ return self._cache[url]
109
+
110
+ try:
111
+ # Download the file
112
+ response = requests.get(url, timeout=30)
113
+ response.raise_for_status()
114
+
115
+ # Determine file type and parse
116
+ if url.endswith(".tsv"):
117
+ df = pd.read_csv(io.StringIO(response.text), sep="\t")
118
+ elif url.endswith(".csv"):
119
+ df = pd.read_csv(io.StringIO(response.text))
120
+ elif url.endswith(".json"):
121
+ df = pd.read_json(io.StringIO(response.text))
122
+ else:
123
+ # Try to auto-detect (default to TSV)
124
+ df = pd.read_csv(io.StringIO(response.text), sep="\t")
125
+
126
+ # Cache the result
127
+ self._cache[url] = df
128
+ return df
129
+
130
+ except Exception as e:
131
+ raise ValueError(f"Failed to load catalog from {url}: {str(e)}")
132
+
133
+ def _parse_bbox_string(self, bbox_str: str) -> Optional[List[float]]:
134
+ """Parse a bbox string to a list of floats.
135
+
136
+ Args:
137
+ bbox_str: Bounding box string in format "minLon, minLat, maxLon, maxLat".
138
+
139
+ Returns:
140
+ List of floats [minLon, minLat, maxLon, maxLat] or None if parsing fails.
141
+ """
142
+ try:
143
+ if pd.isna(bbox_str) or not bbox_str:
144
+ return None
145
+ parts = str(bbox_str).split(",")
146
+ if len(parts) != 4:
147
+ return None
148
+ bbox = [float(p.strip()) for p in parts]
149
+ return bbox
150
+ except (ValueError, AttributeError):
151
+ return None
152
+
153
+ def _bbox_intersects(self, bbox1: List[float], bbox2: List[float]) -> bool:
154
+ """Check if two bounding boxes intersect.
155
+
156
+ Args:
157
+ bbox1: First bbox as [minLon, minLat, maxLon, maxLat].
158
+ bbox2: Second bbox as [minLon, minLat, maxLon, maxLat].
159
+
160
+ Returns:
161
+ True if bboxes intersect, False otherwise.
162
+ """
163
+ # Check if boxes do NOT intersect, then negate
164
+ # bbox1 is completely to the left, right, below, or above bbox2
165
+ return not (
166
+ bbox1[2] < bbox2[0] # bbox1 maxLon < bbox2 minLon (left of)
167
+ or bbox1[0] > bbox2[2] # bbox1 minLon > bbox2 maxLon (right of)
168
+ or bbox1[3] < bbox2[1] # bbox1 maxLat < bbox2 minLat (below)
169
+ or bbox1[1] > bbox2[3] # bbox1 minLat > bbox2 maxLat (above)
170
+ )
171
+
172
+ def _bbox_contains_point(self, bbox: List[float], lon: float, lat: float) -> bool:
173
+ """Check if a bounding box contains a point.
174
+
175
+ Args:
176
+ bbox: Bounding box as [minLon, minLat, maxLon, maxLat].
177
+ lon: Longitude of the point.
178
+ lat: Latitude of the point.
179
+
180
+ Returns:
181
+ True if bbox contains the point, False otherwise.
182
+ """
183
+ return bbox[0] <= lon <= bbox[2] and bbox[1] <= lat <= bbox[3]
184
+
185
+ def _search_dataframe(
186
+ self,
187
+ df: pd.DataFrame,
188
+ keywords: Optional[str] = None,
189
+ dataset_type: Optional[str] = None,
190
+ provider: Optional[str] = None,
191
+ start_date: Optional[str] = None,
192
+ end_date: Optional[str] = None,
193
+ max_results: int = 10,
194
+ ) -> pd.DataFrame:
195
+ """Search dataframe with filters.
196
+
197
+ Args:
198
+ df: DataFrame to search.
199
+ keywords: Keywords to search for (searches in id, title, keywords, description).
200
+ dataset_type: Filter by dataset type.
201
+ provider: Filter by provider.
202
+ start_date: Filter datasets that have data after this date (YYYY-MM-DD).
203
+ end_date: Filter datasets that have data before this date (YYYY-MM-DD).
204
+ max_results: Maximum number of results to return.
205
+
206
+ Returns:
207
+ Filtered DataFrame.
208
+ """
209
+ result_df = df.copy()
210
+
211
+ # Apply keyword search
212
+ if keywords:
213
+ keyword_lower = keywords.lower()
214
+ mask = pd.Series([False] * len(result_df), index=result_df.index)
215
+
216
+ # Search in id
217
+ if "id" in result_df.columns:
218
+ mask |= (
219
+ result_df["id"]
220
+ .astype(str)
221
+ .str.lower()
222
+ .str.contains(keyword_lower, na=False)
223
+ )
224
+
225
+ # Search in title
226
+ if "title" in result_df.columns:
227
+ mask |= (
228
+ result_df["title"]
229
+ .astype(str)
230
+ .str.lower()
231
+ .str.contains(keyword_lower, na=False)
232
+ )
233
+
234
+ # Search in keywords
235
+ if "keywords" in result_df.columns:
236
+ mask |= (
237
+ result_df["keywords"]
238
+ .astype(str)
239
+ .str.lower()
240
+ .str.contains(keyword_lower, na=False)
241
+ )
242
+
243
+ # Search in description
244
+ if "description" in result_df.columns:
245
+ mask |= (
246
+ result_df["description"]
247
+ .astype(str)
248
+ .str.lower()
249
+ .str.contains(keyword_lower, na=False)
250
+ )
251
+
252
+ result_df = result_df[mask]
253
+
254
+ # Filter by type
255
+ if dataset_type and "type" in result_df.columns:
256
+ result_df = result_df[
257
+ result_df["type"]
258
+ .astype(str)
259
+ .str.lower()
260
+ .str.contains(dataset_type.lower(), na=False)
261
+ ]
262
+
263
+ # Filter by provider
264
+ if provider and "provider" in result_df.columns:
265
+ result_df = result_df[
266
+ result_df["provider"]
267
+ .astype(str)
268
+ .str.lower()
269
+ .str.contains(provider.lower(), na=False)
270
+ ]
271
+
272
+ # Filter by temporal range
273
+ if start_date and "end_date" in result_df.columns:
274
+ # Keep datasets where end_date >= start_date (dataset has data after start_date)
275
+ result_df = result_df[
276
+ (result_df["end_date"].notna()) & (result_df["end_date"] >= start_date)
277
+ ]
278
+
279
+ if end_date and "start_date" in result_df.columns:
280
+ # Keep datasets where start_date <= end_date (dataset has data before end_date)
281
+ result_df = result_df[
282
+ (result_df["start_date"].notna())
283
+ & (result_df["start_date"] <= end_date)
284
+ ]
285
+
286
+ # Limit results
287
+ if len(result_df) > max_results:
288
+ result_df = result_df.head(max_results)
289
+
290
+ return result_df
291
+
292
+ @tool(
293
+ description="Search for datasets in the catalog using keywords, filters, and date range"
294
+ )
295
+ def search_datasets(
296
+ self,
297
+ keywords: Optional[str] = None,
298
+ dataset_type: Optional[str] = None,
299
+ provider: Optional[str] = None,
300
+ start_date: Optional[str] = None,
301
+ end_date: Optional[str] = None,
302
+ max_results: Optional[Union[str, int]] = 10,
303
+ ) -> str:
304
+ """Search for datasets in the catalog.
305
+
306
+ Args:
307
+ keywords: Keywords to search for. Searches in id, title, keywords, and description fields.
308
+ Example: "landcover" will find datasets with "landcover" in any searchable field.
309
+ dataset_type: Filter by dataset type (e.g., "image", "image_collection", "table").
310
+ Example: "image_collection" to find only image collections.
311
+ provider: Filter by data provider name.
312
+ Example: "NASA" to find only NASA datasets.
313
+ start_date: Filter datasets that have data after this date in YYYY-MM-DD format.
314
+ Example: "2020-01-01" to find datasets with data from 2020 onwards.
315
+ end_date: Filter datasets that have data before this date in YYYY-MM-DD format.
316
+ Example: "2023-12-31" to find datasets with data up to 2023.
317
+ max_results: Maximum number of results to return (default: 10).
318
+
319
+ Returns:
320
+ JSON string containing search results with dataset information.
321
+ """
322
+ try:
323
+ if self._catalog_df is None:
324
+ return json.dumps(
325
+ {
326
+ "error": "No catalog loaded. Please provide catalog_url or catalog_df."
327
+ }
328
+ )
329
+
330
+ # Parse max_results if it's a string
331
+ if isinstance(max_results, str):
332
+ try:
333
+ max_results = int(max_results)
334
+ except ValueError:
335
+ max_results = 10
336
+
337
+ # Search the dataframe
338
+ result_df = self._search_dataframe(
339
+ self._catalog_df,
340
+ keywords=keywords,
341
+ dataset_type=dataset_type,
342
+ provider=provider,
343
+ start_date=start_date,
344
+ end_date=end_date,
345
+ max_results=max_results,
346
+ )
347
+
348
+ # Convert to models
349
+ dataset_models = []
350
+ for _, row in result_df.iterrows():
351
+ dataset_models.append(
352
+ CatalogDatasetInfo(
353
+ id=str(row.get("id", "")),
354
+ title=str(row.get("title", "")),
355
+ type=(
356
+ str(row.get("type", ""))
357
+ if pd.notna(row.get("type"))
358
+ else None
359
+ ),
360
+ provider=(
361
+ str(row.get("provider", ""))
362
+ if pd.notna(row.get("provider"))
363
+ else None
364
+ ),
365
+ description=(
366
+ str(row.get("description", ""))
367
+ if pd.notna(row.get("description"))
368
+ else None
369
+ ),
370
+ keywords=(
371
+ str(row.get("keywords", ""))
372
+ if pd.notna(row.get("keywords"))
373
+ else None
374
+ ),
375
+ snippet=(
376
+ str(row.get("snippet", ""))
377
+ if pd.notna(row.get("snippet"))
378
+ else None
379
+ ),
380
+ start_date=(
381
+ str(row.get("start_date", ""))
382
+ if pd.notna(row.get("start_date"))
383
+ else None
384
+ ),
385
+ end_date=(
386
+ str(row.get("end_date", ""))
387
+ if pd.notna(row.get("end_date"))
388
+ else None
389
+ ),
390
+ bbox=(
391
+ str(row.get("bbox", ""))
392
+ if pd.notna(row.get("bbox"))
393
+ else None
394
+ ),
395
+ license=(
396
+ str(row.get("license", ""))
397
+ if pd.notna(row.get("license"))
398
+ else None
399
+ ),
400
+ url=(
401
+ str(row.get("url", ""))
402
+ if pd.notna(row.get("url"))
403
+ else None
404
+ ),
405
+ catalog=(
406
+ str(row.get("catalog", ""))
407
+ if pd.notna(row.get("catalog"))
408
+ else None
409
+ ),
410
+ deprecated=(
411
+ str(row.get("deprecated", ""))
412
+ if pd.notna(row.get("deprecated"))
413
+ else None
414
+ ),
415
+ )
416
+ )
417
+
418
+ # Create search result
419
+ filters = {}
420
+ if keywords:
421
+ filters["keywords"] = keywords
422
+ if dataset_type:
423
+ filters["dataset_type"] = dataset_type
424
+ if provider:
425
+ filters["provider"] = provider
426
+
427
+ query_parts = []
428
+ if keywords:
429
+ query_parts.append(f"keywords: {keywords}")
430
+ if dataset_type:
431
+ query_parts.append(f"type: {dataset_type}")
432
+ if provider:
433
+ query_parts.append(f"provider: {provider}")
434
+ query_str = ", ".join(query_parts) if query_parts else "all datasets"
435
+
436
+ result = CatalogSearchResult(
437
+ query=query_str,
438
+ dataset_count=len(dataset_models),
439
+ datasets=dataset_models,
440
+ filters=filters if filters else None,
441
+ )
442
+
443
+ return json.dumps(result.model_dump(), indent=2)
444
+
445
+ except Exception as e:
446
+ return json.dumps({"error": str(e)})
447
+
448
+ @tool(description="Get detailed information about a specific dataset")
449
+ def get_dataset_info(
450
+ self,
451
+ dataset_id: str,
452
+ ) -> str:
453
+ """Get detailed information about a specific dataset.
454
+
455
+ Args:
456
+ dataset_id: The dataset ID to retrieve.
457
+
458
+ Returns:
459
+ JSON string with detailed dataset information.
460
+ """
461
+ try:
462
+ if self._catalog_df is None:
463
+ return json.dumps({"error": "No catalog loaded."})
464
+
465
+ # Find the dataset
466
+ if "id" not in self._catalog_df.columns:
467
+ return json.dumps({"error": "Catalog does not have 'id' column."})
468
+
469
+ result_df = self._catalog_df[self._catalog_df["id"] == dataset_id]
470
+
471
+ if len(result_df) == 0:
472
+ return json.dumps(
473
+ {"error": f"Dataset '{dataset_id}' not found in catalog."}
474
+ )
475
+
476
+ row = result_df.iloc[0]
477
+
478
+ # Convert to model
479
+ dataset = CatalogDatasetInfo(
480
+ id=str(row.get("id", "")),
481
+ title=str(row.get("title", "")),
482
+ type=str(row.get("type", "")) if pd.notna(row.get("type")) else None,
483
+ provider=(
484
+ str(row.get("provider", ""))
485
+ if pd.notna(row.get("provider"))
486
+ else None
487
+ ),
488
+ description=(
489
+ str(row.get("description", ""))
490
+ if pd.notna(row.get("description"))
491
+ else None
492
+ ),
493
+ keywords=(
494
+ str(row.get("keywords", ""))
495
+ if pd.notna(row.get("keywords"))
496
+ else None
497
+ ),
498
+ snippet=(
499
+ str(row.get("snippet", ""))
500
+ if pd.notna(row.get("snippet"))
501
+ else None
502
+ ),
503
+ start_date=(
504
+ str(row.get("start_date", ""))
505
+ if pd.notna(row.get("start_date"))
506
+ else None
507
+ ),
508
+ end_date=(
509
+ str(row.get("end_date", ""))
510
+ if pd.notna(row.get("end_date"))
511
+ else None
512
+ ),
513
+ bbox=str(row.get("bbox", "")) if pd.notna(row.get("bbox")) else None,
514
+ license=(
515
+ str(row.get("license", ""))
516
+ if pd.notna(row.get("license"))
517
+ else None
518
+ ),
519
+ url=str(row.get("url", "")) if pd.notna(row.get("url")) else None,
520
+ catalog=(
521
+ str(row.get("catalog", ""))
522
+ if pd.notna(row.get("catalog"))
523
+ else None
524
+ ),
525
+ deprecated=(
526
+ str(row.get("deprecated", ""))
527
+ if pd.notna(row.get("deprecated"))
528
+ else None
529
+ ),
530
+ )
531
+
532
+ return json.dumps(dataset.model_dump(), indent=2)
533
+
534
+ except Exception as e:
535
+ return json.dumps({"error": str(e)})
536
+
537
+ @tool(description="List unique dataset types available in the catalog")
538
+ def list_dataset_types(self) -> str:
539
+ """List unique dataset types available in the catalog.
540
+
541
+ Returns:
542
+ JSON string with list of dataset types.
543
+ """
544
+ try:
545
+ if self._catalog_df is None:
546
+ return json.dumps({"error": "No catalog loaded."})
547
+
548
+ if "type" not in self._catalog_df.columns:
549
+ return json.dumps({"error": "Catalog does not have 'type' column."})
550
+
551
+ types = self._catalog_df["type"].dropna().unique().tolist()
552
+ types.sort()
553
+
554
+ result = {
555
+ "count": len(types),
556
+ "types": types,
557
+ }
558
+
559
+ return json.dumps(result, indent=2)
560
+
561
+ except Exception as e:
562
+ return json.dumps({"error": str(e)})
563
+
564
+ @tool(description="List unique data providers in the catalog")
565
+ def list_providers(self) -> str:
566
+ """List unique data providers in the catalog.
567
+
568
+ Returns:
569
+ JSON string with list of providers.
570
+ """
571
+ try:
572
+ if self._catalog_df is None:
573
+ return json.dumps({"error": "No catalog loaded."})
574
+
575
+ if "provider" not in self._catalog_df.columns:
576
+ return json.dumps({"error": "Catalog does not have 'provider' column."})
577
+
578
+ providers = self._catalog_df["provider"].dropna().unique().tolist()
579
+ providers.sort()
580
+
581
+ result = {
582
+ "count": len(providers),
583
+ "providers": providers,
584
+ }
585
+
586
+ return json.dumps(result, indent=2)
587
+
588
+ except Exception as e:
589
+ return json.dumps({"error": str(e)})
590
+
591
+ @tool(description="Get catalog statistics and summary information")
592
+ def get_catalog_stats(self) -> str:
593
+ """Get statistics about the catalog.
594
+
595
+ Returns:
596
+ JSON string with catalog statistics.
597
+ """
598
+ try:
599
+ if self._catalog_df is None:
600
+ return json.dumps({"error": "No catalog loaded."})
601
+
602
+ stats = {
603
+ "total_datasets": len(self._catalog_df),
604
+ "columns": list(self._catalog_df.columns),
605
+ }
606
+
607
+ # Add type counts if available
608
+ if "type" in self._catalog_df.columns:
609
+ type_counts = self._catalog_df["type"].value_counts().to_dict()
610
+ stats["dataset_types"] = type_counts
611
+
612
+ # Add provider counts if available
613
+ if "provider" in self._catalog_df.columns:
614
+ # Get top 10 providers
615
+ provider_counts = (
616
+ self._catalog_df["provider"].value_counts().head(10).to_dict()
617
+ )
618
+ stats["top_providers"] = provider_counts
619
+
620
+ return json.dumps(stats, indent=2)
621
+
622
+ except Exception as e:
623
+ return json.dumps({"error": str(e)})
624
+
625
+ @tool(description="Parse a location name and return its bounding box coordinates")
626
+ def geocode_location(self, location_name: str) -> str:
627
+ """Convert a location name to geographic coordinates and bounding box.
628
+
629
+ This tool uses a geocoding service to find the coordinates for a given location name.
630
+
631
+ Args:
632
+ location_name: Name of the location (e.g., "San Francisco", "New York", "Paris, France", "California").
633
+
634
+ Returns:
635
+ JSON string with location info including bounding box and center coordinates.
636
+ """
637
+ try:
638
+ # Check static cache first (common locations)
639
+ location_key = location_name.lower().strip()
640
+ if location_key in self._LOCATION_CACHE:
641
+ cached = self._LOCATION_CACHE[location_key]
642
+ location_info = LocationInfo(
643
+ name=cached["name"],
644
+ bbox=cached["bbox"],
645
+ center=cached["center"],
646
+ )
647
+ return json.dumps(location_info.model_dump(), indent=2)
648
+
649
+ # Check runtime cache
650
+ if location_key in self._geocode_cache:
651
+ return self._geocode_cache[location_key]
652
+
653
+ # Geocode using Nominatim
654
+ url = "https://nominatim.openstreetmap.org/search"
655
+ params = {
656
+ "q": location_name,
657
+ "format": "json",
658
+ "limit": 1,
659
+ }
660
+ headers = {"User-Agent": "GeoAI-Catalog-Agent/1.0"}
661
+
662
+ response = requests.get(url, params=params, headers=headers, timeout=10)
663
+ response.raise_for_status()
664
+
665
+ results = response.json()
666
+
667
+ if not results:
668
+ error_result = json.dumps(
669
+ {"error": f"Location '{location_name}' not found"}
670
+ )
671
+ self._geocode_cache[location_key] = error_result
672
+ return error_result
673
+
674
+ result = results[0]
675
+ bbox = [
676
+ float(result["boundingbox"][2]), # west
677
+ float(result["boundingbox"][0]), # south
678
+ float(result["boundingbox"][3]), # east
679
+ float(result["boundingbox"][1]), # north
680
+ ]
681
+ center = [float(result["lon"]), float(result["lat"])]
682
+
683
+ location_info = LocationInfo(
684
+ name=result.get("display_name", location_name),
685
+ bbox=bbox,
686
+ center=center,
687
+ )
688
+
689
+ result_json = json.dumps(location_info.model_dump(), indent=2)
690
+ # Cache the result
691
+ self._geocode_cache[location_key] = result_json
692
+
693
+ return result_json
694
+
695
+ except Exception as e:
696
+ return json.dumps({"error": f"Geocoding error: {str(e)}"})
697
+
698
+ @tool(
699
+ description="Search for datasets by geographic region, keywords, and date range"
700
+ )
701
+ def search_by_region(
702
+ self,
703
+ bbox: Optional[Union[str, List[float]]] = None,
704
+ location: Optional[str] = None,
705
+ keywords: Optional[str] = None,
706
+ dataset_type: Optional[str] = None,
707
+ provider: Optional[str] = None,
708
+ start_date: Optional[str] = None,
709
+ end_date: Optional[str] = None,
710
+ max_results: Optional[Union[str, int]] = 10,
711
+ ) -> str:
712
+ """Search for datasets that cover a specific geographic region.
713
+
714
+ Args:
715
+ bbox: Bounding box as [west, south, east, north] or comma-separated string.
716
+ Example: [-122.5, 37.5, -122.0, 38.0] for San Francisco Bay Area.
717
+ location: Location name to geocode into a bounding box.
718
+ Example: "California", "San Francisco", "New York City".
719
+ keywords: Additional keywords to search for in dataset metadata.
720
+ dataset_type: Filter by dataset type (e.g., "image", "image_collection").
721
+ provider: Filter by data provider name.
722
+ start_date: Filter datasets that have data after this date in YYYY-MM-DD format.
723
+ Example: "2020-01-01" to find datasets with data from 2020 onwards.
724
+ end_date: Filter datasets that have data before this date in YYYY-MM-DD format.
725
+ Example: "2023-12-31" to find datasets with data up to 2023.
726
+ max_results: Maximum number of results to return (default: 10).
727
+
728
+ Returns:
729
+ JSON string containing search results with datasets that intersect the search region.
730
+ """
731
+ try:
732
+ if self._catalog_df is None:
733
+ return json.dumps({"error": "No catalog loaded."})
734
+
735
+ # Parse max_results if it's a string
736
+ if isinstance(max_results, str):
737
+ try:
738
+ max_results = int(max_results)
739
+ except ValueError:
740
+ max_results = 10
741
+
742
+ # Determine search bbox
743
+ search_bbox = None
744
+
745
+ if bbox is not None:
746
+ # Parse bbox if it's a string
747
+ if isinstance(bbox, str):
748
+ search_bbox = self._parse_bbox_string(bbox)
749
+ if search_bbox is None:
750
+ return json.dumps({"error": f"Invalid bbox format: {bbox}"})
751
+ else:
752
+ search_bbox = bbox
753
+
754
+ elif location is not None:
755
+ # Geocode location to bbox
756
+ geocode_result = json.loads(self.geocode_location(location))
757
+ if "error" in geocode_result:
758
+ return json.dumps(geocode_result)
759
+ search_bbox = geocode_result["bbox"]
760
+
761
+ if search_bbox is None:
762
+ return json.dumps(
763
+ {"error": "Either bbox or location must be provided."}
764
+ )
765
+
766
+ # Validate search bbox
767
+ if len(search_bbox) != 4:
768
+ return json.dumps(
769
+ {
770
+ "error": "Bbox must have 4 values [minLon, minLat, maxLon, maxLat]"
771
+ }
772
+ )
773
+
774
+ # Filter by spatial intersection
775
+ if "bbox" not in self._catalog_df.columns:
776
+ return json.dumps(
777
+ {
778
+ "error": "Catalog does not have 'bbox' column. Try using a JSON format catalog."
779
+ }
780
+ )
781
+
782
+ # Create mask for spatial intersection
783
+ spatial_mask = pd.Series(
784
+ [False] * len(self._catalog_df), index=self._catalog_df.index
785
+ )
786
+
787
+ for idx, row in self._catalog_df.iterrows():
788
+ dataset_bbox = self._parse_bbox_string(row.get("bbox"))
789
+ if dataset_bbox and self._bbox_intersects(dataset_bbox, search_bbox):
790
+ spatial_mask[idx] = True
791
+
792
+ result_df = self._catalog_df[spatial_mask]
793
+
794
+ # Apply additional filters using existing _search_dataframe logic
795
+ result_df = self._search_dataframe(
796
+ result_df,
797
+ keywords=keywords,
798
+ dataset_type=dataset_type,
799
+ provider=provider,
800
+ start_date=start_date,
801
+ end_date=end_date,
802
+ max_results=max_results,
803
+ )
804
+
805
+ # Convert to models
806
+ dataset_models = []
807
+ for _, row in result_df.iterrows():
808
+ dataset_models.append(
809
+ CatalogDatasetInfo(
810
+ id=str(row.get("id", "")),
811
+ title=str(row.get("title", "")),
812
+ type=(
813
+ str(row.get("type", ""))
814
+ if pd.notna(row.get("type"))
815
+ else None
816
+ ),
817
+ provider=(
818
+ str(row.get("provider", ""))
819
+ if pd.notna(row.get("provider"))
820
+ else None
821
+ ),
822
+ description=(
823
+ str(row.get("description", ""))
824
+ if pd.notna(row.get("description"))
825
+ else None
826
+ ),
827
+ keywords=(
828
+ str(row.get("keywords", ""))
829
+ if pd.notna(row.get("keywords"))
830
+ else None
831
+ ),
832
+ snippet=(
833
+ str(row.get("snippet", ""))
834
+ if pd.notna(row.get("snippet"))
835
+ else None
836
+ ),
837
+ start_date=(
838
+ str(row.get("start_date", ""))
839
+ if pd.notna(row.get("start_date"))
840
+ else None
841
+ ),
842
+ end_date=(
843
+ str(row.get("end_date", ""))
844
+ if pd.notna(row.get("end_date"))
845
+ else None
846
+ ),
847
+ bbox=(
848
+ str(row.get("bbox", ""))
849
+ if pd.notna(row.get("bbox"))
850
+ else None
851
+ ),
852
+ license=(
853
+ str(row.get("license", ""))
854
+ if pd.notna(row.get("license"))
855
+ else None
856
+ ),
857
+ url=(
858
+ str(row.get("url", ""))
859
+ if pd.notna(row.get("url"))
860
+ else None
861
+ ),
862
+ catalog=(
863
+ str(row.get("catalog", ""))
864
+ if pd.notna(row.get("catalog"))
865
+ else None
866
+ ),
867
+ deprecated=(
868
+ str(row.get("deprecated", ""))
869
+ if pd.notna(row.get("deprecated"))
870
+ else None
871
+ ),
872
+ )
873
+ )
874
+
875
+ # Create search result
876
+ filters = {"search_bbox": search_bbox}
877
+ if keywords:
878
+ filters["keywords"] = keywords
879
+ if dataset_type:
880
+ filters["dataset_type"] = dataset_type
881
+ if provider:
882
+ filters["provider"] = provider
883
+
884
+ query_parts = []
885
+ if location:
886
+ query_parts.append(f"location: {location}")
887
+ elif bbox:
888
+ query_parts.append(f"bbox: {search_bbox}")
889
+ if keywords:
890
+ query_parts.append(f"keywords: {keywords}")
891
+ if dataset_type:
892
+ query_parts.append(f"type: {dataset_type}")
893
+ if provider:
894
+ query_parts.append(f"provider: {provider}")
895
+ query_str = ", ".join(query_parts) if query_parts else "spatial search"
896
+
897
+ result = CatalogSearchResult(
898
+ query=query_str,
899
+ dataset_count=len(dataset_models),
900
+ datasets=dataset_models,
901
+ filters=filters,
902
+ )
903
+
904
+ return json.dumps(result.model_dump(), indent=2)
905
+
906
+ except Exception as e:
907
+ return json.dumps({"error": str(e)})