geoai-py 0.16.0__py2.py3-none-any.whl → 0.17.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geoai/__init__.py +1 -1
- geoai/agents/__init__.py +4 -0
- geoai/agents/catalog_models.py +51 -0
- geoai/agents/catalog_tools.py +907 -0
- geoai/agents/geo_agents.py +925 -41
- geoai/agents/stac_models.py +67 -0
- geoai/agents/stac_tools.py +435 -0
- geoai/download.py +5 -1
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/METADATA +1 -1
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/RECORD +14 -10
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/WHEEL +0 -0
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/entry_points.txt +0 -0
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {geoai_py-0.16.0.dist-info → geoai_py-0.17.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,907 @@
|
|
1
|
+
"""Tools for searching data catalogs."""
|
2
|
+
|
3
|
+
import io
|
4
|
+
import json
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import requests
|
9
|
+
from strands import tool
|
10
|
+
|
11
|
+
from .catalog_models import CatalogDatasetInfo, CatalogSearchResult, LocationInfo
|
12
|
+
|
13
|
+
|
14
|
+
class CatalogTools:
|
15
|
+
"""Collection of tools for searching and interacting with data catalogs."""
|
16
|
+
|
17
|
+
# Common location cache to avoid repeated geocoding
|
18
|
+
_LOCATION_CACHE = {
|
19
|
+
"san francisco": {
|
20
|
+
"name": "San Francisco",
|
21
|
+
"bbox": [-122.5155, 37.7034, -122.3549, 37.8324],
|
22
|
+
"center": [-122.4194, 37.7749],
|
23
|
+
},
|
24
|
+
"new york": {
|
25
|
+
"name": "New York",
|
26
|
+
"bbox": [-74.0479, 40.6829, -73.9067, 40.8820],
|
27
|
+
"center": [-73.9352, 40.7306],
|
28
|
+
},
|
29
|
+
"new york city": {
|
30
|
+
"name": "New York City",
|
31
|
+
"bbox": [-74.0479, 40.6829, -73.9067, 40.8820],
|
32
|
+
"center": [-73.9352, 40.7306],
|
33
|
+
},
|
34
|
+
"paris": {
|
35
|
+
"name": "Paris",
|
36
|
+
"bbox": [2.2241, 48.8156, 2.4698, 48.9022],
|
37
|
+
"center": [2.3522, 48.8566],
|
38
|
+
},
|
39
|
+
"london": {
|
40
|
+
"name": "London",
|
41
|
+
"bbox": [-0.5103, 51.2868, 0.3340, 51.6919],
|
42
|
+
"center": [-0.1276, 51.5074],
|
43
|
+
},
|
44
|
+
"tokyo": {
|
45
|
+
"name": "Tokyo",
|
46
|
+
"bbox": [139.5694, 35.5232, 139.9182, 35.8173],
|
47
|
+
"center": [139.6917, 35.6895],
|
48
|
+
},
|
49
|
+
"los angeles": {
|
50
|
+
"name": "Los Angeles",
|
51
|
+
"bbox": [-118.6682, 33.7037, -118.1553, 34.3373],
|
52
|
+
"center": [-118.2437, 34.0522],
|
53
|
+
},
|
54
|
+
"chicago": {
|
55
|
+
"name": "Chicago",
|
56
|
+
"bbox": [-87.9401, 41.6445, -87.5241, 42.0230],
|
57
|
+
"center": [-87.6298, 41.8781],
|
58
|
+
},
|
59
|
+
"seattle": {
|
60
|
+
"name": "Seattle",
|
61
|
+
"bbox": [-122.4595, 47.4810, -122.2244, 47.7341],
|
62
|
+
"center": [-122.3321, 47.6062],
|
63
|
+
},
|
64
|
+
"california": {
|
65
|
+
"name": "California",
|
66
|
+
"bbox": [-124.4820, 32.5288, -114.1315, 42.0095],
|
67
|
+
"center": [-119.4179, 36.7783],
|
68
|
+
},
|
69
|
+
"las vegas": {
|
70
|
+
"name": "Las Vegas",
|
71
|
+
"bbox": [-115.3711, 35.9630, -114.9372, 36.2610],
|
72
|
+
"center": [-115.1400, 36.1177],
|
73
|
+
},
|
74
|
+
}
|
75
|
+
|
76
|
+
def __init__(
|
77
|
+
self,
|
78
|
+
catalog_url: Optional[str] = None,
|
79
|
+
catalog_df: Optional[pd.DataFrame] = None,
|
80
|
+
) -> None:
|
81
|
+
"""Initialize CatalogTools.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
catalog_url: URL to a catalog file (TSV, CSV, or JSON). If None, must provide catalog_df.
|
85
|
+
catalog_df: Pre-loaded catalog as a pandas DataFrame. If None, must provide catalog_url.
|
86
|
+
"""
|
87
|
+
self.catalog_url = catalog_url
|
88
|
+
self._catalog_df = catalog_df
|
89
|
+
self._cache = {}
|
90
|
+
# Runtime cache for geocoding results
|
91
|
+
self._geocode_cache = {}
|
92
|
+
|
93
|
+
# Load catalog if URL provided
|
94
|
+
if catalog_url and catalog_df is None:
|
95
|
+
self._catalog_df = self._load_catalog(catalog_url)
|
96
|
+
|
97
|
+
def _load_catalog(self, url: str) -> pd.DataFrame:
|
98
|
+
"""Load catalog from a URL.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
url: URL to catalog file (TSV, CSV, or JSON).
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
DataFrame containing catalog data.
|
105
|
+
"""
|
106
|
+
# Check cache first
|
107
|
+
if url in self._cache:
|
108
|
+
return self._cache[url]
|
109
|
+
|
110
|
+
try:
|
111
|
+
# Download the file
|
112
|
+
response = requests.get(url, timeout=30)
|
113
|
+
response.raise_for_status()
|
114
|
+
|
115
|
+
# Determine file type and parse
|
116
|
+
if url.endswith(".tsv"):
|
117
|
+
df = pd.read_csv(io.StringIO(response.text), sep="\t")
|
118
|
+
elif url.endswith(".csv"):
|
119
|
+
df = pd.read_csv(io.StringIO(response.text))
|
120
|
+
elif url.endswith(".json"):
|
121
|
+
df = pd.read_json(io.StringIO(response.text))
|
122
|
+
else:
|
123
|
+
# Try to auto-detect (default to TSV)
|
124
|
+
df = pd.read_csv(io.StringIO(response.text), sep="\t")
|
125
|
+
|
126
|
+
# Cache the result
|
127
|
+
self._cache[url] = df
|
128
|
+
return df
|
129
|
+
|
130
|
+
except Exception as e:
|
131
|
+
raise ValueError(f"Failed to load catalog from {url}: {str(e)}")
|
132
|
+
|
133
|
+
def _parse_bbox_string(self, bbox_str: str) -> Optional[List[float]]:
|
134
|
+
"""Parse a bbox string to a list of floats.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
bbox_str: Bounding box string in format "minLon, minLat, maxLon, maxLat".
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
List of floats [minLon, minLat, maxLon, maxLat] or None if parsing fails.
|
141
|
+
"""
|
142
|
+
try:
|
143
|
+
if pd.isna(bbox_str) or not bbox_str:
|
144
|
+
return None
|
145
|
+
parts = str(bbox_str).split(",")
|
146
|
+
if len(parts) != 4:
|
147
|
+
return None
|
148
|
+
bbox = [float(p.strip()) for p in parts]
|
149
|
+
return bbox
|
150
|
+
except (ValueError, AttributeError):
|
151
|
+
return None
|
152
|
+
|
153
|
+
def _bbox_intersects(self, bbox1: List[float], bbox2: List[float]) -> bool:
|
154
|
+
"""Check if two bounding boxes intersect.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
bbox1: First bbox as [minLon, minLat, maxLon, maxLat].
|
158
|
+
bbox2: Second bbox as [minLon, minLat, maxLon, maxLat].
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
True if bboxes intersect, False otherwise.
|
162
|
+
"""
|
163
|
+
# Check if boxes do NOT intersect, then negate
|
164
|
+
# bbox1 is completely to the left, right, below, or above bbox2
|
165
|
+
return not (
|
166
|
+
bbox1[2] < bbox2[0] # bbox1 maxLon < bbox2 minLon (left of)
|
167
|
+
or bbox1[0] > bbox2[2] # bbox1 minLon > bbox2 maxLon (right of)
|
168
|
+
or bbox1[3] < bbox2[1] # bbox1 maxLat < bbox2 minLat (below)
|
169
|
+
or bbox1[1] > bbox2[3] # bbox1 minLat > bbox2 maxLat (above)
|
170
|
+
)
|
171
|
+
|
172
|
+
def _bbox_contains_point(self, bbox: List[float], lon: float, lat: float) -> bool:
|
173
|
+
"""Check if a bounding box contains a point.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
bbox: Bounding box as [minLon, minLat, maxLon, maxLat].
|
177
|
+
lon: Longitude of the point.
|
178
|
+
lat: Latitude of the point.
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
True if bbox contains the point, False otherwise.
|
182
|
+
"""
|
183
|
+
return bbox[0] <= lon <= bbox[2] and bbox[1] <= lat <= bbox[3]
|
184
|
+
|
185
|
+
def _search_dataframe(
|
186
|
+
self,
|
187
|
+
df: pd.DataFrame,
|
188
|
+
keywords: Optional[str] = None,
|
189
|
+
dataset_type: Optional[str] = None,
|
190
|
+
provider: Optional[str] = None,
|
191
|
+
start_date: Optional[str] = None,
|
192
|
+
end_date: Optional[str] = None,
|
193
|
+
max_results: int = 10,
|
194
|
+
) -> pd.DataFrame:
|
195
|
+
"""Search dataframe with filters.
|
196
|
+
|
197
|
+
Args:
|
198
|
+
df: DataFrame to search.
|
199
|
+
keywords: Keywords to search for (searches in id, title, keywords, description).
|
200
|
+
dataset_type: Filter by dataset type.
|
201
|
+
provider: Filter by provider.
|
202
|
+
start_date: Filter datasets that have data after this date (YYYY-MM-DD).
|
203
|
+
end_date: Filter datasets that have data before this date (YYYY-MM-DD).
|
204
|
+
max_results: Maximum number of results to return.
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
Filtered DataFrame.
|
208
|
+
"""
|
209
|
+
result_df = df.copy()
|
210
|
+
|
211
|
+
# Apply keyword search
|
212
|
+
if keywords:
|
213
|
+
keyword_lower = keywords.lower()
|
214
|
+
mask = pd.Series([False] * len(result_df), index=result_df.index)
|
215
|
+
|
216
|
+
# Search in id
|
217
|
+
if "id" in result_df.columns:
|
218
|
+
mask |= (
|
219
|
+
result_df["id"]
|
220
|
+
.astype(str)
|
221
|
+
.str.lower()
|
222
|
+
.str.contains(keyword_lower, na=False)
|
223
|
+
)
|
224
|
+
|
225
|
+
# Search in title
|
226
|
+
if "title" in result_df.columns:
|
227
|
+
mask |= (
|
228
|
+
result_df["title"]
|
229
|
+
.astype(str)
|
230
|
+
.str.lower()
|
231
|
+
.str.contains(keyword_lower, na=False)
|
232
|
+
)
|
233
|
+
|
234
|
+
# Search in keywords
|
235
|
+
if "keywords" in result_df.columns:
|
236
|
+
mask |= (
|
237
|
+
result_df["keywords"]
|
238
|
+
.astype(str)
|
239
|
+
.str.lower()
|
240
|
+
.str.contains(keyword_lower, na=False)
|
241
|
+
)
|
242
|
+
|
243
|
+
# Search in description
|
244
|
+
if "description" in result_df.columns:
|
245
|
+
mask |= (
|
246
|
+
result_df["description"]
|
247
|
+
.astype(str)
|
248
|
+
.str.lower()
|
249
|
+
.str.contains(keyword_lower, na=False)
|
250
|
+
)
|
251
|
+
|
252
|
+
result_df = result_df[mask]
|
253
|
+
|
254
|
+
# Filter by type
|
255
|
+
if dataset_type and "type" in result_df.columns:
|
256
|
+
result_df = result_df[
|
257
|
+
result_df["type"]
|
258
|
+
.astype(str)
|
259
|
+
.str.lower()
|
260
|
+
.str.contains(dataset_type.lower(), na=False)
|
261
|
+
]
|
262
|
+
|
263
|
+
# Filter by provider
|
264
|
+
if provider and "provider" in result_df.columns:
|
265
|
+
result_df = result_df[
|
266
|
+
result_df["provider"]
|
267
|
+
.astype(str)
|
268
|
+
.str.lower()
|
269
|
+
.str.contains(provider.lower(), na=False)
|
270
|
+
]
|
271
|
+
|
272
|
+
# Filter by temporal range
|
273
|
+
if start_date and "end_date" in result_df.columns:
|
274
|
+
# Keep datasets where end_date >= start_date (dataset has data after start_date)
|
275
|
+
result_df = result_df[
|
276
|
+
(result_df["end_date"].notna()) & (result_df["end_date"] >= start_date)
|
277
|
+
]
|
278
|
+
|
279
|
+
if end_date and "start_date" in result_df.columns:
|
280
|
+
# Keep datasets where start_date <= end_date (dataset has data before end_date)
|
281
|
+
result_df = result_df[
|
282
|
+
(result_df["start_date"].notna())
|
283
|
+
& (result_df["start_date"] <= end_date)
|
284
|
+
]
|
285
|
+
|
286
|
+
# Limit results
|
287
|
+
if len(result_df) > max_results:
|
288
|
+
result_df = result_df.head(max_results)
|
289
|
+
|
290
|
+
return result_df
|
291
|
+
|
292
|
+
@tool(
|
293
|
+
description="Search for datasets in the catalog using keywords, filters, and date range"
|
294
|
+
)
|
295
|
+
def search_datasets(
|
296
|
+
self,
|
297
|
+
keywords: Optional[str] = None,
|
298
|
+
dataset_type: Optional[str] = None,
|
299
|
+
provider: Optional[str] = None,
|
300
|
+
start_date: Optional[str] = None,
|
301
|
+
end_date: Optional[str] = None,
|
302
|
+
max_results: Optional[Union[str, int]] = 10,
|
303
|
+
) -> str:
|
304
|
+
"""Search for datasets in the catalog.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
keywords: Keywords to search for. Searches in id, title, keywords, and description fields.
|
308
|
+
Example: "landcover" will find datasets with "landcover" in any searchable field.
|
309
|
+
dataset_type: Filter by dataset type (e.g., "image", "image_collection", "table").
|
310
|
+
Example: "image_collection" to find only image collections.
|
311
|
+
provider: Filter by data provider name.
|
312
|
+
Example: "NASA" to find only NASA datasets.
|
313
|
+
start_date: Filter datasets that have data after this date in YYYY-MM-DD format.
|
314
|
+
Example: "2020-01-01" to find datasets with data from 2020 onwards.
|
315
|
+
end_date: Filter datasets that have data before this date in YYYY-MM-DD format.
|
316
|
+
Example: "2023-12-31" to find datasets with data up to 2023.
|
317
|
+
max_results: Maximum number of results to return (default: 10).
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
JSON string containing search results with dataset information.
|
321
|
+
"""
|
322
|
+
try:
|
323
|
+
if self._catalog_df is None:
|
324
|
+
return json.dumps(
|
325
|
+
{
|
326
|
+
"error": "No catalog loaded. Please provide catalog_url or catalog_df."
|
327
|
+
}
|
328
|
+
)
|
329
|
+
|
330
|
+
# Parse max_results if it's a string
|
331
|
+
if isinstance(max_results, str):
|
332
|
+
try:
|
333
|
+
max_results = int(max_results)
|
334
|
+
except ValueError:
|
335
|
+
max_results = 10
|
336
|
+
|
337
|
+
# Search the dataframe
|
338
|
+
result_df = self._search_dataframe(
|
339
|
+
self._catalog_df,
|
340
|
+
keywords=keywords,
|
341
|
+
dataset_type=dataset_type,
|
342
|
+
provider=provider,
|
343
|
+
start_date=start_date,
|
344
|
+
end_date=end_date,
|
345
|
+
max_results=max_results,
|
346
|
+
)
|
347
|
+
|
348
|
+
# Convert to models
|
349
|
+
dataset_models = []
|
350
|
+
for _, row in result_df.iterrows():
|
351
|
+
dataset_models.append(
|
352
|
+
CatalogDatasetInfo(
|
353
|
+
id=str(row.get("id", "")),
|
354
|
+
title=str(row.get("title", "")),
|
355
|
+
type=(
|
356
|
+
str(row.get("type", ""))
|
357
|
+
if pd.notna(row.get("type"))
|
358
|
+
else None
|
359
|
+
),
|
360
|
+
provider=(
|
361
|
+
str(row.get("provider", ""))
|
362
|
+
if pd.notna(row.get("provider"))
|
363
|
+
else None
|
364
|
+
),
|
365
|
+
description=(
|
366
|
+
str(row.get("description", ""))
|
367
|
+
if pd.notna(row.get("description"))
|
368
|
+
else None
|
369
|
+
),
|
370
|
+
keywords=(
|
371
|
+
str(row.get("keywords", ""))
|
372
|
+
if pd.notna(row.get("keywords"))
|
373
|
+
else None
|
374
|
+
),
|
375
|
+
snippet=(
|
376
|
+
str(row.get("snippet", ""))
|
377
|
+
if pd.notna(row.get("snippet"))
|
378
|
+
else None
|
379
|
+
),
|
380
|
+
start_date=(
|
381
|
+
str(row.get("start_date", ""))
|
382
|
+
if pd.notna(row.get("start_date"))
|
383
|
+
else None
|
384
|
+
),
|
385
|
+
end_date=(
|
386
|
+
str(row.get("end_date", ""))
|
387
|
+
if pd.notna(row.get("end_date"))
|
388
|
+
else None
|
389
|
+
),
|
390
|
+
bbox=(
|
391
|
+
str(row.get("bbox", ""))
|
392
|
+
if pd.notna(row.get("bbox"))
|
393
|
+
else None
|
394
|
+
),
|
395
|
+
license=(
|
396
|
+
str(row.get("license", ""))
|
397
|
+
if pd.notna(row.get("license"))
|
398
|
+
else None
|
399
|
+
),
|
400
|
+
url=(
|
401
|
+
str(row.get("url", ""))
|
402
|
+
if pd.notna(row.get("url"))
|
403
|
+
else None
|
404
|
+
),
|
405
|
+
catalog=(
|
406
|
+
str(row.get("catalog", ""))
|
407
|
+
if pd.notna(row.get("catalog"))
|
408
|
+
else None
|
409
|
+
),
|
410
|
+
deprecated=(
|
411
|
+
str(row.get("deprecated", ""))
|
412
|
+
if pd.notna(row.get("deprecated"))
|
413
|
+
else None
|
414
|
+
),
|
415
|
+
)
|
416
|
+
)
|
417
|
+
|
418
|
+
# Create search result
|
419
|
+
filters = {}
|
420
|
+
if keywords:
|
421
|
+
filters["keywords"] = keywords
|
422
|
+
if dataset_type:
|
423
|
+
filters["dataset_type"] = dataset_type
|
424
|
+
if provider:
|
425
|
+
filters["provider"] = provider
|
426
|
+
|
427
|
+
query_parts = []
|
428
|
+
if keywords:
|
429
|
+
query_parts.append(f"keywords: {keywords}")
|
430
|
+
if dataset_type:
|
431
|
+
query_parts.append(f"type: {dataset_type}")
|
432
|
+
if provider:
|
433
|
+
query_parts.append(f"provider: {provider}")
|
434
|
+
query_str = ", ".join(query_parts) if query_parts else "all datasets"
|
435
|
+
|
436
|
+
result = CatalogSearchResult(
|
437
|
+
query=query_str,
|
438
|
+
dataset_count=len(dataset_models),
|
439
|
+
datasets=dataset_models,
|
440
|
+
filters=filters if filters else None,
|
441
|
+
)
|
442
|
+
|
443
|
+
return json.dumps(result.model_dump(), indent=2)
|
444
|
+
|
445
|
+
except Exception as e:
|
446
|
+
return json.dumps({"error": str(e)})
|
447
|
+
|
448
|
+
@tool(description="Get detailed information about a specific dataset")
|
449
|
+
def get_dataset_info(
|
450
|
+
self,
|
451
|
+
dataset_id: str,
|
452
|
+
) -> str:
|
453
|
+
"""Get detailed information about a specific dataset.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
dataset_id: The dataset ID to retrieve.
|
457
|
+
|
458
|
+
Returns:
|
459
|
+
JSON string with detailed dataset information.
|
460
|
+
"""
|
461
|
+
try:
|
462
|
+
if self._catalog_df is None:
|
463
|
+
return json.dumps({"error": "No catalog loaded."})
|
464
|
+
|
465
|
+
# Find the dataset
|
466
|
+
if "id" not in self._catalog_df.columns:
|
467
|
+
return json.dumps({"error": "Catalog does not have 'id' column."})
|
468
|
+
|
469
|
+
result_df = self._catalog_df[self._catalog_df["id"] == dataset_id]
|
470
|
+
|
471
|
+
if len(result_df) == 0:
|
472
|
+
return json.dumps(
|
473
|
+
{"error": f"Dataset '{dataset_id}' not found in catalog."}
|
474
|
+
)
|
475
|
+
|
476
|
+
row = result_df.iloc[0]
|
477
|
+
|
478
|
+
# Convert to model
|
479
|
+
dataset = CatalogDatasetInfo(
|
480
|
+
id=str(row.get("id", "")),
|
481
|
+
title=str(row.get("title", "")),
|
482
|
+
type=str(row.get("type", "")) if pd.notna(row.get("type")) else None,
|
483
|
+
provider=(
|
484
|
+
str(row.get("provider", ""))
|
485
|
+
if pd.notna(row.get("provider"))
|
486
|
+
else None
|
487
|
+
),
|
488
|
+
description=(
|
489
|
+
str(row.get("description", ""))
|
490
|
+
if pd.notna(row.get("description"))
|
491
|
+
else None
|
492
|
+
),
|
493
|
+
keywords=(
|
494
|
+
str(row.get("keywords", ""))
|
495
|
+
if pd.notna(row.get("keywords"))
|
496
|
+
else None
|
497
|
+
),
|
498
|
+
snippet=(
|
499
|
+
str(row.get("snippet", ""))
|
500
|
+
if pd.notna(row.get("snippet"))
|
501
|
+
else None
|
502
|
+
),
|
503
|
+
start_date=(
|
504
|
+
str(row.get("start_date", ""))
|
505
|
+
if pd.notna(row.get("start_date"))
|
506
|
+
else None
|
507
|
+
),
|
508
|
+
end_date=(
|
509
|
+
str(row.get("end_date", ""))
|
510
|
+
if pd.notna(row.get("end_date"))
|
511
|
+
else None
|
512
|
+
),
|
513
|
+
bbox=str(row.get("bbox", "")) if pd.notna(row.get("bbox")) else None,
|
514
|
+
license=(
|
515
|
+
str(row.get("license", ""))
|
516
|
+
if pd.notna(row.get("license"))
|
517
|
+
else None
|
518
|
+
),
|
519
|
+
url=str(row.get("url", "")) if pd.notna(row.get("url")) else None,
|
520
|
+
catalog=(
|
521
|
+
str(row.get("catalog", ""))
|
522
|
+
if pd.notna(row.get("catalog"))
|
523
|
+
else None
|
524
|
+
),
|
525
|
+
deprecated=(
|
526
|
+
str(row.get("deprecated", ""))
|
527
|
+
if pd.notna(row.get("deprecated"))
|
528
|
+
else None
|
529
|
+
),
|
530
|
+
)
|
531
|
+
|
532
|
+
return json.dumps(dataset.model_dump(), indent=2)
|
533
|
+
|
534
|
+
except Exception as e:
|
535
|
+
return json.dumps({"error": str(e)})
|
536
|
+
|
537
|
+
@tool(description="List unique dataset types available in the catalog")
|
538
|
+
def list_dataset_types(self) -> str:
|
539
|
+
"""List unique dataset types available in the catalog.
|
540
|
+
|
541
|
+
Returns:
|
542
|
+
JSON string with list of dataset types.
|
543
|
+
"""
|
544
|
+
try:
|
545
|
+
if self._catalog_df is None:
|
546
|
+
return json.dumps({"error": "No catalog loaded."})
|
547
|
+
|
548
|
+
if "type" not in self._catalog_df.columns:
|
549
|
+
return json.dumps({"error": "Catalog does not have 'type' column."})
|
550
|
+
|
551
|
+
types = self._catalog_df["type"].dropna().unique().tolist()
|
552
|
+
types.sort()
|
553
|
+
|
554
|
+
result = {
|
555
|
+
"count": len(types),
|
556
|
+
"types": types,
|
557
|
+
}
|
558
|
+
|
559
|
+
return json.dumps(result, indent=2)
|
560
|
+
|
561
|
+
except Exception as e:
|
562
|
+
return json.dumps({"error": str(e)})
|
563
|
+
|
564
|
+
@tool(description="List unique data providers in the catalog")
|
565
|
+
def list_providers(self) -> str:
|
566
|
+
"""List unique data providers in the catalog.
|
567
|
+
|
568
|
+
Returns:
|
569
|
+
JSON string with list of providers.
|
570
|
+
"""
|
571
|
+
try:
|
572
|
+
if self._catalog_df is None:
|
573
|
+
return json.dumps({"error": "No catalog loaded."})
|
574
|
+
|
575
|
+
if "provider" not in self._catalog_df.columns:
|
576
|
+
return json.dumps({"error": "Catalog does not have 'provider' column."})
|
577
|
+
|
578
|
+
providers = self._catalog_df["provider"].dropna().unique().tolist()
|
579
|
+
providers.sort()
|
580
|
+
|
581
|
+
result = {
|
582
|
+
"count": len(providers),
|
583
|
+
"providers": providers,
|
584
|
+
}
|
585
|
+
|
586
|
+
return json.dumps(result, indent=2)
|
587
|
+
|
588
|
+
except Exception as e:
|
589
|
+
return json.dumps({"error": str(e)})
|
590
|
+
|
591
|
+
@tool(description="Get catalog statistics and summary information")
|
592
|
+
def get_catalog_stats(self) -> str:
|
593
|
+
"""Get statistics about the catalog.
|
594
|
+
|
595
|
+
Returns:
|
596
|
+
JSON string with catalog statistics.
|
597
|
+
"""
|
598
|
+
try:
|
599
|
+
if self._catalog_df is None:
|
600
|
+
return json.dumps({"error": "No catalog loaded."})
|
601
|
+
|
602
|
+
stats = {
|
603
|
+
"total_datasets": len(self._catalog_df),
|
604
|
+
"columns": list(self._catalog_df.columns),
|
605
|
+
}
|
606
|
+
|
607
|
+
# Add type counts if available
|
608
|
+
if "type" in self._catalog_df.columns:
|
609
|
+
type_counts = self._catalog_df["type"].value_counts().to_dict()
|
610
|
+
stats["dataset_types"] = type_counts
|
611
|
+
|
612
|
+
# Add provider counts if available
|
613
|
+
if "provider" in self._catalog_df.columns:
|
614
|
+
# Get top 10 providers
|
615
|
+
provider_counts = (
|
616
|
+
self._catalog_df["provider"].value_counts().head(10).to_dict()
|
617
|
+
)
|
618
|
+
stats["top_providers"] = provider_counts
|
619
|
+
|
620
|
+
return json.dumps(stats, indent=2)
|
621
|
+
|
622
|
+
except Exception as e:
|
623
|
+
return json.dumps({"error": str(e)})
|
624
|
+
|
625
|
+
@tool(description="Parse a location name and return its bounding box coordinates")
|
626
|
+
def geocode_location(self, location_name: str) -> str:
|
627
|
+
"""Convert a location name to geographic coordinates and bounding box.
|
628
|
+
|
629
|
+
This tool uses a geocoding service to find the coordinates for a given location name.
|
630
|
+
|
631
|
+
Args:
|
632
|
+
location_name: Name of the location (e.g., "San Francisco", "New York", "Paris, France", "California").
|
633
|
+
|
634
|
+
Returns:
|
635
|
+
JSON string with location info including bounding box and center coordinates.
|
636
|
+
"""
|
637
|
+
try:
|
638
|
+
# Check static cache first (common locations)
|
639
|
+
location_key = location_name.lower().strip()
|
640
|
+
if location_key in self._LOCATION_CACHE:
|
641
|
+
cached = self._LOCATION_CACHE[location_key]
|
642
|
+
location_info = LocationInfo(
|
643
|
+
name=cached["name"],
|
644
|
+
bbox=cached["bbox"],
|
645
|
+
center=cached["center"],
|
646
|
+
)
|
647
|
+
return json.dumps(location_info.model_dump(), indent=2)
|
648
|
+
|
649
|
+
# Check runtime cache
|
650
|
+
if location_key in self._geocode_cache:
|
651
|
+
return self._geocode_cache[location_key]
|
652
|
+
|
653
|
+
# Geocode using Nominatim
|
654
|
+
url = "https://nominatim.openstreetmap.org/search"
|
655
|
+
params = {
|
656
|
+
"q": location_name,
|
657
|
+
"format": "json",
|
658
|
+
"limit": 1,
|
659
|
+
}
|
660
|
+
headers = {"User-Agent": "GeoAI-Catalog-Agent/1.0"}
|
661
|
+
|
662
|
+
response = requests.get(url, params=params, headers=headers, timeout=10)
|
663
|
+
response.raise_for_status()
|
664
|
+
|
665
|
+
results = response.json()
|
666
|
+
|
667
|
+
if not results:
|
668
|
+
error_result = json.dumps(
|
669
|
+
{"error": f"Location '{location_name}' not found"}
|
670
|
+
)
|
671
|
+
self._geocode_cache[location_key] = error_result
|
672
|
+
return error_result
|
673
|
+
|
674
|
+
result = results[0]
|
675
|
+
bbox = [
|
676
|
+
float(result["boundingbox"][2]), # west
|
677
|
+
float(result["boundingbox"][0]), # south
|
678
|
+
float(result["boundingbox"][3]), # east
|
679
|
+
float(result["boundingbox"][1]), # north
|
680
|
+
]
|
681
|
+
center = [float(result["lon"]), float(result["lat"])]
|
682
|
+
|
683
|
+
location_info = LocationInfo(
|
684
|
+
name=result.get("display_name", location_name),
|
685
|
+
bbox=bbox,
|
686
|
+
center=center,
|
687
|
+
)
|
688
|
+
|
689
|
+
result_json = json.dumps(location_info.model_dump(), indent=2)
|
690
|
+
# Cache the result
|
691
|
+
self._geocode_cache[location_key] = result_json
|
692
|
+
|
693
|
+
return result_json
|
694
|
+
|
695
|
+
except Exception as e:
|
696
|
+
return json.dumps({"error": f"Geocoding error: {str(e)}"})
|
697
|
+
|
698
|
+
@tool(
|
699
|
+
description="Search for datasets by geographic region, keywords, and date range"
|
700
|
+
)
|
701
|
+
def search_by_region(
|
702
|
+
self,
|
703
|
+
bbox: Optional[Union[str, List[float]]] = None,
|
704
|
+
location: Optional[str] = None,
|
705
|
+
keywords: Optional[str] = None,
|
706
|
+
dataset_type: Optional[str] = None,
|
707
|
+
provider: Optional[str] = None,
|
708
|
+
start_date: Optional[str] = None,
|
709
|
+
end_date: Optional[str] = None,
|
710
|
+
max_results: Optional[Union[str, int]] = 10,
|
711
|
+
) -> str:
|
712
|
+
"""Search for datasets that cover a specific geographic region.
|
713
|
+
|
714
|
+
Args:
|
715
|
+
bbox: Bounding box as [west, south, east, north] or comma-separated string.
|
716
|
+
Example: [-122.5, 37.5, -122.0, 38.0] for San Francisco Bay Area.
|
717
|
+
location: Location name to geocode into a bounding box.
|
718
|
+
Example: "California", "San Francisco", "New York City".
|
719
|
+
keywords: Additional keywords to search for in dataset metadata.
|
720
|
+
dataset_type: Filter by dataset type (e.g., "image", "image_collection").
|
721
|
+
provider: Filter by data provider name.
|
722
|
+
start_date: Filter datasets that have data after this date in YYYY-MM-DD format.
|
723
|
+
Example: "2020-01-01" to find datasets with data from 2020 onwards.
|
724
|
+
end_date: Filter datasets that have data before this date in YYYY-MM-DD format.
|
725
|
+
Example: "2023-12-31" to find datasets with data up to 2023.
|
726
|
+
max_results: Maximum number of results to return (default: 10).
|
727
|
+
|
728
|
+
Returns:
|
729
|
+
JSON string containing search results with datasets that intersect the search region.
|
730
|
+
"""
|
731
|
+
try:
|
732
|
+
if self._catalog_df is None:
|
733
|
+
return json.dumps({"error": "No catalog loaded."})
|
734
|
+
|
735
|
+
# Parse max_results if it's a string
|
736
|
+
if isinstance(max_results, str):
|
737
|
+
try:
|
738
|
+
max_results = int(max_results)
|
739
|
+
except ValueError:
|
740
|
+
max_results = 10
|
741
|
+
|
742
|
+
# Determine search bbox
|
743
|
+
search_bbox = None
|
744
|
+
|
745
|
+
if bbox is not None:
|
746
|
+
# Parse bbox if it's a string
|
747
|
+
if isinstance(bbox, str):
|
748
|
+
search_bbox = self._parse_bbox_string(bbox)
|
749
|
+
if search_bbox is None:
|
750
|
+
return json.dumps({"error": f"Invalid bbox format: {bbox}"})
|
751
|
+
else:
|
752
|
+
search_bbox = bbox
|
753
|
+
|
754
|
+
elif location is not None:
|
755
|
+
# Geocode location to bbox
|
756
|
+
geocode_result = json.loads(self.geocode_location(location))
|
757
|
+
if "error" in geocode_result:
|
758
|
+
return json.dumps(geocode_result)
|
759
|
+
search_bbox = geocode_result["bbox"]
|
760
|
+
|
761
|
+
if search_bbox is None:
|
762
|
+
return json.dumps(
|
763
|
+
{"error": "Either bbox or location must be provided."}
|
764
|
+
)
|
765
|
+
|
766
|
+
# Validate search bbox
|
767
|
+
if len(search_bbox) != 4:
|
768
|
+
return json.dumps(
|
769
|
+
{
|
770
|
+
"error": "Bbox must have 4 values [minLon, minLat, maxLon, maxLat]"
|
771
|
+
}
|
772
|
+
)
|
773
|
+
|
774
|
+
# Filter by spatial intersection
|
775
|
+
if "bbox" not in self._catalog_df.columns:
|
776
|
+
return json.dumps(
|
777
|
+
{
|
778
|
+
"error": "Catalog does not have 'bbox' column. Try using a JSON format catalog."
|
779
|
+
}
|
780
|
+
)
|
781
|
+
|
782
|
+
# Create mask for spatial intersection
|
783
|
+
spatial_mask = pd.Series(
|
784
|
+
[False] * len(self._catalog_df), index=self._catalog_df.index
|
785
|
+
)
|
786
|
+
|
787
|
+
for idx, row in self._catalog_df.iterrows():
|
788
|
+
dataset_bbox = self._parse_bbox_string(row.get("bbox"))
|
789
|
+
if dataset_bbox and self._bbox_intersects(dataset_bbox, search_bbox):
|
790
|
+
spatial_mask[idx] = True
|
791
|
+
|
792
|
+
result_df = self._catalog_df[spatial_mask]
|
793
|
+
|
794
|
+
# Apply additional filters using existing _search_dataframe logic
|
795
|
+
result_df = self._search_dataframe(
|
796
|
+
result_df,
|
797
|
+
keywords=keywords,
|
798
|
+
dataset_type=dataset_type,
|
799
|
+
provider=provider,
|
800
|
+
start_date=start_date,
|
801
|
+
end_date=end_date,
|
802
|
+
max_results=max_results,
|
803
|
+
)
|
804
|
+
|
805
|
+
# Convert to models
|
806
|
+
dataset_models = []
|
807
|
+
for _, row in result_df.iterrows():
|
808
|
+
dataset_models.append(
|
809
|
+
CatalogDatasetInfo(
|
810
|
+
id=str(row.get("id", "")),
|
811
|
+
title=str(row.get("title", "")),
|
812
|
+
type=(
|
813
|
+
str(row.get("type", ""))
|
814
|
+
if pd.notna(row.get("type"))
|
815
|
+
else None
|
816
|
+
),
|
817
|
+
provider=(
|
818
|
+
str(row.get("provider", ""))
|
819
|
+
if pd.notna(row.get("provider"))
|
820
|
+
else None
|
821
|
+
),
|
822
|
+
description=(
|
823
|
+
str(row.get("description", ""))
|
824
|
+
if pd.notna(row.get("description"))
|
825
|
+
else None
|
826
|
+
),
|
827
|
+
keywords=(
|
828
|
+
str(row.get("keywords", ""))
|
829
|
+
if pd.notna(row.get("keywords"))
|
830
|
+
else None
|
831
|
+
),
|
832
|
+
snippet=(
|
833
|
+
str(row.get("snippet", ""))
|
834
|
+
if pd.notna(row.get("snippet"))
|
835
|
+
else None
|
836
|
+
),
|
837
|
+
start_date=(
|
838
|
+
str(row.get("start_date", ""))
|
839
|
+
if pd.notna(row.get("start_date"))
|
840
|
+
else None
|
841
|
+
),
|
842
|
+
end_date=(
|
843
|
+
str(row.get("end_date", ""))
|
844
|
+
if pd.notna(row.get("end_date"))
|
845
|
+
else None
|
846
|
+
),
|
847
|
+
bbox=(
|
848
|
+
str(row.get("bbox", ""))
|
849
|
+
if pd.notna(row.get("bbox"))
|
850
|
+
else None
|
851
|
+
),
|
852
|
+
license=(
|
853
|
+
str(row.get("license", ""))
|
854
|
+
if pd.notna(row.get("license"))
|
855
|
+
else None
|
856
|
+
),
|
857
|
+
url=(
|
858
|
+
str(row.get("url", ""))
|
859
|
+
if pd.notna(row.get("url"))
|
860
|
+
else None
|
861
|
+
),
|
862
|
+
catalog=(
|
863
|
+
str(row.get("catalog", ""))
|
864
|
+
if pd.notna(row.get("catalog"))
|
865
|
+
else None
|
866
|
+
),
|
867
|
+
deprecated=(
|
868
|
+
str(row.get("deprecated", ""))
|
869
|
+
if pd.notna(row.get("deprecated"))
|
870
|
+
else None
|
871
|
+
),
|
872
|
+
)
|
873
|
+
)
|
874
|
+
|
875
|
+
# Create search result
|
876
|
+
filters = {"search_bbox": search_bbox}
|
877
|
+
if keywords:
|
878
|
+
filters["keywords"] = keywords
|
879
|
+
if dataset_type:
|
880
|
+
filters["dataset_type"] = dataset_type
|
881
|
+
if provider:
|
882
|
+
filters["provider"] = provider
|
883
|
+
|
884
|
+
query_parts = []
|
885
|
+
if location:
|
886
|
+
query_parts.append(f"location: {location}")
|
887
|
+
elif bbox:
|
888
|
+
query_parts.append(f"bbox: {search_bbox}")
|
889
|
+
if keywords:
|
890
|
+
query_parts.append(f"keywords: {keywords}")
|
891
|
+
if dataset_type:
|
892
|
+
query_parts.append(f"type: {dataset_type}")
|
893
|
+
if provider:
|
894
|
+
query_parts.append(f"provider: {provider}")
|
895
|
+
query_str = ", ".join(query_parts) if query_parts else "spatial search"
|
896
|
+
|
897
|
+
result = CatalogSearchResult(
|
898
|
+
query=query_str,
|
899
|
+
dataset_count=len(dataset_models),
|
900
|
+
datasets=dataset_models,
|
901
|
+
filters=filters,
|
902
|
+
)
|
903
|
+
|
904
|
+
return json.dumps(result.model_dump(), indent=2)
|
905
|
+
|
906
|
+
except Exception as e:
|
907
|
+
return json.dumps({"error": str(e)})
|