foundry-mcp 0.3.3__py3-none-any.whl → 0.8.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foundry_mcp/__init__.py +7 -1
- foundry_mcp/cli/__init__.py +0 -13
- foundry_mcp/cli/commands/plan.py +10 -3
- foundry_mcp/cli/commands/review.py +19 -4
- foundry_mcp/cli/commands/session.py +1 -8
- foundry_mcp/cli/commands/specs.py +38 -208
- foundry_mcp/cli/context.py +39 -0
- foundry_mcp/cli/output.py +3 -3
- foundry_mcp/config.py +615 -11
- foundry_mcp/core/ai_consultation.py +146 -9
- foundry_mcp/core/batch_operations.py +1196 -0
- foundry_mcp/core/discovery.py +7 -7
- foundry_mcp/core/error_store.py +2 -2
- foundry_mcp/core/intake.py +933 -0
- foundry_mcp/core/llm_config.py +28 -2
- foundry_mcp/core/metrics_store.py +2 -2
- foundry_mcp/core/naming.py +25 -2
- foundry_mcp/core/progress.py +70 -0
- foundry_mcp/core/prometheus.py +0 -13
- foundry_mcp/core/prompts/fidelity_review.py +149 -4
- foundry_mcp/core/prompts/markdown_plan_review.py +5 -1
- foundry_mcp/core/prompts/plan_review.py +5 -1
- foundry_mcp/core/providers/__init__.py +12 -0
- foundry_mcp/core/providers/base.py +39 -0
- foundry_mcp/core/providers/claude.py +51 -48
- foundry_mcp/core/providers/codex.py +70 -60
- foundry_mcp/core/providers/cursor_agent.py +25 -47
- foundry_mcp/core/providers/detectors.py +34 -7
- foundry_mcp/core/providers/gemini.py +69 -58
- foundry_mcp/core/providers/opencode.py +101 -47
- foundry_mcp/core/providers/package-lock.json +4 -4
- foundry_mcp/core/providers/package.json +1 -1
- foundry_mcp/core/providers/validation.py +128 -0
- foundry_mcp/core/research/__init__.py +68 -0
- foundry_mcp/core/research/memory.py +528 -0
- foundry_mcp/core/research/models.py +1220 -0
- foundry_mcp/core/research/providers/__init__.py +40 -0
- foundry_mcp/core/research/providers/base.py +242 -0
- foundry_mcp/core/research/providers/google.py +507 -0
- foundry_mcp/core/research/providers/perplexity.py +442 -0
- foundry_mcp/core/research/providers/semantic_scholar.py +544 -0
- foundry_mcp/core/research/providers/tavily.py +383 -0
- foundry_mcp/core/research/workflows/__init__.py +25 -0
- foundry_mcp/core/research/workflows/base.py +298 -0
- foundry_mcp/core/research/workflows/chat.py +271 -0
- foundry_mcp/core/research/workflows/consensus.py +539 -0
- foundry_mcp/core/research/workflows/deep_research.py +4020 -0
- foundry_mcp/core/research/workflows/ideate.py +682 -0
- foundry_mcp/core/research/workflows/thinkdeep.py +405 -0
- foundry_mcp/core/responses.py +690 -0
- foundry_mcp/core/spec.py +2439 -236
- foundry_mcp/core/task.py +1205 -31
- foundry_mcp/core/testing.py +512 -123
- foundry_mcp/core/validation.py +319 -43
- foundry_mcp/dashboard/components/charts.py +0 -57
- foundry_mcp/dashboard/launcher.py +11 -0
- foundry_mcp/dashboard/views/metrics.py +25 -35
- foundry_mcp/dashboard/views/overview.py +1 -65
- foundry_mcp/resources/specs.py +25 -25
- foundry_mcp/schemas/intake-schema.json +89 -0
- foundry_mcp/schemas/sdd-spec-schema.json +33 -5
- foundry_mcp/server.py +0 -14
- foundry_mcp/tools/unified/__init__.py +39 -18
- foundry_mcp/tools/unified/authoring.py +2371 -248
- foundry_mcp/tools/unified/documentation_helpers.py +69 -6
- foundry_mcp/tools/unified/environment.py +434 -32
- foundry_mcp/tools/unified/error.py +18 -1
- foundry_mcp/tools/unified/lifecycle.py +8 -0
- foundry_mcp/tools/unified/plan.py +133 -2
- foundry_mcp/tools/unified/provider.py +0 -40
- foundry_mcp/tools/unified/research.py +1283 -0
- foundry_mcp/tools/unified/review.py +374 -17
- foundry_mcp/tools/unified/review_helpers.py +16 -1
- foundry_mcp/tools/unified/server.py +9 -24
- foundry_mcp/tools/unified/spec.py +367 -0
- foundry_mcp/tools/unified/task.py +1664 -30
- foundry_mcp/tools/unified/test.py +69 -8
- {foundry_mcp-0.3.3.dist-info → foundry_mcp-0.8.10.dist-info}/METADATA +8 -1
- foundry_mcp-0.8.10.dist-info/RECORD +153 -0
- foundry_mcp/cli/flags.py +0 -266
- foundry_mcp/core/feature_flags.py +0 -592
- foundry_mcp-0.3.3.dist-info/RECORD +0 -135
- {foundry_mcp-0.3.3.dist-info → foundry_mcp-0.8.10.dist-info}/WHEEL +0 -0
- {foundry_mcp-0.3.3.dist-info → foundry_mcp-0.8.10.dist-info}/entry_points.txt +0 -0
- {foundry_mcp-0.3.3.dist-info → foundry_mcp-0.8.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
"""Google Custom Search provider for web search.
|
|
2
|
+
|
|
3
|
+
This module implements GoogleSearchProvider, which wraps the Google Custom Search
|
|
4
|
+
JSON API to provide web search capabilities for the deep research workflow.
|
|
5
|
+
|
|
6
|
+
Google Custom Search API documentation:
|
|
7
|
+
https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list
|
|
8
|
+
|
|
9
|
+
Example usage:
|
|
10
|
+
provider = GoogleSearchProvider(
|
|
11
|
+
api_key="AIza...",
|
|
12
|
+
cx="017576662512468239146:omuauf_lfve",
|
|
13
|
+
)
|
|
14
|
+
sources = await provider.search("machine learning trends", max_results=5)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from typing import Any, Optional
|
|
22
|
+
|
|
23
|
+
import httpx
|
|
24
|
+
|
|
25
|
+
from foundry_mcp.core.research.models import ResearchSource, SourceType
|
|
26
|
+
from foundry_mcp.core.research.providers.base import (
|
|
27
|
+
AuthenticationError,
|
|
28
|
+
RateLimitError,
|
|
29
|
+
SearchProvider,
|
|
30
|
+
SearchProviderError,
|
|
31
|
+
SearchResult,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
# Google Custom Search API constants
|
|
37
|
+
GOOGLE_API_BASE_URL = "https://www.googleapis.com/customsearch/v1"
|
|
38
|
+
DEFAULT_TIMEOUT = 30.0
|
|
39
|
+
DEFAULT_MAX_RETRIES = 3
|
|
40
|
+
DEFAULT_RATE_LIMIT = 1.0 # requests per second (Google CSE has daily quota limits)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class GoogleSearchProvider(SearchProvider):
|
|
44
|
+
"""Google Custom Search API provider for web search.
|
|
45
|
+
|
|
46
|
+
Wraps the Google Custom Search JSON API to provide web search capabilities.
|
|
47
|
+
Requires a Google API key and a Custom Search Engine (CSE) ID.
|
|
48
|
+
|
|
49
|
+
To set up:
|
|
50
|
+
1. Create a project in Google Cloud Console
|
|
51
|
+
2. Enable the Custom Search API
|
|
52
|
+
3. Create an API key
|
|
53
|
+
4. Create a Custom Search Engine at https://cse.google.com/
|
|
54
|
+
5. Get the Search Engine ID (cx parameter)
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
api_key: Google API key (required)
|
|
58
|
+
cx: Custom Search Engine ID (required)
|
|
59
|
+
base_url: API base URL (default: https://www.googleapis.com/customsearch/v1)
|
|
60
|
+
timeout: Request timeout in seconds (default: 30.0)
|
|
61
|
+
max_retries: Maximum retry attempts for rate limits (default: 3)
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
provider = GoogleSearchProvider(
|
|
65
|
+
api_key="AIza...",
|
|
66
|
+
cx="017576662512468239146:omuauf_lfve",
|
|
67
|
+
)
|
|
68
|
+
sources = await provider.search(
|
|
69
|
+
"AI trends 2024",
|
|
70
|
+
max_results=5,
|
|
71
|
+
)
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
api_key: Optional[str] = None,
|
|
77
|
+
cx: Optional[str] = None,
|
|
78
|
+
base_url: str = GOOGLE_API_BASE_URL,
|
|
79
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
80
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
81
|
+
):
|
|
82
|
+
"""Initialize Google Custom Search provider.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
api_key: Google API key. If not provided, reads from GOOGLE_API_KEY env var.
|
|
86
|
+
cx: Custom Search Engine ID. If not provided, reads from GOOGLE_CSE_ID env var.
|
|
87
|
+
base_url: API base URL (default: https://www.googleapis.com/customsearch/v1)
|
|
88
|
+
timeout: Request timeout in seconds (default: 30.0)
|
|
89
|
+
max_retries: Maximum retry attempts for rate limits (default: 3)
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If API key or CSE ID is not provided or found in environment
|
|
93
|
+
"""
|
|
94
|
+
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
|
95
|
+
if not self._api_key:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
"Google API key required. Provide via api_key parameter "
|
|
98
|
+
"or GOOGLE_API_KEY environment variable."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
self._cx = cx or os.environ.get("GOOGLE_CSE_ID")
|
|
102
|
+
if not self._cx:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"Google Custom Search Engine ID required. Provide via cx parameter "
|
|
105
|
+
"or GOOGLE_CSE_ID environment variable."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self._base_url = base_url.rstrip("/")
|
|
109
|
+
self._timeout = timeout
|
|
110
|
+
self._max_retries = max_retries
|
|
111
|
+
self._rate_limit_value = DEFAULT_RATE_LIMIT
|
|
112
|
+
|
|
113
|
+
def get_provider_name(self) -> str:
|
|
114
|
+
"""Return the provider identifier.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
"google"
|
|
118
|
+
"""
|
|
119
|
+
return "google"
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def rate_limit(self) -> Optional[float]:
|
|
123
|
+
"""Return the rate limit in requests per second.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
1.0 (one request per second)
|
|
127
|
+
"""
|
|
128
|
+
return self._rate_limit_value
|
|
129
|
+
|
|
130
|
+
async def search(
|
|
131
|
+
self,
|
|
132
|
+
query: str,
|
|
133
|
+
max_results: int = 10,
|
|
134
|
+
**kwargs: Any,
|
|
135
|
+
) -> list[ResearchSource]:
|
|
136
|
+
"""Execute a web search via Google Custom Search API.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
query: The search query string
|
|
140
|
+
max_results: Maximum number of results to return (default: 10, max: 10 per request)
|
|
141
|
+
**kwargs: Additional Google CSE options:
|
|
142
|
+
- site_search: Restrict results to a specific site
|
|
143
|
+
- date_restrict: Restrict by date (e.g., "d7" for past week, "m1" for past month)
|
|
144
|
+
- file_type: Restrict to specific file types (e.g., "pdf")
|
|
145
|
+
- safe: Safe search level ("off", "medium", "high")
|
|
146
|
+
- sub_query_id: SubQuery ID for source tracking
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of ResearchSource objects
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
AuthenticationError: If API key is invalid
|
|
153
|
+
RateLimitError: If rate limit/quota exceeded after all retries
|
|
154
|
+
SearchProviderError: For other API errors
|
|
155
|
+
"""
|
|
156
|
+
# Extract Google-specific options
|
|
157
|
+
site_search = kwargs.get("site_search")
|
|
158
|
+
date_restrict = kwargs.get("date_restrict")
|
|
159
|
+
file_type = kwargs.get("file_type")
|
|
160
|
+
safe = kwargs.get("safe", "off")
|
|
161
|
+
sub_query_id = kwargs.get("sub_query_id")
|
|
162
|
+
|
|
163
|
+
# Google CSE returns max 10 results per request
|
|
164
|
+
# For more results, pagination with 'start' parameter would be needed
|
|
165
|
+
max_results = min(max_results, 10)
|
|
166
|
+
|
|
167
|
+
# Build query parameters
|
|
168
|
+
params: dict[str, Any] = {
|
|
169
|
+
"key": self._api_key,
|
|
170
|
+
"cx": self._cx,
|
|
171
|
+
"q": query,
|
|
172
|
+
"num": max_results,
|
|
173
|
+
"safe": safe,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if site_search:
|
|
177
|
+
params["siteSearch"] = site_search
|
|
178
|
+
if date_restrict:
|
|
179
|
+
params["dateRestrict"] = date_restrict
|
|
180
|
+
if file_type:
|
|
181
|
+
params["fileType"] = file_type
|
|
182
|
+
|
|
183
|
+
# Execute with retry logic
|
|
184
|
+
response_data = await self._execute_with_retry(params)
|
|
185
|
+
|
|
186
|
+
# Parse results
|
|
187
|
+
return self._parse_response(response_data, sub_query_id)
|
|
188
|
+
|
|
189
|
+
async def _execute_with_retry(
|
|
190
|
+
self,
|
|
191
|
+
params: dict[str, Any],
|
|
192
|
+
) -> dict[str, Any]:
|
|
193
|
+
"""Execute API request with exponential backoff retry.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
params: Query parameters
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Parsed JSON response
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
AuthenticationError: If API key is invalid
|
|
203
|
+
RateLimitError: If rate limit exceeded after all retries
|
|
204
|
+
SearchProviderError: For other API errors
|
|
205
|
+
"""
|
|
206
|
+
last_error: Optional[Exception] = None
|
|
207
|
+
|
|
208
|
+
for attempt in range(self._max_retries):
|
|
209
|
+
try:
|
|
210
|
+
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
|
211
|
+
response = await client.get(self._base_url, params=params)
|
|
212
|
+
|
|
213
|
+
# Handle authentication errors (not retryable)
|
|
214
|
+
if response.status_code == 401:
|
|
215
|
+
raise AuthenticationError(
|
|
216
|
+
provider="google",
|
|
217
|
+
message="Invalid API key",
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Handle forbidden (invalid CSE ID or API not enabled)
|
|
221
|
+
if response.status_code == 403:
|
|
222
|
+
error_data = self._parse_error_response(response)
|
|
223
|
+
# Check if it's a quota error (retryable) vs auth error (not retryable)
|
|
224
|
+
if "quota" in error_data.lower() or "limit" in error_data.lower():
|
|
225
|
+
retry_after = self._parse_retry_after(response)
|
|
226
|
+
if attempt < self._max_retries - 1:
|
|
227
|
+
wait_time = retry_after or (2**attempt)
|
|
228
|
+
logger.warning(
|
|
229
|
+
f"Google CSE quota limit hit, waiting {wait_time}s "
|
|
230
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
231
|
+
)
|
|
232
|
+
await asyncio.sleep(wait_time)
|
|
233
|
+
continue
|
|
234
|
+
raise RateLimitError(
|
|
235
|
+
provider="google",
|
|
236
|
+
retry_after=retry_after,
|
|
237
|
+
)
|
|
238
|
+
# Non-quota 403 errors (bad CSE ID, API not enabled)
|
|
239
|
+
raise AuthenticationError(
|
|
240
|
+
provider="google",
|
|
241
|
+
message=f"Access denied: {error_data}",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Handle rate limiting (429)
|
|
245
|
+
if response.status_code == 429:
|
|
246
|
+
retry_after = self._parse_retry_after(response)
|
|
247
|
+
if attempt < self._max_retries - 1:
|
|
248
|
+
wait_time = retry_after or (2**attempt)
|
|
249
|
+
logger.warning(
|
|
250
|
+
f"Google CSE rate limit hit, waiting {wait_time}s "
|
|
251
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
252
|
+
)
|
|
253
|
+
await asyncio.sleep(wait_time)
|
|
254
|
+
continue
|
|
255
|
+
raise RateLimitError(
|
|
256
|
+
provider="google",
|
|
257
|
+
retry_after=retry_after,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Handle other errors
|
|
261
|
+
if response.status_code >= 400:
|
|
262
|
+
error_msg = self._parse_error_response(response)
|
|
263
|
+
raise SearchProviderError(
|
|
264
|
+
provider="google",
|
|
265
|
+
message=f"API error {response.status_code}: {error_msg}",
|
|
266
|
+
retryable=response.status_code >= 500,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return response.json()
|
|
270
|
+
|
|
271
|
+
except httpx.TimeoutException as e:
|
|
272
|
+
last_error = e
|
|
273
|
+
if attempt < self._max_retries - 1:
|
|
274
|
+
wait_time = 2**attempt
|
|
275
|
+
logger.warning(
|
|
276
|
+
f"Google CSE request timeout, retrying in {wait_time}s "
|
|
277
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
278
|
+
)
|
|
279
|
+
await asyncio.sleep(wait_time)
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
except httpx.RequestError as e:
|
|
283
|
+
last_error = e
|
|
284
|
+
if attempt < self._max_retries - 1:
|
|
285
|
+
wait_time = 2**attempt
|
|
286
|
+
logger.warning(
|
|
287
|
+
f"Google CSE request error: {e}, retrying in {wait_time}s "
|
|
288
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
289
|
+
)
|
|
290
|
+
await asyncio.sleep(wait_time)
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
except (AuthenticationError, RateLimitError, SearchProviderError):
|
|
294
|
+
raise
|
|
295
|
+
|
|
296
|
+
# All retries exhausted
|
|
297
|
+
raise SearchProviderError(
|
|
298
|
+
provider="google",
|
|
299
|
+
message=f"Request failed after {self._max_retries} attempts",
|
|
300
|
+
retryable=False,
|
|
301
|
+
original_error=last_error,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def _parse_retry_after(self, response: httpx.Response) -> Optional[float]:
|
|
305
|
+
"""Parse Retry-After header from response.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
response: HTTP response
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Seconds to wait, or None if not provided
|
|
312
|
+
"""
|
|
313
|
+
retry_after = response.headers.get("Retry-After")
|
|
314
|
+
if retry_after:
|
|
315
|
+
try:
|
|
316
|
+
return float(retry_after)
|
|
317
|
+
except ValueError:
|
|
318
|
+
pass
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def _parse_error_response(self, response: httpx.Response) -> str:
|
|
322
|
+
"""Extract error message from Google API error response.
|
|
323
|
+
|
|
324
|
+
Google API returns errors in format:
|
|
325
|
+
{
|
|
326
|
+
"error": {
|
|
327
|
+
"code": 403,
|
|
328
|
+
"message": "...",
|
|
329
|
+
"errors": [...]
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
response: HTTP response
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
Error message string
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
data = response.json()
|
|
341
|
+
error = data.get("error", {})
|
|
342
|
+
if isinstance(error, dict):
|
|
343
|
+
return error.get("message", str(error))
|
|
344
|
+
return str(error)
|
|
345
|
+
except Exception:
|
|
346
|
+
return response.text[:200] if response.text else "Unknown error"
|
|
347
|
+
|
|
348
|
+
def _parse_response(
|
|
349
|
+
self,
|
|
350
|
+
data: dict[str, Any],
|
|
351
|
+
sub_query_id: Optional[str] = None,
|
|
352
|
+
) -> list[ResearchSource]:
|
|
353
|
+
"""Parse Google Custom Search API response into ResearchSource objects.
|
|
354
|
+
|
|
355
|
+
Google CSE response structure:
|
|
356
|
+
{
|
|
357
|
+
"items": [
|
|
358
|
+
{
|
|
359
|
+
"title": "...",
|
|
360
|
+
"link": "...",
|
|
361
|
+
"snippet": "...",
|
|
362
|
+
"displayLink": "example.com",
|
|
363
|
+
"pagemap": {
|
|
364
|
+
"metatags": [{"og:description": "...", "article:published_time": "..."}]
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
],
|
|
368
|
+
"searchInformation": {
|
|
369
|
+
"totalResults": "123456"
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
data: Google CSE API response JSON
|
|
375
|
+
sub_query_id: SubQuery ID for source tracking
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
List of ResearchSource objects
|
|
379
|
+
"""
|
|
380
|
+
sources: list[ResearchSource] = []
|
|
381
|
+
items = data.get("items", [])
|
|
382
|
+
|
|
383
|
+
for item in items:
|
|
384
|
+
# Extract published date from pagemap metatags if available
|
|
385
|
+
published_date = self._extract_published_date(item)
|
|
386
|
+
|
|
387
|
+
# Create SearchResult from Google response
|
|
388
|
+
search_result = SearchResult(
|
|
389
|
+
url=item.get("link", ""),
|
|
390
|
+
title=item.get("title", "Untitled"),
|
|
391
|
+
snippet=item.get("snippet"),
|
|
392
|
+
content=None, # Google CSE doesn't provide full content
|
|
393
|
+
score=None, # Google CSE doesn't provide relevance scores
|
|
394
|
+
published_date=published_date,
|
|
395
|
+
source=item.get("displayLink"),
|
|
396
|
+
metadata={
|
|
397
|
+
"google_cache_id": item.get("cacheId"),
|
|
398
|
+
"mime_type": item.get("mime"),
|
|
399
|
+
"file_format": item.get("fileFormat"),
|
|
400
|
+
},
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Convert to ResearchSource
|
|
404
|
+
research_source = search_result.to_research_source(
|
|
405
|
+
source_type=SourceType.WEB,
|
|
406
|
+
sub_query_id=sub_query_id,
|
|
407
|
+
)
|
|
408
|
+
sources.append(research_source)
|
|
409
|
+
|
|
410
|
+
return sources
|
|
411
|
+
|
|
412
|
+
def _extract_published_date(self, item: dict[str, Any]) -> Optional[datetime]:
|
|
413
|
+
"""Extract published date from Google CSE item pagemap.
|
|
414
|
+
|
|
415
|
+
Looks for common metatag fields that contain publication dates:
|
|
416
|
+
- article:published_time
|
|
417
|
+
- datePublished
|
|
418
|
+
- og:published_time
|
|
419
|
+
- article:modified_time (fallback)
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
item: Single item from Google CSE response
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Parsed datetime or None
|
|
426
|
+
"""
|
|
427
|
+
pagemap = item.get("pagemap", {})
|
|
428
|
+
metatags = pagemap.get("metatags", [])
|
|
429
|
+
|
|
430
|
+
if not metatags:
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
# Metatags is a list, typically with one element
|
|
434
|
+
tags = metatags[0] if metatags else {}
|
|
435
|
+
|
|
436
|
+
# Try various date fields in order of preference
|
|
437
|
+
date_fields = [
|
|
438
|
+
"article:published_time",
|
|
439
|
+
"datepublished",
|
|
440
|
+
"og:published_time",
|
|
441
|
+
"article:modified_time",
|
|
442
|
+
"datemodified",
|
|
443
|
+
]
|
|
444
|
+
|
|
445
|
+
for field in date_fields:
|
|
446
|
+
date_str = tags.get(field)
|
|
447
|
+
if date_str:
|
|
448
|
+
parsed = self._parse_date(date_str)
|
|
449
|
+
if parsed:
|
|
450
|
+
return parsed
|
|
451
|
+
|
|
452
|
+
return None
|
|
453
|
+
|
|
454
|
+
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
|
455
|
+
"""Parse date string from various formats.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
date_str: Date string (ISO format or other common formats)
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Parsed datetime or None
|
|
462
|
+
"""
|
|
463
|
+
if not date_str:
|
|
464
|
+
return None
|
|
465
|
+
|
|
466
|
+
# Try ISO format first
|
|
467
|
+
try:
|
|
468
|
+
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
469
|
+
except ValueError:
|
|
470
|
+
pass
|
|
471
|
+
|
|
472
|
+
# Try common date formats
|
|
473
|
+
formats = [
|
|
474
|
+
"%Y-%m-%d",
|
|
475
|
+
"%Y/%m/%d",
|
|
476
|
+
"%d-%m-%Y",
|
|
477
|
+
"%d/%m/%Y",
|
|
478
|
+
"%B %d, %Y",
|
|
479
|
+
"%b %d, %Y",
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
for fmt in formats:
|
|
483
|
+
try:
|
|
484
|
+
return datetime.strptime(date_str, fmt)
|
|
485
|
+
except ValueError:
|
|
486
|
+
continue
|
|
487
|
+
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
async def health_check(self) -> bool:
|
|
491
|
+
"""Check if Google Custom Search API is accessible.
|
|
492
|
+
|
|
493
|
+
Performs a lightweight search to verify API key, CSE ID, and connectivity.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
True if provider is healthy, False otherwise
|
|
497
|
+
"""
|
|
498
|
+
try:
|
|
499
|
+
# Perform minimal search to verify connectivity
|
|
500
|
+
await self.search("test", max_results=1)
|
|
501
|
+
return True
|
|
502
|
+
except AuthenticationError:
|
|
503
|
+
logger.error("Google CSE health check failed: invalid API key or CSE ID")
|
|
504
|
+
return False
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.warning(f"Google CSE health check failed: {e}")
|
|
507
|
+
return False
|