aiecs 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +1 -1
- aiecs/config/config.py +2 -1
- aiecs/llm/clients/vertex_client.py +5 -0
- aiecs/main.py +2 -2
- aiecs/scripts/tools_develop/README.md +111 -2
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +80 -21
- aiecs/scripts/tools_develop/verify_tools.py +347 -0
- aiecs/tools/__init__.py +94 -30
- aiecs/tools/apisource/__init__.py +106 -0
- aiecs/tools/apisource/intelligence/__init__.py +20 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +378 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +387 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +384 -0
- aiecs/tools/apisource/monitoring/__init__.py +12 -0
- aiecs/tools/apisource/monitoring/metrics.py +308 -0
- aiecs/tools/apisource/providers/__init__.py +114 -0
- aiecs/tools/apisource/providers/base.py +684 -0
- aiecs/tools/apisource/providers/census.py +412 -0
- aiecs/tools/apisource/providers/fred.py +575 -0
- aiecs/tools/apisource/providers/newsapi.py +402 -0
- aiecs/tools/apisource/providers/worldbank.py +346 -0
- aiecs/tools/apisource/reliability/__init__.py +14 -0
- aiecs/tools/apisource/reliability/error_handler.py +362 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +420 -0
- aiecs/tools/apisource/tool.py +814 -0
- aiecs/tools/apisource/utils/__init__.py +12 -0
- aiecs/tools/apisource/utils/validators.py +343 -0
- aiecs/tools/langchain_adapter.py +95 -17
- aiecs/tools/search_tool/__init__.py +102 -0
- aiecs/tools/search_tool/analyzers.py +583 -0
- aiecs/tools/search_tool/cache.py +280 -0
- aiecs/tools/search_tool/constants.py +127 -0
- aiecs/tools/search_tool/context.py +219 -0
- aiecs/tools/search_tool/core.py +773 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +257 -0
- aiecs/tools/search_tool/metrics.py +375 -0
- aiecs/tools/search_tool/rate_limiter.py +177 -0
- aiecs/tools/search_tool/schemas.py +297 -0
- aiecs/tools/statistics/data_loader_tool.py +2 -2
- aiecs/tools/statistics/data_transformer_tool.py +1 -1
- aiecs/tools/task_tools/__init__.py +8 -8
- aiecs/tools/task_tools/report_tool.py +1 -1
- aiecs/tools/tool_executor/__init__.py +2 -0
- aiecs/tools/tool_executor/tool_executor.py +284 -14
- aiecs/utils/__init__.py +11 -0
- aiecs/utils/cache_provider.py +698 -0
- aiecs/utils/execution_utils.py +5 -5
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/METADATA +1 -1
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/RECORD +55 -23
- aiecs/tools/task_tools/search_tool.py +0 -1123
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/WHEEL +0 -0
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/entry_points.txt +0 -0
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.2.1.dist-info → aiecs-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result Deduplication
|
|
3
|
+
|
|
4
|
+
This module handles detection and removal of duplicate and highly similar
|
|
5
|
+
search results.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
from urllib.parse import urlparse, urlunparse
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ResultDeduplicator:
|
|
14
|
+
"""Removes duplicate and similar search results"""
|
|
15
|
+
|
|
16
|
+
def deduplicate_results(
|
|
17
|
+
self,
|
|
18
|
+
results: List[Dict[str, Any]],
|
|
19
|
+
similarity_threshold: float = 0.85
|
|
20
|
+
) -> List[Dict[str, Any]]:
|
|
21
|
+
"""
|
|
22
|
+
Remove duplicate and highly similar results.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
results: List of search results
|
|
26
|
+
similarity_threshold: Similarity threshold (0-1) for considering results as duplicates
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Deduplicated list of results
|
|
30
|
+
"""
|
|
31
|
+
if not results:
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
unique_results = []
|
|
35
|
+
seen_urls = set()
|
|
36
|
+
seen_content_hashes = set()
|
|
37
|
+
|
|
38
|
+
for result in results:
|
|
39
|
+
url = result.get('link', '')
|
|
40
|
+
|
|
41
|
+
# 1. URL deduplication (normalized)
|
|
42
|
+
normalized_url = self._normalize_url(url)
|
|
43
|
+
if normalized_url in seen_urls:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# 2. Content similarity deduplication
|
|
47
|
+
content_hash = self._calculate_content_hash(
|
|
48
|
+
result.get('title', ''),
|
|
49
|
+
result.get('snippet', '')
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Check for high similarity with existing results
|
|
53
|
+
is_duplicate = False
|
|
54
|
+
for seen_hash in seen_content_hashes:
|
|
55
|
+
similarity = self._calculate_similarity(content_hash, seen_hash)
|
|
56
|
+
if similarity > similarity_threshold:
|
|
57
|
+
is_duplicate = True
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
if is_duplicate:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
# Add to unique results
|
|
64
|
+
unique_results.append(result)
|
|
65
|
+
seen_urls.add(normalized_url)
|
|
66
|
+
seen_content_hashes.add(content_hash)
|
|
67
|
+
|
|
68
|
+
return unique_results
|
|
69
|
+
|
|
70
|
+
def _normalize_url(self, url: str) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Normalize URL by removing query parameters and fragments.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
url: URL to normalize
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Normalized URL
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
parsed = urlparse(url)
|
|
82
|
+
# Keep only scheme, netloc, and path
|
|
83
|
+
normalized = urlunparse((
|
|
84
|
+
parsed.scheme,
|
|
85
|
+
parsed.netloc.lower(),
|
|
86
|
+
parsed.path.rstrip('/'),
|
|
87
|
+
'', '', '' # Remove params, query, fragment
|
|
88
|
+
))
|
|
89
|
+
return normalized
|
|
90
|
+
except Exception:
|
|
91
|
+
return url.lower()
|
|
92
|
+
|
|
93
|
+
def _calculate_content_hash(self, title: str, snippet: str) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Calculate content hash for similarity detection.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
title: Result title
|
|
99
|
+
snippet: Result snippet
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Content hash string
|
|
103
|
+
"""
|
|
104
|
+
content = f"{title.lower()} {snippet.lower()}"
|
|
105
|
+
# Remove punctuation and normalize whitespace
|
|
106
|
+
content = ''.join(c for c in content if c.isalnum() or c.isspace())
|
|
107
|
+
content = ' '.join(content.split())
|
|
108
|
+
return hashlib.md5(content.encode()).hexdigest()
|
|
109
|
+
|
|
110
|
+
def _calculate_similarity(self, hash1: str, hash2: str) -> float:
|
|
111
|
+
"""
|
|
112
|
+
Calculate similarity between two content hashes.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
hash1: First content hash
|
|
116
|
+
hash2: Second content hash
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Similarity score (0-1)
|
|
120
|
+
"""
|
|
121
|
+
# Exact hash match
|
|
122
|
+
return 1.0 if hash1 == hash2 else 0.0
|
|
123
|
+
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent-Friendly Error Handling
|
|
3
|
+
|
|
4
|
+
This module formats errors in an agent-friendly way with clear messages,
|
|
5
|
+
suggested actions, and alternative approaches.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List
|
|
9
|
+
|
|
10
|
+
from .constants import (
|
|
11
|
+
QuotaExceededError,
|
|
12
|
+
AuthenticationError,
|
|
13
|
+
RateLimitError,
|
|
14
|
+
CircuitBreakerOpenError,
|
|
15
|
+
ValidationError,
|
|
16
|
+
SearchAPIError
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AgentFriendlyErrorHandler:
|
|
21
|
+
"""Formats errors for agent consumption with actionable suggestions"""
|
|
22
|
+
|
|
23
|
+
def format_error_for_agent(
|
|
24
|
+
self,
|
|
25
|
+
error: Exception,
|
|
26
|
+
context: Dict[str, Any]
|
|
27
|
+
) -> Dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Format error for agent-friendly consumption.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
error: The exception that occurred
|
|
33
|
+
context: Context information (circuit breaker timeout, etc.)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Structured error information dictionary
|
|
37
|
+
"""
|
|
38
|
+
error_response = {
|
|
39
|
+
'error_type': 'unknown',
|
|
40
|
+
'severity': 'medium',
|
|
41
|
+
'user_message': '',
|
|
42
|
+
'technical_details': str(error),
|
|
43
|
+
'suggested_actions': [],
|
|
44
|
+
'alternative_approaches': [],
|
|
45
|
+
'can_retry': False,
|
|
46
|
+
'estimated_recovery_time': None
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
error_str = str(error).lower()
|
|
50
|
+
error_type = type(error).__name__
|
|
51
|
+
|
|
52
|
+
# Handle specific error types
|
|
53
|
+
if isinstance(error, QuotaExceededError) or 'quota' in error_str or 'rate limit' in error_str:
|
|
54
|
+
self._handle_quota_exceeded(error_response)
|
|
55
|
+
|
|
56
|
+
elif isinstance(error, AuthenticationError) or 'auth' in error_str or 'credential' in error_str:
|
|
57
|
+
self._handle_authentication_error(error_response)
|
|
58
|
+
|
|
59
|
+
elif isinstance(error, RateLimitError):
|
|
60
|
+
self._handle_rate_limit_error(error_response)
|
|
61
|
+
|
|
62
|
+
elif isinstance(error, CircuitBreakerOpenError) or 'circuit breaker' in error_str:
|
|
63
|
+
self._handle_circuit_breaker_error(error_response, context)
|
|
64
|
+
|
|
65
|
+
elif isinstance(error, ValidationError) or 'invalid' in error_str or 'validation' in error_str:
|
|
66
|
+
self._handle_validation_error(error_response)
|
|
67
|
+
|
|
68
|
+
elif 'timeout' in error_str or 'connection' in error_str or 'network' in error_str:
|
|
69
|
+
self._handle_network_error(error_response)
|
|
70
|
+
|
|
71
|
+
elif 'no results' in error_str or 'not found' in error_str:
|
|
72
|
+
self._handle_no_results(error_response)
|
|
73
|
+
|
|
74
|
+
else:
|
|
75
|
+
# Generic error handling
|
|
76
|
+
error_response.update({
|
|
77
|
+
'error_type': error_type,
|
|
78
|
+
'severity': 'medium',
|
|
79
|
+
'user_message': f'An unexpected error occurred: {str(error)}',
|
|
80
|
+
'suggested_actions': [
|
|
81
|
+
'Check your query parameters',
|
|
82
|
+
'Try simplifying the query',
|
|
83
|
+
'Retry the operation'
|
|
84
|
+
],
|
|
85
|
+
'can_retry': True
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
return error_response
|
|
89
|
+
|
|
90
|
+
def _handle_quota_exceeded(self, response: Dict[str, Any]):
|
|
91
|
+
"""Handle quota exceeded errors"""
|
|
92
|
+
response.update({
|
|
93
|
+
'error_type': 'quota_exceeded',
|
|
94
|
+
'severity': 'high',
|
|
95
|
+
'user_message': (
|
|
96
|
+
'Search API quota has been exceeded. '
|
|
97
|
+
'The service has temporarily reached its usage limit.'
|
|
98
|
+
),
|
|
99
|
+
'suggested_actions': [
|
|
100
|
+
'Wait 60-120 seconds before retrying',
|
|
101
|
+
'Reduce the number of results requested',
|
|
102
|
+
'Use more specific queries to get better results with fewer searches',
|
|
103
|
+
'Check if cached results are available'
|
|
104
|
+
],
|
|
105
|
+
'alternative_approaches': [
|
|
106
|
+
'Use the scraper tool to extract information from known URLs',
|
|
107
|
+
'Query specific authoritative domains using site: operator',
|
|
108
|
+
'Defer non-urgent searches to later'
|
|
109
|
+
],
|
|
110
|
+
'can_retry': True,
|
|
111
|
+
'estimated_recovery_time': '1-2 minutes'
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
def _handle_authentication_error(self, response: Dict[str, Any]):
|
|
115
|
+
"""Handle authentication errors"""
|
|
116
|
+
response.update({
|
|
117
|
+
'error_type': 'authentication_failed',
|
|
118
|
+
'severity': 'high',
|
|
119
|
+
'user_message': (
|
|
120
|
+
'Search API authentication failed. '
|
|
121
|
+
'The API credentials may be invalid or expired.'
|
|
122
|
+
),
|
|
123
|
+
'suggested_actions': [
|
|
124
|
+
'Verify that GOOGLE_API_KEY is set correctly in environment',
|
|
125
|
+
'Check that GOOGLE_CSE_ID is valid',
|
|
126
|
+
'Ensure API key has not expired',
|
|
127
|
+
'Verify API key has Custom Search API enabled'
|
|
128
|
+
],
|
|
129
|
+
'alternative_approaches': [
|
|
130
|
+
'Use alternative data sources (apisource_tool)',
|
|
131
|
+
'Request manual search from user'
|
|
132
|
+
],
|
|
133
|
+
'can_retry': False,
|
|
134
|
+
'estimated_recovery_time': None
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
def _handle_rate_limit_error(self, response: Dict[str, Any]):
|
|
138
|
+
"""Handle rate limit errors"""
|
|
139
|
+
response.update({
|
|
140
|
+
'error_type': 'rate_limit_exceeded',
|
|
141
|
+
'severity': 'medium',
|
|
142
|
+
'user_message': (
|
|
143
|
+
'Rate limit has been exceeded. '
|
|
144
|
+
'Too many requests in a short time period.'
|
|
145
|
+
),
|
|
146
|
+
'suggested_actions': [
|
|
147
|
+
'Wait for the suggested time before retrying',
|
|
148
|
+
'Reduce request frequency',
|
|
149
|
+
'Use cached results when available',
|
|
150
|
+
'Batch similar queries together'
|
|
151
|
+
],
|
|
152
|
+
'alternative_approaches': [
|
|
153
|
+
'Use cached or historical data',
|
|
154
|
+
'Prioritize critical searches'
|
|
155
|
+
],
|
|
156
|
+
'can_retry': True,
|
|
157
|
+
'estimated_recovery_time': 'As indicated in error message'
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
def _handle_circuit_breaker_error(
|
|
161
|
+
self,
|
|
162
|
+
response: Dict[str, Any],
|
|
163
|
+
context: Dict[str, Any]
|
|
164
|
+
):
|
|
165
|
+
"""Handle circuit breaker open errors"""
|
|
166
|
+
timeout = context.get('circuit_breaker_timeout', 60)
|
|
167
|
+
|
|
168
|
+
response.update({
|
|
169
|
+
'error_type': 'circuit_breaker_open',
|
|
170
|
+
'severity': 'high',
|
|
171
|
+
'user_message': (
|
|
172
|
+
'Search service is temporarily unavailable due to repeated failures. '
|
|
173
|
+
'The circuit breaker has been triggered for protection.'
|
|
174
|
+
),
|
|
175
|
+
'suggested_actions': [
|
|
176
|
+
f'Wait {timeout} seconds for circuit to reset',
|
|
177
|
+
'Check search service status',
|
|
178
|
+
'Review recent error logs'
|
|
179
|
+
],
|
|
180
|
+
'alternative_approaches': [
|
|
181
|
+
'Use alternative data sources',
|
|
182
|
+
'Defer search to later',
|
|
183
|
+
'Use cached or historical data'
|
|
184
|
+
],
|
|
185
|
+
'can_retry': True,
|
|
186
|
+
'estimated_recovery_time': f'{timeout} seconds'
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
def _handle_validation_error(self, response: Dict[str, Any]):
|
|
190
|
+
"""Handle validation errors"""
|
|
191
|
+
response.update({
|
|
192
|
+
'error_type': 'invalid_query',
|
|
193
|
+
'severity': 'low',
|
|
194
|
+
'user_message': (
|
|
195
|
+
'The search query or parameters are invalid. '
|
|
196
|
+
'Please check the query format.'
|
|
197
|
+
),
|
|
198
|
+
'suggested_actions': [
|
|
199
|
+
'Simplify the query - remove special characters',
|
|
200
|
+
'Check that all parameters are within valid ranges',
|
|
201
|
+
'Ensure query is not empty',
|
|
202
|
+
'Review query syntax for search operators'
|
|
203
|
+
],
|
|
204
|
+
'alternative_approaches': [
|
|
205
|
+
'Break complex query into simpler parts',
|
|
206
|
+
'Use basic search without advanced operators'
|
|
207
|
+
],
|
|
208
|
+
'can_retry': True,
|
|
209
|
+
'estimated_recovery_time': 'immediate (after fixing query)'
|
|
210
|
+
})
|
|
211
|
+
|
|
212
|
+
def _handle_network_error(self, response: Dict[str, Any]):
|
|
213
|
+
"""Handle network-related errors"""
|
|
214
|
+
response.update({
|
|
215
|
+
'error_type': 'network_error',
|
|
216
|
+
'severity': 'medium',
|
|
217
|
+
'user_message': (
|
|
218
|
+
'Network connection to search API failed. '
|
|
219
|
+
'This is usually a temporary issue.'
|
|
220
|
+
),
|
|
221
|
+
'suggested_actions': [
|
|
222
|
+
'Retry the search in 5-10 seconds',
|
|
223
|
+
'Check internet connectivity',
|
|
224
|
+
'Try with a shorter timeout if query is complex'
|
|
225
|
+
],
|
|
226
|
+
'alternative_approaches': [
|
|
227
|
+
'Use cached results if available',
|
|
228
|
+
'Try alternative search parameters'
|
|
229
|
+
],
|
|
230
|
+
'can_retry': True,
|
|
231
|
+
'estimated_recovery_time': '10-30 seconds'
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
def _handle_no_results(self, response: Dict[str, Any]):
|
|
235
|
+
"""Handle no results found"""
|
|
236
|
+
response.update({
|
|
237
|
+
'error_type': 'no_results',
|
|
238
|
+
'severity': 'low',
|
|
239
|
+
'user_message': (
|
|
240
|
+
'No search results found for the query. '
|
|
241
|
+
'Try broadening your search terms.'
|
|
242
|
+
),
|
|
243
|
+
'suggested_actions': [
|
|
244
|
+
'Remove overly specific terms',
|
|
245
|
+
'Try synonyms or related terms',
|
|
246
|
+
'Remove date restrictions',
|
|
247
|
+
'Broaden the search scope'
|
|
248
|
+
],
|
|
249
|
+
'alternative_approaches': [
|
|
250
|
+
'Search for related topics',
|
|
251
|
+
'Try different search engines or sources',
|
|
252
|
+
'Break down into sub-queries'
|
|
253
|
+
],
|
|
254
|
+
'can_retry': True,
|
|
255
|
+
'estimated_recovery_time': 'immediate (with modified query)'
|
|
256
|
+
})
|
|
257
|
+
|