aiecs 1.2.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (55) hide show
  1. aiecs/__init__.py +1 -1
  2. aiecs/llm/clients/vertex_client.py +22 -2
  3. aiecs/main.py +2 -2
  4. aiecs/scripts/tools_develop/README.md +111 -2
  5. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  6. aiecs/scripts/tools_develop/validate_tool_schemas.py +80 -21
  7. aiecs/scripts/tools_develop/verify_tools.py +347 -0
  8. aiecs/tools/__init__.py +94 -30
  9. aiecs/tools/apisource/__init__.py +106 -0
  10. aiecs/tools/apisource/intelligence/__init__.py +20 -0
  11. aiecs/tools/apisource/intelligence/data_fusion.py +378 -0
  12. aiecs/tools/apisource/intelligence/query_analyzer.py +387 -0
  13. aiecs/tools/apisource/intelligence/search_enhancer.py +384 -0
  14. aiecs/tools/apisource/monitoring/__init__.py +12 -0
  15. aiecs/tools/apisource/monitoring/metrics.py +308 -0
  16. aiecs/tools/apisource/providers/__init__.py +114 -0
  17. aiecs/tools/apisource/providers/base.py +684 -0
  18. aiecs/tools/apisource/providers/census.py +412 -0
  19. aiecs/tools/apisource/providers/fred.py +575 -0
  20. aiecs/tools/apisource/providers/newsapi.py +402 -0
  21. aiecs/tools/apisource/providers/worldbank.py +346 -0
  22. aiecs/tools/apisource/reliability/__init__.py +14 -0
  23. aiecs/tools/apisource/reliability/error_handler.py +362 -0
  24. aiecs/tools/apisource/reliability/fallback_strategy.py +420 -0
  25. aiecs/tools/apisource/tool.py +814 -0
  26. aiecs/tools/apisource/utils/__init__.py +12 -0
  27. aiecs/tools/apisource/utils/validators.py +343 -0
  28. aiecs/tools/langchain_adapter.py +95 -17
  29. aiecs/tools/search_tool/__init__.py +102 -0
  30. aiecs/tools/search_tool/analyzers.py +583 -0
  31. aiecs/tools/search_tool/cache.py +280 -0
  32. aiecs/tools/search_tool/constants.py +127 -0
  33. aiecs/tools/search_tool/context.py +219 -0
  34. aiecs/tools/search_tool/core.py +773 -0
  35. aiecs/tools/search_tool/deduplicator.py +123 -0
  36. aiecs/tools/search_tool/error_handler.py +257 -0
  37. aiecs/tools/search_tool/metrics.py +375 -0
  38. aiecs/tools/search_tool/rate_limiter.py +177 -0
  39. aiecs/tools/search_tool/schemas.py +297 -0
  40. aiecs/tools/statistics/data_loader_tool.py +2 -2
  41. aiecs/tools/statistics/data_transformer_tool.py +1 -1
  42. aiecs/tools/task_tools/__init__.py +8 -8
  43. aiecs/tools/task_tools/report_tool.py +1 -1
  44. aiecs/tools/tool_executor/__init__.py +2 -0
  45. aiecs/tools/tool_executor/tool_executor.py +284 -14
  46. aiecs/utils/__init__.py +11 -0
  47. aiecs/utils/cache_provider.py +698 -0
  48. aiecs/utils/execution_utils.py +5 -5
  49. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/METADATA +1 -1
  50. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/RECORD +54 -22
  51. aiecs/tools/task_tools/search_tool.py +0 -1123
  52. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/WHEEL +0 -0
  53. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/entry_points.txt +0 -0
  54. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/licenses/LICENSE +0 -0
  55. {aiecs-1.2.2.dist-info → aiecs-1.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+ """
2
+ Data Fusion Engine for Cross-Provider Results
3
+
4
+ Intelligently merges results from multiple API providers:
5
+ - Detect and handle duplicate data
6
+ - Resolve conflicts based on quality scores
7
+ - Support multiple fusion strategies
8
+ - Preserve provenance information
9
+ """
10
+
11
+ import logging
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DataFusionEngine:
18
+ """
19
+ Fuses data from multiple providers intelligently.
20
+
21
+ Handles duplicate detection, conflict resolution, and data quality
22
+ optimization when combining results from different sources.
23
+ """
24
+
25
+ # Fusion strategies
26
+ STRATEGY_BEST_QUALITY = 'best_quality'
27
+ STRATEGY_MERGE_ALL = 'merge_all'
28
+ STRATEGY_CONSENSUS = 'consensus'
29
+ STRATEGY_FIRST_SUCCESS = 'first_success'
30
+
31
+ def __init__(self):
32
+ """Initialize data fusion engine"""
33
+ pass
34
+
35
+ def fuse_multi_provider_results(
36
+ self,
37
+ results: List[Dict[str, Any]],
38
+ fusion_strategy: str = STRATEGY_BEST_QUALITY
39
+ ) -> Optional[Dict[str, Any]]:
40
+ """
41
+ Fuse results from multiple providers.
42
+
43
+ Args:
44
+ results: List of results from different providers
45
+ fusion_strategy: Strategy to use for fusion:
46
+ - 'best_quality': Select result with highest quality score
47
+ - 'merge_all': Merge all results, preserving sources
48
+ - 'consensus': Use data points agreed upon by multiple sources
49
+ - 'first_success': Use first successful result
50
+
51
+ Returns:
52
+ Fused result dictionary or None if no valid results
53
+ """
54
+ if not results:
55
+ return None
56
+
57
+ # Filter out failed results
58
+ valid_results = [
59
+ r for r in results
60
+ if r.get('data') is not None
61
+ ]
62
+
63
+ if not valid_results:
64
+ return None
65
+
66
+ if fusion_strategy == self.STRATEGY_BEST_QUALITY:
67
+ return self._fuse_best_quality(valid_results)
68
+
69
+ elif fusion_strategy == self.STRATEGY_MERGE_ALL:
70
+ return self._fuse_merge_all(valid_results)
71
+
72
+ elif fusion_strategy == self.STRATEGY_CONSENSUS:
73
+ return self._fuse_consensus(valid_results)
74
+
75
+ elif fusion_strategy == self.STRATEGY_FIRST_SUCCESS:
76
+ return valid_results[0]
77
+
78
+ else:
79
+ logger.warning(f"Unknown fusion strategy: {fusion_strategy}, using best_quality")
80
+ return self._fuse_best_quality(valid_results)
81
+
82
+ def _fuse_best_quality(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
83
+ """
84
+ Select result with highest quality score.
85
+
86
+ Args:
87
+ results: List of valid results
88
+
89
+ Returns:
90
+ Result with best quality
91
+ """
92
+ def get_quality_score(result: Dict[str, Any]) -> float:
93
+ """Extract quality score from result"""
94
+ metadata = result.get('metadata', {})
95
+ quality = metadata.get('quality', {})
96
+ return quality.get('score', 0.5)
97
+
98
+ best_result = max(results, key=get_quality_score)
99
+
100
+ # Add fusion metadata
101
+ best_result['metadata']['fusion_info'] = {
102
+ 'strategy': self.STRATEGY_BEST_QUALITY,
103
+ 'total_providers_queried': len(results),
104
+ 'selected_provider': best_result.get('provider'),
105
+ 'quality_score': get_quality_score(best_result),
106
+ 'alternative_providers': [
107
+ r.get('provider') for r in results
108
+ if r.get('provider') != best_result.get('provider')
109
+ ]
110
+ }
111
+
112
+ return best_result
113
+
114
+ def _fuse_merge_all(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
115
+ """
116
+ Merge all results, preserving source information.
117
+
118
+ Args:
119
+ results: List of valid results
120
+
121
+ Returns:
122
+ Merged result with all data
123
+ """
124
+ merged = {
125
+ 'operation': 'multi_provider_search',
126
+ 'data': [],
127
+ 'metadata': {
128
+ 'fusion_info': {
129
+ 'strategy': self.STRATEGY_MERGE_ALL,
130
+ 'total_providers': len(results),
131
+ 'sources': []
132
+ }
133
+ }
134
+ }
135
+
136
+ # Collect all data with source tags
137
+ for result in results:
138
+ provider = result.get('provider', 'unknown')
139
+ data = result.get('data', [])
140
+ metadata = result.get('metadata', {})
141
+
142
+ # Handle different data structures
143
+ if isinstance(data, list):
144
+ for item in data:
145
+ if isinstance(item, dict):
146
+ # Add source information to each item
147
+ enriched_item = item.copy()
148
+ enriched_item['_source_provider'] = provider
149
+ enriched_item['_source_quality'] = metadata.get('quality', {})
150
+ enriched_item['_source_timestamp'] = metadata.get('timestamp')
151
+ merged['data'].append(enriched_item)
152
+ else:
153
+ # Handle non-dict items
154
+ merged['data'].append({
155
+ 'value': item,
156
+ '_source_provider': provider,
157
+ '_source_quality': metadata.get('quality', {})
158
+ })
159
+ elif isinstance(data, dict):
160
+ # Single dict result
161
+ enriched_data = data.copy()
162
+ enriched_data['_source_provider'] = provider
163
+ enriched_data['_source_quality'] = metadata.get('quality', {})
164
+ merged['data'].append(enriched_data)
165
+
166
+ # Record source info
167
+ merged['metadata']['fusion_info']['sources'].append({
168
+ 'provider': provider,
169
+ 'operation': result.get('operation'),
170
+ 'record_count': len(data) if isinstance(data, list) else 1,
171
+ 'quality': metadata.get('quality', {})
172
+ })
173
+
174
+ return merged
175
+
176
+ def _fuse_consensus(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
177
+ """
178
+ Use consensus-based fusion (data agreed upon by multiple sources).
179
+
180
+ Args:
181
+ results: List of valid results
182
+
183
+ Returns:
184
+ Consensus result
185
+ """
186
+ # For now, implement a simple version
187
+ # TODO: Implement more sophisticated consensus logic
188
+
189
+ # Use best quality as baseline
190
+ consensus = self._fuse_best_quality(results)
191
+
192
+ # Update strategy in metadata
193
+ consensus['metadata']['fusion_info']['strategy'] = self.STRATEGY_CONSENSUS
194
+ consensus['metadata']['fusion_info']['note'] = (
195
+ 'Consensus strategy currently uses best quality baseline'
196
+ )
197
+
198
+ return consensus
199
+
200
+ def detect_duplicate_data(
201
+ self,
202
+ data1: Dict[str, Any],
203
+ data2: Dict[str, Any],
204
+ key_fields: Optional[List[str]] = None
205
+ ) -> Tuple[bool, float]:
206
+ """
207
+ Detect if two data items are duplicates.
208
+
209
+ Args:
210
+ data1: First data item
211
+ data2: Second data item
212
+ key_fields: Fields to compare (auto-detected if None)
213
+
214
+ Returns:
215
+ Tuple of (is_duplicate, similarity_score)
216
+ """
217
+ if key_fields is None:
218
+ # Auto-detect key fields
219
+ key_fields = [
220
+ 'id', 'series_id', 'indicator_code', 'indicator_id',
221
+ 'title', 'name', 'code'
222
+ ]
223
+
224
+ matches = 0
225
+ total_fields = 0
226
+
227
+ for field in key_fields:
228
+ if field in data1 and field in data2:
229
+ total_fields += 1
230
+ if data1[field] == data2[field]:
231
+ matches += 1
232
+
233
+ if total_fields == 0:
234
+ # No common key fields, check title/name similarity
235
+ return self._check_text_similarity(data1, data2)
236
+
237
+ similarity = matches / total_fields if total_fields > 0 else 0.0
238
+ is_duplicate = similarity > 0.8
239
+
240
+ return is_duplicate, similarity
241
+
242
+ def _check_text_similarity(
243
+ self,
244
+ data1: Dict[str, Any],
245
+ data2: Dict[str, Any]
246
+ ) -> Tuple[bool, float]:
247
+ """
248
+ Check text similarity for title/name fields.
249
+
250
+ Args:
251
+ data1: First data item
252
+ data2: Second data item
253
+
254
+ Returns:
255
+ Tuple of (is_duplicate, similarity_score)
256
+ """
257
+ text_fields = ['title', 'name', 'description']
258
+
259
+ for field in text_fields:
260
+ if field in data1 and field in data2:
261
+ text1 = str(data1[field]).lower()
262
+ text2 = str(data2[field]).lower()
263
+
264
+ # Simple word-based similarity
265
+ words1 = set(text1.split())
266
+ words2 = set(text2.split())
267
+
268
+ if not words1 or not words2:
269
+ continue
270
+
271
+ intersection = len(words1 & words2)
272
+ union = len(words1 | words2)
273
+
274
+ similarity = intersection / union if union > 0 else 0.0
275
+
276
+ if similarity > 0.7:
277
+ return True, similarity
278
+
279
+ return False, 0.0
280
+
281
+ def resolve_conflict(
282
+ self,
283
+ values: List[Dict[str, Any]],
284
+ resolution_strategy: str = 'quality'
285
+ ) -> Any:
286
+ """
287
+ Resolve conflicts when multiple sources provide different values.
288
+
289
+ Args:
290
+ values: List of value dictionaries with {'value': ..., 'quality': ..., 'source': ...}
291
+ resolution_strategy: Strategy for resolution ('quality', 'majority', 'average')
292
+
293
+ Returns:
294
+ Resolved value
295
+ """
296
+ if not values:
297
+ return None
298
+
299
+ if len(values) == 1:
300
+ return values[0].get('value')
301
+
302
+ if resolution_strategy == 'quality':
303
+ # Choose value from source with highest quality
304
+ best = max(values, key=lambda v: v.get('quality', {}).get('score', 0))
305
+ return best.get('value')
306
+
307
+ elif resolution_strategy == 'majority':
308
+ # Use most common value
309
+ from collections import Counter
310
+ value_counts = Counter([str(v.get('value')) for v in values])
311
+ most_common = value_counts.most_common(1)[0][0]
312
+ # Return original type
313
+ for v in values:
314
+ if str(v.get('value')) == most_common:
315
+ return v.get('value')
316
+
317
+ elif resolution_strategy == 'average':
318
+ # Average numeric values
319
+ try:
320
+ numeric_values = [
321
+ float(v.get('value'))
322
+ for v in values
323
+ if v.get('value') is not None
324
+ ]
325
+ if numeric_values:
326
+ return sum(numeric_values) / len(numeric_values)
327
+ except (ValueError, TypeError):
328
+ # Fall back to quality-based
329
+ return self.resolve_conflict(values, 'quality')
330
+
331
+ # Default: return first value
332
+ return values[0].get('value')
333
+
334
+ def deduplicate_results(
335
+ self,
336
+ data_list: List[Dict[str, Any]],
337
+ key_fields: Optional[List[str]] = None
338
+ ) -> List[Dict[str, Any]]:
339
+ """
340
+ Remove duplicate entries from a data list.
341
+
342
+ Args:
343
+ data_list: List of data items
344
+ key_fields: Fields to use for duplicate detection
345
+
346
+ Returns:
347
+ Deduplicated list
348
+ """
349
+ if not data_list:
350
+ return []
351
+
352
+ unique_data = []
353
+ seen_signatures = set()
354
+
355
+ for item in data_list:
356
+ # Create a signature for this item
357
+ if key_fields:
358
+ signature = tuple(
359
+ item.get(field) for field in key_fields
360
+ if field in item
361
+ )
362
+ else:
363
+ # Auto signature from common fields
364
+ signature_fields = ['id', 'series_id', 'indicator_code', 'title', 'name']
365
+ signature = tuple(
366
+ item.get(field) for field in signature_fields
367
+ if field in item
368
+ )
369
+
370
+ if signature and signature not in seen_signatures:
371
+ seen_signatures.add(signature)
372
+ unique_data.append(item)
373
+ elif not signature:
374
+ # No identifiable signature, include it
375
+ unique_data.append(item)
376
+
377
+ return unique_data
378
+