local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +5 -3
- local_deep_research/web/database/models.py +51 -2
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +51 -61
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +227 -41
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +310 -103
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
- local_deep_research-0.5.0.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,3 @@
|
|
1
|
-
import json
|
2
1
|
import logging
|
3
2
|
from typing import Any, Dict, List, Optional
|
4
3
|
|
@@ -48,40 +47,48 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
48
47
|
"""
|
49
48
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
50
49
|
super().__init__(
|
51
|
-
llm=llm,
|
50
|
+
llm=llm,
|
51
|
+
max_filtered_results=max_filtered_results,
|
52
|
+
max_results=max_results,
|
52
53
|
)
|
53
|
-
|
54
|
+
|
54
55
|
self.index_name = index_name
|
55
56
|
self.highlight_fields = highlight_fields
|
56
57
|
self.search_fields = search_fields
|
57
58
|
self.filter_query = filter_query or {}
|
58
|
-
|
59
|
+
|
59
60
|
# Initialize the Elasticsearch client
|
60
61
|
es_args = {}
|
61
|
-
|
62
|
+
|
62
63
|
# Basic authentication
|
63
64
|
if username and password:
|
64
65
|
es_args["basic_auth"] = (username, password)
|
65
|
-
|
66
|
+
|
66
67
|
# API key authentication
|
67
68
|
if api_key:
|
68
69
|
es_args["api_key"] = api_key
|
69
|
-
|
70
|
+
|
70
71
|
# Cloud ID for Elastic Cloud
|
71
72
|
if cloud_id:
|
72
73
|
es_args["cloud_id"] = cloud_id
|
73
|
-
|
74
|
+
|
74
75
|
# Connect to Elasticsearch
|
75
76
|
self.client = Elasticsearch(hosts, **es_args)
|
76
|
-
|
77
|
+
|
77
78
|
# Verify connection
|
78
79
|
try:
|
79
80
|
info = self.client.info()
|
80
|
-
logger.info(
|
81
|
-
|
81
|
+
logger.info(
|
82
|
+
f"Connected to Elasticsearch cluster: {info.get('cluster_name')}"
|
83
|
+
)
|
84
|
+
logger.info(
|
85
|
+
f"Elasticsearch version: {info.get('version', {}).get('number')}"
|
86
|
+
)
|
82
87
|
except Exception as e:
|
83
88
|
logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
|
84
|
-
raise ConnectionError(
|
89
|
+
raise ConnectionError(
|
90
|
+
f"Could not connect to Elasticsearch: {str(e)}"
|
91
|
+
)
|
85
92
|
|
86
93
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
87
94
|
"""
|
@@ -93,7 +100,9 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
93
100
|
Returns:
|
94
101
|
List of preview dictionaries
|
95
102
|
"""
|
96
|
-
logger.info(
|
103
|
+
logger.info(
|
104
|
+
f"Getting document previews from Elasticsearch with query: {query}"
|
105
|
+
)
|
97
106
|
|
98
107
|
try:
|
99
108
|
# Build the search query
|
@@ -113,31 +122,31 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
113
122
|
},
|
114
123
|
"size": self.max_results,
|
115
124
|
}
|
116
|
-
|
125
|
+
|
117
126
|
# Add filter if provided
|
118
127
|
if self.filter_query:
|
119
128
|
search_query["query"] = {
|
120
129
|
"bool": {
|
121
130
|
"must": search_query["query"],
|
122
|
-
"filter": self.filter_query
|
131
|
+
"filter": self.filter_query,
|
123
132
|
}
|
124
133
|
}
|
125
|
-
|
134
|
+
|
126
135
|
# Execute the search
|
127
136
|
response = self.client.search(
|
128
137
|
index=self.index_name,
|
129
138
|
body=search_query,
|
130
139
|
)
|
131
|
-
|
140
|
+
|
132
141
|
# Process the search results
|
133
142
|
hits = response.get("hits", {}).get("hits", [])
|
134
|
-
|
143
|
+
|
135
144
|
# Format results as previews with basic information
|
136
145
|
previews = []
|
137
146
|
for hit in hits:
|
138
147
|
source = hit.get("_source", {})
|
139
148
|
highlight = hit.get("highlight", {})
|
140
|
-
|
149
|
+
|
141
150
|
# Extract highlighted snippets or fall back to original content
|
142
151
|
snippet = ""
|
143
152
|
for field in self.highlight_fields:
|
@@ -145,25 +154,30 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
145
154
|
# Join all highlights for this field
|
146
155
|
field_snippets = " ... ".join(highlight[field])
|
147
156
|
snippet += field_snippets + " "
|
148
|
-
|
157
|
+
|
149
158
|
# If no highlights, use a portion of the content
|
150
159
|
if not snippet and "content" in source:
|
151
160
|
content = source.get("content", "")
|
152
|
-
snippet =
|
153
|
-
|
161
|
+
snippet = (
|
162
|
+
content[:250] + "..." if len(content) > 250 else content
|
163
|
+
)
|
164
|
+
|
154
165
|
# Create preview object
|
155
166
|
preview = {
|
156
167
|
"id": hit.get("_id", ""),
|
157
168
|
"title": source.get("title", "Untitled Document"),
|
158
|
-
"link": source.get("url", "")
|
169
|
+
"link": source.get("url", "")
|
170
|
+
or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
|
159
171
|
"snippet": snippet.strip(),
|
160
172
|
"score": hit.get("_score", 0),
|
161
173
|
"_index": hit.get("_index", self.index_name),
|
162
174
|
}
|
163
|
-
|
175
|
+
|
164
176
|
previews.append(preview)
|
165
|
-
|
166
|
-
logger.info(
|
177
|
+
|
178
|
+
logger.info(
|
179
|
+
f"Found {len(previews)} preview results from Elasticsearch"
|
180
|
+
)
|
167
181
|
return previews
|
168
182
|
|
169
183
|
except Exception as e:
|
@@ -196,7 +210,7 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
196
210
|
for item in relevant_items:
|
197
211
|
# Start with the preview data
|
198
212
|
result = item.copy()
|
199
|
-
|
213
|
+
|
200
214
|
# Get the document ID
|
201
215
|
doc_id = item.get("id")
|
202
216
|
if not doc_id:
|
@@ -204,30 +218,34 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
204
218
|
logger.warning(f"Skipping item without ID: {item}")
|
205
219
|
results.append(result)
|
206
220
|
continue
|
207
|
-
|
221
|
+
|
208
222
|
try:
|
209
223
|
# Fetch the full document
|
210
224
|
doc_response = self.client.get(
|
211
225
|
index=self.index_name,
|
212
226
|
id=doc_id,
|
213
227
|
)
|
214
|
-
|
228
|
+
|
215
229
|
# Get the source document
|
216
230
|
source = doc_response.get("_source", {})
|
217
|
-
|
231
|
+
|
218
232
|
# Add full content to the result
|
219
|
-
result["content"] = source.get(
|
233
|
+
result["content"] = source.get(
|
234
|
+
"content", result.get("snippet", "")
|
235
|
+
)
|
220
236
|
result["full_content"] = source.get("content", "")
|
221
|
-
|
237
|
+
|
222
238
|
# Add metadata from source
|
223
239
|
for key, value in source.items():
|
224
240
|
if key not in result and key not in ["content"]:
|
225
241
|
result[key] = value
|
226
|
-
|
242
|
+
|
227
243
|
except Exception as e:
|
228
|
-
logger.error(
|
244
|
+
logger.error(
|
245
|
+
f"Error fetching full content for document {doc_id}: {str(e)}"
|
246
|
+
)
|
229
247
|
# Keep the preview data if we can't get the full content
|
230
|
-
|
248
|
+
|
231
249
|
results.append(result)
|
232
250
|
|
233
251
|
return results
|
@@ -235,10 +253,10 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
235
253
|
def search_by_query_string(self, query_string: str) -> List[Dict[str, Any]]:
|
236
254
|
"""
|
237
255
|
Perform a search using Elasticsearch Query String syntax.
|
238
|
-
|
256
|
+
|
239
257
|
Args:
|
240
258
|
query_string: The query in Elasticsearch Query String syntax
|
241
|
-
|
259
|
+
|
242
260
|
Returns:
|
243
261
|
List of search results
|
244
262
|
"""
|
@@ -258,28 +276,28 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
258
276
|
},
|
259
277
|
"size": self.max_results,
|
260
278
|
}
|
261
|
-
|
279
|
+
|
262
280
|
# Execute the search
|
263
281
|
response = self.client.search(
|
264
282
|
index=self.index_name,
|
265
283
|
body=search_query,
|
266
284
|
)
|
267
|
-
|
285
|
+
|
268
286
|
# Process and return the results
|
269
287
|
previews = self._process_es_response(response)
|
270
288
|
return self._get_full_content(previews)
|
271
|
-
|
289
|
+
|
272
290
|
except Exception as e:
|
273
291
|
logger.error(f"Error in query_string search: {str(e)}")
|
274
292
|
return []
|
275
|
-
|
293
|
+
|
276
294
|
def search_by_dsl(self, query_dsl: Dict[str, Any]) -> List[Dict[str, Any]]:
|
277
295
|
"""
|
278
296
|
Perform a search using Elasticsearch DSL (Query Domain Specific Language).
|
279
|
-
|
297
|
+
|
280
298
|
Args:
|
281
299
|
query_dsl: The query in Elasticsearch DSL format
|
282
|
-
|
300
|
+
|
283
301
|
Returns:
|
284
302
|
List of search results
|
285
303
|
"""
|
@@ -289,55 +307,60 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
|
|
289
307
|
index=self.index_name,
|
290
308
|
body=query_dsl,
|
291
309
|
)
|
292
|
-
|
310
|
+
|
293
311
|
# Process and return the results
|
294
312
|
previews = self._process_es_response(response)
|
295
313
|
return self._get_full_content(previews)
|
296
|
-
|
314
|
+
|
297
315
|
except Exception as e:
|
298
316
|
logger.error(f"Error in DSL search: {str(e)}")
|
299
317
|
return []
|
300
|
-
|
301
|
-
def _process_es_response(
|
318
|
+
|
319
|
+
def _process_es_response(
|
320
|
+
self, response: Dict[str, Any]
|
321
|
+
) -> List[Dict[str, Any]]:
|
302
322
|
"""
|
303
323
|
Process Elasticsearch response into preview dictionaries.
|
304
|
-
|
324
|
+
|
305
325
|
Args:
|
306
326
|
response: Elasticsearch response dictionary
|
307
|
-
|
327
|
+
|
308
328
|
Returns:
|
309
329
|
List of preview dictionaries
|
310
330
|
"""
|
311
331
|
hits = response.get("hits", {}).get("hits", [])
|
312
|
-
|
332
|
+
|
313
333
|
# Format results as previews
|
314
334
|
previews = []
|
315
335
|
for hit in hits:
|
316
336
|
source = hit.get("_source", {})
|
317
337
|
highlight = hit.get("highlight", {})
|
318
|
-
|
338
|
+
|
319
339
|
# Extract highlighted snippets or fall back to original content
|
320
340
|
snippet = ""
|
321
341
|
for field in self.highlight_fields:
|
322
342
|
if field in highlight and highlight[field]:
|
323
343
|
field_snippets = " ... ".join(highlight[field])
|
324
344
|
snippet += field_snippets + " "
|
325
|
-
|
345
|
+
|
326
346
|
# If no highlights, use a portion of the content
|
327
347
|
if not snippet and "content" in source:
|
328
348
|
content = source.get("content", "")
|
329
|
-
snippet =
|
330
|
-
|
349
|
+
snippet = (
|
350
|
+
content[:250] + "..." if len(content) > 250 else content
|
351
|
+
)
|
352
|
+
|
331
353
|
# Create preview object
|
332
354
|
preview = {
|
333
355
|
"id": hit.get("_id", ""),
|
334
356
|
"title": source.get("title", "Untitled Document"),
|
335
|
-
"link": source.get("url", "")
|
357
|
+
"link": source.get("url", "")
|
358
|
+
or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
|
336
359
|
"snippet": snippet.strip(),
|
337
360
|
"score": hit.get("_score", 0),
|
338
361
|
"_index": hit.get("_index", self.index_name),
|
339
362
|
}
|
340
|
-
|
363
|
+
|
341
364
|
previews.append(preview)
|
342
|
-
|
343
|
-
return previews
|
365
|
+
|
366
|
+
return previews
|
@@ -46,7 +46,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
46
46
|
"""
|
47
47
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
48
48
|
super().__init__(
|
49
|
-
llm=llm,
|
49
|
+
llm=llm,
|
50
|
+
max_filtered_results=max_filtered_results,
|
51
|
+
max_results=max_results,
|
50
52
|
)
|
51
53
|
self.api_key = api_key or os.getenv("GITHUB_API_KEY")
|
52
54
|
self.search_type = search_type
|
@@ -224,7 +226,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
224
226
|
|
225
227
|
# If no results, try to provide more guidance
|
226
228
|
if not results:
|
227
|
-
logger.warning(
|
229
|
+
logger.warning(
|
230
|
+
"No results found. Consider these search tips:"
|
231
|
+
)
|
228
232
|
logger.warning("1. Use shorter, more specific queries")
|
229
233
|
logger.warning(
|
230
234
|
"2. For repositories, try adding 'stars:>100' or 'language:python'"
|
@@ -255,7 +259,8 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
255
259
|
try:
|
256
260
|
# Get README
|
257
261
|
response = requests.get(
|
258
|
-
f"{self.api_base}/repos/{repo_full_name}/readme",
|
262
|
+
f"{self.api_base}/repos/{repo_full_name}/readme",
|
263
|
+
headers=self.headers,
|
259
264
|
)
|
260
265
|
|
261
266
|
# Check for rate limiting
|
@@ -267,7 +272,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
267
272
|
encoding = data.get("encoding", "")
|
268
273
|
|
269
274
|
if encoding == "base64" and content:
|
270
|
-
return base64.b64decode(content).decode(
|
275
|
+
return base64.b64decode(content).decode(
|
276
|
+
"utf-8", errors="replace"
|
277
|
+
)
|
271
278
|
return content
|
272
279
|
else:
|
273
280
|
logger.warning(
|
@@ -312,7 +319,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
312
319
|
|
313
320
|
if response.status_code == 200:
|
314
321
|
issues = response.json()
|
315
|
-
logger.info(
|
322
|
+
logger.info(
|
323
|
+
f"Got {len(issues)} recent issues for {repo_full_name}"
|
324
|
+
)
|
316
325
|
else:
|
317
326
|
logger.warning(
|
318
327
|
f"Could not get issues for {repo_full_name}: {response.status_code}"
|
@@ -346,17 +355,23 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
346
355
|
encoding = data.get("encoding", "")
|
347
356
|
|
348
357
|
if encoding == "base64" and content:
|
349
|
-
return base64.b64decode(content).decode(
|
358
|
+
return base64.b64decode(content).decode(
|
359
|
+
"utf-8", errors="replace"
|
360
|
+
)
|
350
361
|
return content
|
351
362
|
else:
|
352
|
-
logger.warning(
|
363
|
+
logger.warning(
|
364
|
+
f"Could not get file content: {response.status_code}"
|
365
|
+
)
|
353
366
|
return ""
|
354
367
|
|
355
368
|
except Exception as e:
|
356
369
|
logger.error(f"Error getting file content: {e}")
|
357
370
|
return ""
|
358
371
|
|
359
|
-
def _format_repository_preview(
|
372
|
+
def _format_repository_preview(
|
373
|
+
self, repo: Dict[str, Any]
|
374
|
+
) -> Dict[str, Any]:
|
360
375
|
"""Format repository search result as preview"""
|
361
376
|
return {
|
362
377
|
"id": str(repo.get("id", "")),
|
@@ -393,7 +408,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
393
408
|
def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
|
394
409
|
"""Format issue search result as preview"""
|
395
410
|
repo = (
|
396
|
-
issue.get("repository", {})
|
411
|
+
issue.get("repository", {})
|
412
|
+
if "repository" in issue
|
413
|
+
else {"full_name": ""}
|
397
414
|
)
|
398
415
|
return {
|
399
416
|
"id": f"issue_{issue.get('number', '')}",
|
@@ -503,7 +520,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
503
520
|
keywords[:5]
|
504
521
|
) # Add up to 5 keywords
|
505
522
|
|
506
|
-
logger.info(
|
523
|
+
logger.info(
|
524
|
+
f"Using specialized contribution query: {specialized_query}"
|
525
|
+
)
|
507
526
|
|
508
527
|
# Perform GitHub search with specialized query
|
509
528
|
results = self._search_github(specialized_query)
|
@@ -560,7 +579,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
560
579
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
561
580
|
return relevant_items
|
562
581
|
|
563
|
-
logger.info(
|
582
|
+
logger.info(
|
583
|
+
f"Getting full content for {len(relevant_items)} GitHub results"
|
584
|
+
)
|
564
585
|
|
565
586
|
results = []
|
566
587
|
for item in relevant_items:
|
@@ -610,7 +631,10 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
610
631
|
f"Public repositories: {item.get('public_repos', 0)}\n"
|
611
632
|
)
|
612
633
|
|
613
|
-
if
|
634
|
+
if (
|
635
|
+
item.get("snippet")
|
636
|
+
and item.get("snippet") != "No bio provided"
|
637
|
+
):
|
614
638
|
profile_summary += f"\nBio: {item.get('snippet')}\n"
|
615
639
|
|
616
640
|
result["full_content"] = profile_summary
|
@@ -620,7 +644,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
620
644
|
|
621
645
|
return results
|
622
646
|
|
623
|
-
def search_repository(
|
647
|
+
def search_repository(
|
648
|
+
self, repo_owner: str, repo_name: str
|
649
|
+
) -> Dict[str, Any]:
|
624
650
|
"""
|
625
651
|
Get detailed information about a specific repository.
|
626
652
|
|
@@ -672,7 +698,10 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
672
698
|
return {}
|
673
699
|
|
674
700
|
def search_code(
|
675
|
-
self,
|
701
|
+
self,
|
702
|
+
query: str,
|
703
|
+
language: Optional[str] = None,
|
704
|
+
user: Optional[str] = None,
|
676
705
|
) -> List[Dict[str, Any]]:
|
677
706
|
"""
|
678
707
|
Search for code with more specific parameters.
|
@@ -769,7 +798,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
769
798
|
results = data.get("items", [])
|
770
799
|
|
771
800
|
# Format results
|
772
|
-
previews = [
|
801
|
+
previews = [
|
802
|
+
self._format_issue_preview(result) for result in results
|
803
|
+
]
|
773
804
|
|
774
805
|
# For issues, we don't need to get full content
|
775
806
|
return previews
|
@@ -51,7 +51,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
51
51
|
"""
|
52
52
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
53
53
|
super().__init__(
|
54
|
-
llm=llm,
|
54
|
+
llm=llm,
|
55
|
+
max_filtered_results=max_filtered_results,
|
56
|
+
max_results=max_results,
|
55
57
|
)
|
56
58
|
self.include_full_content = include_full_content
|
57
59
|
|
@@ -61,7 +63,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
61
63
|
|
62
64
|
# Rate limiting - keep track of last request time
|
63
65
|
self.last_request_time = 0
|
64
|
-
self.min_request_interval =
|
66
|
+
self.min_request_interval = (
|
67
|
+
0.5 # Minimum time between requests in seconds
|
68
|
+
)
|
65
69
|
|
66
70
|
# Language code mapping
|
67
71
|
language_code_mapping = {
|
@@ -92,7 +96,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
92
96
|
|
93
97
|
self.api_key = api_key
|
94
98
|
if not self.api_key:
|
95
|
-
self.api_key = get_db_setting(
|
99
|
+
self.api_key = get_db_setting(
|
100
|
+
"search.engine.web.google_pse.api_key"
|
101
|
+
)
|
96
102
|
|
97
103
|
self.search_engine_id = search_engine_id
|
98
104
|
if not self.search_engine_id:
|
@@ -187,7 +193,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
187
193
|
# Add jitter to retries after the first attempt
|
188
194
|
if attempt > 0:
|
189
195
|
jitter = random.uniform(0.5, 1.5)
|
190
|
-
sleep_time =
|
196
|
+
sleep_time = (
|
197
|
+
self.retry_delay * (2 ** (attempt - 1)) * jitter
|
198
|
+
)
|
191
199
|
logger.info(
|
192
200
|
"Retry attempt %s / %s for query '%s'. Waiting %s s",
|
193
201
|
attempt + 1,
|
@@ -272,7 +280,7 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
272
280
|
{
|
273
281
|
"title": title,
|
274
282
|
"snippet": snippet,
|
275
|
-
"
|
283
|
+
"link": url,
|
276
284
|
"source": "Google Programmable Search",
|
277
285
|
}
|
278
286
|
)
|
@@ -296,7 +304,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
296
304
|
logger.error("Error getting search results: %s", str(e))
|
297
305
|
break
|
298
306
|
|
299
|
-
logger.info(
|
307
|
+
logger.info(
|
308
|
+
"Retrieved %s search results for query: '%s'", len(results), query
|
309
|
+
)
|
300
310
|
return results
|
301
311
|
|
302
312
|
def _get_full_content(
|
@@ -48,7 +48,9 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
48
48
|
"""
|
49
49
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
50
50
|
super().__init__(
|
51
|
-
llm=llm,
|
51
|
+
llm=llm,
|
52
|
+
max_filtered_results=max_filtered_results,
|
53
|
+
max_results=max_results,
|
52
54
|
)
|
53
55
|
self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
|
54
56
|
self.optimize_queries = optimize_queries
|
@@ -204,15 +206,19 @@ ONE WORD ONLY:"""
|
|
204
206
|
logger.info(
|
205
207
|
"Query classified as HISTORICAL - extending search timeframe"
|
206
208
|
)
|
207
|
-
ten_years_ago = (
|
208
|
-
|
209
|
-
)
|
209
|
+
ten_years_ago = (
|
210
|
+
datetime.now() - timedelta(days=3650)
|
211
|
+
).strftime("%Y-%m-%d")
|
210
212
|
self.from_date = ten_years_ago
|
211
213
|
|
212
214
|
elif "CURRENT" in answer:
|
213
215
|
# For current events, focus on recent content
|
214
|
-
logger.info(
|
215
|
-
|
216
|
+
logger.info(
|
217
|
+
"Query classified as CURRENT - focusing on recent content"
|
218
|
+
)
|
219
|
+
recent = (datetime.now() - timedelta(days=60)).strftime(
|
220
|
+
"%Y-%m-%d"
|
221
|
+
)
|
216
222
|
self.from_date = recent
|
217
223
|
self.order_by = "newest" # Prioritize newest for current events
|
218
224
|
|
@@ -246,7 +252,9 @@ ONE WORD ONLY:"""
|
|
246
252
|
|
247
253
|
# Strategy 1: Expand to 6 months
|
248
254
|
logger.info("Strategy 1: Expanding time range to 6 months")
|
249
|
-
six_months_ago = (datetime.now() - timedelta(days=180)).strftime(
|
255
|
+
six_months_ago = (datetime.now() - timedelta(days=180)).strftime(
|
256
|
+
"%Y-%m-%d"
|
257
|
+
)
|
250
258
|
self.from_date = six_months_ago
|
251
259
|
|
252
260
|
articles1 = self._get_all_data(query)
|
@@ -256,7 +264,9 @@ ONE WORD ONLY:"""
|
|
256
264
|
|
257
265
|
# Strategy 2: Expand to all time and try relevance order
|
258
266
|
if len(articles) < 3:
|
259
|
-
logger.info(
|
267
|
+
logger.info(
|
268
|
+
"Strategy 2: Expanding to all time with relevance ordering"
|
269
|
+
)
|
260
270
|
self.from_date = "2000-01-01" # Effectively "all time"
|
261
271
|
self.order_by = "relevance"
|
262
272
|
|
@@ -315,12 +325,15 @@ ONE WORD ONLY:"""
|
|
315
325
|
# Always request all fields for simplicity
|
316
326
|
# Ensure max_results is an integer to avoid comparison errors
|
317
327
|
page_size = min(
|
318
|
-
int(self.max_results) if self.max_results is not None else 10,
|
328
|
+
int(self.max_results) if self.max_results is not None else 10,
|
329
|
+
50,
|
319
330
|
)
|
320
331
|
|
321
332
|
# Log full parameters for debugging
|
322
333
|
logger.info(f"Guardian API search query: '{query}'")
|
323
|
-
logger.info(
|
334
|
+
logger.info(
|
335
|
+
f"Guardian API date range: {self.from_date} to {self.to_date}"
|
336
|
+
)
|
324
337
|
|
325
338
|
params = {
|
326
339
|
"q": query,
|
@@ -363,7 +376,9 @@ ONE WORD ONLY:"""
|
|
363
376
|
# Format the article with all fields
|
364
377
|
result = {
|
365
378
|
"id": article.get("id", ""),
|
366
|
-
"title": fields.get(
|
379
|
+
"title": fields.get(
|
380
|
+
"headline", article.get("webTitle", "")
|
381
|
+
),
|
367
382
|
"link": article.get("webUrl", ""),
|
368
383
|
"snippet": fields.get("trailText", ""),
|
369
384
|
"publication_date": article.get("webPublicationDate", ""),
|
@@ -399,7 +414,9 @@ ONE WORD ONLY:"""
|
|
399
414
|
Returns:
|
400
415
|
List of preview dictionaries
|
401
416
|
"""
|
402
|
-
logger.info(
|
417
|
+
logger.info(
|
418
|
+
f"Getting articles from The Guardian API for query: {query}"
|
419
|
+
)
|
403
420
|
|
404
421
|
# Step 1: Optimize the query using LLM
|
405
422
|
optimized_query = self._optimize_query_for_guardian(query)
|
@@ -471,7 +488,10 @@ ONE WORD ONLY:"""
|
|
471
488
|
article_id = item.get("id", "")
|
472
489
|
|
473
490
|
# Get the full article from our cache
|
474
|
-
if
|
491
|
+
if (
|
492
|
+
hasattr(self, "_full_articles")
|
493
|
+
and article_id in self._full_articles
|
494
|
+
):
|
475
495
|
results.append(self._full_articles[article_id])
|
476
496
|
else:
|
477
497
|
# If not found (shouldn't happen), just use the preview
|
@@ -502,7 +522,9 @@ ONE WORD ONLY:"""
|
|
502
522
|
|
503
523
|
# If no results, try one more time with a simplified query
|
504
524
|
if not previews:
|
505
|
-
simple_query = " ".join(
|
525
|
+
simple_query = " ".join(
|
526
|
+
[w for w in query.split() if len(w) > 3][:3]
|
527
|
+
)
|
506
528
|
logger.warning(
|
507
529
|
f"No Guardian articles found, trying simplified query: {simple_query}"
|
508
530
|
)
|
@@ -518,7 +540,9 @@ ONE WORD ONLY:"""
|
|
518
540
|
|
519
541
|
# If still no results after all attempts, return empty list
|
520
542
|
if not previews:
|
521
|
-
logger.warning(
|
543
|
+
logger.warning(
|
544
|
+
"No Guardian articles found after multiple attempts"
|
545
|
+
)
|
522
546
|
return []
|
523
547
|
|
524
548
|
# Filter for relevance if we have an LLM
|