local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,40 @@
|
|
1
|
-
import requests
|
2
|
-
import logging
|
3
1
|
import base64
|
2
|
+
import json
|
3
|
+
import logging
|
4
4
|
import os
|
5
5
|
import time
|
6
|
-
from typing import Dict, List,
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
|
8
|
+
import requests
|
7
9
|
from langchain_core.language_models import BaseLLM
|
8
10
|
|
9
|
-
from
|
10
|
-
from
|
11
|
+
from ...config import llm_config, search_config
|
12
|
+
from ..search_engine_base import BaseSearchEngine
|
11
13
|
|
12
14
|
# Setup logging
|
13
15
|
logging.basicConfig(level=logging.INFO)
|
14
16
|
logger = logging.getLogger(__name__)
|
15
17
|
|
18
|
+
|
16
19
|
class GitHubSearchEngine(BaseSearchEngine):
|
17
20
|
"""
|
18
21
|
GitHub search engine implementation.
|
19
22
|
Provides search across GitHub repositories, code, issues, and users.
|
20
23
|
"""
|
21
|
-
|
22
|
-
def __init__(
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
max_results: int = 15,
|
28
|
+
api_key: Optional[str] = None,
|
29
|
+
search_type: str = "repositories",
|
30
|
+
include_readme: bool = True,
|
31
|
+
include_issues: bool = False,
|
32
|
+
llm: Optional[BaseLLM] = None,
|
33
|
+
max_filtered_results: Optional[int] = None,
|
34
|
+
):
|
30
35
|
"""
|
31
36
|
Initialize the GitHub search engine.
|
32
|
-
|
37
|
+
|
33
38
|
Args:
|
34
39
|
max_results: Maximum number of search results
|
35
40
|
api_key: GitHub API token (can also be set in GITHUB_API_KEY env)
|
@@ -40,78 +45,144 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
40
45
|
max_filtered_results: Maximum number of results to keep after filtering
|
41
46
|
"""
|
42
47
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
43
|
-
super().__init__(
|
48
|
+
super().__init__(
|
49
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
50
|
+
)
|
44
51
|
self.api_key = api_key or os.getenv("GITHUB_API_KEY")
|
45
52
|
self.search_type = search_type
|
46
53
|
self.include_readme = include_readme
|
47
54
|
self.include_issues = include_issues
|
48
|
-
|
55
|
+
|
49
56
|
# API endpoints
|
50
57
|
self.api_base = "https://api.github.com"
|
51
58
|
self.search_endpoint = f"{self.api_base}/search/{search_type}"
|
52
|
-
|
59
|
+
|
53
60
|
# Set up API headers
|
54
61
|
self.headers = {
|
55
62
|
"Accept": "application/vnd.github.v3+json",
|
56
|
-
"User-Agent": "Local-Deep-Research-Agent"
|
63
|
+
"User-Agent": "Local-Deep-Research-Agent",
|
57
64
|
}
|
58
|
-
|
65
|
+
|
59
66
|
# Add authentication if API key provided
|
60
67
|
if self.api_key:
|
61
68
|
self.headers["Authorization"] = f"token {self.api_key}"
|
62
69
|
logger.info("Using authenticated GitHub API requests")
|
63
70
|
else:
|
64
|
-
logger.warning(
|
65
|
-
|
71
|
+
logger.warning(
|
72
|
+
"No GitHub API key provided. Rate limits will be restricted."
|
73
|
+
)
|
74
|
+
|
66
75
|
def _handle_rate_limits(self, response):
|
67
76
|
"""Handle GitHub API rate limits by logging warnings and sleeping if necessary"""
|
68
77
|
remaining = int(response.headers.get("X-RateLimit-Remaining", 60))
|
69
78
|
reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
|
70
|
-
|
79
|
+
|
71
80
|
if remaining < 5:
|
72
81
|
current_time = time.time()
|
73
82
|
wait_time = max(reset_time - current_time, 0)
|
74
|
-
logger.warning(
|
75
|
-
|
83
|
+
logger.warning(
|
84
|
+
f"GitHub API rate limit almost reached. {remaining} requests remaining."
|
85
|
+
)
|
86
|
+
|
76
87
|
if wait_time > 0 and remaining == 0:
|
77
|
-
logger.warning(
|
88
|
+
logger.warning(
|
89
|
+
f"GitHub API rate limit exceeded. Waiting {wait_time:.0f} seconds."
|
90
|
+
)
|
78
91
|
time.sleep(min(wait_time, 60)) # Wait at most 60 seconds
|
79
|
-
|
92
|
+
|
93
|
+
def _optimize_github_query(self, query: str) -> str:
|
94
|
+
"""
|
95
|
+
Optimize the GitHub search query using LLM to improve search results.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
query: Original search query
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Optimized GitHub search query
|
102
|
+
"""
|
103
|
+
# Get LLM from config if not already set
|
104
|
+
if not self.llm:
|
105
|
+
try:
|
106
|
+
self.llm = llm_config.get_llm()
|
107
|
+
if not self.llm:
|
108
|
+
logger.warning("No LLM available for query optimization")
|
109
|
+
return query
|
110
|
+
except Exception as e:
|
111
|
+
logger.error(f"Error getting LLM from config: {e}")
|
112
|
+
return query
|
113
|
+
|
114
|
+
prompt = f"""Transform this GitHub search query into an optimized version for the GitHub search API. Follow these steps:
|
115
|
+
1. Strip question words (e.g., 'what', 'are', 'is'), stop words (e.g., 'and', 'as', 'of', 'on'), and redundant terms (e.g., 'repositories', 'repos', 'github') since they're implied by the search context.
|
116
|
+
2. Keep only domain-specific keywords and avoid using "-related" terms.
|
117
|
+
3. Add GitHub-specific filters with dynamic thresholds based on query context:
|
118
|
+
- For stars: Use higher threshold (e.g., 'stars:>1000') for mainstream topics, lower (e.g., 'stars:>50') for specialized topics
|
119
|
+
- For language: Detect programming language from query or omit if unclear
|
120
|
+
- For search scope: Use 'in:name,description,readme' for general queries, 'in:file' for code-specific queries
|
121
|
+
4. For date ranges, adapt based on query context:
|
122
|
+
- For emerging: Use 'created:>2024-01-01'
|
123
|
+
- For mature: Use 'pushed:>2023-01-01'
|
124
|
+
- For historical research: Use 'created:2020-01-01..2024-01-01'
|
125
|
+
5. For excluding results, adapt based on query:
|
126
|
+
- Exclude irrelevant languages based on context
|
127
|
+
- Use 'NOT' to exclude competing terms
|
128
|
+
6. Ensure the output is a concise, space-separated string with no punctuation or extra text beyond keywords and filters.
|
129
|
+
|
130
|
+
|
131
|
+
Original query: "{query}"
|
132
|
+
|
133
|
+
Return ONLY the optimized query, ready for GitHub's search API. Do not include explanations or additional text."""
|
134
|
+
|
135
|
+
try:
|
136
|
+
response = self.llm.invoke(prompt)
|
137
|
+
|
138
|
+
# Handle different response formats (string or object with content attribute)
|
139
|
+
if hasattr(response, "content"):
|
140
|
+
optimized_query = response.content.strip()
|
141
|
+
else:
|
142
|
+
# Handle string responses
|
143
|
+
optimized_query = str(response).strip()
|
144
|
+
|
145
|
+
# Validate the optimized query
|
146
|
+
if optimized_query and len(optimized_query) > 0:
|
147
|
+
logger.info(
|
148
|
+
f"LLM optimized query from '{query}' to '{optimized_query}'"
|
149
|
+
)
|
150
|
+
return optimized_query
|
151
|
+
else:
|
152
|
+
logger.warning("LLM returned empty query, using original")
|
153
|
+
return query
|
154
|
+
|
155
|
+
except Exception as e:
|
156
|
+
logger.error(f"Error optimizing query with LLM: {e}")
|
157
|
+
return query
|
158
|
+
|
80
159
|
def _search_github(self, query: str) -> List[Dict[str, Any]]:
|
81
160
|
"""
|
82
161
|
Perform a GitHub search based on the configured search type.
|
83
|
-
|
162
|
+
|
84
163
|
Args:
|
85
164
|
query: The search query
|
86
|
-
|
165
|
+
|
87
166
|
Returns:
|
88
167
|
List of GitHub search result items
|
89
168
|
"""
|
90
169
|
results = []
|
91
|
-
|
170
|
+
|
92
171
|
try:
|
93
|
-
# Optimize GitHub query
|
94
|
-
github_query = query
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
# Extract key terms if it's a recommendation request
|
99
|
-
if "recommend" in query.lower() or "looking for" in query.lower():
|
100
|
-
github_query = "stars:>100 " + " ".join([
|
101
|
-
word for word in query.split()
|
102
|
-
if len(word) > 3 and word.lower() not in
|
103
|
-
["recommend", "recommended", "github", "repositories", "looking", "developers"]
|
104
|
-
])
|
105
|
-
|
106
|
-
logger.info(f"Optimized GitHub query: {github_query}")
|
107
|
-
|
172
|
+
# Optimize GitHub query using LLM
|
173
|
+
github_query = self._optimize_github_query(query)
|
174
|
+
|
175
|
+
logger.info(f"Final GitHub query: {github_query}")
|
176
|
+
|
108
177
|
# Construct search parameters
|
109
178
|
params = {
|
110
179
|
"q": github_query,
|
111
|
-
"per_page": min(
|
112
|
-
|
180
|
+
"per_page": min(
|
181
|
+
self.max_results, 100
|
182
|
+
), # GitHub API max is 100 per page
|
183
|
+
"page": 1,
|
113
184
|
}
|
114
|
-
|
185
|
+
|
115
186
|
# Add sort parameters based on search type
|
116
187
|
if self.search_type == "repositories":
|
117
188
|
params["sort"] = "stars"
|
@@ -125,92 +196,104 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
125
196
|
elif self.search_type == "users":
|
126
197
|
params["sort"] = "followers"
|
127
198
|
params["order"] = "desc"
|
128
|
-
|
199
|
+
|
129
200
|
# Execute the API request
|
130
201
|
response = requests.get(
|
131
|
-
self.search_endpoint,
|
132
|
-
headers=self.headers,
|
133
|
-
params=params
|
202
|
+
self.search_endpoint, headers=self.headers, params=params
|
134
203
|
)
|
135
|
-
|
204
|
+
|
136
205
|
# Check for rate limiting
|
137
206
|
self._handle_rate_limits(response)
|
138
|
-
|
207
|
+
|
139
208
|
# Handle response with detailed logging
|
140
209
|
if response.status_code == 200:
|
141
210
|
data = response.json()
|
142
211
|
total_count = data.get("total_count", 0)
|
143
212
|
results = data.get("items", [])
|
144
|
-
logger.info(
|
145
|
-
|
213
|
+
logger.info(
|
214
|
+
f"GitHub search returned {len(results)} results (total available: {total_count})"
|
215
|
+
)
|
216
|
+
|
146
217
|
# Log the rate limit information
|
147
|
-
rate_limit_remaining = response.headers.get(
|
148
|
-
|
149
|
-
|
150
|
-
|
218
|
+
rate_limit_remaining = response.headers.get(
|
219
|
+
"X-RateLimit-Remaining", "unknown"
|
220
|
+
)
|
221
|
+
logger.info(
|
222
|
+
f"GitHub API rate limit: {rate_limit_remaining} requests remaining"
|
223
|
+
)
|
224
|
+
|
151
225
|
# If no results, try to provide more guidance
|
152
226
|
if not results:
|
153
|
-
logger.warning(
|
154
|
-
logger.warning(
|
155
|
-
logger.warning(
|
156
|
-
|
227
|
+
logger.warning("No results found. Consider these search tips:")
|
228
|
+
logger.warning("1. Use shorter, more specific queries")
|
229
|
+
logger.warning(
|
230
|
+
"2. For repositories, try adding 'stars:>100' or 'language:python'"
|
231
|
+
)
|
232
|
+
logger.warning(
|
233
|
+
"3. For contribution opportunities, search for 'good-first-issue' or 'help-wanted'"
|
234
|
+
)
|
157
235
|
else:
|
158
|
-
logger.error(
|
159
|
-
|
236
|
+
logger.error(
|
237
|
+
f"GitHub API error: {response.status_code} - {response.text}"
|
238
|
+
)
|
239
|
+
|
160
240
|
except Exception as e:
|
161
241
|
logger.error(f"Error searching GitHub: {e}")
|
162
|
-
|
242
|
+
|
163
243
|
return results
|
164
|
-
|
244
|
+
|
165
245
|
def _get_readme_content(self, repo_full_name: str) -> str:
|
166
246
|
"""
|
167
247
|
Get README content for a repository.
|
168
|
-
|
248
|
+
|
169
249
|
Args:
|
170
250
|
repo_full_name: Full name of the repository (owner/repo)
|
171
|
-
|
251
|
+
|
172
252
|
Returns:
|
173
253
|
Decoded README content or empty string if not found
|
174
254
|
"""
|
175
255
|
try:
|
176
256
|
# Get README
|
177
257
|
response = requests.get(
|
178
|
-
f"{self.api_base}/repos/{repo_full_name}/readme",
|
179
|
-
headers=self.headers
|
258
|
+
f"{self.api_base}/repos/{repo_full_name}/readme", headers=self.headers
|
180
259
|
)
|
181
|
-
|
260
|
+
|
182
261
|
# Check for rate limiting
|
183
262
|
self._handle_rate_limits(response)
|
184
|
-
|
263
|
+
|
185
264
|
if response.status_code == 200:
|
186
265
|
data = response.json()
|
187
266
|
content = data.get("content", "")
|
188
267
|
encoding = data.get("encoding", "")
|
189
|
-
|
268
|
+
|
190
269
|
if encoding == "base64" and content:
|
191
|
-
return base64.b64decode(content).decode(
|
270
|
+
return base64.b64decode(content).decode("utf-8", errors="replace")
|
192
271
|
return content
|
193
272
|
else:
|
194
|
-
logger.warning(
|
273
|
+
logger.warning(
|
274
|
+
f"Could not get README for {repo_full_name}: {response.status_code}"
|
275
|
+
)
|
195
276
|
return ""
|
196
|
-
|
277
|
+
|
197
278
|
except Exception as e:
|
198
279
|
logger.error(f"Error getting README for {repo_full_name}: {e}")
|
199
280
|
return ""
|
200
|
-
|
201
|
-
def _get_recent_issues(
|
281
|
+
|
282
|
+
def _get_recent_issues(
|
283
|
+
self, repo_full_name: str, limit: int = 5
|
284
|
+
) -> List[Dict[str, Any]]:
|
202
285
|
"""
|
203
286
|
Get recent issues for a repository.
|
204
|
-
|
287
|
+
|
205
288
|
Args:
|
206
289
|
repo_full_name: Full name of the repository (owner/repo)
|
207
290
|
limit: Maximum number of issues to return
|
208
|
-
|
291
|
+
|
209
292
|
Returns:
|
210
293
|
List of recent issues
|
211
294
|
"""
|
212
295
|
issues = []
|
213
|
-
|
296
|
+
|
214
297
|
try:
|
215
298
|
# Get recent issues
|
216
299
|
response = requests.get(
|
@@ -220,60 +303,59 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
220
303
|
"state": "all",
|
221
304
|
"per_page": limit,
|
222
305
|
"sort": "updated",
|
223
|
-
"direction": "desc"
|
224
|
-
}
|
306
|
+
"direction": "desc",
|
307
|
+
},
|
225
308
|
)
|
226
|
-
|
309
|
+
|
227
310
|
# Check for rate limiting
|
228
311
|
self._handle_rate_limits(response)
|
229
|
-
|
312
|
+
|
230
313
|
if response.status_code == 200:
|
231
314
|
issues = response.json()
|
232
315
|
logger.info(f"Got {len(issues)} recent issues for {repo_full_name}")
|
233
316
|
else:
|
234
|
-
logger.warning(
|
235
|
-
|
317
|
+
logger.warning(
|
318
|
+
f"Could not get issues for {repo_full_name}: {response.status_code}"
|
319
|
+
)
|
320
|
+
|
236
321
|
except Exception as e:
|
237
322
|
logger.error(f"Error getting issues for {repo_full_name}: {e}")
|
238
|
-
|
323
|
+
|
239
324
|
return issues
|
240
|
-
|
325
|
+
|
241
326
|
def _get_file_content(self, file_url: str) -> str:
|
242
327
|
"""
|
243
328
|
Get content of a file from GitHub.
|
244
|
-
|
329
|
+
|
245
330
|
Args:
|
246
331
|
file_url: API URL for the file
|
247
|
-
|
332
|
+
|
248
333
|
Returns:
|
249
334
|
Decoded file content or empty string if not found
|
250
335
|
"""
|
251
336
|
try:
|
252
337
|
# Get file content
|
253
|
-
response = requests.get(
|
254
|
-
|
255
|
-
headers=self.headers
|
256
|
-
)
|
257
|
-
|
338
|
+
response = requests.get(file_url, headers=self.headers)
|
339
|
+
|
258
340
|
# Check for rate limiting
|
259
341
|
self._handle_rate_limits(response)
|
260
|
-
|
342
|
+
|
261
343
|
if response.status_code == 200:
|
262
344
|
data = response.json()
|
263
345
|
content = data.get("content", "")
|
264
346
|
encoding = data.get("encoding", "")
|
265
|
-
|
347
|
+
|
266
348
|
if encoding == "base64" and content:
|
267
|
-
return base64.b64decode(content).decode(
|
349
|
+
return base64.b64decode(content).decode("utf-8", errors="replace")
|
268
350
|
return content
|
269
351
|
else:
|
270
352
|
logger.warning(f"Could not get file content: {response.status_code}")
|
271
353
|
return ""
|
272
|
-
|
354
|
+
|
273
355
|
except Exception as e:
|
274
356
|
logger.error(f"Error getting file content: {e}")
|
275
357
|
return ""
|
276
|
-
|
358
|
+
|
277
359
|
def _format_repository_preview(self, repo: Dict[str, Any]) -> Dict[str, Any]:
|
278
360
|
"""Format repository search result as preview"""
|
279
361
|
return {
|
@@ -290,9 +372,9 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
290
372
|
"owner": repo.get("owner", {}).get("login", ""),
|
291
373
|
"is_fork": repo.get("fork", False),
|
292
374
|
"search_type": "repository",
|
293
|
-
"repo_full_name": repo.get("full_name", "")
|
375
|
+
"repo_full_name": repo.get("full_name", ""),
|
294
376
|
}
|
295
|
-
|
377
|
+
|
296
378
|
def _format_code_preview(self, code: Dict[str, Any]) -> Dict[str, Any]:
|
297
379
|
"""Format code search result as preview"""
|
298
380
|
repo = code.get("repository", {})
|
@@ -305,26 +387,32 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
305
387
|
"repo_name": repo.get("full_name", ""),
|
306
388
|
"repo_url": repo.get("html_url", ""),
|
307
389
|
"search_type": "code",
|
308
|
-
"file_url": code.get("url", "")
|
390
|
+
"file_url": code.get("url", ""),
|
309
391
|
}
|
310
|
-
|
392
|
+
|
311
393
|
def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
|
312
394
|
"""Format issue search result as preview"""
|
313
|
-
repo =
|
395
|
+
repo = (
|
396
|
+
issue.get("repository", {}) if "repository" in issue else {"full_name": ""}
|
397
|
+
)
|
314
398
|
return {
|
315
399
|
"id": f"issue_{issue.get('number', '')}",
|
316
400
|
"title": issue.get("title", ""),
|
317
401
|
"link": issue.get("html_url", ""),
|
318
|
-
"snippet":
|
402
|
+
"snippet": (
|
403
|
+
issue.get("body", "")[:200] + "..."
|
404
|
+
if len(issue.get("body", "")) > 200
|
405
|
+
else issue.get("body", "")
|
406
|
+
),
|
319
407
|
"state": issue.get("state", ""),
|
320
408
|
"created_at": issue.get("created_at", ""),
|
321
409
|
"updated_at": issue.get("updated_at", ""),
|
322
410
|
"user": issue.get("user", {}).get("login", ""),
|
323
411
|
"comments": issue.get("comments", 0),
|
324
412
|
"search_type": "issue",
|
325
|
-
"repo_name": repo.get("full_name", "")
|
413
|
+
"repo_name": repo.get("full_name", ""),
|
326
414
|
}
|
327
|
-
|
415
|
+
|
328
416
|
def _format_user_preview(self, user: Dict[str, Any]) -> Dict[str, Any]:
|
329
417
|
"""Format user search result as preview"""
|
330
418
|
return {
|
@@ -337,64 +425,100 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
337
425
|
"public_repos": user.get("public_repos", 0),
|
338
426
|
"location": user.get("location", ""),
|
339
427
|
"search_type": "user",
|
340
|
-
"user_login": user.get("login", "")
|
428
|
+
"user_login": user.get("login", ""),
|
341
429
|
}
|
342
|
-
|
430
|
+
|
343
431
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
344
432
|
"""
|
345
433
|
Get preview information for GitHub search results.
|
346
|
-
|
434
|
+
|
347
435
|
Args:
|
348
436
|
query: The search query
|
349
|
-
|
437
|
+
|
350
438
|
Returns:
|
351
439
|
List of preview dictionaries
|
352
440
|
"""
|
353
441
|
logger.info(f"Getting GitHub previews for query: {query}")
|
354
|
-
|
442
|
+
|
355
443
|
# For contribution-focused queries, automatically adjust search type and add filters
|
356
|
-
if any(
|
444
|
+
if any(
|
445
|
+
term in query.lower()
|
446
|
+
for term in [
|
447
|
+
"contribute",
|
448
|
+
"contributing",
|
449
|
+
"contribution",
|
450
|
+
"beginner",
|
451
|
+
"newcomer",
|
452
|
+
]
|
453
|
+
):
|
357
454
|
# Use repositories search with help-wanted or good-first-issue labels
|
358
455
|
original_search_type = self.search_type
|
359
456
|
self.search_type = "repositories"
|
360
457
|
self.search_endpoint = f"{self.api_base}/search/repositories"
|
361
|
-
|
458
|
+
|
362
459
|
# Create a specialized query for finding beginner-friendly projects
|
363
460
|
specialized_query = "good-first-issues:>5 is:public archived:false"
|
364
|
-
|
461
|
+
|
365
462
|
# Extract language preferences if present
|
366
463
|
languages = []
|
367
|
-
for lang in [
|
464
|
+
for lang in [
|
465
|
+
"python",
|
466
|
+
"javascript",
|
467
|
+
"java",
|
468
|
+
"rust",
|
469
|
+
"go",
|
470
|
+
"typescript",
|
471
|
+
"c#",
|
472
|
+
"c++",
|
473
|
+
"ruby",
|
474
|
+
]:
|
368
475
|
if lang in query.lower():
|
369
476
|
languages.append(lang)
|
370
|
-
|
477
|
+
|
371
478
|
if languages:
|
372
479
|
specialized_query += f" language:{' language:'.join(languages)}"
|
373
|
-
|
480
|
+
|
374
481
|
# Extract keywords
|
375
|
-
keywords = [
|
376
|
-
|
377
|
-
|
378
|
-
|
482
|
+
keywords = [
|
483
|
+
word
|
484
|
+
for word in query.split()
|
485
|
+
if len(word) > 3
|
486
|
+
and word.lower()
|
487
|
+
not in [
|
488
|
+
"recommend",
|
489
|
+
"recommended",
|
490
|
+
"github",
|
491
|
+
"repositories",
|
492
|
+
"looking",
|
493
|
+
"developers",
|
494
|
+
"contribute",
|
495
|
+
"contributing",
|
496
|
+
"beginner",
|
497
|
+
"newcomer",
|
498
|
+
]
|
499
|
+
]
|
500
|
+
|
379
501
|
if keywords:
|
380
|
-
specialized_query += " " + " ".join(
|
381
|
-
|
502
|
+
specialized_query += " " + " ".join(
|
503
|
+
keywords[:5]
|
504
|
+
) # Add up to 5 keywords
|
505
|
+
|
382
506
|
logger.info(f"Using specialized contribution query: {specialized_query}")
|
383
|
-
|
507
|
+
|
384
508
|
# Perform GitHub search with specialized query
|
385
509
|
results = self._search_github(specialized_query)
|
386
|
-
|
510
|
+
|
387
511
|
# Restore original search type
|
388
512
|
self.search_type = original_search_type
|
389
513
|
self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
|
390
514
|
else:
|
391
515
|
# Perform standard GitHub search
|
392
516
|
results = self._search_github(query)
|
393
|
-
|
517
|
+
|
394
518
|
if not results:
|
395
519
|
logger.warning(f"No GitHub results found for query: {query}")
|
396
520
|
return []
|
397
|
-
|
521
|
+
|
398
522
|
# Format results as previews
|
399
523
|
previews = []
|
400
524
|
for result in results:
|
@@ -410,34 +534,39 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
410
534
|
else:
|
411
535
|
logger.warning(f"Unknown search type: {self.search_type}")
|
412
536
|
continue
|
413
|
-
|
537
|
+
|
414
538
|
previews.append(preview)
|
415
|
-
|
539
|
+
|
416
540
|
logger.info(f"Formatted {len(previews)} GitHub preview results")
|
417
541
|
return previews
|
418
|
-
|
419
|
-
def _get_full_content(
|
542
|
+
|
543
|
+
def _get_full_content(
|
544
|
+
self, relevant_items: List[Dict[str, Any]]
|
545
|
+
) -> List[Dict[str, Any]]:
|
420
546
|
"""
|
421
547
|
Get full content for the relevant GitHub search results.
|
422
|
-
|
548
|
+
|
423
549
|
Args:
|
424
550
|
relevant_items: List of relevant preview dictionaries
|
425
|
-
|
551
|
+
|
426
552
|
Returns:
|
427
553
|
List of result dictionaries with full content
|
428
554
|
"""
|
429
555
|
# Check if we should add full content
|
430
|
-
if
|
556
|
+
if (
|
557
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
558
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
559
|
+
):
|
431
560
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
432
561
|
return relevant_items
|
433
|
-
|
562
|
+
|
434
563
|
logger.info(f"Getting full content for {len(relevant_items)} GitHub results")
|
435
|
-
|
564
|
+
|
436
565
|
results = []
|
437
566
|
for item in relevant_items:
|
438
567
|
result = item.copy()
|
439
568
|
search_type = item.get("search_type", "")
|
440
|
-
|
569
|
+
|
441
570
|
# Add content based on search type
|
442
571
|
if search_type == "repository" and self.include_readme:
|
443
572
|
repo_full_name = item.get("repo_full_name", "")
|
@@ -446,12 +575,12 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
446
575
|
readme_content = self._get_readme_content(repo_full_name)
|
447
576
|
result["full_content"] = readme_content
|
448
577
|
result["content_type"] = "readme"
|
449
|
-
|
578
|
+
|
450
579
|
# Get recent issues if requested
|
451
580
|
if self.include_issues:
|
452
581
|
issues = self._get_recent_issues(repo_full_name)
|
453
582
|
result["recent_issues"] = issues
|
454
|
-
|
583
|
+
|
455
584
|
elif search_type == "code":
|
456
585
|
file_url = item.get("file_url", "")
|
457
586
|
if file_url:
|
@@ -459,197 +588,206 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
459
588
|
file_content = self._get_file_content(file_url)
|
460
589
|
result["full_content"] = file_content
|
461
590
|
result["content_type"] = "file"
|
462
|
-
|
591
|
+
|
463
592
|
elif search_type == "issue":
|
464
593
|
# For issues, the snippet usually contains a summary already
|
465
594
|
# We'll just keep it as is
|
466
595
|
result["full_content"] = item.get("snippet", "")
|
467
596
|
result["content_type"] = "issue"
|
468
|
-
|
597
|
+
|
469
598
|
elif search_type == "user":
|
470
599
|
# For users, construct a profile summary
|
471
|
-
user_login = item.get("user_login", "")
|
472
600
|
profile_summary = f"GitHub user: {item.get('title', '')}\n"
|
473
|
-
|
601
|
+
|
474
602
|
if item.get("name"):
|
475
603
|
profile_summary += f"Name: {item.get('name')}\n"
|
476
|
-
|
604
|
+
|
477
605
|
if item.get("location"):
|
478
606
|
profile_summary += f"Location: {item.get('location')}\n"
|
479
|
-
|
607
|
+
|
480
608
|
profile_summary += f"Followers: {item.get('followers', 0)}\n"
|
481
|
-
profile_summary +=
|
482
|
-
|
609
|
+
profile_summary += (
|
610
|
+
f"Public repositories: {item.get('public_repos', 0)}\n"
|
611
|
+
)
|
612
|
+
|
483
613
|
if item.get("snippet") and item.get("snippet") != "No bio provided":
|
484
614
|
profile_summary += f"\nBio: {item.get('snippet')}\n"
|
485
|
-
|
615
|
+
|
486
616
|
result["full_content"] = profile_summary
|
487
617
|
result["content_type"] = "user_profile"
|
488
|
-
|
618
|
+
|
489
619
|
results.append(result)
|
490
|
-
|
620
|
+
|
491
621
|
return results
|
492
|
-
|
622
|
+
|
493
623
|
def search_repository(self, repo_owner: str, repo_name: str) -> Dict[str, Any]:
|
494
624
|
"""
|
495
625
|
Get detailed information about a specific repository.
|
496
|
-
|
626
|
+
|
497
627
|
Args:
|
498
628
|
repo_owner: Owner of the repository
|
499
629
|
repo_name: Name of the repository
|
500
|
-
|
630
|
+
|
501
631
|
Returns:
|
502
632
|
Dictionary with repository information
|
503
633
|
"""
|
504
634
|
repo_full_name = f"{repo_owner}/{repo_name}"
|
505
635
|
logger.info(f"Getting details for repository: {repo_full_name}")
|
506
|
-
|
636
|
+
|
507
637
|
try:
|
508
638
|
# Get repository details
|
509
639
|
response = requests.get(
|
510
|
-
f"{self.api_base}/repos/{repo_full_name}",
|
511
|
-
headers=self.headers
|
640
|
+
f"{self.api_base}/repos/{repo_full_name}", headers=self.headers
|
512
641
|
)
|
513
|
-
|
642
|
+
|
514
643
|
# Check for rate limiting
|
515
644
|
self._handle_rate_limits(response)
|
516
|
-
|
645
|
+
|
517
646
|
if response.status_code == 200:
|
518
647
|
repo = response.json()
|
519
|
-
|
648
|
+
|
520
649
|
# Format as repository preview
|
521
650
|
result = self._format_repository_preview(repo)
|
522
|
-
|
651
|
+
|
523
652
|
# Add README content if requested
|
524
653
|
if self.include_readme:
|
525
654
|
readme_content = self._get_readme_content(repo_full_name)
|
526
655
|
result["full_content"] = readme_content
|
527
656
|
result["content_type"] = "readme"
|
528
|
-
|
657
|
+
|
529
658
|
# Add recent issues if requested
|
530
659
|
if self.include_issues:
|
531
660
|
issues = self._get_recent_issues(repo_full_name)
|
532
661
|
result["recent_issues"] = issues
|
533
|
-
|
662
|
+
|
534
663
|
return result
|
535
664
|
else:
|
536
|
-
logger.error(
|
665
|
+
logger.error(
|
666
|
+
f"Error getting repository details: {response.status_code} - {response.text}"
|
667
|
+
)
|
537
668
|
return {}
|
538
|
-
|
669
|
+
|
539
670
|
except Exception as e:
|
540
671
|
logger.error(f"Error getting repository details: {e}")
|
541
672
|
return {}
|
542
|
-
|
543
|
-
def search_code(
|
673
|
+
|
674
|
+
def search_code(
|
675
|
+
self, query: str, language: Optional[str] = None, user: Optional[str] = None
|
676
|
+
) -> List[Dict[str, Any]]:
|
544
677
|
"""
|
545
678
|
Search for code with more specific parameters.
|
546
|
-
|
679
|
+
|
547
680
|
Args:
|
548
681
|
query: Code search query
|
549
682
|
language: Filter by programming language
|
550
683
|
user: Filter by GitHub username/organization
|
551
|
-
|
684
|
+
|
552
685
|
Returns:
|
553
686
|
List of code search results
|
554
687
|
"""
|
555
688
|
# Build advanced query
|
556
689
|
advanced_query = query
|
557
|
-
|
690
|
+
|
558
691
|
if language:
|
559
692
|
advanced_query += f" language:{language}"
|
560
|
-
|
693
|
+
|
561
694
|
if user:
|
562
695
|
advanced_query += f" user:{user}"
|
563
|
-
|
696
|
+
|
564
697
|
# Save current search type
|
565
698
|
original_search_type = self.search_type
|
566
|
-
|
699
|
+
|
567
700
|
try:
|
568
701
|
# Set search type to code
|
569
702
|
self.search_type = "code"
|
570
703
|
self.search_endpoint = f"{self.api_base}/search/code"
|
571
|
-
|
704
|
+
|
572
705
|
# Perform search
|
573
706
|
results = self._search_github(advanced_query)
|
574
|
-
|
707
|
+
|
575
708
|
# Format results
|
576
709
|
previews = [self._format_code_preview(result) for result in results]
|
577
|
-
|
710
|
+
|
578
711
|
# Get full content if requested
|
579
|
-
if
|
712
|
+
if (
|
713
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
714
|
+
and not search_config.SEARCH_SNIPPETS_ONLY
|
715
|
+
):
|
580
716
|
return self._get_full_content(previews)
|
581
|
-
|
717
|
+
|
582
718
|
return previews
|
583
|
-
|
719
|
+
|
584
720
|
finally:
|
585
721
|
# Restore original search type
|
586
722
|
self.search_type = original_search_type
|
587
723
|
self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
|
588
|
-
|
589
|
-
def search_issues(
|
724
|
+
|
725
|
+
def search_issues(
|
726
|
+
self, query: str, state: str = "open", sort: str = "updated"
|
727
|
+
) -> List[Dict[str, Any]]:
|
590
728
|
"""
|
591
729
|
Search for issues with more specific parameters.
|
592
|
-
|
730
|
+
|
593
731
|
Args:
|
594
732
|
query: Issue search query
|
595
733
|
state: Filter by issue state ("open", "closed", "all")
|
596
734
|
sort: Sort order ("updated", "created", "comments")
|
597
|
-
|
735
|
+
|
598
736
|
Returns:
|
599
737
|
List of issue search results
|
600
738
|
"""
|
601
739
|
# Build advanced query
|
602
740
|
advanced_query = query + f" state:{state}"
|
603
|
-
|
741
|
+
|
604
742
|
# Save current search type
|
605
743
|
original_search_type = self.search_type
|
606
|
-
|
744
|
+
|
607
745
|
try:
|
608
746
|
# Set search type to issues
|
609
747
|
self.search_type = "issues"
|
610
748
|
self.search_endpoint = f"{self.api_base}/search/issues"
|
611
|
-
|
749
|
+
|
612
750
|
# Set sort parameter
|
613
751
|
params = {
|
614
752
|
"q": advanced_query,
|
615
753
|
"per_page": min(self.max_results, 100),
|
616
754
|
"page": 1,
|
617
755
|
"sort": sort,
|
618
|
-
"order": "desc"
|
756
|
+
"order": "desc",
|
619
757
|
}
|
620
|
-
|
758
|
+
|
621
759
|
# Perform search
|
622
760
|
response = requests.get(
|
623
|
-
self.search_endpoint,
|
624
|
-
headers=self.headers,
|
625
|
-
params=params
|
761
|
+
self.search_endpoint, headers=self.headers, params=params
|
626
762
|
)
|
627
|
-
|
763
|
+
|
628
764
|
# Check for rate limiting
|
629
765
|
self._handle_rate_limits(response)
|
630
|
-
|
766
|
+
|
631
767
|
if response.status_code == 200:
|
632
768
|
data = response.json()
|
633
769
|
results = data.get("items", [])
|
634
|
-
|
770
|
+
|
635
771
|
# Format results
|
636
772
|
previews = [self._format_issue_preview(result) for result in results]
|
637
|
-
|
773
|
+
|
638
774
|
# For issues, we don't need to get full content
|
639
775
|
return previews
|
640
776
|
else:
|
641
|
-
logger.error(
|
777
|
+
logger.error(
|
778
|
+
f"GitHub API error: {response.status_code} - {response.text}"
|
779
|
+
)
|
642
780
|
return []
|
643
|
-
|
781
|
+
|
644
782
|
finally:
|
645
783
|
# Restore original search type
|
646
784
|
self.search_type = original_search_type
|
647
785
|
self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
|
648
|
-
|
786
|
+
|
649
787
|
def set_search_type(self, search_type: str):
|
650
788
|
"""
|
651
789
|
Set the search type for subsequent searches.
|
652
|
-
|
790
|
+
|
653
791
|
Args:
|
654
792
|
search_type: Type of GitHub search ("repositories", "code", "issues", "users")
|
655
793
|
"""
|
@@ -658,4 +796,77 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
658
796
|
self.search_endpoint = f"{self.api_base}/search/{search_type}"
|
659
797
|
logger.info(f"Set GitHub search type to: {search_type}")
|
660
798
|
else:
|
661
|
-
logger.error(f"Invalid GitHub search type: {search_type}")
|
799
|
+
logger.error(f"Invalid GitHub search type: {search_type}")
|
800
|
+
|
801
|
+
def _filter_for_relevance(
|
802
|
+
self, previews: List[Dict[str, Any]], query: str
|
803
|
+
) -> List[Dict[str, Any]]:
|
804
|
+
"""
|
805
|
+
Filter GitHub search results for relevance using LLM.
|
806
|
+
|
807
|
+
Args:
|
808
|
+
previews: List of preview dictionaries
|
809
|
+
query: Original search query
|
810
|
+
|
811
|
+
Returns:
|
812
|
+
List of relevant preview dictionaries
|
813
|
+
"""
|
814
|
+
if not self.llm or not previews:
|
815
|
+
return previews
|
816
|
+
|
817
|
+
# Create a specialized prompt for GitHub results
|
818
|
+
prompt = f"""Analyze these GitHub search results and rank them by relevance to the query.
|
819
|
+
Consider:
|
820
|
+
1. Repository stars and activity (higher is better)
|
821
|
+
2. Match between query intent and repository description
|
822
|
+
3. Repository language and topics
|
823
|
+
4. Last update time (more recent is better)
|
824
|
+
5. Whether it's a fork (original repositories are preferred)
|
825
|
+
|
826
|
+
Query: "{query}"
|
827
|
+
|
828
|
+
Results:
|
829
|
+
{json.dumps(previews, indent=2)}
|
830
|
+
|
831
|
+
Return ONLY a JSON array of indices in order of relevance (most relevant first).
|
832
|
+
Example: [0, 2, 1, 3]
|
833
|
+
Do not include any other text or explanation."""
|
834
|
+
|
835
|
+
try:
|
836
|
+
response = self.llm.invoke(prompt)
|
837
|
+
response_text = response.content.strip()
|
838
|
+
|
839
|
+
# Extract JSON array from response
|
840
|
+
start_idx = response_text.find("[")
|
841
|
+
end_idx = response_text.rfind("]")
|
842
|
+
|
843
|
+
if start_idx >= 0 and end_idx > start_idx:
|
844
|
+
array_text = response_text[start_idx : end_idx + 1]
|
845
|
+
ranked_indices = json.loads(array_text)
|
846
|
+
|
847
|
+
# Return the results in ranked order
|
848
|
+
ranked_results = []
|
849
|
+
for idx in ranked_indices:
|
850
|
+
if idx < len(previews):
|
851
|
+
ranked_results.append(previews[idx])
|
852
|
+
|
853
|
+
# Limit to max_filtered_results if specified
|
854
|
+
if (
|
855
|
+
self.max_filtered_results
|
856
|
+
and len(ranked_results) > self.max_filtered_results
|
857
|
+
):
|
858
|
+
logger.info(
|
859
|
+
f"Limiting filtered results to top {self.max_filtered_results}"
|
860
|
+
)
|
861
|
+
return ranked_results[: self.max_filtered_results]
|
862
|
+
|
863
|
+
return ranked_results
|
864
|
+
else:
|
865
|
+
logger.info(
|
866
|
+
"Could not find JSON array in response, returning no previews"
|
867
|
+
)
|
868
|
+
return []
|
869
|
+
|
870
|
+
except Exception as e:
|
871
|
+
logger.error(f"Error filtering GitHub results: {e}")
|
872
|
+
return []
|