local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +154 -160
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +87 -45
  41. local_deep_research/search_system.py +153 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1583 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.2.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,40 @@
1
- import requests
2
- import logging
3
1
  import base64
2
+ import json
3
+ import logging
4
4
  import os
5
5
  import time
6
- from typing import Dict, List, Any, Optional, Union
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import requests
7
9
  from langchain_core.language_models import BaseLLM
8
10
 
9
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
10
- from local_deep_research import config
11
+ from ...config import llm_config, search_config
12
+ from ..search_engine_base import BaseSearchEngine
11
13
 
12
14
  # Setup logging
13
15
  logging.basicConfig(level=logging.INFO)
14
16
  logger = logging.getLogger(__name__)
15
17
 
18
+
16
19
  class GitHubSearchEngine(BaseSearchEngine):
17
20
  """
18
21
  GitHub search engine implementation.
19
22
  Provides search across GitHub repositories, code, issues, and users.
20
23
  """
21
-
22
- def __init__(self,
23
- max_results: int = 15,
24
- api_key: Optional[str] = None,
25
- search_type: str = "repositories",
26
- include_readme: bool = True,
27
- include_issues: bool = False,
28
- llm: Optional[BaseLLM] = None,
29
- max_filtered_results: Optional[int] = None):
24
+
25
+ def __init__(
26
+ self,
27
+ max_results: int = 15,
28
+ api_key: Optional[str] = None,
29
+ search_type: str = "repositories",
30
+ include_readme: bool = True,
31
+ include_issues: bool = False,
32
+ llm: Optional[BaseLLM] = None,
33
+ max_filtered_results: Optional[int] = None,
34
+ ):
30
35
  """
31
36
  Initialize the GitHub search engine.
32
-
37
+
33
38
  Args:
34
39
  max_results: Maximum number of search results
35
40
  api_key: GitHub API token (can also be set in GITHUB_API_KEY env)
@@ -40,78 +45,144 @@ class GitHubSearchEngine(BaseSearchEngine):
40
45
  max_filtered_results: Maximum number of results to keep after filtering
41
46
  """
42
47
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
43
- super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
48
+ super().__init__(
49
+ llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
50
+ )
44
51
  self.api_key = api_key or os.getenv("GITHUB_API_KEY")
45
52
  self.search_type = search_type
46
53
  self.include_readme = include_readme
47
54
  self.include_issues = include_issues
48
-
55
+
49
56
  # API endpoints
50
57
  self.api_base = "https://api.github.com"
51
58
  self.search_endpoint = f"{self.api_base}/search/{search_type}"
52
-
59
+
53
60
  # Set up API headers
54
61
  self.headers = {
55
62
  "Accept": "application/vnd.github.v3+json",
56
- "User-Agent": "Local-Deep-Research-Agent"
63
+ "User-Agent": "Local-Deep-Research-Agent",
57
64
  }
58
-
65
+
59
66
  # Add authentication if API key provided
60
67
  if self.api_key:
61
68
  self.headers["Authorization"] = f"token {self.api_key}"
62
69
  logger.info("Using authenticated GitHub API requests")
63
70
  else:
64
- logger.warning("No GitHub API key provided. Rate limits will be restricted.")
65
-
71
+ logger.warning(
72
+ "No GitHub API key provided. Rate limits will be restricted."
73
+ )
74
+
66
75
  def _handle_rate_limits(self, response):
67
76
  """Handle GitHub API rate limits by logging warnings and sleeping if necessary"""
68
77
  remaining = int(response.headers.get("X-RateLimit-Remaining", 60))
69
78
  reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
70
-
79
+
71
80
  if remaining < 5:
72
81
  current_time = time.time()
73
82
  wait_time = max(reset_time - current_time, 0)
74
- logger.warning(f"GitHub API rate limit almost reached. {remaining} requests remaining.")
75
-
83
+ logger.warning(
84
+ f"GitHub API rate limit almost reached. {remaining} requests remaining."
85
+ )
86
+
76
87
  if wait_time > 0 and remaining == 0:
77
- logger.warning(f"GitHub API rate limit exceeded. Waiting {wait_time:.0f} seconds.")
88
+ logger.warning(
89
+ f"GitHub API rate limit exceeded. Waiting {wait_time:.0f} seconds."
90
+ )
78
91
  time.sleep(min(wait_time, 60)) # Wait at most 60 seconds
79
-
92
+
93
+ def _optimize_github_query(self, query: str) -> str:
94
+ """
95
+ Optimize the GitHub search query using LLM to improve search results.
96
+
97
+ Args:
98
+ query: Original search query
99
+
100
+ Returns:
101
+ Optimized GitHub search query
102
+ """
103
+ # Get LLM from config if not already set
104
+ if not self.llm:
105
+ try:
106
+ self.llm = llm_config.get_llm()
107
+ if not self.llm:
108
+ logger.warning("No LLM available for query optimization")
109
+ return query
110
+ except Exception as e:
111
+ logger.error(f"Error getting LLM from config: {e}")
112
+ return query
113
+
114
+ prompt = f"""Transform this GitHub search query into an optimized version for the GitHub search API. Follow these steps:
115
+ 1. Strip question words (e.g., 'what', 'are', 'is'), stop words (e.g., 'and', 'as', 'of', 'on'), and redundant terms (e.g., 'repositories', 'repos', 'github') since they're implied by the search context.
116
+ 2. Keep only domain-specific keywords and avoid using "-related" terms.
117
+ 3. Add GitHub-specific filters with dynamic thresholds based on query context:
118
+ - For stars: Use higher threshold (e.g., 'stars:>1000') for mainstream topics, lower (e.g., 'stars:>50') for specialized topics
119
+ - For language: Detect programming language from query or omit if unclear
120
+ - For search scope: Use 'in:name,description,readme' for general queries, 'in:file' for code-specific queries
121
+ 4. For date ranges, adapt based on query context:
122
+ - For emerging: Use 'created:>2024-01-01'
123
+ - For mature: Use 'pushed:>2023-01-01'
124
+ - For historical research: Use 'created:2020-01-01..2024-01-01'
125
+ 5. For excluding results, adapt based on query:
126
+ - Exclude irrelevant languages based on context
127
+ - Use 'NOT' to exclude competing terms
128
+ 6. Ensure the output is a concise, space-separated string with no punctuation or extra text beyond keywords and filters.
129
+
130
+
131
+ Original query: "{query}"
132
+
133
+ Return ONLY the optimized query, ready for GitHub's search API. Do not include explanations or additional text."""
134
+
135
+ try:
136
+ response = self.llm.invoke(prompt)
137
+
138
+ # Handle different response formats (string or object with content attribute)
139
+ if hasattr(response, "content"):
140
+ optimized_query = response.content.strip()
141
+ else:
142
+ # Handle string responses
143
+ optimized_query = str(response).strip()
144
+
145
+ # Validate the optimized query
146
+ if optimized_query and len(optimized_query) > 0:
147
+ logger.info(
148
+ f"LLM optimized query from '{query}' to '{optimized_query}'"
149
+ )
150
+ return optimized_query
151
+ else:
152
+ logger.warning("LLM returned empty query, using original")
153
+ return query
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error optimizing query with LLM: {e}")
157
+ return query
158
+
80
159
  def _search_github(self, query: str) -> List[Dict[str, Any]]:
81
160
  """
82
161
  Perform a GitHub search based on the configured search type.
83
-
162
+
84
163
  Args:
85
164
  query: The search query
86
-
165
+
87
166
  Returns:
88
167
  List of GitHub search result items
89
168
  """
90
169
  results = []
91
-
170
+
92
171
  try:
93
- # Optimize GitHub query format
94
- github_query = query
95
-
96
- # For long queries, focus on keywords and add filters for better results
97
- if len(query) > 80:
98
- # Extract key terms if it's a recommendation request
99
- if "recommend" in query.lower() or "looking for" in query.lower():
100
- github_query = "stars:>100 " + " ".join([
101
- word for word in query.split()
102
- if len(word) > 3 and word.lower() not in
103
- ["recommend", "recommended", "github", "repositories", "looking", "developers"]
104
- ])
105
-
106
- logger.info(f"Optimized GitHub query: {github_query}")
107
-
172
+ # Optimize GitHub query using LLM
173
+ github_query = self._optimize_github_query(query)
174
+
175
+ logger.info(f"Final GitHub query: {github_query}")
176
+
108
177
  # Construct search parameters
109
178
  params = {
110
179
  "q": github_query,
111
- "per_page": min(self.max_results, 100), # GitHub API max is 100 per page
112
- "page": 1
180
+ "per_page": min(
181
+ self.max_results, 100
182
+ ), # GitHub API max is 100 per page
183
+ "page": 1,
113
184
  }
114
-
185
+
115
186
  # Add sort parameters based on search type
116
187
  if self.search_type == "repositories":
117
188
  params["sort"] = "stars"
@@ -125,92 +196,104 @@ class GitHubSearchEngine(BaseSearchEngine):
125
196
  elif self.search_type == "users":
126
197
  params["sort"] = "followers"
127
198
  params["order"] = "desc"
128
-
199
+
129
200
  # Execute the API request
130
201
  response = requests.get(
131
- self.search_endpoint,
132
- headers=self.headers,
133
- params=params
202
+ self.search_endpoint, headers=self.headers, params=params
134
203
  )
135
-
204
+
136
205
  # Check for rate limiting
137
206
  self._handle_rate_limits(response)
138
-
207
+
139
208
  # Handle response with detailed logging
140
209
  if response.status_code == 200:
141
210
  data = response.json()
142
211
  total_count = data.get("total_count", 0)
143
212
  results = data.get("items", [])
144
- logger.info(f"GitHub search returned {len(results)} results (total available: {total_count})")
145
-
213
+ logger.info(
214
+ f"GitHub search returned {len(results)} results (total available: {total_count})"
215
+ )
216
+
146
217
  # Log the rate limit information
147
- rate_limit_remaining = response.headers.get("X-RateLimit-Remaining", "unknown")
148
- rate_limit_reset = response.headers.get("X-RateLimit-Reset", "unknown")
149
- logger.info(f"GitHub API rate limit: {rate_limit_remaining} requests remaining")
150
-
218
+ rate_limit_remaining = response.headers.get(
219
+ "X-RateLimit-Remaining", "unknown"
220
+ )
221
+ logger.info(
222
+ f"GitHub API rate limit: {rate_limit_remaining} requests remaining"
223
+ )
224
+
151
225
  # If no results, try to provide more guidance
152
226
  if not results:
153
- logger.warning(f"No results found. Consider these search tips:")
154
- logger.warning(f"1. Use shorter, more specific queries")
155
- logger.warning(f"2. For repositories, try adding 'stars:>100' or 'language:python'")
156
- logger.warning(f"3. For contribution opportunities, search for 'good-first-issue' or 'help-wanted'")
227
+ logger.warning("No results found. Consider these search tips:")
228
+ logger.warning("1. Use shorter, more specific queries")
229
+ logger.warning(
230
+ "2. For repositories, try adding 'stars:>100' or 'language:python'"
231
+ )
232
+ logger.warning(
233
+ "3. For contribution opportunities, search for 'good-first-issue' or 'help-wanted'"
234
+ )
157
235
  else:
158
- logger.error(f"GitHub API error: {response.status_code} - {response.text}")
159
-
236
+ logger.error(
237
+ f"GitHub API error: {response.status_code} - {response.text}"
238
+ )
239
+
160
240
  except Exception as e:
161
241
  logger.error(f"Error searching GitHub: {e}")
162
-
242
+
163
243
  return results
164
-
244
+
165
245
  def _get_readme_content(self, repo_full_name: str) -> str:
166
246
  """
167
247
  Get README content for a repository.
168
-
248
+
169
249
  Args:
170
250
  repo_full_name: Full name of the repository (owner/repo)
171
-
251
+
172
252
  Returns:
173
253
  Decoded README content or empty string if not found
174
254
  """
175
255
  try:
176
256
  # Get README
177
257
  response = requests.get(
178
- f"{self.api_base}/repos/{repo_full_name}/readme",
179
- headers=self.headers
258
+ f"{self.api_base}/repos/{repo_full_name}/readme", headers=self.headers
180
259
  )
181
-
260
+
182
261
  # Check for rate limiting
183
262
  self._handle_rate_limits(response)
184
-
263
+
185
264
  if response.status_code == 200:
186
265
  data = response.json()
187
266
  content = data.get("content", "")
188
267
  encoding = data.get("encoding", "")
189
-
268
+
190
269
  if encoding == "base64" and content:
191
- return base64.b64decode(content).decode('utf-8', errors='replace')
270
+ return base64.b64decode(content).decode("utf-8", errors="replace")
192
271
  return content
193
272
  else:
194
- logger.warning(f"Could not get README for {repo_full_name}: {response.status_code}")
273
+ logger.warning(
274
+ f"Could not get README for {repo_full_name}: {response.status_code}"
275
+ )
195
276
  return ""
196
-
277
+
197
278
  except Exception as e:
198
279
  logger.error(f"Error getting README for {repo_full_name}: {e}")
199
280
  return ""
200
-
201
- def _get_recent_issues(self, repo_full_name: str, limit: int = 5) -> List[Dict[str, Any]]:
281
+
282
+ def _get_recent_issues(
283
+ self, repo_full_name: str, limit: int = 5
284
+ ) -> List[Dict[str, Any]]:
202
285
  """
203
286
  Get recent issues for a repository.
204
-
287
+
205
288
  Args:
206
289
  repo_full_name: Full name of the repository (owner/repo)
207
290
  limit: Maximum number of issues to return
208
-
291
+
209
292
  Returns:
210
293
  List of recent issues
211
294
  """
212
295
  issues = []
213
-
296
+
214
297
  try:
215
298
  # Get recent issues
216
299
  response = requests.get(
@@ -220,60 +303,59 @@ class GitHubSearchEngine(BaseSearchEngine):
220
303
  "state": "all",
221
304
  "per_page": limit,
222
305
  "sort": "updated",
223
- "direction": "desc"
224
- }
306
+ "direction": "desc",
307
+ },
225
308
  )
226
-
309
+
227
310
  # Check for rate limiting
228
311
  self._handle_rate_limits(response)
229
-
312
+
230
313
  if response.status_code == 200:
231
314
  issues = response.json()
232
315
  logger.info(f"Got {len(issues)} recent issues for {repo_full_name}")
233
316
  else:
234
- logger.warning(f"Could not get issues for {repo_full_name}: {response.status_code}")
235
-
317
+ logger.warning(
318
+ f"Could not get issues for {repo_full_name}: {response.status_code}"
319
+ )
320
+
236
321
  except Exception as e:
237
322
  logger.error(f"Error getting issues for {repo_full_name}: {e}")
238
-
323
+
239
324
  return issues
240
-
325
+
241
326
  def _get_file_content(self, file_url: str) -> str:
242
327
  """
243
328
  Get content of a file from GitHub.
244
-
329
+
245
330
  Args:
246
331
  file_url: API URL for the file
247
-
332
+
248
333
  Returns:
249
334
  Decoded file content or empty string if not found
250
335
  """
251
336
  try:
252
337
  # Get file content
253
- response = requests.get(
254
- file_url,
255
- headers=self.headers
256
- )
257
-
338
+ response = requests.get(file_url, headers=self.headers)
339
+
258
340
  # Check for rate limiting
259
341
  self._handle_rate_limits(response)
260
-
342
+
261
343
  if response.status_code == 200:
262
344
  data = response.json()
263
345
  content = data.get("content", "")
264
346
  encoding = data.get("encoding", "")
265
-
347
+
266
348
  if encoding == "base64" and content:
267
- return base64.b64decode(content).decode('utf-8', errors='replace')
349
+ return base64.b64decode(content).decode("utf-8", errors="replace")
268
350
  return content
269
351
  else:
270
352
  logger.warning(f"Could not get file content: {response.status_code}")
271
353
  return ""
272
-
354
+
273
355
  except Exception as e:
274
356
  logger.error(f"Error getting file content: {e}")
275
357
  return ""
276
-
358
+
277
359
  def _format_repository_preview(self, repo: Dict[str, Any]) -> Dict[str, Any]:
278
360
  """Format repository search result as preview"""
279
361
  return {
@@ -290,9 +372,9 @@ class GitHubSearchEngine(BaseSearchEngine):
290
372
  "owner": repo.get("owner", {}).get("login", ""),
291
373
  "is_fork": repo.get("fork", False),
292
374
  "search_type": "repository",
293
- "repo_full_name": repo.get("full_name", "")
375
+ "repo_full_name": repo.get("full_name", ""),
294
376
  }
295
-
377
+
296
378
  def _format_code_preview(self, code: Dict[str, Any]) -> Dict[str, Any]:
297
379
  """Format code search result as preview"""
298
380
  repo = code.get("repository", {})
@@ -305,26 +387,32 @@ class GitHubSearchEngine(BaseSearchEngine):
305
387
  "repo_name": repo.get("full_name", ""),
306
388
  "repo_url": repo.get("html_url", ""),
307
389
  "search_type": "code",
308
- "file_url": code.get("url", "")
390
+ "file_url": code.get("url", ""),
309
391
  }
310
-
392
+
311
393
  def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
312
394
  """Format issue search result as preview"""
313
- repo = issue.get("repository", {}) if "repository" in issue else {"full_name": ""}
395
+ repo = (
396
+ issue.get("repository", {}) if "repository" in issue else {"full_name": ""}
397
+ )
314
398
  return {
315
399
  "id": f"issue_{issue.get('number', '')}",
316
400
  "title": issue.get("title", ""),
317
401
  "link": issue.get("html_url", ""),
318
- "snippet": issue.get("body", "")[:200] + "..." if len(issue.get("body", "")) > 200 else issue.get("body", ""),
402
+ "snippet": (
403
+ issue.get("body", "")[:200] + "..."
404
+ if len(issue.get("body", "")) > 200
405
+ else issue.get("body", "")
406
+ ),
319
407
  "state": issue.get("state", ""),
320
408
  "created_at": issue.get("created_at", ""),
321
409
  "updated_at": issue.get("updated_at", ""),
322
410
  "user": issue.get("user", {}).get("login", ""),
323
411
  "comments": issue.get("comments", 0),
324
412
  "search_type": "issue",
325
- "repo_name": repo.get("full_name", "")
413
+ "repo_name": repo.get("full_name", ""),
326
414
  }
327
-
415
+
328
416
  def _format_user_preview(self, user: Dict[str, Any]) -> Dict[str, Any]:
329
417
  """Format user search result as preview"""
330
418
  return {
@@ -337,64 +425,100 @@ class GitHubSearchEngine(BaseSearchEngine):
337
425
  "public_repos": user.get("public_repos", 0),
338
426
  "location": user.get("location", ""),
339
427
  "search_type": "user",
340
- "user_login": user.get("login", "")
428
+ "user_login": user.get("login", ""),
341
429
  }
342
-
430
+
343
431
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
344
432
  """
345
433
  Get preview information for GitHub search results.
346
-
434
+
347
435
  Args:
348
436
  query: The search query
349
-
437
+
350
438
  Returns:
351
439
  List of preview dictionaries
352
440
  """
353
441
  logger.info(f"Getting GitHub previews for query: {query}")
354
-
442
+
355
443
  # For contribution-focused queries, automatically adjust search type and add filters
356
- if any(term in query.lower() for term in ["contribute", "contributing", "contribution", "beginner", "newcomer"]):
444
+ if any(
445
+ term in query.lower()
446
+ for term in [
447
+ "contribute",
448
+ "contributing",
449
+ "contribution",
450
+ "beginner",
451
+ "newcomer",
452
+ ]
453
+ ):
357
454
  # Use repositories search with help-wanted or good-first-issue labels
358
455
  original_search_type = self.search_type
359
456
  self.search_type = "repositories"
360
457
  self.search_endpoint = f"{self.api_base}/search/repositories"
361
-
458
+
362
459
  # Create a specialized query for finding beginner-friendly projects
363
460
  specialized_query = "good-first-issues:>5 is:public archived:false"
364
-
461
+
365
462
  # Extract language preferences if present
366
463
  languages = []
367
- for lang in ["python", "javascript", "java", "rust", "go", "typescript", "c#", "c++", "ruby"]:
464
+ for lang in [
465
+ "python",
466
+ "javascript",
467
+ "java",
468
+ "rust",
469
+ "go",
470
+ "typescript",
471
+ "c#",
472
+ "c++",
473
+ "ruby",
474
+ ]:
368
475
  if lang in query.lower():
369
476
  languages.append(lang)
370
-
477
+
371
478
  if languages:
372
479
  specialized_query += f" language:{' language:'.join(languages)}"
373
-
480
+
374
481
  # Extract keywords
375
- keywords = [word for word in query.split() if len(word) > 3 and word.lower() not in
376
- ["recommend", "recommended", "github", "repositories", "looking",
377
- "developers", "contribute", "contributing", "beginner", "newcomer"]]
378
-
482
+ keywords = [
483
+ word
484
+ for word in query.split()
485
+ if len(word) > 3
486
+ and word.lower()
487
+ not in [
488
+ "recommend",
489
+ "recommended",
490
+ "github",
491
+ "repositories",
492
+ "looking",
493
+ "developers",
494
+ "contribute",
495
+ "contributing",
496
+ "beginner",
497
+ "newcomer",
498
+ ]
499
+ ]
500
+
379
501
  if keywords:
380
- specialized_query += " " + " ".join(keywords[:5]) # Add up to 5 keywords
381
-
502
+ specialized_query += " " + " ".join(
503
+ keywords[:5]
504
+ ) # Add up to 5 keywords
505
+
382
506
  logger.info(f"Using specialized contribution query: {specialized_query}")
383
-
507
+
384
508
  # Perform GitHub search with specialized query
385
509
  results = self._search_github(specialized_query)
386
-
510
+
387
511
  # Restore original search type
388
512
  self.search_type = original_search_type
389
513
  self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
390
514
  else:
391
515
  # Perform standard GitHub search
392
516
  results = self._search_github(query)
393
-
517
+
394
518
  if not results:
395
519
  logger.warning(f"No GitHub results found for query: {query}")
396
520
  return []
397
-
521
+
398
522
  # Format results as previews
399
523
  previews = []
400
524
  for result in results:
@@ -410,34 +534,39 @@ class GitHubSearchEngine(BaseSearchEngine):
410
534
  else:
411
535
  logger.warning(f"Unknown search type: {self.search_type}")
412
536
  continue
413
-
537
+
414
538
  previews.append(preview)
415
-
539
+
416
540
  logger.info(f"Formatted {len(previews)} GitHub preview results")
417
541
  return previews
418
-
419
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
542
+
543
+ def _get_full_content(
544
+ self, relevant_items: List[Dict[str, Any]]
545
+ ) -> List[Dict[str, Any]]:
420
546
  """
421
547
  Get full content for the relevant GitHub search results.
422
-
548
+
423
549
  Args:
424
550
  relevant_items: List of relevant preview dictionaries
425
-
551
+
426
552
  Returns:
427
553
  List of result dictionaries with full content
428
554
  """
429
555
  # Check if we should add full content
430
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
556
+ if (
557
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
558
+ and search_config.SEARCH_SNIPPETS_ONLY
559
+ ):
431
560
  logger.info("Snippet-only mode, skipping full content retrieval")
432
561
  return relevant_items
433
-
562
+
434
563
  logger.info(f"Getting full content for {len(relevant_items)} GitHub results")
435
-
564
+
436
565
  results = []
437
566
  for item in relevant_items:
438
567
  result = item.copy()
439
568
  search_type = item.get("search_type", "")
440
-
569
+
441
570
  # Add content based on search type
442
571
  if search_type == "repository" and self.include_readme:
443
572
  repo_full_name = item.get("repo_full_name", "")
@@ -446,12 +575,12 @@ class GitHubSearchEngine(BaseSearchEngine):
446
575
  readme_content = self._get_readme_content(repo_full_name)
447
576
  result["full_content"] = readme_content
448
577
  result["content_type"] = "readme"
449
-
578
+
450
579
  # Get recent issues if requested
451
580
  if self.include_issues:
452
581
  issues = self._get_recent_issues(repo_full_name)
453
582
  result["recent_issues"] = issues
454
-
583
+
455
584
  elif search_type == "code":
456
585
  file_url = item.get("file_url", "")
457
586
  if file_url:
@@ -459,197 +588,206 @@ class GitHubSearchEngine(BaseSearchEngine):
459
588
  file_content = self._get_file_content(file_url)
460
589
  result["full_content"] = file_content
461
590
  result["content_type"] = "file"
462
-
591
+
463
592
  elif search_type == "issue":
464
593
  # For issues, the snippet usually contains a summary already
465
594
  # We'll just keep it as is
466
595
  result["full_content"] = item.get("snippet", "")
467
596
  result["content_type"] = "issue"
468
-
597
+
469
598
  elif search_type == "user":
470
599
  # For users, construct a profile summary
471
- user_login = item.get("user_login", "")
472
600
  profile_summary = f"GitHub user: {item.get('title', '')}\n"
473
-
601
+
474
602
  if item.get("name"):
475
603
  profile_summary += f"Name: {item.get('name')}\n"
476
-
604
+
477
605
  if item.get("location"):
478
606
  profile_summary += f"Location: {item.get('location')}\n"
479
-
607
+
480
608
  profile_summary += f"Followers: {item.get('followers', 0)}\n"
481
- profile_summary += f"Public repositories: {item.get('public_repos', 0)}\n"
482
-
609
+ profile_summary += (
610
+ f"Public repositories: {item.get('public_repos', 0)}\n"
611
+ )
612
+
483
613
  if item.get("snippet") and item.get("snippet") != "No bio provided":
484
614
  profile_summary += f"\nBio: {item.get('snippet')}\n"
485
-
615
+
486
616
  result["full_content"] = profile_summary
487
617
  result["content_type"] = "user_profile"
488
-
618
+
489
619
  results.append(result)
490
-
620
+
491
621
  return results
492
-
622
+
493
623
  def search_repository(self, repo_owner: str, repo_name: str) -> Dict[str, Any]:
494
624
  """
495
625
  Get detailed information about a specific repository.
496
-
626
+
497
627
  Args:
498
628
  repo_owner: Owner of the repository
499
629
  repo_name: Name of the repository
500
-
630
+
501
631
  Returns:
502
632
  Dictionary with repository information
503
633
  """
504
634
  repo_full_name = f"{repo_owner}/{repo_name}"
505
635
  logger.info(f"Getting details for repository: {repo_full_name}")
506
-
636
+
507
637
  try:
508
638
  # Get repository details
509
639
  response = requests.get(
510
- f"{self.api_base}/repos/{repo_full_name}",
511
- headers=self.headers
640
+ f"{self.api_base}/repos/{repo_full_name}", headers=self.headers
512
641
  )
513
-
642
+
514
643
  # Check for rate limiting
515
644
  self._handle_rate_limits(response)
516
-
645
+
517
646
  if response.status_code == 200:
518
647
  repo = response.json()
519
-
648
+
520
649
  # Format as repository preview
521
650
  result = self._format_repository_preview(repo)
522
-
651
+
523
652
  # Add README content if requested
524
653
  if self.include_readme:
525
654
  readme_content = self._get_readme_content(repo_full_name)
526
655
  result["full_content"] = readme_content
527
656
  result["content_type"] = "readme"
528
-
657
+
529
658
  # Add recent issues if requested
530
659
  if self.include_issues:
531
660
  issues = self._get_recent_issues(repo_full_name)
532
661
  result["recent_issues"] = issues
533
-
662
+
534
663
  return result
535
664
  else:
536
- logger.error(f"Error getting repository details: {response.status_code} - {response.text}")
665
+ logger.error(
666
+ f"Error getting repository details: {response.status_code} - {response.text}"
667
+ )
537
668
  return {}
538
-
669
+
539
670
  except Exception as e:
540
671
  logger.error(f"Error getting repository details: {e}")
541
672
  return {}
542
-
543
- def search_code(self, query: str, language: Optional[str] = None, user: Optional[str] = None) -> List[Dict[str, Any]]:
673
+
674
+ def search_code(
675
+ self, query: str, language: Optional[str] = None, user: Optional[str] = None
676
+ ) -> List[Dict[str, Any]]:
544
677
  """
545
678
  Search for code with more specific parameters.
546
-
679
+
547
680
  Args:
548
681
  query: Code search query
549
682
  language: Filter by programming language
550
683
  user: Filter by GitHub username/organization
551
-
684
+
552
685
  Returns:
553
686
  List of code search results
554
687
  """
555
688
  # Build advanced query
556
689
  advanced_query = query
557
-
690
+
558
691
  if language:
559
692
  advanced_query += f" language:{language}"
560
-
693
+
561
694
  if user:
562
695
  advanced_query += f" user:{user}"
563
-
696
+
564
697
  # Save current search type
565
698
  original_search_type = self.search_type
566
-
699
+
567
700
  try:
568
701
  # Set search type to code
569
702
  self.search_type = "code"
570
703
  self.search_endpoint = f"{self.api_base}/search/code"
571
-
704
+
572
705
  # Perform search
573
706
  results = self._search_github(advanced_query)
574
-
707
+
575
708
  # Format results
576
709
  previews = [self._format_code_preview(result) for result in results]
577
-
710
+
578
711
  # Get full content if requested
579
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and not config.SEARCH_SNIPPETS_ONLY:
712
+ if (
713
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
714
+ and not search_config.SEARCH_SNIPPETS_ONLY
715
+ ):
580
716
  return self._get_full_content(previews)
581
-
717
+
582
718
  return previews
583
-
719
+
584
720
  finally:
585
721
  # Restore original search type
586
722
  self.search_type = original_search_type
587
723
  self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
588
-
589
- def search_issues(self, query: str, state: str = "open", sort: str = "updated") -> List[Dict[str, Any]]:
724
+
725
+ def search_issues(
726
+ self, query: str, state: str = "open", sort: str = "updated"
727
+ ) -> List[Dict[str, Any]]:
590
728
  """
591
729
  Search for issues with more specific parameters.
592
-
730
+
593
731
  Args:
594
732
  query: Issue search query
595
733
  state: Filter by issue state ("open", "closed", "all")
596
734
  sort: Sort order ("updated", "created", "comments")
597
-
735
+
598
736
  Returns:
599
737
  List of issue search results
600
738
  """
601
739
  # Build advanced query
602
740
  advanced_query = query + f" state:{state}"
603
-
741
+
604
742
  # Save current search type
605
743
  original_search_type = self.search_type
606
-
744
+
607
745
  try:
608
746
  # Set search type to issues
609
747
  self.search_type = "issues"
610
748
  self.search_endpoint = f"{self.api_base}/search/issues"
611
-
749
+
612
750
  # Set sort parameter
613
751
  params = {
614
752
  "q": advanced_query,
615
753
  "per_page": min(self.max_results, 100),
616
754
  "page": 1,
617
755
  "sort": sort,
618
- "order": "desc"
756
+ "order": "desc",
619
757
  }
620
-
758
+
621
759
  # Perform search
622
760
  response = requests.get(
623
- self.search_endpoint,
624
- headers=self.headers,
625
- params=params
761
+ self.search_endpoint, headers=self.headers, params=params
626
762
  )
627
-
763
+
628
764
  # Check for rate limiting
629
765
  self._handle_rate_limits(response)
630
-
766
+
631
767
  if response.status_code == 200:
632
768
  data = response.json()
633
769
  results = data.get("items", [])
634
-
770
+
635
771
  # Format results
636
772
  previews = [self._format_issue_preview(result) for result in results]
637
-
773
+
638
774
  # For issues, we don't need to get full content
639
775
  return previews
640
776
  else:
641
- logger.error(f"GitHub API error: {response.status_code} - {response.text}")
777
+ logger.error(
778
+ f"GitHub API error: {response.status_code} - {response.text}"
779
+ )
642
780
  return []
643
-
781
+
644
782
  finally:
645
783
  # Restore original search type
646
784
  self.search_type = original_search_type
647
785
  self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
648
-
786
+
649
787
  def set_search_type(self, search_type: str):
650
788
  """
651
789
  Set the search type for subsequent searches.
652
-
790
+
653
791
  Args:
654
792
  search_type: Type of GitHub search ("repositories", "code", "issues", "users")
655
793
  """
@@ -658,4 +796,77 @@ class GitHubSearchEngine(BaseSearchEngine):
658
796
  self.search_endpoint = f"{self.api_base}/search/{search_type}"
659
797
  logger.info(f"Set GitHub search type to: {search_type}")
660
798
  else:
661
- logger.error(f"Invalid GitHub search type: {search_type}")
799
+ logger.error(f"Invalid GitHub search type: {search_type}")
800
+
801
+ def _filter_for_relevance(
802
+ self, previews: List[Dict[str, Any]], query: str
803
+ ) -> List[Dict[str, Any]]:
804
+ """
805
+ Filter GitHub search results for relevance using LLM.
806
+
807
+ Args:
808
+ previews: List of preview dictionaries
809
+ query: Original search query
810
+
811
+ Returns:
812
+ List of relevant preview dictionaries
813
+ """
814
+ if not self.llm or not previews:
815
+ return previews
816
+
817
+ # Create a specialized prompt for GitHub results
818
+ prompt = f"""Analyze these GitHub search results and rank them by relevance to the query.
819
+ Consider:
820
+ 1. Repository stars and activity (higher is better)
821
+ 2. Match between query intent and repository description
822
+ 3. Repository language and topics
823
+ 4. Last update time (more recent is better)
824
+ 5. Whether it's a fork (original repositories are preferred)
825
+
826
+ Query: "{query}"
827
+
828
+ Results:
829
+ {json.dumps(previews, indent=2)}
830
+
831
+ Return ONLY a JSON array of indices in order of relevance (most relevant first).
832
+ Example: [0, 2, 1, 3]
833
+ Do not include any other text or explanation."""
834
+
835
+ try:
836
+ response = self.llm.invoke(prompt)
837
+ response_text = response.content.strip()
838
+
839
+ # Extract JSON array from response
840
+ start_idx = response_text.find("[")
841
+ end_idx = response_text.rfind("]")
842
+
843
+ if start_idx >= 0 and end_idx > start_idx:
844
+ array_text = response_text[start_idx : end_idx + 1]
845
+ ranked_indices = json.loads(array_text)
846
+
847
+ # Return the results in ranked order
848
+ ranked_results = []
849
+ for idx in ranked_indices:
850
+ if idx < len(previews):
851
+ ranked_results.append(previews[idx])
852
+
853
+ # Limit to max_filtered_results if specified
854
+ if (
855
+ self.max_filtered_results
856
+ and len(ranked_results) > self.max_filtered_results
857
+ ):
858
+ logger.info(
859
+ f"Limiting filtered results to top {self.max_filtered_results}"
860
+ )
861
+ return ranked_results[: self.max_filtered_results]
862
+
863
+ return ranked_results
864
+ else:
865
+ logger.info(
866
+ "Could not find JSON array in response, returning no previews"
867
+ )
868
+ return []
869
+
870
+ except Exception as e:
871
+ logger.error(f"Error filtering GitHub results: {e}")
872
+ return []