voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/github_scraper.py — GitHub clearnet intelligence source for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Searches GitHub code and repositories for security-relevant content that
|
|
5
|
+
matches an investigation query. Runs over CLEARNET — GitHub is public and
|
|
6
|
+
does not require Tor.
|
|
7
|
+
|
|
8
|
+
Typical high-signal content found on GitHub:
|
|
9
|
+
- Leaked configs (API keys, credentials, internal endpoints)
|
|
10
|
+
- Malware source code & proof-of-concept exploits
|
|
11
|
+
- C2 / beacon configuration files
|
|
12
|
+
- Threat actor tooling, dropper scripts, stealers
|
|
13
|
+
- Security research write-ups
|
|
14
|
+
|
|
15
|
+
Authentication is OPTIONAL:
|
|
16
|
+
- Unauthenticated: 10 requests/minute (search API)
|
|
17
|
+
- Authenticated: 30 requests/minute — set GITHUB_TOKEN to enable
|
|
18
|
+
|
|
19
|
+
Public API:
|
|
20
|
+
async def scrape_github(
|
|
21
|
+
query: str,
|
|
22
|
+
refined_query: str = "",
|
|
23
|
+
max_results: int = 15,
|
|
24
|
+
) -> list[dict]
|
|
25
|
+
|
|
26
|
+
Returns page dicts compatible with the existing extraction pipeline:
|
|
27
|
+
{
|
|
28
|
+
"url": str,
|
|
29
|
+
"text_content": str,
|
|
30
|
+
"title": str,
|
|
31
|
+
"source_type": "github",
|
|
32
|
+
"source_name": "GitHub",
|
|
33
|
+
"github_repo": str,
|
|
34
|
+
"github_filename": str,
|
|
35
|
+
"github_stars": int,
|
|
36
|
+
"scraped_at": str,
|
|
37
|
+
"word_count": int,
|
|
38
|
+
"relevance": int,
|
|
39
|
+
}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
import base64
|
|
46
|
+
import logging
|
|
47
|
+
import os
|
|
48
|
+
import re
|
|
49
|
+
from datetime import datetime, timezone
|
|
50
|
+
from typing import Optional
|
|
51
|
+
|
|
52
|
+
import aiohttp
|
|
53
|
+
|
|
54
|
+
from utils.content_safety import (
|
|
55
|
+
is_blocked_query,
|
|
56
|
+
is_blocked_url,
|
|
57
|
+
sanitize_content,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
logger = logging.getLogger(__name__)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Constants
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
GITHUB_API_BASE = "https://api.github.com"
|
|
68
|
+
GITHUB_RAW_BASE = "https://raw.githubusercontent.com"
|
|
69
|
+
|
|
70
|
+
# Max file size to fetch (200KB)
|
|
71
|
+
MAX_FILE_SIZE = 200 * 1024
|
|
72
|
+
|
|
73
|
+
# Max results per search type
|
|
74
|
+
MAX_CODE_RESULTS = 10
|
|
75
|
+
MAX_REPO_RESULTS = 5
|
|
76
|
+
|
|
77
|
+
# Max total GitHub items per investigation
|
|
78
|
+
MAX_TOTAL_RESULTS = 15
|
|
79
|
+
|
|
80
|
+
# Rate limit delays (seconds)
|
|
81
|
+
# Unauthenticated: 10/min = 6s between requests
|
|
82
|
+
# Authenticated: 30/min = 2s between requests
|
|
83
|
+
RATE_LIMIT_DELAY_UNAUTH = 6.0
|
|
84
|
+
RATE_LIMIT_DELAY_AUTH = 2.0
|
|
85
|
+
|
|
86
|
+
# Security-relevant file extensions to fetch
|
|
87
|
+
SECURITY_EXTENSIONS = {
|
|
88
|
+
".py", ".js", ".ts", ".go", ".rs",
|
|
89
|
+
".c", ".cpp", ".cs", ".java",
|
|
90
|
+
".sh", ".bash", ".ps1", ".bat",
|
|
91
|
+
".yaml", ".yml", ".json", ".toml",
|
|
92
|
+
".conf", ".config", ".ini", ".env",
|
|
93
|
+
".txt", ".md", ".log",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# File names that are almost always valuable
|
|
97
|
+
HIGH_VALUE_FILENAMES = {
|
|
98
|
+
"config.py", "config.js", "config.json",
|
|
99
|
+
"settings.py", "settings.json",
|
|
100
|
+
"malware.py", "rat.py", "stealer.py",
|
|
101
|
+
"c2.py", "c2.js", "server.py",
|
|
102
|
+
"payload.py", "dropper.py",
|
|
103
|
+
"keylogger.py", "ransomware.py",
|
|
104
|
+
"exploit.py", "exploit.js",
|
|
105
|
+
"credentials.txt", "passwords.txt",
|
|
106
|
+
"victims.txt", "targets.txt",
|
|
107
|
+
"README.md", "README.txt",
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Repositories to skip (noise: tutorials, awesome lists, etc.)
|
|
111
|
+
SKIP_REPO_PATTERNS = [
|
|
112
|
+
r"awesome-.*",
|
|
113
|
+
r".*-tutorial",
|
|
114
|
+
r".*-course",
|
|
115
|
+
r".*-book",
|
|
116
|
+
r".*-cheatsheet",
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
# Scraper
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class GitHubScraper:
|
|
126
|
+
"""
|
|
127
|
+
Scrapes GitHub for security-relevant content using the GitHub Search API.
|
|
128
|
+
Works with or without authentication.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
self._token = os.getenv("GITHUB_TOKEN", "").strip()
|
|
133
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
134
|
+
self._rate_limit_delay = (
|
|
135
|
+
RATE_LIMIT_DELAY_AUTH if self._token else RATE_LIMIT_DELAY_UNAUTH
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def _headers(self) -> dict:
|
|
140
|
+
headers = {
|
|
141
|
+
"Accept": "application/vnd.github+json",
|
|
142
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
143
|
+
"User-Agent": "VoidAccess-OSINT/1.1",
|
|
144
|
+
}
|
|
145
|
+
if self._token:
|
|
146
|
+
headers["Authorization"] = f"Bearer {self._token}"
|
|
147
|
+
return headers
|
|
148
|
+
|
|
149
|
+
async def __aenter__(self):
|
|
150
|
+
self._session = aiohttp.ClientSession(
|
|
151
|
+
headers=self._headers,
|
|
152
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
153
|
+
)
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
async def __aexit__(self, *args):
|
|
157
|
+
if self._session:
|
|
158
|
+
await self._session.close()
|
|
159
|
+
|
|
160
|
+
async def search_and_fetch(
|
|
161
|
+
self,
|
|
162
|
+
query: str,
|
|
163
|
+
refined_query: str = "",
|
|
164
|
+
max_results: int = MAX_TOTAL_RESULTS,
|
|
165
|
+
) -> list[dict]:
|
|
166
|
+
"""
|
|
167
|
+
Search GitHub and return page dicts compatible with the extraction
|
|
168
|
+
pipeline.
|
|
169
|
+
"""
|
|
170
|
+
blocked, _ = is_blocked_query(query)
|
|
171
|
+
if blocked:
|
|
172
|
+
logger.warning("GitHub scraping blocked — prohibited query")
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
search_queries = self._build_search_queries(query, refined_query)
|
|
176
|
+
|
|
177
|
+
auth_status = "authenticated" if self._token else "unauthenticated"
|
|
178
|
+
logger.info(
|
|
179
|
+
"GitHub scraping (%s): '%s'",
|
|
180
|
+
auth_status,
|
|
181
|
+
query[:50],
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
all_results: list[dict] = []
|
|
185
|
+
seen_urls: set[str] = set()
|
|
186
|
+
|
|
187
|
+
code_task = self._search_code(search_queries[0])
|
|
188
|
+
repo_task = self._search_repos(search_queries[0])
|
|
189
|
+
|
|
190
|
+
code_results, repo_results = await asyncio.gather(
|
|
191
|
+
code_task,
|
|
192
|
+
repo_task,
|
|
193
|
+
return_exceptions=True,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if isinstance(code_results, list):
|
|
197
|
+
for item in code_results:
|
|
198
|
+
url = item.get("url", "")
|
|
199
|
+
if url and url not in seen_urls:
|
|
200
|
+
seen_urls.add(url)
|
|
201
|
+
all_results.append(item)
|
|
202
|
+
|
|
203
|
+
if isinstance(repo_results, list):
|
|
204
|
+
for item in repo_results:
|
|
205
|
+
url = item.get("url", "")
|
|
206
|
+
if url and url not in seen_urls:
|
|
207
|
+
seen_urls.add(url)
|
|
208
|
+
all_results.append(item)
|
|
209
|
+
|
|
210
|
+
all_results.sort(key=lambda x: x.get("relevance", 0), reverse=True)
|
|
211
|
+
final = all_results[:max_results]
|
|
212
|
+
|
|
213
|
+
logger.info("GitHub scraping: %d results found", len(final))
|
|
214
|
+
return final
|
|
215
|
+
|
|
216
|
+
def _build_search_queries(
|
|
217
|
+
self,
|
|
218
|
+
query: str,
|
|
219
|
+
refined_query: str,
|
|
220
|
+
) -> list[str]:
|
|
221
|
+
"""
|
|
222
|
+
Build GitHub search queries. GitHub code search has specific syntax —
|
|
223
|
+
keep queries short and reasonably safe.
|
|
224
|
+
"""
|
|
225
|
+
queries: list[str] = []
|
|
226
|
+
|
|
227
|
+
base = refined_query or query
|
|
228
|
+
base = re.sub(r"[^\w\s\-.]", " ", base).strip()[:100]
|
|
229
|
+
queries.append(base)
|
|
230
|
+
|
|
231
|
+
# Add language-specific variants for known malware/tooling names.
|
|
232
|
+
TOOL_LANGS = {
|
|
233
|
+
"cobalt strike": "malleable",
|
|
234
|
+
"metasploit": "language:ruby",
|
|
235
|
+
"mimikatz": "language:c",
|
|
236
|
+
"covenant": "language:csharp",
|
|
237
|
+
"sliver": "language:go",
|
|
238
|
+
"havoc": "language:c",
|
|
239
|
+
"brute ratel": "config",
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
query_lower = query.lower()
|
|
243
|
+
for tool, modifier in TOOL_LANGS.items():
|
|
244
|
+
if tool in query_lower:
|
|
245
|
+
queries.append(f"{tool} {modifier}")
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
return queries[:2]
|
|
249
|
+
|
|
250
|
+
async def _search_code(self, search_query: str) -> list[dict]:
|
|
251
|
+
"""Search GitHub code and fetch file content."""
|
|
252
|
+
if not self._session:
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
results: list[dict] = []
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
params = {
|
|
259
|
+
"q": search_query,
|
|
260
|
+
"per_page": MAX_CODE_RESULTS,
|
|
261
|
+
"sort": "indexed",
|
|
262
|
+
"order": "desc",
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
async with self._session.get(
|
|
266
|
+
f"{GITHUB_API_BASE}/search/code",
|
|
267
|
+
params=params,
|
|
268
|
+
) as resp:
|
|
269
|
+
if resp.status == 403:
|
|
270
|
+
retry_after = int(
|
|
271
|
+
resp.headers.get("X-RateLimit-Reset", 60)
|
|
272
|
+
)
|
|
273
|
+
logger.warning(
|
|
274
|
+
"GitHub rate limit hit. Reset in %ss",
|
|
275
|
+
retry_after,
|
|
276
|
+
)
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
if resp.status == 422:
|
|
280
|
+
logger.debug(
|
|
281
|
+
"GitHub code search query invalid: %s",
|
|
282
|
+
search_query,
|
|
283
|
+
)
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
if resp.status != 200:
|
|
287
|
+
return []
|
|
288
|
+
|
|
289
|
+
data = await resp.json()
|
|
290
|
+
items = data.get("items", [])
|
|
291
|
+
|
|
292
|
+
await asyncio.sleep(self._rate_limit_delay)
|
|
293
|
+
|
|
294
|
+
fetch_tasks = []
|
|
295
|
+
for item in items[:MAX_CODE_RESULTS]:
|
|
296
|
+
repo_name = item.get("repository", {}).get("name", "")
|
|
297
|
+
if self._is_noise_repo(repo_name):
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
filename = item.get("name", "")
|
|
301
|
+
ext = "." + filename.rsplit(".", 1)[-1] if "." in filename else ""
|
|
302
|
+
|
|
303
|
+
if (
|
|
304
|
+
ext.lower() not in SECURITY_EXTENSIONS
|
|
305
|
+
and filename not in HIGH_VALUE_FILENAMES
|
|
306
|
+
):
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
fetch_tasks.append(self._fetch_code_file(item))
|
|
310
|
+
|
|
311
|
+
if fetch_tasks:
|
|
312
|
+
fetched = await asyncio.gather(
|
|
313
|
+
*fetch_tasks, return_exceptions=True
|
|
314
|
+
)
|
|
315
|
+
for f in fetched:
|
|
316
|
+
if isinstance(f, dict) and f.get("text_content"):
|
|
317
|
+
results.append(f)
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.debug("GitHub code search error: %s", e)
|
|
321
|
+
|
|
322
|
+
return results
|
|
323
|
+
|
|
324
|
+
async def _fetch_code_file(self, item: dict) -> dict:
|
|
325
|
+
"""Fetch the raw content of a GitHub file."""
|
|
326
|
+
if not self._session:
|
|
327
|
+
return {}
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
git_url = item.get("git_url", "")
|
|
331
|
+
html_url = item.get("html_url", "")
|
|
332
|
+
|
|
333
|
+
if not git_url:
|
|
334
|
+
return {}
|
|
335
|
+
|
|
336
|
+
blocked, _ = is_blocked_url(html_url)
|
|
337
|
+
if blocked:
|
|
338
|
+
return {}
|
|
339
|
+
|
|
340
|
+
async with self._session.get(git_url) as resp:
|
|
341
|
+
if resp.status != 200:
|
|
342
|
+
return {}
|
|
343
|
+
data = await resp.json()
|
|
344
|
+
|
|
345
|
+
await asyncio.sleep(self._rate_limit_delay / 2)
|
|
346
|
+
|
|
347
|
+
content_b64 = data.get("content", "").replace("\n", "")
|
|
348
|
+
if not content_b64:
|
|
349
|
+
return {}
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
content = base64.b64decode(content_b64).decode(
|
|
353
|
+
"utf-8", errors="ignore"
|
|
354
|
+
)
|
|
355
|
+
except Exception:
|
|
356
|
+
return {}
|
|
357
|
+
|
|
358
|
+
if len(content) > MAX_FILE_SIZE:
|
|
359
|
+
content = content[:MAX_FILE_SIZE]
|
|
360
|
+
|
|
361
|
+
clean_content, was_flagged = sanitize_content(content)
|
|
362
|
+
if was_flagged:
|
|
363
|
+
return {}
|
|
364
|
+
|
|
365
|
+
if not clean_content or len(clean_content.strip()) < 30:
|
|
366
|
+
return {}
|
|
367
|
+
|
|
368
|
+
repo = item.get("repository", {})
|
|
369
|
+
repo_name = repo.get("full_name", "unknown")
|
|
370
|
+
filename = item.get("name", "")
|
|
371
|
+
|
|
372
|
+
title = f"GitHub: {repo_name} — {filename}"
|
|
373
|
+
|
|
374
|
+
relevance = self._score_relevance(clean_content, filename, repo_name)
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
"url": html_url,
|
|
378
|
+
"text_content": clean_content,
|
|
379
|
+
"title": title,
|
|
380
|
+
"source_type": "github",
|
|
381
|
+
"source_name": "GitHub",
|
|
382
|
+
"github_repo": repo_name,
|
|
383
|
+
"github_filename": filename,
|
|
384
|
+
"github_stars": repo.get("stargazers_count", 0),
|
|
385
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
386
|
+
"word_count": len(clean_content.split()),
|
|
387
|
+
"relevance": relevance,
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logger.debug("GitHub file fetch error: %s", e)
|
|
392
|
+
return {}
|
|
393
|
+
|
|
394
|
+
async def _search_repos(self, search_query: str) -> list[dict]:
|
|
395
|
+
"""Search GitHub repositories and fetch README content."""
|
|
396
|
+
if not self._session:
|
|
397
|
+
return []
|
|
398
|
+
|
|
399
|
+
results: list[dict] = []
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
params = {
|
|
403
|
+
"q": search_query,
|
|
404
|
+
"per_page": MAX_REPO_RESULTS,
|
|
405
|
+
"sort": "updated",
|
|
406
|
+
"order": "desc",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
async with self._session.get(
|
|
410
|
+
f"{GITHUB_API_BASE}/search/repositories",
|
|
411
|
+
params=params,
|
|
412
|
+
) as resp:
|
|
413
|
+
if resp.status == 403:
|
|
414
|
+
logger.warning("GitHub rate limit on repo search")
|
|
415
|
+
return []
|
|
416
|
+
|
|
417
|
+
if resp.status != 200:
|
|
418
|
+
return []
|
|
419
|
+
|
|
420
|
+
data = await resp.json()
|
|
421
|
+
items = data.get("items", [])
|
|
422
|
+
|
|
423
|
+
await asyncio.sleep(self._rate_limit_delay)
|
|
424
|
+
|
|
425
|
+
fetch_tasks = []
|
|
426
|
+
for item in items[:MAX_REPO_RESULTS]:
|
|
427
|
+
repo_name = item.get("name", "")
|
|
428
|
+
if self._is_noise_repo(repo_name):
|
|
429
|
+
continue
|
|
430
|
+
fetch_tasks.append(self._fetch_repo_readme(item))
|
|
431
|
+
|
|
432
|
+
if fetch_tasks:
|
|
433
|
+
fetched = await asyncio.gather(
|
|
434
|
+
*fetch_tasks, return_exceptions=True
|
|
435
|
+
)
|
|
436
|
+
for f in fetched:
|
|
437
|
+
if isinstance(f, dict) and f.get("text_content"):
|
|
438
|
+
results.append(f)
|
|
439
|
+
|
|
440
|
+
except Exception as e:
|
|
441
|
+
logger.debug("GitHub repo search error: %s", e)
|
|
442
|
+
|
|
443
|
+
return results
|
|
444
|
+
|
|
445
|
+
async def _fetch_repo_readme(self, repo: dict) -> dict:
|
|
446
|
+
"""Fetch README content for a repository."""
|
|
447
|
+
if not self._session:
|
|
448
|
+
return {}
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
full_name = repo.get("full_name", "")
|
|
452
|
+
if not full_name:
|
|
453
|
+
return {}
|
|
454
|
+
|
|
455
|
+
readme_url = f"{GITHUB_API_BASE}/repos/{full_name}/readme"
|
|
456
|
+
|
|
457
|
+
async with self._session.get(readme_url) as resp:
|
|
458
|
+
if resp.status != 200:
|
|
459
|
+
return {}
|
|
460
|
+
data = await resp.json()
|
|
461
|
+
|
|
462
|
+
await asyncio.sleep(self._rate_limit_delay / 2)
|
|
463
|
+
|
|
464
|
+
content_b64 = data.get("content", "").replace("\n", "")
|
|
465
|
+
if not content_b64:
|
|
466
|
+
return {}
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
content = base64.b64decode(content_b64).decode(
|
|
470
|
+
"utf-8", errors="ignore"
|
|
471
|
+
)
|
|
472
|
+
except Exception:
|
|
473
|
+
return {}
|
|
474
|
+
|
|
475
|
+
if len(content) > MAX_FILE_SIZE:
|
|
476
|
+
content = content[:MAX_FILE_SIZE]
|
|
477
|
+
|
|
478
|
+
clean_content, was_flagged = sanitize_content(content)
|
|
479
|
+
if (
|
|
480
|
+
was_flagged
|
|
481
|
+
or not clean_content
|
|
482
|
+
or len(clean_content.strip()) < 50
|
|
483
|
+
):
|
|
484
|
+
return {}
|
|
485
|
+
|
|
486
|
+
html_url = repo.get(
|
|
487
|
+
"html_url", f"https://github.com/{full_name}"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
return {
|
|
491
|
+
"url": html_url,
|
|
492
|
+
"text_content": clean_content,
|
|
493
|
+
"title": f"GitHub: {full_name} — README",
|
|
494
|
+
"source_type": "github",
|
|
495
|
+
"source_name": "GitHub",
|
|
496
|
+
"github_repo": full_name,
|
|
497
|
+
"github_filename": "README",
|
|
498
|
+
"github_stars": repo.get("stargazers_count", 0),
|
|
499
|
+
"github_description": repo.get("description", ""),
|
|
500
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
501
|
+
"word_count": len(clean_content.split()),
|
|
502
|
+
"relevance": self._score_relevance(
|
|
503
|
+
clean_content, "README", full_name
|
|
504
|
+
),
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.debug("GitHub README fetch error: %s", e)
|
|
509
|
+
return {}
|
|
510
|
+
|
|
511
|
+
def _is_noise_repo(self, repo_name: str) -> bool:
|
|
512
|
+
"""Returns True if this repo should be skipped (tutorial, awesome list, etc.)."""
|
|
513
|
+
name_lower = (repo_name or "").lower()
|
|
514
|
+
for pattern in SKIP_REPO_PATTERNS:
|
|
515
|
+
if re.match(pattern, name_lower):
|
|
516
|
+
return True
|
|
517
|
+
return False
|
|
518
|
+
|
|
519
|
+
def _score_relevance(
|
|
520
|
+
self,
|
|
521
|
+
content: str,
|
|
522
|
+
filename: str,
|
|
523
|
+
repo_name: str,
|
|
524
|
+
) -> int:
|
|
525
|
+
"""Score how relevant this file is."""
|
|
526
|
+
score = 0
|
|
527
|
+
content_lower = (content or "").lower()
|
|
528
|
+
|
|
529
|
+
if filename in HIGH_VALUE_FILENAMES:
|
|
530
|
+
score += 5
|
|
531
|
+
|
|
532
|
+
IOC_PATTERNS = [
|
|
533
|
+
r"\b[A-Fa-f0-9]{32}\b", # MD5
|
|
534
|
+
r"\b[A-Fa-f0-9]{64}\b", # SHA256
|
|
535
|
+
r"\bCVE-\d{4}-\d+\b",
|
|
536
|
+
r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", # IPv4
|
|
537
|
+
r"[a-zA-Z2-7]{16,56}\.onion",
|
|
538
|
+
r"-----BEGIN PGP",
|
|
539
|
+
r"AKIA[0-9A-Z]{16}", # AWS access key
|
|
540
|
+
]
|
|
541
|
+
for pattern in IOC_PATTERNS:
|
|
542
|
+
if re.search(pattern, content, re.IGNORECASE):
|
|
543
|
+
score += 3
|
|
544
|
+
|
|
545
|
+
SEC_KEYWORDS = [
|
|
546
|
+
"malware", "ransomware", "c2",
|
|
547
|
+
"command and control", "botnet",
|
|
548
|
+
"stealer", "rat", "trojan",
|
|
549
|
+
"exploit", "payload", "shellcode",
|
|
550
|
+
"cobalt strike", "beacon",
|
|
551
|
+
"mimikatz", "credential",
|
|
552
|
+
"lateral movement", "persistence",
|
|
553
|
+
]
|
|
554
|
+
for kw in SEC_KEYWORDS:
|
|
555
|
+
if kw in content_lower:
|
|
556
|
+
score += 2
|
|
557
|
+
|
|
558
|
+
return score
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
# ---------------------------------------------------------------------------
|
|
562
|
+
# Module-level helpers
|
|
563
|
+
# ---------------------------------------------------------------------------
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _is_github_scraping_enabled() -> bool:
|
|
567
|
+
"""Read GITHUB_SCRAPING_ENABLED at call time so tests can monkey-patch it."""
|
|
568
|
+
return os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() == "true"
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
async def scrape_github(
|
|
572
|
+
query: str,
|
|
573
|
+
refined_query: str = "",
|
|
574
|
+
max_results: int = MAX_TOTAL_RESULTS,
|
|
575
|
+
) -> list[dict]:
|
|
576
|
+
"""
|
|
577
|
+
Main entry point for GitHub scraping.
|
|
578
|
+
Returns list of page dicts compatible with the extraction pipeline.
|
|
579
|
+
"""
|
|
580
|
+
if not _is_github_scraping_enabled():
|
|
581
|
+
logger.info("GitHub scraping disabled")
|
|
582
|
+
return []
|
|
583
|
+
|
|
584
|
+
async with GitHubScraper() as scraper:
|
|
585
|
+
return await scraper.search_and_fetch(
|
|
586
|
+
query=query,
|
|
587
|
+
refined_query=refined_query,
|
|
588
|
+
max_results=max_results,
|
|
589
|
+
)
|