voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,589 @@
1
+ """
2
+ sources/github_scraper.py — GitHub clearnet intelligence source for VoidAccess.
3
+
4
+ Searches GitHub code and repositories for security-relevant content that
5
+ matches an investigation query. Runs over CLEARNET — GitHub is public and
6
+ does not require Tor.
7
+
8
+ Typical high-signal content found on GitHub:
9
+ - Leaked configs (API keys, credentials, internal endpoints)
10
+ - Malware source code & proof-of-concept exploits
11
+ - C2 / beacon configuration files
12
+ - Threat actor tooling, dropper scripts, stealers
13
+ - Security research write-ups
14
+
15
+ Authentication is OPTIONAL:
16
+ - Unauthenticated: 10 requests/minute (search API)
17
+ - Authenticated: 30 requests/minute — set GITHUB_TOKEN to enable
18
+
19
+ Public API:
20
+ async def scrape_github(
21
+ query: str,
22
+ refined_query: str = "",
23
+ max_results: int = 15,
24
+ ) -> list[dict]
25
+
26
+ Returns page dicts compatible with the existing extraction pipeline:
27
+ {
28
+ "url": str,
29
+ "text_content": str,
30
+ "title": str,
31
+ "source_type": "github",
32
+ "source_name": "GitHub",
33
+ "github_repo": str,
34
+ "github_filename": str,
35
+ "github_stars": int,
36
+ "scraped_at": str,
37
+ "word_count": int,
38
+ "relevance": int,
39
+ }
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import asyncio
45
+ import base64
46
+ import logging
47
+ import os
48
+ import re
49
+ from datetime import datetime, timezone
50
+ from typing import Optional
51
+
52
+ import aiohttp
53
+
54
+ from utils.content_safety import (
55
+ is_blocked_query,
56
+ is_blocked_url,
57
+ sanitize_content,
58
+ )
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Constants
65
+ # ---------------------------------------------------------------------------
66
+
67
+ GITHUB_API_BASE = "https://api.github.com"
68
+ GITHUB_RAW_BASE = "https://raw.githubusercontent.com"
69
+
70
+ # Max file size to fetch (200KB)
71
+ MAX_FILE_SIZE = 200 * 1024
72
+
73
+ # Max results per search type
74
+ MAX_CODE_RESULTS = 10
75
+ MAX_REPO_RESULTS = 5
76
+
77
+ # Max total GitHub items per investigation
78
+ MAX_TOTAL_RESULTS = 15
79
+
80
+ # Rate limit delays (seconds)
81
+ # Unauthenticated: 10/min = 6s between requests
82
+ # Authenticated: 30/min = 2s between requests
83
+ RATE_LIMIT_DELAY_UNAUTH = 6.0
84
+ RATE_LIMIT_DELAY_AUTH = 2.0
85
+
86
+ # Security-relevant file extensions to fetch
87
+ SECURITY_EXTENSIONS = {
88
+ ".py", ".js", ".ts", ".go", ".rs",
89
+ ".c", ".cpp", ".cs", ".java",
90
+ ".sh", ".bash", ".ps1", ".bat",
91
+ ".yaml", ".yml", ".json", ".toml",
92
+ ".conf", ".config", ".ini", ".env",
93
+ ".txt", ".md", ".log",
94
+ }
95
+
96
+ # File names that are almost always valuable
97
+ HIGH_VALUE_FILENAMES = {
98
+ "config.py", "config.js", "config.json",
99
+ "settings.py", "settings.json",
100
+ "malware.py", "rat.py", "stealer.py",
101
+ "c2.py", "c2.js", "server.py",
102
+ "payload.py", "dropper.py",
103
+ "keylogger.py", "ransomware.py",
104
+ "exploit.py", "exploit.js",
105
+ "credentials.txt", "passwords.txt",
106
+ "victims.txt", "targets.txt",
107
+ "README.md", "README.txt",
108
+ }
109
+
110
+ # Repositories to skip (noise: tutorials, awesome lists, etc.)
111
+ SKIP_REPO_PATTERNS = [
112
+ r"awesome-.*",
113
+ r".*-tutorial",
114
+ r".*-course",
115
+ r".*-book",
116
+ r".*-cheatsheet",
117
+ ]
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Scraper
122
+ # ---------------------------------------------------------------------------
123
+
124
+
125
+ class GitHubScraper:
126
+ """
127
+ Scrapes GitHub for security-relevant content using the GitHub Search API.
128
+ Works with or without authentication.
129
+ """
130
+
131
+ def __init__(self):
132
+ self._token = os.getenv("GITHUB_TOKEN", "").strip()
133
+ self._session: Optional[aiohttp.ClientSession] = None
134
+ self._rate_limit_delay = (
135
+ RATE_LIMIT_DELAY_AUTH if self._token else RATE_LIMIT_DELAY_UNAUTH
136
+ )
137
+
138
+ @property
139
+ def _headers(self) -> dict:
140
+ headers = {
141
+ "Accept": "application/vnd.github+json",
142
+ "X-GitHub-Api-Version": "2022-11-28",
143
+ "User-Agent": "VoidAccess-OSINT/1.1",
144
+ }
145
+ if self._token:
146
+ headers["Authorization"] = f"Bearer {self._token}"
147
+ return headers
148
+
149
+ async def __aenter__(self):
150
+ self._session = aiohttp.ClientSession(
151
+ headers=self._headers,
152
+ timeout=aiohttp.ClientTimeout(total=30),
153
+ )
154
+ return self
155
+
156
+ async def __aexit__(self, *args):
157
+ if self._session:
158
+ await self._session.close()
159
+
160
+ async def search_and_fetch(
161
+ self,
162
+ query: str,
163
+ refined_query: str = "",
164
+ max_results: int = MAX_TOTAL_RESULTS,
165
+ ) -> list[dict]:
166
+ """
167
+ Search GitHub and return page dicts compatible with the extraction
168
+ pipeline.
169
+ """
170
+ blocked, _ = is_blocked_query(query)
171
+ if blocked:
172
+ logger.warning("GitHub scraping blocked — prohibited query")
173
+ return []
174
+
175
+ search_queries = self._build_search_queries(query, refined_query)
176
+
177
+ auth_status = "authenticated" if self._token else "unauthenticated"
178
+ logger.info(
179
+ "GitHub scraping (%s): '%s'",
180
+ auth_status,
181
+ query[:50],
182
+ )
183
+
184
+ all_results: list[dict] = []
185
+ seen_urls: set[str] = set()
186
+
187
+ code_task = self._search_code(search_queries[0])
188
+ repo_task = self._search_repos(search_queries[0])
189
+
190
+ code_results, repo_results = await asyncio.gather(
191
+ code_task,
192
+ repo_task,
193
+ return_exceptions=True,
194
+ )
195
+
196
+ if isinstance(code_results, list):
197
+ for item in code_results:
198
+ url = item.get("url", "")
199
+ if url and url not in seen_urls:
200
+ seen_urls.add(url)
201
+ all_results.append(item)
202
+
203
+ if isinstance(repo_results, list):
204
+ for item in repo_results:
205
+ url = item.get("url", "")
206
+ if url and url not in seen_urls:
207
+ seen_urls.add(url)
208
+ all_results.append(item)
209
+
210
+ all_results.sort(key=lambda x: x.get("relevance", 0), reverse=True)
211
+ final = all_results[:max_results]
212
+
213
+ logger.info("GitHub scraping: %d results found", len(final))
214
+ return final
215
+
216
+ def _build_search_queries(
217
+ self,
218
+ query: str,
219
+ refined_query: str,
220
+ ) -> list[str]:
221
+ """
222
+ Build GitHub search queries. GitHub code search has specific syntax —
223
+ keep queries short and reasonably safe.
224
+ """
225
+ queries: list[str] = []
226
+
227
+ base = refined_query or query
228
+ base = re.sub(r"[^\w\s\-.]", " ", base).strip()[:100]
229
+ queries.append(base)
230
+
231
+ # Add language-specific variants for known malware/tooling names.
232
+ TOOL_LANGS = {
233
+ "cobalt strike": "malleable",
234
+ "metasploit": "language:ruby",
235
+ "mimikatz": "language:c",
236
+ "covenant": "language:csharp",
237
+ "sliver": "language:go",
238
+ "havoc": "language:c",
239
+ "brute ratel": "config",
240
+ }
241
+
242
+ query_lower = query.lower()
243
+ for tool, modifier in TOOL_LANGS.items():
244
+ if tool in query_lower:
245
+ queries.append(f"{tool} {modifier}")
246
+ break
247
+
248
+ return queries[:2]
249
+
250
+ async def _search_code(self, search_query: str) -> list[dict]:
251
+ """Search GitHub code and fetch file content."""
252
+ if not self._session:
253
+ return []
254
+
255
+ results: list[dict] = []
256
+
257
+ try:
258
+ params = {
259
+ "q": search_query,
260
+ "per_page": MAX_CODE_RESULTS,
261
+ "sort": "indexed",
262
+ "order": "desc",
263
+ }
264
+
265
+ async with self._session.get(
266
+ f"{GITHUB_API_BASE}/search/code",
267
+ params=params,
268
+ ) as resp:
269
+ if resp.status == 403:
270
+ retry_after = int(
271
+ resp.headers.get("X-RateLimit-Reset", 60)
272
+ )
273
+ logger.warning(
274
+ "GitHub rate limit hit. Reset in %ss",
275
+ retry_after,
276
+ )
277
+ return []
278
+
279
+ if resp.status == 422:
280
+ logger.debug(
281
+ "GitHub code search query invalid: %s",
282
+ search_query,
283
+ )
284
+ return []
285
+
286
+ if resp.status != 200:
287
+ return []
288
+
289
+ data = await resp.json()
290
+ items = data.get("items", [])
291
+
292
+ await asyncio.sleep(self._rate_limit_delay)
293
+
294
+ fetch_tasks = []
295
+ for item in items[:MAX_CODE_RESULTS]:
296
+ repo_name = item.get("repository", {}).get("name", "")
297
+ if self._is_noise_repo(repo_name):
298
+ continue
299
+
300
+ filename = item.get("name", "")
301
+ ext = "." + filename.rsplit(".", 1)[-1] if "." in filename else ""
302
+
303
+ if (
304
+ ext.lower() not in SECURITY_EXTENSIONS
305
+ and filename not in HIGH_VALUE_FILENAMES
306
+ ):
307
+ continue
308
+
309
+ fetch_tasks.append(self._fetch_code_file(item))
310
+
311
+ if fetch_tasks:
312
+ fetched = await asyncio.gather(
313
+ *fetch_tasks, return_exceptions=True
314
+ )
315
+ for f in fetched:
316
+ if isinstance(f, dict) and f.get("text_content"):
317
+ results.append(f)
318
+
319
+ except Exception as e:
320
+ logger.debug("GitHub code search error: %s", e)
321
+
322
+ return results
323
+
324
+ async def _fetch_code_file(self, item: dict) -> dict:
325
+ """Fetch the raw content of a GitHub file."""
326
+ if not self._session:
327
+ return {}
328
+
329
+ try:
330
+ git_url = item.get("git_url", "")
331
+ html_url = item.get("html_url", "")
332
+
333
+ if not git_url:
334
+ return {}
335
+
336
+ blocked, _ = is_blocked_url(html_url)
337
+ if blocked:
338
+ return {}
339
+
340
+ async with self._session.get(git_url) as resp:
341
+ if resp.status != 200:
342
+ return {}
343
+ data = await resp.json()
344
+
345
+ await asyncio.sleep(self._rate_limit_delay / 2)
346
+
347
+ content_b64 = data.get("content", "").replace("\n", "")
348
+ if not content_b64:
349
+ return {}
350
+
351
+ try:
352
+ content = base64.b64decode(content_b64).decode(
353
+ "utf-8", errors="ignore"
354
+ )
355
+ except Exception:
356
+ return {}
357
+
358
+ if len(content) > MAX_FILE_SIZE:
359
+ content = content[:MAX_FILE_SIZE]
360
+
361
+ clean_content, was_flagged = sanitize_content(content)
362
+ if was_flagged:
363
+ return {}
364
+
365
+ if not clean_content or len(clean_content.strip()) < 30:
366
+ return {}
367
+
368
+ repo = item.get("repository", {})
369
+ repo_name = repo.get("full_name", "unknown")
370
+ filename = item.get("name", "")
371
+
372
+ title = f"GitHub: {repo_name} — {filename}"
373
+
374
+ relevance = self._score_relevance(clean_content, filename, repo_name)
375
+
376
+ return {
377
+ "url": html_url,
378
+ "text_content": clean_content,
379
+ "title": title,
380
+ "source_type": "github",
381
+ "source_name": "GitHub",
382
+ "github_repo": repo_name,
383
+ "github_filename": filename,
384
+ "github_stars": repo.get("stargazers_count", 0),
385
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
386
+ "word_count": len(clean_content.split()),
387
+ "relevance": relevance,
388
+ }
389
+
390
+ except Exception as e:
391
+ logger.debug("GitHub file fetch error: %s", e)
392
+ return {}
393
+
394
+ async def _search_repos(self, search_query: str) -> list[dict]:
395
+ """Search GitHub repositories and fetch README content."""
396
+ if not self._session:
397
+ return []
398
+
399
+ results: list[dict] = []
400
+
401
+ try:
402
+ params = {
403
+ "q": search_query,
404
+ "per_page": MAX_REPO_RESULTS,
405
+ "sort": "updated",
406
+ "order": "desc",
407
+ }
408
+
409
+ async with self._session.get(
410
+ f"{GITHUB_API_BASE}/search/repositories",
411
+ params=params,
412
+ ) as resp:
413
+ if resp.status == 403:
414
+ logger.warning("GitHub rate limit on repo search")
415
+ return []
416
+
417
+ if resp.status != 200:
418
+ return []
419
+
420
+ data = await resp.json()
421
+ items = data.get("items", [])
422
+
423
+ await asyncio.sleep(self._rate_limit_delay)
424
+
425
+ fetch_tasks = []
426
+ for item in items[:MAX_REPO_RESULTS]:
427
+ repo_name = item.get("name", "")
428
+ if self._is_noise_repo(repo_name):
429
+ continue
430
+ fetch_tasks.append(self._fetch_repo_readme(item))
431
+
432
+ if fetch_tasks:
433
+ fetched = await asyncio.gather(
434
+ *fetch_tasks, return_exceptions=True
435
+ )
436
+ for f in fetched:
437
+ if isinstance(f, dict) and f.get("text_content"):
438
+ results.append(f)
439
+
440
+ except Exception as e:
441
+ logger.debug("GitHub repo search error: %s", e)
442
+
443
+ return results
444
+
445
+ async def _fetch_repo_readme(self, repo: dict) -> dict:
446
+ """Fetch README content for a repository."""
447
+ if not self._session:
448
+ return {}
449
+
450
+ try:
451
+ full_name = repo.get("full_name", "")
452
+ if not full_name:
453
+ return {}
454
+
455
+ readme_url = f"{GITHUB_API_BASE}/repos/{full_name}/readme"
456
+
457
+ async with self._session.get(readme_url) as resp:
458
+ if resp.status != 200:
459
+ return {}
460
+ data = await resp.json()
461
+
462
+ await asyncio.sleep(self._rate_limit_delay / 2)
463
+
464
+ content_b64 = data.get("content", "").replace("\n", "")
465
+ if not content_b64:
466
+ return {}
467
+
468
+ try:
469
+ content = base64.b64decode(content_b64).decode(
470
+ "utf-8", errors="ignore"
471
+ )
472
+ except Exception:
473
+ return {}
474
+
475
+ if len(content) > MAX_FILE_SIZE:
476
+ content = content[:MAX_FILE_SIZE]
477
+
478
+ clean_content, was_flagged = sanitize_content(content)
479
+ if (
480
+ was_flagged
481
+ or not clean_content
482
+ or len(clean_content.strip()) < 50
483
+ ):
484
+ return {}
485
+
486
+ html_url = repo.get(
487
+ "html_url", f"https://github.com/{full_name}"
488
+ )
489
+
490
+ return {
491
+ "url": html_url,
492
+ "text_content": clean_content,
493
+ "title": f"GitHub: {full_name} — README",
494
+ "source_type": "github",
495
+ "source_name": "GitHub",
496
+ "github_repo": full_name,
497
+ "github_filename": "README",
498
+ "github_stars": repo.get("stargazers_count", 0),
499
+ "github_description": repo.get("description", ""),
500
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
501
+ "word_count": len(clean_content.split()),
502
+ "relevance": self._score_relevance(
503
+ clean_content, "README", full_name
504
+ ),
505
+ }
506
+
507
+ except Exception as e:
508
+ logger.debug("GitHub README fetch error: %s", e)
509
+ return {}
510
+
511
+ def _is_noise_repo(self, repo_name: str) -> bool:
512
+ """Returns True if this repo should be skipped (tutorial, awesome list, etc.)."""
513
+ name_lower = (repo_name or "").lower()
514
+ for pattern in SKIP_REPO_PATTERNS:
515
+ if re.match(pattern, name_lower):
516
+ return True
517
+ return False
518
+
519
+ def _score_relevance(
520
+ self,
521
+ content: str,
522
+ filename: str,
523
+ repo_name: str,
524
+ ) -> int:
525
+ """Score how relevant this file is."""
526
+ score = 0
527
+ content_lower = (content or "").lower()
528
+
529
+ if filename in HIGH_VALUE_FILENAMES:
530
+ score += 5
531
+
532
+ IOC_PATTERNS = [
533
+ r"\b[A-Fa-f0-9]{32}\b", # MD5
534
+ r"\b[A-Fa-f0-9]{64}\b", # SHA256
535
+ r"\bCVE-\d{4}-\d+\b",
536
+ r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", # IPv4
537
+ r"[a-zA-Z2-7]{16,56}\.onion",
538
+ r"-----BEGIN PGP",
539
+ r"AKIA[0-9A-Z]{16}", # AWS access key
540
+ ]
541
+ for pattern in IOC_PATTERNS:
542
+ if re.search(pattern, content, re.IGNORECASE):
543
+ score += 3
544
+
545
+ SEC_KEYWORDS = [
546
+ "malware", "ransomware", "c2",
547
+ "command and control", "botnet",
548
+ "stealer", "rat", "trojan",
549
+ "exploit", "payload", "shellcode",
550
+ "cobalt strike", "beacon",
551
+ "mimikatz", "credential",
552
+ "lateral movement", "persistence",
553
+ ]
554
+ for kw in SEC_KEYWORDS:
555
+ if kw in content_lower:
556
+ score += 2
557
+
558
+ return score
559
+
560
+
561
+ # ---------------------------------------------------------------------------
562
+ # Module-level helpers
563
+ # ---------------------------------------------------------------------------
564
+
565
+
566
+ def _is_github_scraping_enabled() -> bool:
567
+ """Read GITHUB_SCRAPING_ENABLED at call time so tests can monkey-patch it."""
568
+ return os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() == "true"
569
+
570
+
571
+ async def scrape_github(
572
+ query: str,
573
+ refined_query: str = "",
574
+ max_results: int = MAX_TOTAL_RESULTS,
575
+ ) -> list[dict]:
576
+ """
577
+ Main entry point for GitHub scraping.
578
+ Returns list of page dicts compatible with the extraction pipeline.
579
+ """
580
+ if not _is_github_scraping_enabled():
581
+ logger.info("GitHub scraping disabled")
582
+ return []
583
+
584
+ async with GitHubScraper() as scraper:
585
+ return await scraper.search_and_fetch(
586
+ query=query,
587
+ refined_query=refined_query,
588
+ max_results=max_results,
589
+ )