voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,624 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/gitlab_scraper.py — GitLab clearnet intelligence source for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Searches GitLab code and repositories for security-relevant content that
|
|
5
|
+
matches an investigation query. Runs over CLEARNET — GitLab is public and
|
|
6
|
+
does not require Tor.
|
|
7
|
+
|
|
8
|
+
Typical high-signal content found on GitLab:
|
|
9
|
+
- Malware tooling and PoC exploits removed from GitHub but persisting here
|
|
10
|
+
- C2 / beacon configuration files
|
|
11
|
+
- Threat actor infrastructure configs
|
|
12
|
+
- Leaked credentials and internal endpoint configs
|
|
13
|
+
- Security research write-ups and proof-of-concept code
|
|
14
|
+
|
|
15
|
+
Authentication is OPTIONAL:
|
|
16
|
+
- Unauthenticated: ~15 requests/minute (search API)
|
|
17
|
+
- Authenticated: ~60 requests/minute — set GITLAB_TOKEN to enable
|
|
18
|
+
|
|
19
|
+
Public API:
|
|
20
|
+
async def scrape_gitlab(
|
|
21
|
+
query: str,
|
|
22
|
+
refined_query: str = "",
|
|
23
|
+
max_results: int = 15,
|
|
24
|
+
) -> list[dict]
|
|
25
|
+
|
|
26
|
+
Returns page dicts compatible with the existing extraction pipeline:
|
|
27
|
+
{
|
|
28
|
+
"url": str,
|
|
29
|
+
"text_content": str,
|
|
30
|
+
"title": str,
|
|
31
|
+
"source_type": "gitlab",
|
|
32
|
+
"source_name": "GitLab",
|
|
33
|
+
"gitlab_repo": str,
|
|
34
|
+
"gitlab_filename": str,
|
|
35
|
+
"gitlab_stars": int,
|
|
36
|
+
"scraped_at": str,
|
|
37
|
+
"word_count": int,
|
|
38
|
+
"relevance": int,
|
|
39
|
+
}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
import base64
|
|
46
|
+
import logging
|
|
47
|
+
import os
|
|
48
|
+
import re
|
|
49
|
+
from datetime import datetime, timezone
|
|
50
|
+
from typing import Optional
|
|
51
|
+
from urllib.parse import quote
|
|
52
|
+
|
|
53
|
+
import aiohttp
|
|
54
|
+
|
|
55
|
+
from utils.content_safety import (
|
|
56
|
+
is_blocked_query,
|
|
57
|
+
is_blocked_url,
|
|
58
|
+
sanitize_content,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
logger = logging.getLogger(__name__)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Constants
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
GITLAB_API_BASE = "https://gitlab.com/api/v4"
|
|
69
|
+
|
|
70
|
+
# Max file size to fetch (200KB)
|
|
71
|
+
MAX_FILE_SIZE = 200 * 1024
|
|
72
|
+
|
|
73
|
+
# Max results per search type
|
|
74
|
+
MAX_CODE_RESULTS = 10
|
|
75
|
+
MAX_REPO_RESULTS = 5
|
|
76
|
+
|
|
77
|
+
# Max total GitLab items per investigation
|
|
78
|
+
MAX_TOTAL_RESULTS = 15
|
|
79
|
+
|
|
80
|
+
# Rate limit delays (seconds)
|
|
81
|
+
# Unauthenticated: ~15/min = 4s between requests
|
|
82
|
+
# Authenticated: ~60/min = 1s between requests (conservative)
|
|
83
|
+
RATE_LIMIT_DELAY_UNAUTH = 4.0
|
|
84
|
+
RATE_LIMIT_DELAY_AUTH = 1.0
|
|
85
|
+
|
|
86
|
+
# Security-relevant file extensions to fetch
|
|
87
|
+
SECURITY_EXTENSIONS = {
|
|
88
|
+
".py", ".js", ".ts", ".go", ".rs",
|
|
89
|
+
".c", ".cpp", ".cs", ".java",
|
|
90
|
+
".sh", ".bash", ".ps1", ".bat",
|
|
91
|
+
".yaml", ".yml", ".json", ".toml",
|
|
92
|
+
".conf", ".config", ".ini", ".env",
|
|
93
|
+
".txt", ".md", ".log",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# File names that are almost always valuable
|
|
97
|
+
HIGH_VALUE_FILENAMES = {
|
|
98
|
+
"config.py", "config.js", "config.json",
|
|
99
|
+
"settings.py", "settings.json",
|
|
100
|
+
"malware.py", "rat.py", "stealer.py",
|
|
101
|
+
"c2.py", "c2.js", "server.py",
|
|
102
|
+
"payload.py", "dropper.py",
|
|
103
|
+
"keylogger.py", "ransomware.py",
|
|
104
|
+
"exploit.py", "exploit.js",
|
|
105
|
+
"credentials.txt", "passwords.txt",
|
|
106
|
+
"victims.txt", "targets.txt",
|
|
107
|
+
"README.md", "README.txt",
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Repositories to skip (noise: tutorials, awesome lists, etc.)
|
|
111
|
+
SKIP_REPO_PATTERNS = [
|
|
112
|
+
r"awesome-.*",
|
|
113
|
+
r".*-tutorial",
|
|
114
|
+
r".*-course",
|
|
115
|
+
r".*-book",
|
|
116
|
+
r".*-cheatsheet",
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
# Scraper
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class GitLabScraper:
|
|
126
|
+
"""
|
|
127
|
+
Scrapes GitLab for security-relevant content using the GitLab Search API v4.
|
|
128
|
+
Works with or without authentication.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
self._token = os.getenv("GITLAB_TOKEN", "").strip()
|
|
133
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
134
|
+
self._rate_limit_delay = (
|
|
135
|
+
RATE_LIMIT_DELAY_AUTH if self._token else RATE_LIMIT_DELAY_UNAUTH
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def _headers(self) -> dict:
|
|
140
|
+
headers = {
|
|
141
|
+
"User-Agent": "VoidAccess-OSINT/1.1",
|
|
142
|
+
}
|
|
143
|
+
if self._token:
|
|
144
|
+
headers["PRIVATE-TOKEN"] = self._token
|
|
145
|
+
return headers
|
|
146
|
+
|
|
147
|
+
async def __aenter__(self):
|
|
148
|
+
self._session = aiohttp.ClientSession(
|
|
149
|
+
headers=self._headers,
|
|
150
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
151
|
+
)
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
async def __aexit__(self, *args):
|
|
155
|
+
if self._session:
|
|
156
|
+
await self._session.close()
|
|
157
|
+
|
|
158
|
+
async def search_and_fetch(
|
|
159
|
+
self,
|
|
160
|
+
query: str,
|
|
161
|
+
refined_query: str = "",
|
|
162
|
+
max_results: int = MAX_TOTAL_RESULTS,
|
|
163
|
+
) -> list[dict]:
|
|
164
|
+
"""
|
|
165
|
+
Search GitLab and return page dicts compatible with the extraction
|
|
166
|
+
pipeline.
|
|
167
|
+
"""
|
|
168
|
+
blocked, _ = is_blocked_query(query)
|
|
169
|
+
if blocked:
|
|
170
|
+
logger.warning("GitLab scraping blocked — prohibited query")
|
|
171
|
+
return []
|
|
172
|
+
|
|
173
|
+
search_queries = self._build_search_queries(query, refined_query)
|
|
174
|
+
|
|
175
|
+
auth_status = "authenticated" if self._token else "unauthenticated"
|
|
176
|
+
logger.info(
|
|
177
|
+
"GitLab scraping (%s): '%s'",
|
|
178
|
+
auth_status,
|
|
179
|
+
query[:50],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
all_results: list[dict] = []
|
|
183
|
+
seen_urls: set[str] = set()
|
|
184
|
+
|
|
185
|
+
code_task = self._search_code(search_queries[0])
|
|
186
|
+
repo_task = self._search_repos(search_queries[0])
|
|
187
|
+
|
|
188
|
+
code_results, repo_results = await asyncio.gather(
|
|
189
|
+
code_task,
|
|
190
|
+
repo_task,
|
|
191
|
+
return_exceptions=True,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if isinstance(code_results, list):
|
|
195
|
+
for item in code_results:
|
|
196
|
+
url = item.get("url", "")
|
|
197
|
+
if url and url not in seen_urls:
|
|
198
|
+
seen_urls.add(url)
|
|
199
|
+
all_results.append(item)
|
|
200
|
+
|
|
201
|
+
if isinstance(repo_results, list):
|
|
202
|
+
for item in repo_results:
|
|
203
|
+
url = item.get("url", "")
|
|
204
|
+
if url and url not in seen_urls:
|
|
205
|
+
seen_urls.add(url)
|
|
206
|
+
all_results.append(item)
|
|
207
|
+
|
|
208
|
+
all_results.sort(key=lambda x: x.get("relevance", 0), reverse=True)
|
|
209
|
+
final = all_results[:max_results]
|
|
210
|
+
|
|
211
|
+
logger.info("GitLab: %d results found", len(final))
|
|
212
|
+
return final
|
|
213
|
+
|
|
214
|
+
def _build_search_queries(
|
|
215
|
+
self,
|
|
216
|
+
query: str,
|
|
217
|
+
refined_query: str,
|
|
218
|
+
) -> list[str]:
|
|
219
|
+
"""
|
|
220
|
+
Build GitLab search queries. GitLab's search API accepts plain text;
|
|
221
|
+
keep queries short and clean.
|
|
222
|
+
"""
|
|
223
|
+
queries: list[str] = []
|
|
224
|
+
|
|
225
|
+
base = refined_query or query
|
|
226
|
+
base = re.sub(r"[^\w\s\-.]", " ", base).strip()[:100]
|
|
227
|
+
queries.append(base)
|
|
228
|
+
|
|
229
|
+
# Add tool-specific second query for known malware/tooling names.
|
|
230
|
+
TOOL_VARIANTS = {
|
|
231
|
+
"cobalt strike": "malleable",
|
|
232
|
+
"metasploit": "meterpreter",
|
|
233
|
+
"mimikatz": "sekurlsa",
|
|
234
|
+
"covenant": "grunt",
|
|
235
|
+
"sliver": "implant",
|
|
236
|
+
"havoc": "demon",
|
|
237
|
+
"brute ratel": "config",
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
query_lower = query.lower()
|
|
241
|
+
for tool, modifier in TOOL_VARIANTS.items():
|
|
242
|
+
if tool in query_lower:
|
|
243
|
+
queries.append(f"{tool} {modifier}")
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
return queries[:2]
|
|
247
|
+
|
|
248
|
+
async def _search_code(self, search_query: str) -> list[dict]:
|
|
249
|
+
"""Search GitLab code (blobs) and fetch file content."""
|
|
250
|
+
if not self._session:
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
results: list[dict] = []
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
params = {
|
|
257
|
+
"scope": "blobs",
|
|
258
|
+
"search": search_query,
|
|
259
|
+
"per_page": MAX_CODE_RESULTS,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
async with self._session.get(
|
|
263
|
+
f"{GITLAB_API_BASE}/search",
|
|
264
|
+
params=params,
|
|
265
|
+
) as resp:
|
|
266
|
+
if resp.status == 429:
|
|
267
|
+
reset_at = resp.headers.get("RateLimit-Reset", "")
|
|
268
|
+
logger.warning(
|
|
269
|
+
"GitLab rate limit hit. Resets at %s",
|
|
270
|
+
reset_at,
|
|
271
|
+
)
|
|
272
|
+
return []
|
|
273
|
+
|
|
274
|
+
if resp.status == 401:
|
|
275
|
+
logger.debug(
|
|
276
|
+
"GitLab code search: authentication required for this query"
|
|
277
|
+
)
|
|
278
|
+
return []
|
|
279
|
+
|
|
280
|
+
if resp.status != 200:
|
|
281
|
+
return []
|
|
282
|
+
|
|
283
|
+
items = await resp.json()
|
|
284
|
+
if not isinstance(items, list):
|
|
285
|
+
return []
|
|
286
|
+
|
|
287
|
+
await asyncio.sleep(self._rate_limit_delay)
|
|
288
|
+
|
|
289
|
+
fetch_tasks = []
|
|
290
|
+
for item in items[:MAX_CODE_RESULTS]:
|
|
291
|
+
repo_name = str(item.get("project_id", ""))
|
|
292
|
+
filename = item.get("filename", "")
|
|
293
|
+
ext = "." + filename.rsplit(".", 1)[-1] if "." in filename else ""
|
|
294
|
+
|
|
295
|
+
if (
|
|
296
|
+
ext.lower() not in SECURITY_EXTENSIONS
|
|
297
|
+
and filename not in HIGH_VALUE_FILENAMES
|
|
298
|
+
):
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
fetch_tasks.append(self._fetch_code_file(item))
|
|
302
|
+
|
|
303
|
+
if fetch_tasks:
|
|
304
|
+
fetched = await asyncio.gather(
|
|
305
|
+
*fetch_tasks, return_exceptions=True
|
|
306
|
+
)
|
|
307
|
+
for f in fetched:
|
|
308
|
+
if isinstance(f, dict) and f.get("text_content"):
|
|
309
|
+
results.append(f)
|
|
310
|
+
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logger.debug("GitLab code search error: %s", e)
|
|
313
|
+
|
|
314
|
+
return results
|
|
315
|
+
|
|
316
|
+
async def _fetch_code_file(self, item: dict) -> dict:
|
|
317
|
+
"""Fetch the raw content of a GitLab file via the repository files API."""
|
|
318
|
+
if not self._session:
|
|
319
|
+
return {}
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
project_id = item.get("project_id")
|
|
323
|
+
file_path = item.get("path", "")
|
|
324
|
+
ref = item.get("ref", "main")
|
|
325
|
+
filename = item.get("filename", "")
|
|
326
|
+
|
|
327
|
+
if not project_id or not file_path:
|
|
328
|
+
return {}
|
|
329
|
+
|
|
330
|
+
# Build a synthetic html_url for is_blocked_url and result storage
|
|
331
|
+
html_url = (
|
|
332
|
+
f"https://gitlab.com/projects/{project_id}/-/blob/{ref}/{file_path}"
|
|
333
|
+
)
|
|
334
|
+
blocked, _ = is_blocked_url(html_url)
|
|
335
|
+
if blocked:
|
|
336
|
+
return {}
|
|
337
|
+
|
|
338
|
+
# URL-encode the path (slashes must become %2F for the GitLab files API)
|
|
339
|
+
encoded_path = quote(file_path, safe="")
|
|
340
|
+
file_url = (
|
|
341
|
+
f"{GITLAB_API_BASE}/projects/{project_id}"
|
|
342
|
+
f"/repository/files/{encoded_path}?ref={ref}"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
async with self._session.get(file_url) as resp:
|
|
346
|
+
if resp.status != 200:
|
|
347
|
+
# Fall back to the snippet GitLab included in the search result
|
|
348
|
+
snippet = item.get("data", "")
|
|
349
|
+
if snippet and len(snippet.strip()) >= 30:
|
|
350
|
+
clean, flagged = sanitize_content(snippet)
|
|
351
|
+
if not flagged and clean and len(clean.strip()) >= 30:
|
|
352
|
+
score = self._score_relevance(clean, filename, str(project_id))
|
|
353
|
+
return {
|
|
354
|
+
"url": html_url,
|
|
355
|
+
"text_content": clean,
|
|
356
|
+
"title": f"GitLab: project/{project_id} — {filename}",
|
|
357
|
+
"source_type": "gitlab",
|
|
358
|
+
"source_name": "GitLab",
|
|
359
|
+
"gitlab_repo": str(project_id),
|
|
360
|
+
"gitlab_filename": filename,
|
|
361
|
+
"gitlab_stars": 0,
|
|
362
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
363
|
+
"word_count": len(clean.split()),
|
|
364
|
+
"relevance": score,
|
|
365
|
+
}
|
|
366
|
+
return {}
|
|
367
|
+
data = await resp.json()
|
|
368
|
+
|
|
369
|
+
await asyncio.sleep(self._rate_limit_delay / 2)
|
|
370
|
+
|
|
371
|
+
content_b64 = data.get("content", "").replace("\n", "")
|
|
372
|
+
if not content_b64:
|
|
373
|
+
return {}
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
content = base64.b64decode(content_b64).decode(
|
|
377
|
+
"utf-8", errors="ignore"
|
|
378
|
+
)
|
|
379
|
+
except Exception:
|
|
380
|
+
return {}
|
|
381
|
+
|
|
382
|
+
if len(content) > MAX_FILE_SIZE:
|
|
383
|
+
content = content[:MAX_FILE_SIZE]
|
|
384
|
+
|
|
385
|
+
clean_content, was_flagged = sanitize_content(content)
|
|
386
|
+
if was_flagged:
|
|
387
|
+
return {}
|
|
388
|
+
|
|
389
|
+
if not clean_content or len(clean_content.strip()) < 30:
|
|
390
|
+
return {}
|
|
391
|
+
|
|
392
|
+
# Build a better html_url if the search result had path_with_namespace
|
|
393
|
+
# (the code search result doesn't include it directly, so we use project_id)
|
|
394
|
+
title = f"GitLab: project/{project_id} — {filename}"
|
|
395
|
+
relevance = self._score_relevance(clean_content, filename, str(project_id))
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
"url": html_url,
|
|
399
|
+
"text_content": clean_content,
|
|
400
|
+
"title": title,
|
|
401
|
+
"source_type": "gitlab",
|
|
402
|
+
"source_name": "GitLab",
|
|
403
|
+
"gitlab_repo": str(project_id),
|
|
404
|
+
"gitlab_filename": filename,
|
|
405
|
+
"gitlab_stars": 0,
|
|
406
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
407
|
+
"word_count": len(clean_content.split()),
|
|
408
|
+
"relevance": relevance,
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.debug("GitLab file fetch error: %s", e)
|
|
413
|
+
return {}
|
|
414
|
+
|
|
415
|
+
async def _search_repos(self, search_query: str) -> list[dict]:
|
|
416
|
+
"""Search GitLab projects and fetch README content."""
|
|
417
|
+
if not self._session:
|
|
418
|
+
return []
|
|
419
|
+
|
|
420
|
+
results: list[dict] = []
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
params = {
|
|
424
|
+
"scope": "projects",
|
|
425
|
+
"search": search_query,
|
|
426
|
+
"per_page": MAX_REPO_RESULTS,
|
|
427
|
+
"order_by": "updated_at",
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
async with self._session.get(
|
|
431
|
+
f"{GITLAB_API_BASE}/search",
|
|
432
|
+
params=params,
|
|
433
|
+
) as resp:
|
|
434
|
+
if resp.status == 429:
|
|
435
|
+
logger.warning("GitLab rate limit on project search")
|
|
436
|
+
return []
|
|
437
|
+
|
|
438
|
+
if resp.status != 200:
|
|
439
|
+
return []
|
|
440
|
+
|
|
441
|
+
items = await resp.json()
|
|
442
|
+
if not isinstance(items, list):
|
|
443
|
+
return []
|
|
444
|
+
|
|
445
|
+
await asyncio.sleep(self._rate_limit_delay)
|
|
446
|
+
|
|
447
|
+
fetch_tasks = []
|
|
448
|
+
for item in items[:MAX_REPO_RESULTS]:
|
|
449
|
+
repo_name = item.get("name", "")
|
|
450
|
+
if self._is_noise_repo(repo_name):
|
|
451
|
+
continue
|
|
452
|
+
fetch_tasks.append(self._fetch_repo_readme(item))
|
|
453
|
+
|
|
454
|
+
if fetch_tasks:
|
|
455
|
+
fetched = await asyncio.gather(
|
|
456
|
+
*fetch_tasks, return_exceptions=True
|
|
457
|
+
)
|
|
458
|
+
for f in fetched:
|
|
459
|
+
if isinstance(f, dict) and f.get("text_content"):
|
|
460
|
+
results.append(f)
|
|
461
|
+
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.debug("GitLab project search error: %s", e)
|
|
464
|
+
|
|
465
|
+
return results
|
|
466
|
+
|
|
467
|
+
async def _fetch_repo_readme(self, project: dict) -> dict:
|
|
468
|
+
"""Fetch README content for a GitLab project."""
|
|
469
|
+
if not self._session:
|
|
470
|
+
return {}
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
project_id = project.get("id")
|
|
474
|
+
if not project_id:
|
|
475
|
+
return {}
|
|
476
|
+
|
|
477
|
+
path_with_namespace = project.get("path_with_namespace", "")
|
|
478
|
+
default_branch = project.get("default_branch") or "main"
|
|
479
|
+
|
|
480
|
+
# Try README.md then readme.md
|
|
481
|
+
readme_content = ""
|
|
482
|
+
for readme_name in ("README.md", "readme.md", "README.txt"):
|
|
483
|
+
encoded_name = quote(readme_name, safe="")
|
|
484
|
+
readme_url = (
|
|
485
|
+
f"{GITLAB_API_BASE}/projects/{project_id}"
|
|
486
|
+
f"/repository/files/{encoded_name}?ref={default_branch}"
|
|
487
|
+
)
|
|
488
|
+
async with self._session.get(readme_url) as resp:
|
|
489
|
+
if resp.status == 200:
|
|
490
|
+
data = await resp.json()
|
|
491
|
+
content_b64 = data.get("content", "").replace("\n", "")
|
|
492
|
+
if content_b64:
|
|
493
|
+
try:
|
|
494
|
+
readme_content = base64.b64decode(
|
|
495
|
+
content_b64
|
|
496
|
+
).decode("utf-8", errors="ignore")
|
|
497
|
+
except Exception:
|
|
498
|
+
pass
|
|
499
|
+
if readme_content:
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
await asyncio.sleep(self._rate_limit_delay / 2)
|
|
503
|
+
|
|
504
|
+
if not readme_content:
|
|
505
|
+
return {}
|
|
506
|
+
|
|
507
|
+
if len(readme_content) > MAX_FILE_SIZE:
|
|
508
|
+
readme_content = readme_content[:MAX_FILE_SIZE]
|
|
509
|
+
|
|
510
|
+
clean_content, was_flagged = sanitize_content(readme_content)
|
|
511
|
+
if (
|
|
512
|
+
was_flagged
|
|
513
|
+
or not clean_content
|
|
514
|
+
or len(clean_content.strip()) < 50
|
|
515
|
+
):
|
|
516
|
+
return {}
|
|
517
|
+
|
|
518
|
+
web_url = project.get(
|
|
519
|
+
"web_url",
|
|
520
|
+
f"https://gitlab.com/{path_with_namespace}",
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
display_name = path_with_namespace or str(project_id)
|
|
524
|
+
|
|
525
|
+
return {
|
|
526
|
+
"url": web_url,
|
|
527
|
+
"text_content": clean_content,
|
|
528
|
+
"title": f"GitLab: {display_name} — README",
|
|
529
|
+
"source_type": "gitlab",
|
|
530
|
+
"source_name": "GitLab",
|
|
531
|
+
"gitlab_repo": display_name,
|
|
532
|
+
"gitlab_filename": "README",
|
|
533
|
+
"gitlab_stars": project.get("star_count", 0),
|
|
534
|
+
"gitlab_description": project.get("description", ""),
|
|
535
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
536
|
+
"word_count": len(clean_content.split()),
|
|
537
|
+
"relevance": self._score_relevance(
|
|
538
|
+
clean_content, "README", display_name
|
|
539
|
+
),
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.debug("GitLab README fetch error: %s", e)
|
|
544
|
+
return {}
|
|
545
|
+
|
|
546
|
+
def _is_noise_repo(self, repo_name: str) -> bool:
|
|
547
|
+
"""Returns True if this repo should be skipped (tutorial, awesome list, etc.)."""
|
|
548
|
+
name_lower = (repo_name or "").lower()
|
|
549
|
+
for pattern in SKIP_REPO_PATTERNS:
|
|
550
|
+
if re.match(pattern, name_lower):
|
|
551
|
+
return True
|
|
552
|
+
return False
|
|
553
|
+
|
|
554
|
+
def _score_relevance(
|
|
555
|
+
self,
|
|
556
|
+
content: str,
|
|
557
|
+
filename: str,
|
|
558
|
+
repo_name: str,
|
|
559
|
+
) -> int:
|
|
560
|
+
"""Score how relevant this file is."""
|
|
561
|
+
score = 0
|
|
562
|
+
content_lower = (content or "").lower()
|
|
563
|
+
|
|
564
|
+
if filename in HIGH_VALUE_FILENAMES:
|
|
565
|
+
score += 5
|
|
566
|
+
|
|
567
|
+
IOC_PATTERNS = [
|
|
568
|
+
r"\b[A-Fa-f0-9]{32}\b", # MD5
|
|
569
|
+
r"\b[A-Fa-f0-9]{64}\b", # SHA256
|
|
570
|
+
r"\bCVE-\d{4}-\d+\b",
|
|
571
|
+
r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", # IPv4
|
|
572
|
+
r"[a-zA-Z2-7]{16,56}\.onion",
|
|
573
|
+
r"-----BEGIN PGP",
|
|
574
|
+
r"AKIA[0-9A-Z]{16}", # AWS access key
|
|
575
|
+
]
|
|
576
|
+
for pattern in IOC_PATTERNS:
|
|
577
|
+
if re.search(pattern, content, re.IGNORECASE):
|
|
578
|
+
score += 3
|
|
579
|
+
|
|
580
|
+
SEC_KEYWORDS = [
|
|
581
|
+
"malware", "ransomware", "c2",
|
|
582
|
+
"command and control", "botnet",
|
|
583
|
+
"stealer", "rat", "trojan",
|
|
584
|
+
"exploit", "payload", "shellcode",
|
|
585
|
+
"cobalt strike", "beacon",
|
|
586
|
+
"mimikatz", "credential",
|
|
587
|
+
"lateral movement", "persistence",
|
|
588
|
+
]
|
|
589
|
+
for kw in SEC_KEYWORDS:
|
|
590
|
+
if kw in content_lower:
|
|
591
|
+
score += 2
|
|
592
|
+
|
|
593
|
+
return score
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
# ---------------------------------------------------------------------------
|
|
597
|
+
# Module-level helpers
|
|
598
|
+
# ---------------------------------------------------------------------------
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _is_gitlab_scraping_enabled() -> bool:
|
|
602
|
+
"""Read GITLAB_SCRAPING_ENABLED at call time so tests can monkey-patch it."""
|
|
603
|
+
return os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() == "true"
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
async def scrape_gitlab(
|
|
607
|
+
query: str,
|
|
608
|
+
refined_query: str = "",
|
|
609
|
+
max_results: int = MAX_TOTAL_RESULTS,
|
|
610
|
+
) -> list[dict]:
|
|
611
|
+
"""
|
|
612
|
+
Main entry point for GitLab scraping.
|
|
613
|
+
Returns list of page dicts compatible with the extraction pipeline.
|
|
614
|
+
"""
|
|
615
|
+
if not _is_gitlab_scraping_enabled():
|
|
616
|
+
logger.info("GitLab scraping disabled")
|
|
617
|
+
return []
|
|
618
|
+
|
|
619
|
+
async with GitLabScraper() as scraper:
|
|
620
|
+
return await scraper.search_and_fetch(
|
|
621
|
+
query=query,
|
|
622
|
+
refined_query=refined_query,
|
|
623
|
+
max_results=max_results,
|
|
624
|
+
)
|