voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,624 @@
1
+ """
2
+ sources/gitlab_scraper.py — GitLab clearnet intelligence source for VoidAccess.
3
+
4
+ Searches GitLab code and repositories for security-relevant content that
5
+ matches an investigation query. Runs over CLEARNET — GitLab is public and
6
+ does not require Tor.
7
+
8
+ Typical high-signal content found on GitLab:
9
+ - Malware tooling and PoC exploits removed from GitHub but persisting here
10
+ - C2 / beacon configuration files
11
+ - Threat actor infrastructure configs
12
+ - Leaked credentials and internal endpoint configs
13
+ - Security research write-ups and proof-of-concept code
14
+
15
+ Authentication is OPTIONAL:
16
+ - Unauthenticated: ~15 requests/minute (search API)
17
+ - Authenticated: ~60 requests/minute — set GITLAB_TOKEN to enable
18
+
19
+ Public API:
20
+ async def scrape_gitlab(
21
+ query: str,
22
+ refined_query: str = "",
23
+ max_results: int = 15,
24
+ ) -> list[dict]
25
+
26
+ Returns page dicts compatible with the existing extraction pipeline:
27
+ {
28
+ "url": str,
29
+ "text_content": str,
30
+ "title": str,
31
+ "source_type": "gitlab",
32
+ "source_name": "GitLab",
33
+ "gitlab_repo": str,
34
+ "gitlab_filename": str,
35
+ "gitlab_stars": int,
36
+ "scraped_at": str,
37
+ "word_count": int,
38
+ "relevance": int,
39
+ }
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import asyncio
45
+ import base64
46
+ import logging
47
+ import os
48
+ import re
49
+ from datetime import datetime, timezone
50
+ from typing import Optional
51
+ from urllib.parse import quote
52
+
53
+ import aiohttp
54
+
55
+ from utils.content_safety import (
56
+ is_blocked_query,
57
+ is_blocked_url,
58
+ sanitize_content,
59
+ )
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Constants
66
+ # ---------------------------------------------------------------------------
67
+
68
+ GITLAB_API_BASE = "https://gitlab.com/api/v4"
69
+
70
+ # Max file size to fetch (200KB)
71
+ MAX_FILE_SIZE = 200 * 1024
72
+
73
+ # Max results per search type
74
+ MAX_CODE_RESULTS = 10
75
+ MAX_REPO_RESULTS = 5
76
+
77
+ # Max total GitLab items per investigation
78
+ MAX_TOTAL_RESULTS = 15
79
+
80
+ # Rate limit delays (seconds)
81
+ # Unauthenticated: ~15/min = 4s between requests
82
+ # Authenticated: ~60/min = 1s between requests (conservative)
83
+ RATE_LIMIT_DELAY_UNAUTH = 4.0
84
+ RATE_LIMIT_DELAY_AUTH = 1.0
85
+
86
+ # Security-relevant file extensions to fetch
87
+ SECURITY_EXTENSIONS = {
88
+ ".py", ".js", ".ts", ".go", ".rs",
89
+ ".c", ".cpp", ".cs", ".java",
90
+ ".sh", ".bash", ".ps1", ".bat",
91
+ ".yaml", ".yml", ".json", ".toml",
92
+ ".conf", ".config", ".ini", ".env",
93
+ ".txt", ".md", ".log",
94
+ }
95
+
96
+ # File names that are almost always valuable
97
+ HIGH_VALUE_FILENAMES = {
98
+ "config.py", "config.js", "config.json",
99
+ "settings.py", "settings.json",
100
+ "malware.py", "rat.py", "stealer.py",
101
+ "c2.py", "c2.js", "server.py",
102
+ "payload.py", "dropper.py",
103
+ "keylogger.py", "ransomware.py",
104
+ "exploit.py", "exploit.js",
105
+ "credentials.txt", "passwords.txt",
106
+ "victims.txt", "targets.txt",
107
+ "README.md", "README.txt",
108
+ }
109
+
110
+ # Repositories to skip (noise: tutorials, awesome lists, etc.)
111
+ SKIP_REPO_PATTERNS = [
112
+ r"awesome-.*",
113
+ r".*-tutorial",
114
+ r".*-course",
115
+ r".*-book",
116
+ r".*-cheatsheet",
117
+ ]
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Scraper
122
+ # ---------------------------------------------------------------------------
123
+
124
+
125
+ class GitLabScraper:
126
+ """
127
+ Scrapes GitLab for security-relevant content using the GitLab Search API v4.
128
+ Works with or without authentication.
129
+ """
130
+
131
+ def __init__(self):
132
+ self._token = os.getenv("GITLAB_TOKEN", "").strip()
133
+ self._session: Optional[aiohttp.ClientSession] = None
134
+ self._rate_limit_delay = (
135
+ RATE_LIMIT_DELAY_AUTH if self._token else RATE_LIMIT_DELAY_UNAUTH
136
+ )
137
+
138
+ @property
139
+ def _headers(self) -> dict:
140
+ headers = {
141
+ "User-Agent": "VoidAccess-OSINT/1.1",
142
+ }
143
+ if self._token:
144
+ headers["PRIVATE-TOKEN"] = self._token
145
+ return headers
146
+
147
+ async def __aenter__(self):
148
+ self._session = aiohttp.ClientSession(
149
+ headers=self._headers,
150
+ timeout=aiohttp.ClientTimeout(total=30),
151
+ )
152
+ return self
153
+
154
+ async def __aexit__(self, *args):
155
+ if self._session:
156
+ await self._session.close()
157
+
158
+ async def search_and_fetch(
159
+ self,
160
+ query: str,
161
+ refined_query: str = "",
162
+ max_results: int = MAX_TOTAL_RESULTS,
163
+ ) -> list[dict]:
164
+ """
165
+ Search GitLab and return page dicts compatible with the extraction
166
+ pipeline.
167
+ """
168
+ blocked, _ = is_blocked_query(query)
169
+ if blocked:
170
+ logger.warning("GitLab scraping blocked — prohibited query")
171
+ return []
172
+
173
+ search_queries = self._build_search_queries(query, refined_query)
174
+
175
+ auth_status = "authenticated" if self._token else "unauthenticated"
176
+ logger.info(
177
+ "GitLab scraping (%s): '%s'",
178
+ auth_status,
179
+ query[:50],
180
+ )
181
+
182
+ all_results: list[dict] = []
183
+ seen_urls: set[str] = set()
184
+
185
+ code_task = self._search_code(search_queries[0])
186
+ repo_task = self._search_repos(search_queries[0])
187
+
188
+ code_results, repo_results = await asyncio.gather(
189
+ code_task,
190
+ repo_task,
191
+ return_exceptions=True,
192
+ )
193
+
194
+ if isinstance(code_results, list):
195
+ for item in code_results:
196
+ url = item.get("url", "")
197
+ if url and url not in seen_urls:
198
+ seen_urls.add(url)
199
+ all_results.append(item)
200
+
201
+ if isinstance(repo_results, list):
202
+ for item in repo_results:
203
+ url = item.get("url", "")
204
+ if url and url not in seen_urls:
205
+ seen_urls.add(url)
206
+ all_results.append(item)
207
+
208
+ all_results.sort(key=lambda x: x.get("relevance", 0), reverse=True)
209
+ final = all_results[:max_results]
210
+
211
+ logger.info("GitLab: %d results found", len(final))
212
+ return final
213
+
214
+ def _build_search_queries(
215
+ self,
216
+ query: str,
217
+ refined_query: str,
218
+ ) -> list[str]:
219
+ """
220
+ Build GitLab search queries. GitLab's search API accepts plain text;
221
+ keep queries short and clean.
222
+ """
223
+ queries: list[str] = []
224
+
225
+ base = refined_query or query
226
+ base = re.sub(r"[^\w\s\-.]", " ", base).strip()[:100]
227
+ queries.append(base)
228
+
229
+ # Add tool-specific second query for known malware/tooling names.
230
+ TOOL_VARIANTS = {
231
+ "cobalt strike": "malleable",
232
+ "metasploit": "meterpreter",
233
+ "mimikatz": "sekurlsa",
234
+ "covenant": "grunt",
235
+ "sliver": "implant",
236
+ "havoc": "demon",
237
+ "brute ratel": "config",
238
+ }
239
+
240
+ query_lower = query.lower()
241
+ for tool, modifier in TOOL_VARIANTS.items():
242
+ if tool in query_lower:
243
+ queries.append(f"{tool} {modifier}")
244
+ break
245
+
246
+ return queries[:2]
247
+
248
+ async def _search_code(self, search_query: str) -> list[dict]:
249
+ """Search GitLab code (blobs) and fetch file content."""
250
+ if not self._session:
251
+ return []
252
+
253
+ results: list[dict] = []
254
+
255
+ try:
256
+ params = {
257
+ "scope": "blobs",
258
+ "search": search_query,
259
+ "per_page": MAX_CODE_RESULTS,
260
+ }
261
+
262
+ async with self._session.get(
263
+ f"{GITLAB_API_BASE}/search",
264
+ params=params,
265
+ ) as resp:
266
+ if resp.status == 429:
267
+ reset_at = resp.headers.get("RateLimit-Reset", "")
268
+ logger.warning(
269
+ "GitLab rate limit hit. Resets at %s",
270
+ reset_at,
271
+ )
272
+ return []
273
+
274
+ if resp.status == 401:
275
+ logger.debug(
276
+ "GitLab code search: authentication required for this query"
277
+ )
278
+ return []
279
+
280
+ if resp.status != 200:
281
+ return []
282
+
283
+ items = await resp.json()
284
+ if not isinstance(items, list):
285
+ return []
286
+
287
+ await asyncio.sleep(self._rate_limit_delay)
288
+
289
+ fetch_tasks = []
290
+ for item in items[:MAX_CODE_RESULTS]:
291
+ repo_name = str(item.get("project_id", ""))
292
+ filename = item.get("filename", "")
293
+ ext = "." + filename.rsplit(".", 1)[-1] if "." in filename else ""
294
+
295
+ if (
296
+ ext.lower() not in SECURITY_EXTENSIONS
297
+ and filename not in HIGH_VALUE_FILENAMES
298
+ ):
299
+ continue
300
+
301
+ fetch_tasks.append(self._fetch_code_file(item))
302
+
303
+ if fetch_tasks:
304
+ fetched = await asyncio.gather(
305
+ *fetch_tasks, return_exceptions=True
306
+ )
307
+ for f in fetched:
308
+ if isinstance(f, dict) and f.get("text_content"):
309
+ results.append(f)
310
+
311
+ except Exception as e:
312
+ logger.debug("GitLab code search error: %s", e)
313
+
314
+ return results
315
+
316
+ async def _fetch_code_file(self, item: dict) -> dict:
317
+ """Fetch the raw content of a GitLab file via the repository files API."""
318
+ if not self._session:
319
+ return {}
320
+
321
+ try:
322
+ project_id = item.get("project_id")
323
+ file_path = item.get("path", "")
324
+ ref = item.get("ref", "main")
325
+ filename = item.get("filename", "")
326
+
327
+ if not project_id or not file_path:
328
+ return {}
329
+
330
+ # Build a synthetic html_url for is_blocked_url and result storage
331
+ html_url = (
332
+ f"https://gitlab.com/projects/{project_id}/-/blob/{ref}/{file_path}"
333
+ )
334
+ blocked, _ = is_blocked_url(html_url)
335
+ if blocked:
336
+ return {}
337
+
338
+ # URL-encode the path (slashes must become %2F for the GitLab files API)
339
+ encoded_path = quote(file_path, safe="")
340
+ file_url = (
341
+ f"{GITLAB_API_BASE}/projects/{project_id}"
342
+ f"/repository/files/{encoded_path}?ref={ref}"
343
+ )
344
+
345
+ async with self._session.get(file_url) as resp:
346
+ if resp.status != 200:
347
+ # Fall back to the snippet GitLab included in the search result
348
+ snippet = item.get("data", "")
349
+ if snippet and len(snippet.strip()) >= 30:
350
+ clean, flagged = sanitize_content(snippet)
351
+ if not flagged and clean and len(clean.strip()) >= 30:
352
+ score = self._score_relevance(clean, filename, str(project_id))
353
+ return {
354
+ "url": html_url,
355
+ "text_content": clean,
356
+ "title": f"GitLab: project/{project_id} — {filename}",
357
+ "source_type": "gitlab",
358
+ "source_name": "GitLab",
359
+ "gitlab_repo": str(project_id),
360
+ "gitlab_filename": filename,
361
+ "gitlab_stars": 0,
362
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
363
+ "word_count": len(clean.split()),
364
+ "relevance": score,
365
+ }
366
+ return {}
367
+ data = await resp.json()
368
+
369
+ await asyncio.sleep(self._rate_limit_delay / 2)
370
+
371
+ content_b64 = data.get("content", "").replace("\n", "")
372
+ if not content_b64:
373
+ return {}
374
+
375
+ try:
376
+ content = base64.b64decode(content_b64).decode(
377
+ "utf-8", errors="ignore"
378
+ )
379
+ except Exception:
380
+ return {}
381
+
382
+ if len(content) > MAX_FILE_SIZE:
383
+ content = content[:MAX_FILE_SIZE]
384
+
385
+ clean_content, was_flagged = sanitize_content(content)
386
+ if was_flagged:
387
+ return {}
388
+
389
+ if not clean_content or len(clean_content.strip()) < 30:
390
+ return {}
391
+
392
+ # Build a better html_url if the search result had path_with_namespace
393
+ # (the code search result doesn't include it directly, so we use project_id)
394
+ title = f"GitLab: project/{project_id} — {filename}"
395
+ relevance = self._score_relevance(clean_content, filename, str(project_id))
396
+
397
+ return {
398
+ "url": html_url,
399
+ "text_content": clean_content,
400
+ "title": title,
401
+ "source_type": "gitlab",
402
+ "source_name": "GitLab",
403
+ "gitlab_repo": str(project_id),
404
+ "gitlab_filename": filename,
405
+ "gitlab_stars": 0,
406
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
407
+ "word_count": len(clean_content.split()),
408
+ "relevance": relevance,
409
+ }
410
+
411
+ except Exception as e:
412
+ logger.debug("GitLab file fetch error: %s", e)
413
+ return {}
414
+
415
+ async def _search_repos(self, search_query: str) -> list[dict]:
416
+ """Search GitLab projects and fetch README content."""
417
+ if not self._session:
418
+ return []
419
+
420
+ results: list[dict] = []
421
+
422
+ try:
423
+ params = {
424
+ "scope": "projects",
425
+ "search": search_query,
426
+ "per_page": MAX_REPO_RESULTS,
427
+ "order_by": "updated_at",
428
+ }
429
+
430
+ async with self._session.get(
431
+ f"{GITLAB_API_BASE}/search",
432
+ params=params,
433
+ ) as resp:
434
+ if resp.status == 429:
435
+ logger.warning("GitLab rate limit on project search")
436
+ return []
437
+
438
+ if resp.status != 200:
439
+ return []
440
+
441
+ items = await resp.json()
442
+ if not isinstance(items, list):
443
+ return []
444
+
445
+ await asyncio.sleep(self._rate_limit_delay)
446
+
447
+ fetch_tasks = []
448
+ for item in items[:MAX_REPO_RESULTS]:
449
+ repo_name = item.get("name", "")
450
+ if self._is_noise_repo(repo_name):
451
+ continue
452
+ fetch_tasks.append(self._fetch_repo_readme(item))
453
+
454
+ if fetch_tasks:
455
+ fetched = await asyncio.gather(
456
+ *fetch_tasks, return_exceptions=True
457
+ )
458
+ for f in fetched:
459
+ if isinstance(f, dict) and f.get("text_content"):
460
+ results.append(f)
461
+
462
+ except Exception as e:
463
+ logger.debug("GitLab project search error: %s", e)
464
+
465
+ return results
466
+
467
+ async def _fetch_repo_readme(self, project: dict) -> dict:
468
+ """Fetch README content for a GitLab project."""
469
+ if not self._session:
470
+ return {}
471
+
472
+ try:
473
+ project_id = project.get("id")
474
+ if not project_id:
475
+ return {}
476
+
477
+ path_with_namespace = project.get("path_with_namespace", "")
478
+ default_branch = project.get("default_branch") or "main"
479
+
480
+ # Try README.md then readme.md
481
+ readme_content = ""
482
+ for readme_name in ("README.md", "readme.md", "README.txt"):
483
+ encoded_name = quote(readme_name, safe="")
484
+ readme_url = (
485
+ f"{GITLAB_API_BASE}/projects/{project_id}"
486
+ f"/repository/files/{encoded_name}?ref={default_branch}"
487
+ )
488
+ async with self._session.get(readme_url) as resp:
489
+ if resp.status == 200:
490
+ data = await resp.json()
491
+ content_b64 = data.get("content", "").replace("\n", "")
492
+ if content_b64:
493
+ try:
494
+ readme_content = base64.b64decode(
495
+ content_b64
496
+ ).decode("utf-8", errors="ignore")
497
+ except Exception:
498
+ pass
499
+ if readme_content:
500
+ break
501
+
502
+ await asyncio.sleep(self._rate_limit_delay / 2)
503
+
504
+ if not readme_content:
505
+ return {}
506
+
507
+ if len(readme_content) > MAX_FILE_SIZE:
508
+ readme_content = readme_content[:MAX_FILE_SIZE]
509
+
510
+ clean_content, was_flagged = sanitize_content(readme_content)
511
+ if (
512
+ was_flagged
513
+ or not clean_content
514
+ or len(clean_content.strip()) < 50
515
+ ):
516
+ return {}
517
+
518
+ web_url = project.get(
519
+ "web_url",
520
+ f"https://gitlab.com/{path_with_namespace}",
521
+ )
522
+
523
+ display_name = path_with_namespace or str(project_id)
524
+
525
+ return {
526
+ "url": web_url,
527
+ "text_content": clean_content,
528
+ "title": f"GitLab: {display_name} — README",
529
+ "source_type": "gitlab",
530
+ "source_name": "GitLab",
531
+ "gitlab_repo": display_name,
532
+ "gitlab_filename": "README",
533
+ "gitlab_stars": project.get("star_count", 0),
534
+ "gitlab_description": project.get("description", ""),
535
+ "scraped_at": datetime.now(timezone.utc).isoformat(),
536
+ "word_count": len(clean_content.split()),
537
+ "relevance": self._score_relevance(
538
+ clean_content, "README", display_name
539
+ ),
540
+ }
541
+
542
+ except Exception as e:
543
+ logger.debug("GitLab README fetch error: %s", e)
544
+ return {}
545
+
546
+ def _is_noise_repo(self, repo_name: str) -> bool:
547
+ """Returns True if this repo should be skipped (tutorial, awesome list, etc.)."""
548
+ name_lower = (repo_name or "").lower()
549
+ for pattern in SKIP_REPO_PATTERNS:
550
+ if re.match(pattern, name_lower):
551
+ return True
552
+ return False
553
+
554
+ def _score_relevance(
555
+ self,
556
+ content: str,
557
+ filename: str,
558
+ repo_name: str,
559
+ ) -> int:
560
+ """Score how relevant this file is."""
561
+ score = 0
562
+ content_lower = (content or "").lower()
563
+
564
+ if filename in HIGH_VALUE_FILENAMES:
565
+ score += 5
566
+
567
+ IOC_PATTERNS = [
568
+ r"\b[A-Fa-f0-9]{32}\b", # MD5
569
+ r"\b[A-Fa-f0-9]{64}\b", # SHA256
570
+ r"\bCVE-\d{4}-\d+\b",
571
+ r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", # IPv4
572
+ r"[a-zA-Z2-7]{16,56}\.onion",
573
+ r"-----BEGIN PGP",
574
+ r"AKIA[0-9A-Z]{16}", # AWS access key
575
+ ]
576
+ for pattern in IOC_PATTERNS:
577
+ if re.search(pattern, content, re.IGNORECASE):
578
+ score += 3
579
+
580
+ SEC_KEYWORDS = [
581
+ "malware", "ransomware", "c2",
582
+ "command and control", "botnet",
583
+ "stealer", "rat", "trojan",
584
+ "exploit", "payload", "shellcode",
585
+ "cobalt strike", "beacon",
586
+ "mimikatz", "credential",
587
+ "lateral movement", "persistence",
588
+ ]
589
+ for kw in SEC_KEYWORDS:
590
+ if kw in content_lower:
591
+ score += 2
592
+
593
+ return score
594
+
595
+
596
+ # ---------------------------------------------------------------------------
597
+ # Module-level helpers
598
+ # ---------------------------------------------------------------------------
599
+
600
+
601
+ def _is_gitlab_scraping_enabled() -> bool:
602
+ """Read GITLAB_SCRAPING_ENABLED at call time so tests can monkey-patch it."""
603
+ return os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() == "true"
604
+
605
+
606
+ async def scrape_gitlab(
607
+ query: str,
608
+ refined_query: str = "",
609
+ max_results: int = MAX_TOTAL_RESULTS,
610
+ ) -> list[dict]:
611
+ """
612
+ Main entry point for GitLab scraping.
613
+ Returns list of page dicts compatible with the extraction pipeline.
614
+ """
615
+ if not _is_gitlab_scraping_enabled():
616
+ logger.info("GitLab scraping disabled")
617
+ return []
618
+
619
+ async with GitLabScraper() as scraper:
620
+ return await scraper.search_and_fetch(
621
+ query=query,
622
+ refined_query=refined_query,
623
+ max_results=max_results,
624
+ )