voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/paste_scraper.py
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/paste_scraper.py — Clearnet paste site scraper for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Searches public paste sites (Pastebin, dpaste, paste.ee, Rentry) for
|
|
5
|
+
intelligence relevant to an investigation query. Runs over CLEARNET — these
|
|
6
|
+
sites are public and do not require Tor.
|
|
7
|
+
|
|
8
|
+
Typical high-signal content found on paste sites:
|
|
9
|
+
- Stolen credentials & breach dumps
|
|
10
|
+
- Malware configs / C2 infrastructure
|
|
11
|
+
- IOC lists (hashes, IPs, domains)
|
|
12
|
+
- Ransomware negotiation logs
|
|
13
|
+
- Leaked private keys
|
|
14
|
+
|
|
15
|
+
Public API:
|
|
16
|
+
async def scrape_paste_sites(
|
|
17
|
+
query: str,
|
|
18
|
+
refined_query: str = "",
|
|
19
|
+
max_results: int = 15,
|
|
20
|
+
) -> list[dict]
|
|
21
|
+
|
|
22
|
+
Returns page dicts compatible with the existing scrape pipeline format:
|
|
23
|
+
{
|
|
24
|
+
"url": str,
|
|
25
|
+
"text_content": str,
|
|
26
|
+
"title": str,
|
|
27
|
+
"source_type": "paste_site",
|
|
28
|
+
"source_name": str,
|
|
29
|
+
"scraped_at": str,
|
|
30
|
+
"word_count": int,
|
|
31
|
+
"relevance": int,
|
|
32
|
+
}
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import asyncio
|
|
38
|
+
import logging
|
|
39
|
+
import os
|
|
40
|
+
import re
|
|
41
|
+
from datetime import datetime, timezone
|
|
42
|
+
from typing import Optional
|
|
43
|
+
from urllib.parse import quote_plus
|
|
44
|
+
|
|
45
|
+
import aiohttp
|
|
46
|
+
|
|
47
|
+
from utils.content_safety import is_blocked_query, sanitize_content
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Paste site configuration
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
PASTE_SOURCES = [
|
|
57
|
+
{
|
|
58
|
+
"name": "Pastebin",
|
|
59
|
+
"search_url": "https://pastebin.com/search?q={query}",
|
|
60
|
+
"paste_url": "https://pastebin.com/raw/{id}",
|
|
61
|
+
"result_pattern": r'href="/([a-zA-Z0-9]{8})"',
|
|
62
|
+
"requires_key": False,
|
|
63
|
+
"rate_limit": 1.5,
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"name": "Rentry",
|
|
67
|
+
# Rentry has no public search endpoint — pastes are fetched via
|
|
68
|
+
# direct URL when discovered through Tor results / enrichment.
|
|
69
|
+
"search_url": None,
|
|
70
|
+
"direct_urls": [],
|
|
71
|
+
"paste_url": "https://rentry.co/{id}/raw",
|
|
72
|
+
"requires_key": False,
|
|
73
|
+
"rate_limit": 1.0,
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"name": "dpaste",
|
|
77
|
+
"search_url": "https://dpaste.org/search/?q={query}",
|
|
78
|
+
"paste_url": "https://dpaste.org/{id}/raw/",
|
|
79
|
+
"result_pattern": r'href="/([A-Z0-9]{5,8})/"',
|
|
80
|
+
"requires_key": False,
|
|
81
|
+
"rate_limit": 1.0,
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"name": "paste.ee",
|
|
85
|
+
"search_url": "https://paste.ee/search?q={query}",
|
|
86
|
+
"paste_url": "https://paste.ee/r/{id}",
|
|
87
|
+
"result_pattern": r'href="/p/([a-zA-Z0-9]+)"',
|
|
88
|
+
"requires_key": False,
|
|
89
|
+
"rate_limit": 1.0,
|
|
90
|
+
},
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
HEADERS = {
|
|
94
|
+
"User-Agent": (
|
|
95
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
96
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
97
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
98
|
+
),
|
|
99
|
+
"Accept": (
|
|
100
|
+
"text/html,application/xhtml+xml,"
|
|
101
|
+
"application/xml;q=0.9,*/*;q=0.8"
|
|
102
|
+
),
|
|
103
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
104
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
105
|
+
"Connection": "keep-alive",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
MAX_PASTE_SIZE = 512 * 1024
|
|
109
|
+
MAX_PASTES_PER_SOURCE = 5
|
|
110
|
+
MAX_TOTAL_PASTES = 15
|
|
111
|
+
|
|
112
|
+
# Bitcoin / IP / hash / email / onion / leak-keyword patterns. Pre-compiled
|
|
113
|
+
# once so the relevance scorer does not recompile on every paste.
|
|
114
|
+
_HIGH_VALUE_PATTERNS = [
|
|
115
|
+
re.compile(r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b'), # Bitcoin address
|
|
116
|
+
re.compile(r'\b[A-Fa-f0-9]{32}\b'), # MD5
|
|
117
|
+
re.compile(r'\b[A-Fa-f0-9]{64}\b'), # SHA256
|
|
118
|
+
re.compile(r'\bCVE-\d{4}-\d+\b', re.IGNORECASE),
|
|
119
|
+
re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'), # IPv4
|
|
120
|
+
re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.'), # Email
|
|
121
|
+
re.compile(r'[a-zA-Z2-7]{16,56}\.onion', re.IGNORECASE),
|
|
122
|
+
re.compile(r'-----BEGIN PGP'),
|
|
123
|
+
re.compile(
|
|
124
|
+
r'password|passwd|credentials|leaked|dump|breach|config|c2|'
|
|
125
|
+
r'command.control',
|
|
126
|
+
re.IGNORECASE,
|
|
127
|
+
),
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
_TECH_PATTERNS = [
|
|
131
|
+
re.compile(r'\b(CVE-\d{4}-\d+)\b', re.IGNORECASE),
|
|
132
|
+
re.compile(r'\b([A-Z][a-z]+[A-Z][a-z]+)\b'), # CamelCase tool names
|
|
133
|
+
re.compile(
|
|
134
|
+
r'\b(cobalt strike|metasploit|mimikatz|lockbit|blackcat|alphv|'
|
|
135
|
+
r'revil|conti|ryuk|maze|darkside)\b',
|
|
136
|
+
re.IGNORECASE,
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _is_paste_scraping_enabled() -> bool:
|
|
142
|
+
"""Return True if PASTE_SCRAPING_ENABLED env var is unset or truthy."""
|
|
143
|
+
return os.getenv("PASTE_SCRAPING_ENABLED", "true").lower() == "true"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
# PasteScraper
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class PasteScraper:
|
|
152
|
+
"""
|
|
153
|
+
Scrapes paste sites for intelligence relevant to an investigation query.
|
|
154
|
+
Use as an async context manager so the underlying aiohttp session is
|
|
155
|
+
properly closed.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def __init__(self) -> None:
|
|
159
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
160
|
+
|
|
161
|
+
async def __aenter__(self) -> "PasteScraper":
|
|
162
|
+
self._session = aiohttp.ClientSession(
|
|
163
|
+
headers=HEADERS,
|
|
164
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
165
|
+
)
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
async def __aexit__(self, *args) -> None:
|
|
169
|
+
if self._session is not None:
|
|
170
|
+
await self._session.close()
|
|
171
|
+
self._session = None
|
|
172
|
+
|
|
173
|
+
# -----------------------------------------------------------------------
|
|
174
|
+
# Public entry point
|
|
175
|
+
# -----------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
async def search_and_fetch(
|
|
178
|
+
self,
|
|
179
|
+
query: str,
|
|
180
|
+
refined_query: str = "",
|
|
181
|
+
max_results: int = MAX_TOTAL_PASTES,
|
|
182
|
+
) -> list[dict]:
|
|
183
|
+
"""
|
|
184
|
+
Search all configured paste sources and fetch relevant content.
|
|
185
|
+
Returns a list of page dicts (see module docstring for shape).
|
|
186
|
+
"""
|
|
187
|
+
blocked, _ = is_blocked_query(query)
|
|
188
|
+
if blocked:
|
|
189
|
+
logger.warning(
|
|
190
|
+
"Paste scraping blocked — prohibited query"
|
|
191
|
+
)
|
|
192
|
+
return []
|
|
193
|
+
|
|
194
|
+
search_terms = self._build_search_terms(query, refined_query)
|
|
195
|
+
|
|
196
|
+
logger.info(
|
|
197
|
+
"Paste scraping: '%s' across %d sources",
|
|
198
|
+
query[:50],
|
|
199
|
+
len(PASTE_SOURCES),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Run every (source, term) pair concurrently.
|
|
203
|
+
tasks: list = []
|
|
204
|
+
for source in PASTE_SOURCES:
|
|
205
|
+
if not source.get("search_url"):
|
|
206
|
+
continue
|
|
207
|
+
for term in search_terms[:2]:
|
|
208
|
+
tasks.append(self._scrape_source(source, term))
|
|
209
|
+
|
|
210
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
211
|
+
|
|
212
|
+
all_results: list[dict] = []
|
|
213
|
+
seen_urls: set[str] = set()
|
|
214
|
+
for result in results:
|
|
215
|
+
if isinstance(result, Exception):
|
|
216
|
+
continue
|
|
217
|
+
if not isinstance(result, list):
|
|
218
|
+
continue
|
|
219
|
+
for page in result:
|
|
220
|
+
url = page.get("url", "")
|
|
221
|
+
if url and url not in seen_urls:
|
|
222
|
+
seen_urls.add(url)
|
|
223
|
+
all_results.append(page)
|
|
224
|
+
|
|
225
|
+
all_results.sort(
|
|
226
|
+
key=lambda x: x.get("relevance", 0),
|
|
227
|
+
reverse=True,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
final = all_results[:max_results]
|
|
231
|
+
logger.info("Paste scraping: found %d pastes", len(final))
|
|
232
|
+
return final
|
|
233
|
+
|
|
234
|
+
# -----------------------------------------------------------------------
|
|
235
|
+
# Internals
|
|
236
|
+
# -----------------------------------------------------------------------
|
|
237
|
+
|
|
238
|
+
def _build_search_terms(
|
|
239
|
+
self,
|
|
240
|
+
query: str,
|
|
241
|
+
refined_query: str,
|
|
242
|
+
) -> list[str]:
|
|
243
|
+
"""Build 1-3 search terms; prefers specific technical terms."""
|
|
244
|
+
terms: list[str] = []
|
|
245
|
+
|
|
246
|
+
if refined_query and refined_query != query:
|
|
247
|
+
terms.append(refined_query[:100])
|
|
248
|
+
|
|
249
|
+
terms.append(query[:100])
|
|
250
|
+
|
|
251
|
+
for pattern in _TECH_PATTERNS:
|
|
252
|
+
for m in pattern.findall(query)[:1]:
|
|
253
|
+
term = m if isinstance(m, str) else m[0]
|
|
254
|
+
if term and term not in terms:
|
|
255
|
+
terms.append(term)
|
|
256
|
+
|
|
257
|
+
return terms[:3]
|
|
258
|
+
|
|
259
|
+
async def _scrape_source(
|
|
260
|
+
self,
|
|
261
|
+
source: dict,
|
|
262
|
+
search_term: str,
|
|
263
|
+
) -> list[dict]:
|
|
264
|
+
"""Search one paste source and fetch matching paste contents."""
|
|
265
|
+
results: list[dict] = []
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
paste_ids = await self._search_source(source, search_term)
|
|
269
|
+
if not paste_ids:
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
fetch_tasks = [
|
|
273
|
+
self._fetch_paste(source, paste_id)
|
|
274
|
+
for paste_id in paste_ids[:MAX_PASTES_PER_SOURCE]
|
|
275
|
+
]
|
|
276
|
+
pages = await asyncio.gather(
|
|
277
|
+
*fetch_tasks,
|
|
278
|
+
return_exceptions=True,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
for page in pages:
|
|
282
|
+
if isinstance(page, dict) and page.get("text_content"):
|
|
283
|
+
page["relevance"] = self._score_relevance(
|
|
284
|
+
page["text_content"],
|
|
285
|
+
search_term,
|
|
286
|
+
)
|
|
287
|
+
if page["relevance"] > 0:
|
|
288
|
+
results.append(page)
|
|
289
|
+
except Exception as exc:
|
|
290
|
+
logger.debug(
|
|
291
|
+
"Paste source %s error: %s",
|
|
292
|
+
source.get("name", "?"),
|
|
293
|
+
exc,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
return results
|
|
297
|
+
|
|
298
|
+
async def _search_source(
|
|
299
|
+
self,
|
|
300
|
+
source: dict,
|
|
301
|
+
search_term: str,
|
|
302
|
+
) -> list[str]:
|
|
303
|
+
"""Issue a search request and extract paste IDs from the result HTML."""
|
|
304
|
+
if self._session is None:
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
search_url_template = source.get("search_url")
|
|
308
|
+
if not search_url_template:
|
|
309
|
+
return []
|
|
310
|
+
|
|
311
|
+
encoded_term = quote_plus(search_term)
|
|
312
|
+
search_url = search_url_template.format(query=encoded_term)
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
async with self._session.get(
|
|
316
|
+
search_url,
|
|
317
|
+
allow_redirects=True,
|
|
318
|
+
) as resp:
|
|
319
|
+
if resp.status != 200:
|
|
320
|
+
return []
|
|
321
|
+
html = await resp.text(
|
|
322
|
+
encoding="utf-8",
|
|
323
|
+
errors="ignore",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
pattern = source.get("result_pattern") or ""
|
|
327
|
+
if not pattern:
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
ids = re.findall(pattern, html)
|
|
331
|
+
|
|
332
|
+
seen: set[str] = set()
|
|
333
|
+
unique_ids: list[str] = []
|
|
334
|
+
for i in ids:
|
|
335
|
+
if i not in seen:
|
|
336
|
+
seen.add(i)
|
|
337
|
+
unique_ids.append(i)
|
|
338
|
+
|
|
339
|
+
await asyncio.sleep(source.get("rate_limit", 1.0))
|
|
340
|
+
return unique_ids[:10]
|
|
341
|
+
except Exception as exc:
|
|
342
|
+
logger.debug(
|
|
343
|
+
"Search failed for %s: %s",
|
|
344
|
+
source.get("name", "?"),
|
|
345
|
+
exc,
|
|
346
|
+
)
|
|
347
|
+
return []
|
|
348
|
+
|
|
349
|
+
async def _fetch_paste(
|
|
350
|
+
self,
|
|
351
|
+
source: dict,
|
|
352
|
+
paste_id: str,
|
|
353
|
+
) -> dict:
|
|
354
|
+
"""Fetch the raw content of a single paste."""
|
|
355
|
+
if self._session is None:
|
|
356
|
+
return {}
|
|
357
|
+
|
|
358
|
+
paste_url = source["paste_url"].format(id=paste_id)
|
|
359
|
+
|
|
360
|
+
try:
|
|
361
|
+
async with self._session.get(
|
|
362
|
+
paste_url,
|
|
363
|
+
allow_redirects=True,
|
|
364
|
+
) as resp:
|
|
365
|
+
if resp.status != 200:
|
|
366
|
+
return {}
|
|
367
|
+
|
|
368
|
+
content_length_header = resp.headers.get("content-length", "0")
|
|
369
|
+
try:
|
|
370
|
+
content_length = int(content_length_header)
|
|
371
|
+
except ValueError:
|
|
372
|
+
content_length = 0
|
|
373
|
+
if content_length > MAX_PASTE_SIZE:
|
|
374
|
+
return {}
|
|
375
|
+
|
|
376
|
+
content = await resp.text(
|
|
377
|
+
encoding="utf-8",
|
|
378
|
+
errors="ignore",
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
if len(content) > MAX_PASTE_SIZE:
|
|
382
|
+
content = content[:MAX_PASTE_SIZE]
|
|
383
|
+
|
|
384
|
+
clean_content, was_flagged = sanitize_content(content)
|
|
385
|
+
if was_flagged:
|
|
386
|
+
logger.info("Paste content blocked: %s", paste_url)
|
|
387
|
+
return {}
|
|
388
|
+
|
|
389
|
+
if not clean_content or len(clean_content.strip()) < 50:
|
|
390
|
+
return {}
|
|
391
|
+
|
|
392
|
+
return {
|
|
393
|
+
"url": paste_url,
|
|
394
|
+
"text_content": clean_content,
|
|
395
|
+
"title": f"{source['name']} — {paste_id}",
|
|
396
|
+
"source_type": "paste_site",
|
|
397
|
+
"source_name": source["name"],
|
|
398
|
+
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
399
|
+
"word_count": len(clean_content.split()),
|
|
400
|
+
}
|
|
401
|
+
except Exception as exc:
|
|
402
|
+
logger.debug("Fetch failed %s: %s", paste_url, exc)
|
|
403
|
+
return {}
|
|
404
|
+
|
|
405
|
+
def _score_relevance(self, content: str, search_term: str) -> int:
|
|
406
|
+
"""Score how relevant a paste is. Higher = more relevant."""
|
|
407
|
+
if not content or not search_term:
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
content_lower = content.lower()
|
|
411
|
+
term_lower = search_term.lower()
|
|
412
|
+
score = 0
|
|
413
|
+
|
|
414
|
+
if term_lower in content_lower:
|
|
415
|
+
score += 10
|
|
416
|
+
|
|
417
|
+
for word in term_lower.split():
|
|
418
|
+
if len(word) > 3 and word in content_lower:
|
|
419
|
+
score += 2
|
|
420
|
+
|
|
421
|
+
for pattern in _HIGH_VALUE_PATTERNS:
|
|
422
|
+
if pattern.search(content):
|
|
423
|
+
score += 3
|
|
424
|
+
|
|
425
|
+
return score
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# ---------------------------------------------------------------------------
|
|
429
|
+
# Module-level robots.txt check (safety helper)
|
|
430
|
+
# ---------------------------------------------------------------------------
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
async def check_robots_txt(
|
|
434
|
+
session: aiohttp.ClientSession,
|
|
435
|
+
base_url: str,
|
|
436
|
+
path: str,
|
|
437
|
+
) -> bool:
|
|
438
|
+
"""
|
|
439
|
+
Best-effort robots.txt check. Returns True when crawling is allowed (or
|
|
440
|
+
when the check itself fails — we err on the side of allowing).
|
|
441
|
+
"""
|
|
442
|
+
try:
|
|
443
|
+
from urllib.robotparser import RobotFileParser
|
|
444
|
+
|
|
445
|
+
robots_url = f"{base_url.rstrip('/')}/robots.txt"
|
|
446
|
+
async with session.get(
|
|
447
|
+
robots_url,
|
|
448
|
+
timeout=aiohttp.ClientTimeout(total=5),
|
|
449
|
+
) as resp:
|
|
450
|
+
if resp.status != 200:
|
|
451
|
+
return True
|
|
452
|
+
content = await resp.text(encoding="utf-8", errors="ignore")
|
|
453
|
+
|
|
454
|
+
rp = RobotFileParser()
|
|
455
|
+
rp.parse(content.splitlines())
|
|
456
|
+
return rp.can_fetch("*", path)
|
|
457
|
+
except Exception:
|
|
458
|
+
return True
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# ---------------------------------------------------------------------------
|
|
462
|
+
# Public entry point
|
|
463
|
+
# ---------------------------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
async def scrape_paste_sites(
|
|
467
|
+
query: str,
|
|
468
|
+
refined_query: str = "",
|
|
469
|
+
max_results: int = MAX_TOTAL_PASTES,
|
|
470
|
+
) -> list[dict]:
|
|
471
|
+
"""
|
|
472
|
+
Public entry point — fetch paste site results for *query*.
|
|
473
|
+
Returns [] when paste scraping is disabled via env var.
|
|
474
|
+
"""
|
|
475
|
+
if not _is_paste_scraping_enabled():
|
|
476
|
+
logger.info("Paste scraping disabled via PASTE_SCRAPING_ENABLED=false")
|
|
477
|
+
return []
|
|
478
|
+
|
|
479
|
+
async with PasteScraper() as scraper:
|
|
480
|
+
return await scraper.search_and_fetch(
|
|
481
|
+
query=query,
|
|
482
|
+
refined_query=refined_query,
|
|
483
|
+
max_results=max_results,
|
|
484
|
+
)
|