urlscan-python 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- urlscan/__init__.py +10 -0
- urlscan/_version.py +1 -0
- urlscan/_version.pyi +1 -0
- urlscan/client.py +652 -0
- urlscan/error.py +27 -0
- urlscan/iterator.py +91 -0
- urlscan/types.py +6 -0
- urlscan/utils.py +6 -0
- urlscan_python-0.0.1.dist-info/METADATA +98 -0
- urlscan_python-0.0.1.dist-info/RECORD +12 -0
- urlscan_python-0.0.1.dist-info/WHEEL +4 -0
- urlscan_python-0.0.1.dist-info/licenses/LICENSE +21 -0
urlscan/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from ._version import version
|
|
3
|
+
|
|
4
|
+
__version__ = version
|
|
5
|
+
except ImportError:
|
|
6
|
+
__version__ = "0.0.0"
|
|
7
|
+
|
|
8
|
+
from .client import Client # noqa: F401
|
|
9
|
+
from .error import APIError, RateLimitError # noqa: F401
|
|
10
|
+
from .iterator import SearchIterator # noqa: F401
|
urlscan/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = "0.0.1"
|
urlscan/_version.pyi
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version: str
|
urlscan/client.py
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import datetime
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, BinaryIO, TypedDict
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
from httpx._types import QueryParamTypes, RequestData, TimeoutTypes
|
|
13
|
+
|
|
14
|
+
from ._version import version
|
|
15
|
+
from .error import APIError, RateLimitError, RateLimitRemainingError
|
|
16
|
+
from .iterator import SearchIterator
|
|
17
|
+
from .types import ActionType, VisibilityType
|
|
18
|
+
from .utils import parse_datetime
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("urlscan-python")
|
|
21
|
+
|
|
22
|
+
BASE_URL = os.environ.get("URLSCAN_BASE_URL", "https://urlscan.io")
|
|
23
|
+
USER_AGENT = f"urlscan-py/{version}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _compact(d: dict) -> dict:
|
|
27
|
+
"""Remove empty values from a dictionary."""
|
|
28
|
+
return {k: v for k, v in d.items() if v is not None}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RetryTransport(httpx.HTTPTransport):
|
|
32
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
33
|
+
res = super().handle_request(request)
|
|
34
|
+
if res.status_code == 429:
|
|
35
|
+
rate_limit_reset_after: str | None = res.headers.get(
|
|
36
|
+
"X-Rate-Limit-Reset-After"
|
|
37
|
+
)
|
|
38
|
+
if rate_limit_reset_after is None:
|
|
39
|
+
return res
|
|
40
|
+
|
|
41
|
+
logger.info(
|
|
42
|
+
f"Rate limit error hit. Wait {rate_limit_reset_after} seconds before retrying..."
|
|
43
|
+
)
|
|
44
|
+
time.sleep(float(rate_limit_reset_after))
|
|
45
|
+
return self.handle_request(request)
|
|
46
|
+
|
|
47
|
+
return res
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ClientResponse:
|
|
51
|
+
def __init__(self, res: httpx.Response):
|
|
52
|
+
self._res = res
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def basename(self) -> str:
|
|
56
|
+
return os.path.basename(self._res.url.path)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def content(self) -> bytes:
|
|
60
|
+
return self._res.content
|
|
61
|
+
|
|
62
|
+
def json(self) -> Any:
|
|
63
|
+
return self._res.json()
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def text(self) -> str:
|
|
67
|
+
return self._res.text
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def headers(self):
|
|
71
|
+
return self._res.headers
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def status_code(self) -> int:
|
|
75
|
+
return self._res.status_code
|
|
76
|
+
|
|
77
|
+
def raise_for_status(self) -> None:
|
|
78
|
+
self._res.raise_for_status()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class RateLimit:
|
|
83
|
+
remaining: int
|
|
84
|
+
reset: datetime.datetime
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class RateLimitMemo(TypedDict):
|
|
88
|
+
public: RateLimit | None
|
|
89
|
+
private: RateLimit | None
|
|
90
|
+
unlisted: RateLimit | None
|
|
91
|
+
retrieve: RateLimit | None
|
|
92
|
+
search: RateLimit | None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Client:
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
api_key: str,
|
|
99
|
+
base_url: str = BASE_URL,
|
|
100
|
+
user_agent: str = USER_AGENT,
|
|
101
|
+
trust_env: bool = False,
|
|
102
|
+
timeout: TimeoutTypes = 60,
|
|
103
|
+
proxy: str | None = None,
|
|
104
|
+
verify: bool = True,
|
|
105
|
+
retry: bool = False,
|
|
106
|
+
):
|
|
107
|
+
"""
|
|
108
|
+
Args:
|
|
109
|
+
api_key (str): Your urlscan.io API key.
|
|
110
|
+
base_url (str, optional): Base URL. Defaults to BASE_URL.
|
|
111
|
+
user_agent (str, optional): User agent. Defaults to USER_AGENT.
|
|
112
|
+
trust_env (bool, optional): Enable or disable usage of environment variables for configuration. Defaults to False.
|
|
113
|
+
timeout (TimeoutTypes, optional): timeout configuration to use when sending request. Defaults to 60.
|
|
114
|
+
proxy (str | None, optional): Proxy URL where all the traffic should be routed. Defaults to None.
|
|
115
|
+
verify (bool, optional): Either `True` to use an SSL context with the default CA bundle, `False` to disable verification. Defaults to True.
|
|
116
|
+
retry (bool, optional): Whether to use automatic X-Rate-Limit-Reset-After HTTP header based retry. Defaults to False.
|
|
117
|
+
"""
|
|
118
|
+
self._api_key = api_key
|
|
119
|
+
self._base_url = base_url
|
|
120
|
+
self._user_agent = user_agent
|
|
121
|
+
self._trust_env = trust_env
|
|
122
|
+
self._timeout = timeout
|
|
123
|
+
self._proxy = proxy
|
|
124
|
+
self._verify = verify
|
|
125
|
+
self._retry = retry
|
|
126
|
+
|
|
127
|
+
self._session: httpx.Client | None = None
|
|
128
|
+
self._rate_limit_memo: RateLimitMemo = {
|
|
129
|
+
"public": None,
|
|
130
|
+
"private": None,
|
|
131
|
+
"unlisted": None,
|
|
132
|
+
"retrieve": None,
|
|
133
|
+
"search": None,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
self._scan_uuid_timestamp_memo: dict[str, float] = {}
|
|
137
|
+
|
|
138
|
+
def __enter__(self):
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def __exit__(self, item_type: Any, value: Any, traceback: Any):
|
|
142
|
+
self.close()
|
|
143
|
+
|
|
144
|
+
def close(self):
|
|
145
|
+
if self._session:
|
|
146
|
+
self._session.close()
|
|
147
|
+
self._session = None
|
|
148
|
+
|
|
149
|
+
def _get_session(self) -> httpx.Client:
|
|
150
|
+
if self._session:
|
|
151
|
+
return self._session
|
|
152
|
+
|
|
153
|
+
headers = _compact(
|
|
154
|
+
{
|
|
155
|
+
"User-Agent": self._user_agent,
|
|
156
|
+
"API-Key": self._api_key,
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
transport: httpx.HTTPTransport | None = None
|
|
160
|
+
if self._retry:
|
|
161
|
+
transport = RetryTransport()
|
|
162
|
+
|
|
163
|
+
self._session = httpx.Client(
|
|
164
|
+
base_url=self._base_url,
|
|
165
|
+
headers=headers,
|
|
166
|
+
timeout=self._timeout,
|
|
167
|
+
proxy=self._proxy,
|
|
168
|
+
verify=self._verify,
|
|
169
|
+
trust_env=self._trust_env,
|
|
170
|
+
transport=transport,
|
|
171
|
+
)
|
|
172
|
+
return self._session
|
|
173
|
+
|
|
174
|
+
def _get_action(self, request: httpx.Request) -> ActionType | None:
|
|
175
|
+
path = request.url.path
|
|
176
|
+
if request.method == "GET":
|
|
177
|
+
if path == "/api/v1/search/":
|
|
178
|
+
return "search"
|
|
179
|
+
|
|
180
|
+
if path.startswith("/api/v1/result/"):
|
|
181
|
+
return "retrieve"
|
|
182
|
+
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
if request.method == "POST":
|
|
186
|
+
if path != "/api/v1/scan/":
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
if request.headers.get("Content-Type") != "application/json":
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
with contextlib.suppress(json.JSONDecodeError):
|
|
193
|
+
data: dict = json.loads(request.content)
|
|
194
|
+
return data.get("visibility")
|
|
195
|
+
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
def _send_request(
|
|
199
|
+
self, session: httpx.Client, request: httpx.Request
|
|
200
|
+
) -> ClientResponse:
|
|
201
|
+
# let it automatic retry if retry is enabled
|
|
202
|
+
if self._retry:
|
|
203
|
+
return ClientResponse(session.send(request))
|
|
204
|
+
|
|
205
|
+
action = self._get_action(request)
|
|
206
|
+
if action:
|
|
207
|
+
rate_limit: RateLimit | None = self._rate_limit_memo.get(action)
|
|
208
|
+
if rate_limit:
|
|
209
|
+
utcnow = datetime.datetime.now(datetime.timezone.utc)
|
|
210
|
+
if rate_limit.remaining == 0 and rate_limit.reset > utcnow:
|
|
211
|
+
raise RateLimitRemainingError(
|
|
212
|
+
f"{action} is rate limited. Wait until {utcnow}."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
res = ClientResponse(session.send(request))
|
|
216
|
+
|
|
217
|
+
# use action in response headers
|
|
218
|
+
action = res.headers.get("X-Rate-Limit-Action")
|
|
219
|
+
if action:
|
|
220
|
+
remaining = res.headers.get("X-Rate-Limit-Remaining")
|
|
221
|
+
reset = res.headers.get("X-Rate-Limit-Reset")
|
|
222
|
+
if remaining and reset:
|
|
223
|
+
self._rate_limit_memo[action] = RateLimit(
|
|
224
|
+
remaining=int(remaining),
|
|
225
|
+
reset=parse_datetime(reset),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return res
|
|
229
|
+
|
|
230
|
+
def get(self, path: str, params: QueryParamTypes | None = None) -> ClientResponse:
|
|
231
|
+
"""Send a GET request to a given API endpoint.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
path (str): Path to API endpoint.
|
|
235
|
+
params (QueryParamTypes | None, optional): Query parameters. Defaults to None.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
ClientResponse: Response.
|
|
239
|
+
"""
|
|
240
|
+
session = self._get_session()
|
|
241
|
+
req = session.build_request("GET", path, params=params)
|
|
242
|
+
return self._send_request(session, req)
|
|
243
|
+
|
|
244
|
+
def get_json(self, path: str, params: QueryParamTypes | None = None) -> dict:
|
|
245
|
+
res = self.get(path, params=params)
|
|
246
|
+
return self._response_to_json(res)
|
|
247
|
+
|
|
248
|
+
def post(
|
|
249
|
+
self,
|
|
250
|
+
path: str,
|
|
251
|
+
json: Any | None = None,
|
|
252
|
+
data: RequestData | None = None,
|
|
253
|
+
) -> ClientResponse:
|
|
254
|
+
"""Send a POST request to a given API endpoint.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
path (str): Path.
|
|
258
|
+
json (Any | None, optional): Dict to send in request body as JSON. Defaults to None.
|
|
259
|
+
data (RequestData | None, optional): Dict to send in request body. Defaults to None.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
ClientResponse: Response.
|
|
263
|
+
"""
|
|
264
|
+
session = self._get_session()
|
|
265
|
+
req = session.build_request("POST", path, json=json, data=data)
|
|
266
|
+
return self._send_request(session, req)
|
|
267
|
+
|
|
268
|
+
def download(
|
|
269
|
+
self,
|
|
270
|
+
path: str,
|
|
271
|
+
file: BinaryIO,
|
|
272
|
+
params: QueryParamTypes | None = None,
|
|
273
|
+
) -> None:
|
|
274
|
+
"""Download a file from a given API endpoint.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
path (str): Path to API endpoint.
|
|
278
|
+
file (BinaryIO): File object to write to.
|
|
279
|
+
params (QueryParamTypes | None, optional): Query parameters. Defaults to None.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
BytesIO: File content.
|
|
283
|
+
"""
|
|
284
|
+
res = self.get(path, params=params)
|
|
285
|
+
file.write(res.content)
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
def get_content(self, path: str, params: QueryParamTypes | None = None) -> bytes:
|
|
289
|
+
res = self.get(path, params=params)
|
|
290
|
+
return self._response_to_content(res)
|
|
291
|
+
|
|
292
|
+
def get_text(self, path: str, params: QueryParamTypes | None = None) -> str:
|
|
293
|
+
res = self.get(path, params=params)
|
|
294
|
+
return self._response_to_str(res)
|
|
295
|
+
|
|
296
|
+
def get_result(self, uuid: str) -> dict:
|
|
297
|
+
"""Get a result of a scan by UUID.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
uuid (str): UUID.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Dict: Scan result.
|
|
304
|
+
|
|
305
|
+
Reference:
|
|
306
|
+
https://urlscan.io/docs/api/#result
|
|
307
|
+
"""
|
|
308
|
+
return self.get_json(f"/api/v1/result/{uuid}/")
|
|
309
|
+
|
|
310
|
+
def get_screenshot(self, uuid: str) -> BytesIO:
|
|
311
|
+
"""Get a screenshot of a scan by UUID.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
uuid (str): UUID.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
: Screenshot (img/png) as bytes.
|
|
318
|
+
|
|
319
|
+
Reference:
|
|
320
|
+
https://urlscan.io/docs/api/#screenshot
|
|
321
|
+
"""
|
|
322
|
+
res = self.get(f"/screenshots/{uuid}.png")
|
|
323
|
+
bio = BytesIO(res.content)
|
|
324
|
+
bio.name = res.basename
|
|
325
|
+
return bio
|
|
326
|
+
|
|
327
|
+
def get_dom(self, uuid: str) -> str:
|
|
328
|
+
"""Get a DOM of a scan by UUID.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
uuid (str): UUID
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
str: DOM as a string.
|
|
335
|
+
|
|
336
|
+
Reference:
|
|
337
|
+
https://urlscan.io/docs/api/#dom
|
|
338
|
+
"""
|
|
339
|
+
return self.get_text(f"/dom/{uuid}/")
|
|
340
|
+
|
|
341
|
+
def search(
|
|
342
|
+
self,
|
|
343
|
+
q: str = "",
|
|
344
|
+
size: int = 100,
|
|
345
|
+
limit: int | None = None,
|
|
346
|
+
search_after: str | None = None,
|
|
347
|
+
) -> SearchIterator:
|
|
348
|
+
"""Search.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
q (str): Query term. Defaults to "".
|
|
352
|
+
size (int, optional): Number of results returned in a search. Defaults to 100.
|
|
353
|
+
limit (int | None, optional): . Defaults to None.
|
|
354
|
+
search_after (str | None, optional): Maximum number of results that will be returned by the iterator. Defaults to None.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
SearchIterator: Search iterator.
|
|
358
|
+
|
|
359
|
+
Reference:
|
|
360
|
+
https://urlscan.io/docs/api/#search
|
|
361
|
+
"""
|
|
362
|
+
return SearchIterator(
|
|
363
|
+
self,
|
|
364
|
+
q=q,
|
|
365
|
+
size=size,
|
|
366
|
+
limit=limit,
|
|
367
|
+
search_after=search_after,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def scan(
|
|
371
|
+
self,
|
|
372
|
+
url: str,
|
|
373
|
+
*,
|
|
374
|
+
visibility: VisibilityType,
|
|
375
|
+
tags: list[str] | None = None,
|
|
376
|
+
customagent: str | None = None,
|
|
377
|
+
referer: str | None = None,
|
|
378
|
+
override_safety: Any = None,
|
|
379
|
+
country: str | None = None,
|
|
380
|
+
) -> dict:
|
|
381
|
+
"""Scan a given URL.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
url (str): URL to scan.
|
|
385
|
+
visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
|
|
386
|
+
tags (list[str] | None, optional): Tags to be attached. Defaults to None.
|
|
387
|
+
customagent (str | None, optional): Custom user agent. Defaults to None.
|
|
388
|
+
referer (str | None, optional): Referer. Defaults to None.
|
|
389
|
+
override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
|
|
390
|
+
country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
dict: Scan response.
|
|
394
|
+
|
|
395
|
+
Reference:
|
|
396
|
+
https://urlscan.io/docs/api/#scan
|
|
397
|
+
"""
|
|
398
|
+
data = _compact(
|
|
399
|
+
{
|
|
400
|
+
"url": url,
|
|
401
|
+
"tags": tags,
|
|
402
|
+
"visibility": visibility,
|
|
403
|
+
"customagent": customagent,
|
|
404
|
+
"referer": referer,
|
|
405
|
+
"overrideSafety": override_safety,
|
|
406
|
+
"country": country,
|
|
407
|
+
}
|
|
408
|
+
)
|
|
409
|
+
res = self.post("/api/v1/scan/", json=data)
|
|
410
|
+
json_res = self._response_to_json(res)
|
|
411
|
+
|
|
412
|
+
json_visibility = json_res.get("visibility")
|
|
413
|
+
if json_visibility is not None and json_visibility != visibility:
|
|
414
|
+
logger.warning(f"Visibility is enforced to {json_visibility}.")
|
|
415
|
+
|
|
416
|
+
# memoize the scan UUID & timestamp
|
|
417
|
+
uuid = json_res.get("uuid")
|
|
418
|
+
if isinstance(uuid, str):
|
|
419
|
+
self._scan_uuid_timestamp_memo[uuid] = time.time()
|
|
420
|
+
|
|
421
|
+
return json_res
|
|
422
|
+
|
|
423
|
+
def bulk_scan(
|
|
424
|
+
self,
|
|
425
|
+
urls: list[str],
|
|
426
|
+
*,
|
|
427
|
+
visibility: VisibilityType,
|
|
428
|
+
tags: list[str] | None = None,
|
|
429
|
+
customagent: str | None = None,
|
|
430
|
+
referer: str | None = None,
|
|
431
|
+
override_safety: Any = None,
|
|
432
|
+
country: str | None = None,
|
|
433
|
+
) -> list[tuple[str, dict | Exception]]:
|
|
434
|
+
"""Scan multiple URLs in bulk.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
urls (list[str]): List of URLs to scan.
|
|
438
|
+
visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
|
|
439
|
+
tags (list[str] | None, optional): Tags to be attached. Defaults to None.
|
|
440
|
+
customagent (str | None, optional): Custom user agent. Defaults to None.
|
|
441
|
+
referer (str | None, optional): Referer. Defaults to None.
|
|
442
|
+
override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
|
|
443
|
+
country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
list[tuple[str, dict | Exception]]: A list of tuples of (url, scan response or error).
|
|
447
|
+
|
|
448
|
+
Reference:
|
|
449
|
+
https://urlscan.io/docs/api/#scan
|
|
450
|
+
"""
|
|
451
|
+
|
|
452
|
+
def inner(url: str) -> dict | Exception:
|
|
453
|
+
try:
|
|
454
|
+
return self.scan(
|
|
455
|
+
url,
|
|
456
|
+
visibility=visibility,
|
|
457
|
+
tags=tags,
|
|
458
|
+
customagent=customagent,
|
|
459
|
+
referer=referer,
|
|
460
|
+
override_safety=override_safety,
|
|
461
|
+
country=country,
|
|
462
|
+
)
|
|
463
|
+
except Exception as e:
|
|
464
|
+
return e
|
|
465
|
+
|
|
466
|
+
return [(url, inner(url)) for url in urls]
|
|
467
|
+
|
|
468
|
+
def wait_for_result(
|
|
469
|
+
self,
|
|
470
|
+
uuid: str,
|
|
471
|
+
timeout: float = 60.0,
|
|
472
|
+
interval: float = 1.0,
|
|
473
|
+
initial_wait: float | None = 10.0,
|
|
474
|
+
) -> None:
|
|
475
|
+
"""Wait for a scan result to be available.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
uuid (str): UUID of a result.
|
|
479
|
+
timeout (float, optional): Timeout in seconds. Defaults to 60.0.
|
|
480
|
+
interval (float, optional): Interval in seconds. Defaults to 1.0.
|
|
481
|
+
initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
|
|
482
|
+
"""
|
|
483
|
+
session = self._get_session()
|
|
484
|
+
req = session.build_request("HEAD", f"/api/v1/result/{uuid}/")
|
|
485
|
+
|
|
486
|
+
scanned_at = self._scan_uuid_timestamp_memo.get(uuid)
|
|
487
|
+
if scanned_at and initial_wait:
|
|
488
|
+
elapsed = time.time() - scanned_at
|
|
489
|
+
if elapsed < initial_wait:
|
|
490
|
+
time.sleep(initial_wait - elapsed)
|
|
491
|
+
|
|
492
|
+
start_time = time.time()
|
|
493
|
+
while True:
|
|
494
|
+
res = self._send_request(session, req)
|
|
495
|
+
if res.status_code == 200:
|
|
496
|
+
self._scan_uuid_timestamp_memo.pop(uuid, None)
|
|
497
|
+
return
|
|
498
|
+
|
|
499
|
+
if time.time() - start_time > timeout:
|
|
500
|
+
raise TimeoutError("Timeout waiting for scan result.")
|
|
501
|
+
|
|
502
|
+
time.sleep(interval)
|
|
503
|
+
|
|
504
|
+
def scan_and_get_result(
|
|
505
|
+
self,
|
|
506
|
+
url: str,
|
|
507
|
+
visibility: VisibilityType,
|
|
508
|
+
tags: list[str] | None = None,
|
|
509
|
+
customagent: str | None = None,
|
|
510
|
+
referer: str | None = None,
|
|
511
|
+
override_safety: Any = None,
|
|
512
|
+
country: str | None = None,
|
|
513
|
+
timeout: float = 60.0,
|
|
514
|
+
interval: float = 1.0,
|
|
515
|
+
initial_wait: float | None = 10.0,
|
|
516
|
+
):
|
|
517
|
+
"""Scan a given URL, wait for a result and get it.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
url (str): URL to scan.
|
|
521
|
+
visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
|
|
522
|
+
tags (list[str] | None, optional): Tags to be attached. Defaults to None.
|
|
523
|
+
customagent (str | None, optional): Custom user agent. Defaults to None.
|
|
524
|
+
referer (str | None, optional): Referer. Defaults to None.
|
|
525
|
+
override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
|
|
526
|
+
country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
|
|
527
|
+
timeout (float, optional): Timeout for waiting a result in seconds. Defaults to 60.0.
|
|
528
|
+
interval (float, optional): Interval in seconds. Defaults to 1.0.
|
|
529
|
+
initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
dict: Scan result.
|
|
533
|
+
|
|
534
|
+
Reference:
|
|
535
|
+
https://urlscan.io/docs/api/#scan
|
|
536
|
+
"""
|
|
537
|
+
res = self.scan(
|
|
538
|
+
url,
|
|
539
|
+
visibility=visibility,
|
|
540
|
+
tags=tags,
|
|
541
|
+
customagent=customagent,
|
|
542
|
+
referer=referer,
|
|
543
|
+
override_safety=override_safety,
|
|
544
|
+
country=country,
|
|
545
|
+
)
|
|
546
|
+
uuid: str = res["uuid"]
|
|
547
|
+
self.wait_for_result(
|
|
548
|
+
uuid, timeout=timeout, interval=interval, initial_wait=initial_wait
|
|
549
|
+
)
|
|
550
|
+
return self.get_result(uuid)
|
|
551
|
+
|
|
552
|
+
def bulk_scan_and_get_results(
|
|
553
|
+
self,
|
|
554
|
+
urls: list[str],
|
|
555
|
+
visibility: VisibilityType,
|
|
556
|
+
tags: list[str] | None = None,
|
|
557
|
+
customagent: str | None = None,
|
|
558
|
+
referer: str | None = None,
|
|
559
|
+
override_safety: Any = None,
|
|
560
|
+
country: str | None = None,
|
|
561
|
+
timeout: float = 60.0,
|
|
562
|
+
interval: float = 1.0,
|
|
563
|
+
initial_wait: float | None = 10.0,
|
|
564
|
+
) -> list[tuple[str, dict | Exception]]:
|
|
565
|
+
"""Scan URLs, wait for results and get them.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
urls (list[str]): URLs to scan.
|
|
569
|
+
visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
|
|
570
|
+
tags (list[str] | None, optional): Tags to be attached. Defaults to None.
|
|
571
|
+
customagent (str | None, optional): Custom user agent. Defaults to None.
|
|
572
|
+
referer (str | None, optional): Referer. Defaults to None.
|
|
573
|
+
override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
|
|
574
|
+
country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
|
|
575
|
+
timeout (float, optional): Timeout for waiting a result in seconds. Defaults to 60.0.
|
|
576
|
+
interval (float, optional): Interval in seconds. Defaults to 1.0.
|
|
577
|
+
initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
list[tuple[str, dict | Exception]]: A list of tuples of (url, result or error).
|
|
581
|
+
|
|
582
|
+
Reference:
|
|
583
|
+
https://urlscan.io/docs/api/#scan
|
|
584
|
+
"""
|
|
585
|
+
|
|
586
|
+
responses = self.bulk_scan(
|
|
587
|
+
urls,
|
|
588
|
+
visibility=visibility,
|
|
589
|
+
tags=tags,
|
|
590
|
+
customagent=customagent,
|
|
591
|
+
referer=referer,
|
|
592
|
+
override_safety=override_safety,
|
|
593
|
+
country=country,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
def mapping(res_or_error: dict | Exception) -> dict | Exception:
|
|
597
|
+
if isinstance(res_or_error, Exception):
|
|
598
|
+
return res_or_error
|
|
599
|
+
|
|
600
|
+
uuid: str = res_or_error["uuid"]
|
|
601
|
+
self.wait_for_result(
|
|
602
|
+
uuid, timeout=timeout, interval=interval, initial_wait=initial_wait
|
|
603
|
+
)
|
|
604
|
+
return self.get_result(uuid)
|
|
605
|
+
|
|
606
|
+
return [(url, mapping(res_or_error)) for url, res_or_error in responses]
|
|
607
|
+
|
|
608
|
+
def _get_error(self, res: ClientResponse) -> APIError | None:
|
|
609
|
+
try:
|
|
610
|
+
res.raise_for_status()
|
|
611
|
+
except httpx.HTTPStatusError as exc:
|
|
612
|
+
data: dict = exc.response.json()
|
|
613
|
+
message: str = data["message"]
|
|
614
|
+
description: str | None = data.get("description")
|
|
615
|
+
status: int = data["status"]
|
|
616
|
+
|
|
617
|
+
# ref. https://urlscan.io/docs/api/#ratelimit
|
|
618
|
+
if status == 429:
|
|
619
|
+
rate_limit_reset_after = float(
|
|
620
|
+
exc.response.headers.get("X-Rate-Limit-Reset-After", 0)
|
|
621
|
+
)
|
|
622
|
+
return RateLimitError(
|
|
623
|
+
message,
|
|
624
|
+
description=description,
|
|
625
|
+
status=status,
|
|
626
|
+
rate_limit_reset_after=rate_limit_reset_after,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
return APIError(message, description=description, status=status)
|
|
630
|
+
|
|
631
|
+
return None
|
|
632
|
+
|
|
633
|
+
def _response_to_json(self, res: ClientResponse) -> dict:
|
|
634
|
+
error = self._get_error(res)
|
|
635
|
+
if error:
|
|
636
|
+
raise error
|
|
637
|
+
|
|
638
|
+
return res.json()
|
|
639
|
+
|
|
640
|
+
def _response_to_str(self, res: ClientResponse) -> str:
|
|
641
|
+
error = self._get_error(res)
|
|
642
|
+
if error:
|
|
643
|
+
raise error
|
|
644
|
+
|
|
645
|
+
return res.text
|
|
646
|
+
|
|
647
|
+
def _response_to_content(self, res: ClientResponse) -> bytes:
|
|
648
|
+
error = self._get_error(res)
|
|
649
|
+
if error:
|
|
650
|
+
raise error
|
|
651
|
+
|
|
652
|
+
return res.content
|
urlscan/error.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
class URLScanError(Exception):
|
|
2
|
+
pass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class APIError(URLScanError):
|
|
6
|
+
def __init__(self, message: str, *, status: int, description: str | None = None):
|
|
7
|
+
self.message = message
|
|
8
|
+
self.description = description
|
|
9
|
+
self.status = status
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RateLimitError(APIError):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
message: str,
|
|
17
|
+
*,
|
|
18
|
+
status: int,
|
|
19
|
+
rate_limit_reset_after: float,
|
|
20
|
+
description: str | None = None,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(message, description=description, status=status)
|
|
23
|
+
self.rate_limit_reset_after = rate_limit_reset_after
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RateLimitRemainingError(URLScanError):
|
|
27
|
+
pass
|
urlscan/iterator.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from .client import Client
|
|
5
|
+
|
|
6
|
+
MAX_TOTAL = 10_000
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SearchIterator:
|
|
10
|
+
"""
|
|
11
|
+
Search iterator.
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
>>> from urlscan import Client
|
|
15
|
+
>>>> with Client("<your_api_key>") as client:
|
|
16
|
+
>>> for result in client.search("page.domain:example.com"):
|
|
17
|
+
>>> print(result["_id"], result["page"]["url"])
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
client: "Client",
|
|
23
|
+
*,
|
|
24
|
+
q: str,
|
|
25
|
+
search_after: str | None = None,
|
|
26
|
+
size: int = 100,
|
|
27
|
+
limit: int | None = None,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Args:
|
|
31
|
+
client (Client): Client.
|
|
32
|
+
q (str): Search query.
|
|
33
|
+
search_after (str | None, optional): Search after to retrieve next results. Defaults to None.
|
|
34
|
+
size (int, optional): Number of results returned in a search. Defaults to 100.
|
|
35
|
+
limit (int | None, optional): Maximum number of results that will be returned by the iterator. Defaults to None.
|
|
36
|
+
"""
|
|
37
|
+
self._client = client
|
|
38
|
+
self._size = size
|
|
39
|
+
self._q = q
|
|
40
|
+
self._search_after = search_after
|
|
41
|
+
|
|
42
|
+
self._results: list[dict] = []
|
|
43
|
+
self._limit = limit
|
|
44
|
+
self._count = 0
|
|
45
|
+
self._total: int | None = None
|
|
46
|
+
self._has_more: bool = True
|
|
47
|
+
|
|
48
|
+
def _parse_response(self, data: dict) -> tuple[list[dict], int]:
|
|
49
|
+
results: list[dict] = data["results"]
|
|
50
|
+
total: int = data["total"]
|
|
51
|
+
return results, total
|
|
52
|
+
|
|
53
|
+
def _get(self):
|
|
54
|
+
data = self._client.get_json(
|
|
55
|
+
"/api/v1/search/",
|
|
56
|
+
params={
|
|
57
|
+
"q": self._q,
|
|
58
|
+
"size": self._size,
|
|
59
|
+
"search_after": self._search_after,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
return self._parse_response(data)
|
|
63
|
+
|
|
64
|
+
def __iter__(self):
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def __next__(self):
|
|
68
|
+
if self._limit and self._count >= self._limit:
|
|
69
|
+
raise StopIteration()
|
|
70
|
+
|
|
71
|
+
if not self._results and (self._count == 0 or self._has_more):
|
|
72
|
+
self._results, total = self._get()
|
|
73
|
+
|
|
74
|
+
# NOTE: total should be set only once (to ignore newly added results after the first request)
|
|
75
|
+
self._total = self._total or total
|
|
76
|
+
if self._total != MAX_TOTAL:
|
|
77
|
+
self._has_more = self._total > (self._count + len(self._results))
|
|
78
|
+
else:
|
|
79
|
+
self._has_more = len(self._results) >= self._size
|
|
80
|
+
|
|
81
|
+
if len(self._results) > 0:
|
|
82
|
+
last_result = self._results[-1]
|
|
83
|
+
sort: list[str | int] = last_result["sort"]
|
|
84
|
+
self._search_after = ",".join(str(x) for x in sort)
|
|
85
|
+
|
|
86
|
+
if not self._results:
|
|
87
|
+
raise StopIteration()
|
|
88
|
+
|
|
89
|
+
result = self._results.pop(0)
|
|
90
|
+
self._count += 1
|
|
91
|
+
return result
|
urlscan/types.py
ADDED
urlscan/utils.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: urlscan-python
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: The official Python API client for urlscan.io
|
|
5
|
+
Project-URL: Repository, https://github.com/urlscan/urlscan-python/
|
|
6
|
+
Project-URL: Homepage, https://github.com/urlscan/urlscan-python/
|
|
7
|
+
Project-URL: Documentation, https://urlscan.github.io/urlscan-python/
|
|
8
|
+
Project-URL: Issues, https://github.com/urlscan/urlscan-python/issues/
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Python: <4.0,>=3.10
|
|
16
|
+
Requires-Dist: httpx~=0.27
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# urlscan-python
|
|
20
|
+
|
|
21
|
+
The official Python API client for urlscan.io.
|
|
22
|
+
|
|
23
|
+
## Requirements
|
|
24
|
+
|
|
25
|
+
- Python 3.10+
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install urlscan-python
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quickstart
|
|
34
|
+
|
|
35
|
+
Start by importing `urlscan` module
|
|
36
|
+
|
|
37
|
+
```py
|
|
38
|
+
>>> import urlscan
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Create a client with your API key:
|
|
42
|
+
|
|
43
|
+
```py
|
|
44
|
+
>>> client = urlscan.Client("<your_api_key>")
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Scan a URL:
|
|
48
|
+
|
|
49
|
+
```py
|
|
50
|
+
>>> res = client.scan("<url>", visibility="public")
|
|
51
|
+
>>> uuid: str = res["uuid"]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Wait for a scan result:
|
|
55
|
+
|
|
56
|
+
```py
|
|
57
|
+
>>> client.wait_for_result(uuid)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Get a scan result:
|
|
61
|
+
|
|
62
|
+
```py
|
|
63
|
+
>>> result = client.get_result(uuid)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Bulk scan:
|
|
67
|
+
|
|
68
|
+
```py
|
|
69
|
+
>>> client.bulk_scan(["<url>", "<url>"], visibility="public")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Alternatively, you can use `_and_get_result(s)` suffixed methods to do scan, wait and get at once.
|
|
73
|
+
|
|
74
|
+
```py
|
|
75
|
+
>>> client.scan_and_get_result("<url>", visibility="public")
|
|
76
|
+
>>> client.bulk_scan_and_get_results(["<url>", "<url>"], visibility="public")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
`urlscan.Client.search()` returns an iterator to iterate search results:
|
|
80
|
+
|
|
81
|
+
```py
|
|
82
|
+
>>> for result in client.search("page.domain:example.com"):
|
|
83
|
+
>>> print(result["_id"])
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Examples
|
|
87
|
+
|
|
88
|
+
See [Examples](https://github.com/urlscan/urlscan-python/tree/main/examples/).
|
|
89
|
+
|
|
90
|
+
## References
|
|
91
|
+
|
|
92
|
+
- [Client](https://urlscan.github.io/urlscan-python/references/client/)
|
|
93
|
+
- [Iterator](https://urlscan.github.io/urlscan-python/references/iterator/)
|
|
94
|
+
- [Errors](https://urlscan.github.io/urlscan-python/references/errors/)
|
|
95
|
+
|
|
96
|
+
## Help Wanted?
|
|
97
|
+
|
|
98
|
+
Please feel free to to [open an issue](https://github.com/urlscan/urlscan-python/issues/new) if you find a bug or some feature that you want to see implemented.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
urlscan/__init__.py,sha256=wPTsRLwf8pSoR-dApmXnPWTcqfWdkSjzntkdYJfcxdg,263
|
|
2
|
+
urlscan/_version.py,sha256=CactNZqrHHYTPrkHKccy2WKXmaiUdtTgPqSjFyVXnJk,18
|
|
3
|
+
urlscan/_version.pyi,sha256=GxQ4ZGLPQObN92QW_Hb8IJPEuYINNn186FjrRovM09g,13
|
|
4
|
+
urlscan/client.py,sha256=P6xGypSDPkSIls8XsJrm18ecaKDz8o73a8m72iQ4AG8,22124
|
|
5
|
+
urlscan/error.py,sha256=aKSV6WY0zSn9s_NES3egkZ3YJHpydb6ALpwYfW27QzY,688
|
|
6
|
+
urlscan/iterator.py,sha256=94w6VjA-sjv1GZh2Ml9MNZWmtbP9GytRoGH5psV87oE,2801
|
|
7
|
+
urlscan/types.py,sha256=zaguwOY_XGB87h7o7mo6kPywHZptHWqB84zB-M1NjYM,208
|
|
8
|
+
urlscan/utils.py,sha256=CFDhQLgyaaVBfPR8BygHMXDi0t-1lNCDq45bqGuZo5M,183
|
|
9
|
+
urlscan_python-0.0.1.dist-info/METADATA,sha256=uYx9iadfE8UN2jxYRX89begtFUyH0HnTrplNAY4fDAQ,2342
|
|
10
|
+
urlscan_python-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
+
urlscan_python-0.0.1.dist-info/licenses/LICENSE,sha256=3mQG9Twzsjb827NBj4AxQSxCcc1c8XArzDucw5lhM5s,1067
|
|
12
|
+
urlscan_python-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 urlscan.io
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|