urlscan-python 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
urlscan/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ try:
2
+ from ._version import version
3
+
4
+ __version__ = version
5
+ except ImportError:
6
+ __version__ = "0.0.0"
7
+
8
+ from .client import Client # noqa: F401
9
+ from .error import APIError, RateLimitError # noqa: F401
10
+ from .iterator import SearchIterator # noqa: F401
urlscan/_version.py ADDED
@@ -0,0 +1 @@
1
+ version = "0.0.1"
urlscan/_version.pyi ADDED
@@ -0,0 +1 @@
1
+ version: str
urlscan/client.py ADDED
@@ -0,0 +1,652 @@
1
+ import contextlib
2
+ import datetime
3
+ import json
4
+ import logging
5
+ import os
6
+ import time
7
+ from dataclasses import dataclass
8
+ from io import BytesIO
9
+ from typing import Any, BinaryIO, TypedDict
10
+
11
+ import httpx
12
+ from httpx._types import QueryParamTypes, RequestData, TimeoutTypes
13
+
14
+ from ._version import version
15
+ from .error import APIError, RateLimitError, RateLimitRemainingError
16
+ from .iterator import SearchIterator
17
+ from .types import ActionType, VisibilityType
18
+ from .utils import parse_datetime
19
+
20
+ logger = logging.getLogger("urlscan-python")
21
+
22
+ BASE_URL = os.environ.get("URLSCAN_BASE_URL", "https://urlscan.io")
23
+ USER_AGENT = f"urlscan-py/{version}"
24
+
25
+
26
+ def _compact(d: dict) -> dict:
27
+ """Remove empty values from a dictionary."""
28
+ return {k: v for k, v in d.items() if v is not None}
29
+
30
+
31
+ class RetryTransport(httpx.HTTPTransport):
32
+ def handle_request(self, request: httpx.Request) -> httpx.Response:
33
+ res = super().handle_request(request)
34
+ if res.status_code == 429:
35
+ rate_limit_reset_after: str | None = res.headers.get(
36
+ "X-Rate-Limit-Reset-After"
37
+ )
38
+ if rate_limit_reset_after is None:
39
+ return res
40
+
41
+ logger.info(
42
+ f"Rate limit error hit. Wait {rate_limit_reset_after} seconds before retrying..."
43
+ )
44
+ time.sleep(float(rate_limit_reset_after))
45
+ return self.handle_request(request)
46
+
47
+ return res
48
+
49
+
50
+ class ClientResponse:
51
+ def __init__(self, res: httpx.Response):
52
+ self._res = res
53
+
54
+ @property
55
+ def basename(self) -> str:
56
+ return os.path.basename(self._res.url.path)
57
+
58
+ @property
59
+ def content(self) -> bytes:
60
+ return self._res.content
61
+
62
+ def json(self) -> Any:
63
+ return self._res.json()
64
+
65
+ @property
66
+ def text(self) -> str:
67
+ return self._res.text
68
+
69
+ @property
70
+ def headers(self):
71
+ return self._res.headers
72
+
73
+ @property
74
+ def status_code(self) -> int:
75
+ return self._res.status_code
76
+
77
+ def raise_for_status(self) -> None:
78
+ self._res.raise_for_status()
79
+
80
+
81
+ @dataclass
82
+ class RateLimit:
83
+ remaining: int
84
+ reset: datetime.datetime
85
+
86
+
87
+ class RateLimitMemo(TypedDict):
88
+ public: RateLimit | None
89
+ private: RateLimit | None
90
+ unlisted: RateLimit | None
91
+ retrieve: RateLimit | None
92
+ search: RateLimit | None
93
+
94
+
95
+ class Client:
96
+ def __init__(
97
+ self,
98
+ api_key: str,
99
+ base_url: str = BASE_URL,
100
+ user_agent: str = USER_AGENT,
101
+ trust_env: bool = False,
102
+ timeout: TimeoutTypes = 60,
103
+ proxy: str | None = None,
104
+ verify: bool = True,
105
+ retry: bool = False,
106
+ ):
107
+ """
108
+ Args:
109
+ api_key (str): Your urlscan.io API key.
110
+ base_url (str, optional): Base URL. Defaults to BASE_URL.
111
+ user_agent (str, optional): User agent. Defaults to USER_AGENT.
112
+ trust_env (bool, optional): Enable or disable usage of environment variables for configuration. Defaults to False.
113
+ timeout (TimeoutTypes, optional): timeout configuration to use when sending request. Defaults to 60.
114
+ proxy (str | None, optional): Proxy URL where all the traffic should be routed. Defaults to None.
115
+ verify (bool, optional): Either `True` to use an SSL context with the default CA bundle, `False` to disable verification. Defaults to True.
116
+ retry (bool, optional): Whether to use automatic X-Rate-Limit-Reset-After HTTP header based retry. Defaults to False.
117
+ """
118
+ self._api_key = api_key
119
+ self._base_url = base_url
120
+ self._user_agent = user_agent
121
+ self._trust_env = trust_env
122
+ self._timeout = timeout
123
+ self._proxy = proxy
124
+ self._verify = verify
125
+ self._retry = retry
126
+
127
+ self._session: httpx.Client | None = None
128
+ self._rate_limit_memo: RateLimitMemo = {
129
+ "public": None,
130
+ "private": None,
131
+ "unlisted": None,
132
+ "retrieve": None,
133
+ "search": None,
134
+ }
135
+
136
+ self._scan_uuid_timestamp_memo: dict[str, float] = {}
137
+
138
+ def __enter__(self):
139
+ return self
140
+
141
+ def __exit__(self, item_type: Any, value: Any, traceback: Any):
142
+ self.close()
143
+
144
+ def close(self):
145
+ if self._session:
146
+ self._session.close()
147
+ self._session = None
148
+
149
+ def _get_session(self) -> httpx.Client:
150
+ if self._session:
151
+ return self._session
152
+
153
+ headers = _compact(
154
+ {
155
+ "User-Agent": self._user_agent,
156
+ "API-Key": self._api_key,
157
+ }
158
+ )
159
+ transport: httpx.HTTPTransport | None = None
160
+ if self._retry:
161
+ transport = RetryTransport()
162
+
163
+ self._session = httpx.Client(
164
+ base_url=self._base_url,
165
+ headers=headers,
166
+ timeout=self._timeout,
167
+ proxy=self._proxy,
168
+ verify=self._verify,
169
+ trust_env=self._trust_env,
170
+ transport=transport,
171
+ )
172
+ return self._session
173
+
174
+ def _get_action(self, request: httpx.Request) -> ActionType | None:
175
+ path = request.url.path
176
+ if request.method == "GET":
177
+ if path == "/api/v1/search/":
178
+ return "search"
179
+
180
+ if path.startswith("/api/v1/result/"):
181
+ return "retrieve"
182
+
183
+ return None
184
+
185
+ if request.method == "POST":
186
+ if path != "/api/v1/scan/":
187
+ return None
188
+
189
+ if request.headers.get("Content-Type") != "application/json":
190
+ return None
191
+
192
+ with contextlib.suppress(json.JSONDecodeError):
193
+ data: dict = json.loads(request.content)
194
+ return data.get("visibility")
195
+
196
+ return None
197
+
198
+ def _send_request(
199
+ self, session: httpx.Client, request: httpx.Request
200
+ ) -> ClientResponse:
201
+ # let it automatic retry if retry is enabled
202
+ if self._retry:
203
+ return ClientResponse(session.send(request))
204
+
205
+ action = self._get_action(request)
206
+ if action:
207
+ rate_limit: RateLimit | None = self._rate_limit_memo.get(action)
208
+ if rate_limit:
209
+ utcnow = datetime.datetime.now(datetime.timezone.utc)
210
+ if rate_limit.remaining == 0 and rate_limit.reset > utcnow:
211
+ raise RateLimitRemainingError(
212
+ f"{action} is rate limited. Wait until {utcnow}."
213
+ )
214
+
215
+ res = ClientResponse(session.send(request))
216
+
217
+ # use action in response headers
218
+ action = res.headers.get("X-Rate-Limit-Action")
219
+ if action:
220
+ remaining = res.headers.get("X-Rate-Limit-Remaining")
221
+ reset = res.headers.get("X-Rate-Limit-Reset")
222
+ if remaining and reset:
223
+ self._rate_limit_memo[action] = RateLimit(
224
+ remaining=int(remaining),
225
+ reset=parse_datetime(reset),
226
+ )
227
+
228
+ return res
229
+
230
+ def get(self, path: str, params: QueryParamTypes | None = None) -> ClientResponse:
231
+ """Send a GET request to a given API endpoint.
232
+
233
+ Args:
234
+ path (str): Path to API endpoint.
235
+ params (QueryParamTypes | None, optional): Query parameters. Defaults to None.
236
+
237
+ Returns:
238
+ ClientResponse: Response.
239
+ """
240
+ session = self._get_session()
241
+ req = session.build_request("GET", path, params=params)
242
+ return self._send_request(session, req)
243
+
244
+ def get_json(self, path: str, params: QueryParamTypes | None = None) -> dict:
245
+ res = self.get(path, params=params)
246
+ return self._response_to_json(res)
247
+
248
+ def post(
249
+ self,
250
+ path: str,
251
+ json: Any | None = None,
252
+ data: RequestData | None = None,
253
+ ) -> ClientResponse:
254
+ """Send a POST request to a given API endpoint.
255
+
256
+ Args:
257
+ path (str): Path.
258
+ json (Any | None, optional): Dict to send in request body as JSON. Defaults to None.
259
+ data (RequestData | None, optional): Dict to send in request body. Defaults to None.
260
+
261
+ Returns:
262
+ ClientResponse: Response.
263
+ """
264
+ session = self._get_session()
265
+ req = session.build_request("POST", path, json=json, data=data)
266
+ return self._send_request(session, req)
267
+
268
+ def download(
269
+ self,
270
+ path: str,
271
+ file: BinaryIO,
272
+ params: QueryParamTypes | None = None,
273
+ ) -> None:
274
+ """Download a file from a given API endpoint.
275
+
276
+ Args:
277
+ path (str): Path to API endpoint.
278
+ file (BinaryIO): File object to write to.
279
+ params (QueryParamTypes | None, optional): Query parameters. Defaults to None.
280
+
281
+ Returns:
282
+ BytesIO: File content.
283
+ """
284
+ res = self.get(path, params=params)
285
+ file.write(res.content)
286
+ return
287
+
288
+ def get_content(self, path: str, params: QueryParamTypes | None = None) -> bytes:
289
+ res = self.get(path, params=params)
290
+ return self._response_to_content(res)
291
+
292
+ def get_text(self, path: str, params: QueryParamTypes | None = None) -> str:
293
+ res = self.get(path, params=params)
294
+ return self._response_to_str(res)
295
+
296
+ def get_result(self, uuid: str) -> dict:
297
+ """Get a result of a scan by UUID.
298
+
299
+ Args:
300
+ uuid (str): UUID.
301
+
302
+ Returns:
303
+ Dict: Scan result.
304
+
305
+ Reference:
306
+ https://urlscan.io/docs/api/#result
307
+ """
308
+ return self.get_json(f"/api/v1/result/{uuid}/")
309
+
310
+ def get_screenshot(self, uuid: str) -> BytesIO:
311
+ """Get a screenshot of a scan by UUID.
312
+
313
+ Args:
314
+ uuid (str): UUID.
315
+
316
+ Returns:
317
+ : Screenshot (img/png) as bytes.
318
+
319
+ Reference:
320
+ https://urlscan.io/docs/api/#screenshot
321
+ """
322
+ res = self.get(f"/screenshots/{uuid}.png")
323
+ bio = BytesIO(res.content)
324
+ bio.name = res.basename
325
+ return bio
326
+
327
+ def get_dom(self, uuid: str) -> str:
328
+ """Get a DOM of a scan by UUID.
329
+
330
+ Args:
331
+ uuid (str): UUID
332
+
333
+ Returns:
334
+ str: DOM as a string.
335
+
336
+ Reference:
337
+ https://urlscan.io/docs/api/#dom
338
+ """
339
+ return self.get_text(f"/dom/{uuid}/")
340
+
341
+ def search(
342
+ self,
343
+ q: str = "",
344
+ size: int = 100,
345
+ limit: int | None = None,
346
+ search_after: str | None = None,
347
+ ) -> SearchIterator:
348
+ """Search.
349
+
350
+ Args:
351
+ q (str): Query term. Defaults to "".
352
+ size (int, optional): Number of results returned in a search. Defaults to 100.
353
+ limit (int | None, optional): . Defaults to None.
354
+ search_after (str | None, optional): Maximum number of results that will be returned by the iterator. Defaults to None.
355
+
356
+ Returns:
357
+ SearchIterator: Search iterator.
358
+
359
+ Reference:
360
+ https://urlscan.io/docs/api/#search
361
+ """
362
+ return SearchIterator(
363
+ self,
364
+ q=q,
365
+ size=size,
366
+ limit=limit,
367
+ search_after=search_after,
368
+ )
369
+
370
+ def scan(
371
+ self,
372
+ url: str,
373
+ *,
374
+ visibility: VisibilityType,
375
+ tags: list[str] | None = None,
376
+ customagent: str | None = None,
377
+ referer: str | None = None,
378
+ override_safety: Any = None,
379
+ country: str | None = None,
380
+ ) -> dict:
381
+ """Scan a given URL.
382
+
383
+ Args:
384
+ url (str): URL to scan.
385
+ visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
386
+ tags (list[str] | None, optional): Tags to be attached. Defaults to None.
387
+ customagent (str | None, optional): Custom user agent. Defaults to None.
388
+ referer (str | None, optional): Referer. Defaults to None.
389
+ override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
390
+ country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
391
+
392
+ Returns:
393
+ dict: Scan response.
394
+
395
+ Reference:
396
+ https://urlscan.io/docs/api/#scan
397
+ """
398
+ data = _compact(
399
+ {
400
+ "url": url,
401
+ "tags": tags,
402
+ "visibility": visibility,
403
+ "customagent": customagent,
404
+ "referer": referer,
405
+ "overrideSafety": override_safety,
406
+ "country": country,
407
+ }
408
+ )
409
+ res = self.post("/api/v1/scan/", json=data)
410
+ json_res = self._response_to_json(res)
411
+
412
+ json_visibility = json_res.get("visibility")
413
+ if json_visibility is not None and json_visibility != visibility:
414
+ logger.warning(f"Visibility is enforced to {json_visibility}.")
415
+
416
+ # memoize the scan UUID & timestamp
417
+ uuid = json_res.get("uuid")
418
+ if isinstance(uuid, str):
419
+ self._scan_uuid_timestamp_memo[uuid] = time.time()
420
+
421
+ return json_res
422
+
423
+ def bulk_scan(
424
+ self,
425
+ urls: list[str],
426
+ *,
427
+ visibility: VisibilityType,
428
+ tags: list[str] | None = None,
429
+ customagent: str | None = None,
430
+ referer: str | None = None,
431
+ override_safety: Any = None,
432
+ country: str | None = None,
433
+ ) -> list[tuple[str, dict | Exception]]:
434
+ """Scan multiple URLs in bulk.
435
+
436
+ Args:
437
+ urls (list[str]): List of URLs to scan.
438
+ visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
439
+ tags (list[str] | None, optional): Tags to be attached. Defaults to None.
440
+ customagent (str | None, optional): Custom user agent. Defaults to None.
441
+ referer (str | None, optional): Referer. Defaults to None.
442
+ override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
443
+ country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
444
+
445
+ Returns:
446
+ list[tuple[str, dict | Exception]]: A list of tuples of (url, scan response or error).
447
+
448
+ Reference:
449
+ https://urlscan.io/docs/api/#scan
450
+ """
451
+
452
+ def inner(url: str) -> dict | Exception:
453
+ try:
454
+ return self.scan(
455
+ url,
456
+ visibility=visibility,
457
+ tags=tags,
458
+ customagent=customagent,
459
+ referer=referer,
460
+ override_safety=override_safety,
461
+ country=country,
462
+ )
463
+ except Exception as e:
464
+ return e
465
+
466
+ return [(url, inner(url)) for url in urls]
467
+
468
+ def wait_for_result(
469
+ self,
470
+ uuid: str,
471
+ timeout: float = 60.0,
472
+ interval: float = 1.0,
473
+ initial_wait: float | None = 10.0,
474
+ ) -> None:
475
+ """Wait for a scan result to be available.
476
+
477
+ Args:
478
+ uuid (str): UUID of a result.
479
+ timeout (float, optional): Timeout in seconds. Defaults to 60.0.
480
+ interval (float, optional): Interval in seconds. Defaults to 1.0.
481
+ initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
482
+ """
483
+ session = self._get_session()
484
+ req = session.build_request("HEAD", f"/api/v1/result/{uuid}/")
485
+
486
+ scanned_at = self._scan_uuid_timestamp_memo.get(uuid)
487
+ if scanned_at and initial_wait:
488
+ elapsed = time.time() - scanned_at
489
+ if elapsed < initial_wait:
490
+ time.sleep(initial_wait - elapsed)
491
+
492
+ start_time = time.time()
493
+ while True:
494
+ res = self._send_request(session, req)
495
+ if res.status_code == 200:
496
+ self._scan_uuid_timestamp_memo.pop(uuid, None)
497
+ return
498
+
499
+ if time.time() - start_time > timeout:
500
+ raise TimeoutError("Timeout waiting for scan result.")
501
+
502
+ time.sleep(interval)
503
+
504
+ def scan_and_get_result(
505
+ self,
506
+ url: str,
507
+ visibility: VisibilityType,
508
+ tags: list[str] | None = None,
509
+ customagent: str | None = None,
510
+ referer: str | None = None,
511
+ override_safety: Any = None,
512
+ country: str | None = None,
513
+ timeout: float = 60.0,
514
+ interval: float = 1.0,
515
+ initial_wait: float | None = 10.0,
516
+ ):
517
+ """Scan a given URL, wait for a result and get it.
518
+
519
+ Args:
520
+ url (str): URL to scan.
521
+ visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
522
+ tags (list[str] | None, optional): Tags to be attached. Defaults to None.
523
+ customagent (str | None, optional): Custom user agent. Defaults to None.
524
+ referer (str | None, optional): Referer. Defaults to None.
525
+ override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
526
+ country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
527
+ timeout (float, optional): Timeout for waiting a result in seconds. Defaults to 60.0.
528
+ interval (float, optional): Interval in seconds. Defaults to 1.0.
529
+ initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
530
+
531
+ Returns:
532
+ dict: Scan result.
533
+
534
+ Reference:
535
+ https://urlscan.io/docs/api/#scan
536
+ """
537
+ res = self.scan(
538
+ url,
539
+ visibility=visibility,
540
+ tags=tags,
541
+ customagent=customagent,
542
+ referer=referer,
543
+ override_safety=override_safety,
544
+ country=country,
545
+ )
546
+ uuid: str = res["uuid"]
547
+ self.wait_for_result(
548
+ uuid, timeout=timeout, interval=interval, initial_wait=initial_wait
549
+ )
550
+ return self.get_result(uuid)
551
+
552
+ def bulk_scan_and_get_results(
553
+ self,
554
+ urls: list[str],
555
+ visibility: VisibilityType,
556
+ tags: list[str] | None = None,
557
+ customagent: str | None = None,
558
+ referer: str | None = None,
559
+ override_safety: Any = None,
560
+ country: str | None = None,
561
+ timeout: float = 60.0,
562
+ interval: float = 1.0,
563
+ initial_wait: float | None = 10.0,
564
+ ) -> list[tuple[str, dict | Exception]]:
565
+ """Scan URLs, wait for results and get them.
566
+
567
+ Args:
568
+ urls (list[str]): URLs to scan.
569
+ visibility (VisibilityType): Visibility of the scan. Can be "public", "private", or "unlisted".
570
+ tags (list[str] | None, optional): Tags to be attached. Defaults to None.
571
+ customagent (str | None, optional): Custom user agent. Defaults to None.
572
+ referer (str | None, optional): Referer. Defaults to None.
573
+ override_safety (Any, optional): If set to any value, this will disable reclassification of URLs with potential PII in them. Defaults to None.
574
+ country (str | None, optional): Specify which country the scan should be performed from (2-Letter ISO-3166-1 alpha-2 country. Defaults to None.
575
+ timeout (float, optional): Timeout for waiting a result in seconds. Defaults to 60.0.
576
+ interval (float, optional): Interval in seconds. Defaults to 1.0.
577
+ initial_wait (float | None, optional): Initial wait time in seconds. Set None to disable. Defaults to 10.0.
578
+
579
+ Returns:
580
+ list[tuple[str, dict | Exception]]: A list of tuples of (url, result or error).
581
+
582
+ Reference:
583
+ https://urlscan.io/docs/api/#scan
584
+ """
585
+
586
+ responses = self.bulk_scan(
587
+ urls,
588
+ visibility=visibility,
589
+ tags=tags,
590
+ customagent=customagent,
591
+ referer=referer,
592
+ override_safety=override_safety,
593
+ country=country,
594
+ )
595
+
596
+ def mapping(res_or_error: dict | Exception) -> dict | Exception:
597
+ if isinstance(res_or_error, Exception):
598
+ return res_or_error
599
+
600
+ uuid: str = res_or_error["uuid"]
601
+ self.wait_for_result(
602
+ uuid, timeout=timeout, interval=interval, initial_wait=initial_wait
603
+ )
604
+ return self.get_result(uuid)
605
+
606
+ return [(url, mapping(res_or_error)) for url, res_or_error in responses]
607
+
608
+ def _get_error(self, res: ClientResponse) -> APIError | None:
609
+ try:
610
+ res.raise_for_status()
611
+ except httpx.HTTPStatusError as exc:
612
+ data: dict = exc.response.json()
613
+ message: str = data["message"]
614
+ description: str | None = data.get("description")
615
+ status: int = data["status"]
616
+
617
+ # ref. https://urlscan.io/docs/api/#ratelimit
618
+ if status == 429:
619
+ rate_limit_reset_after = float(
620
+ exc.response.headers.get("X-Rate-Limit-Reset-After", 0)
621
+ )
622
+ return RateLimitError(
623
+ message,
624
+ description=description,
625
+ status=status,
626
+ rate_limit_reset_after=rate_limit_reset_after,
627
+ )
628
+
629
+ return APIError(message, description=description, status=status)
630
+
631
+ return None
632
+
633
+ def _response_to_json(self, res: ClientResponse) -> dict:
634
+ error = self._get_error(res)
635
+ if error:
636
+ raise error
637
+
638
+ return res.json()
639
+
640
+ def _response_to_str(self, res: ClientResponse) -> str:
641
+ error = self._get_error(res)
642
+ if error:
643
+ raise error
644
+
645
+ return res.text
646
+
647
+ def _response_to_content(self, res: ClientResponse) -> bytes:
648
+ error = self._get_error(res)
649
+ if error:
650
+ raise error
651
+
652
+ return res.content
urlscan/error.py ADDED
@@ -0,0 +1,27 @@
1
+ class URLScanError(Exception):
2
+ pass
3
+
4
+
5
+ class APIError(URLScanError):
6
+ def __init__(self, message: str, *, status: int, description: str | None = None):
7
+ self.message = message
8
+ self.description = description
9
+ self.status = status
10
+ super().__init__(message)
11
+
12
+
13
+ class RateLimitError(APIError):
14
+ def __init__(
15
+ self,
16
+ message: str,
17
+ *,
18
+ status: int,
19
+ rate_limit_reset_after: float,
20
+ description: str | None = None,
21
+ ):
22
+ super().__init__(message, description=description, status=status)
23
+ self.rate_limit_reset_after = rate_limit_reset_after
24
+
25
+
26
+ class RateLimitRemainingError(URLScanError):
27
+ pass
urlscan/iterator.py ADDED
@@ -0,0 +1,91 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from .client import Client
5
+
6
+ MAX_TOTAL = 10_000
7
+
8
+
9
+ class SearchIterator:
10
+ """
11
+ Search iterator.
12
+
13
+ Examples:
14
+ >>> from urlscan import Client
15
+ >>>> with Client("<your_api_key>") as client:
16
+ >>> for result in client.search("page.domain:example.com"):
17
+ >>> print(result["_id"], result["page"]["url"])
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ client: "Client",
23
+ *,
24
+ q: str,
25
+ search_after: str | None = None,
26
+ size: int = 100,
27
+ limit: int | None = None,
28
+ ):
29
+ """
30
+ Args:
31
+ client (Client): Client.
32
+ q (str): Search query.
33
+ search_after (str | None, optional): Search after to retrieve next results. Defaults to None.
34
+ size (int, optional): Number of results returned in a search. Defaults to 100.
35
+ limit (int | None, optional): Maximum number of results that will be returned by the iterator. Defaults to None.
36
+ """
37
+ self._client = client
38
+ self._size = size
39
+ self._q = q
40
+ self._search_after = search_after
41
+
42
+ self._results: list[dict] = []
43
+ self._limit = limit
44
+ self._count = 0
45
+ self._total: int | None = None
46
+ self._has_more: bool = True
47
+
48
+ def _parse_response(self, data: dict) -> tuple[list[dict], int]:
49
+ results: list[dict] = data["results"]
50
+ total: int = data["total"]
51
+ return results, total
52
+
53
+ def _get(self):
54
+ data = self._client.get_json(
55
+ "/api/v1/search/",
56
+ params={
57
+ "q": self._q,
58
+ "size": self._size,
59
+ "search_after": self._search_after,
60
+ },
61
+ )
62
+ return self._parse_response(data)
63
+
64
+ def __iter__(self):
65
+ return self
66
+
67
+ def __next__(self):
68
+ if self._limit and self._count >= self._limit:
69
+ raise StopIteration()
70
+
71
+ if not self._results and (self._count == 0 or self._has_more):
72
+ self._results, total = self._get()
73
+
74
+ # NOTE: total should be set only once (to ignore newly added results after the first request)
75
+ self._total = self._total or total
76
+ if self._total != MAX_TOTAL:
77
+ self._has_more = self._total > (self._count + len(self._results))
78
+ else:
79
+ self._has_more = len(self._results) >= self._size
80
+
81
+ if len(self._results) > 0:
82
+ last_result = self._results[-1]
83
+ sort: list[str | int] = last_result["sort"]
84
+ self._search_after = ",".join(str(x) for x in sort)
85
+
86
+ if not self._results:
87
+ raise StopIteration()
88
+
89
+ result = self._results.pop(0)
90
+ self._count += 1
91
+ return result
urlscan/types.py ADDED
@@ -0,0 +1,6 @@
1
+ from typing import Literal
2
+
3
+ VisibilityType = Literal["public", "private", "unlisted"]
4
+ SearchType = Literal["search"]
5
+ RetrieveType = Literal["retrieve"]
6
+ ActionType = VisibilityType | SearchType | RetrieveType
urlscan/utils.py ADDED
@@ -0,0 +1,6 @@
1
+ import datetime
2
+
3
+
4
+ def parse_datetime(s: str) -> datetime.datetime:
5
+ dt = datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%S.%fZ")
6
+ return dt.replace(tzinfo=datetime.timezone.utc)
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.4
2
+ Name: urlscan-python
3
+ Version: 0.0.1
4
+ Summary: The official Python API client for urlscan.io
5
+ Project-URL: Repository, https://github.com/urlscan/urlscan-python/
6
+ Project-URL: Homepage, https://github.com/urlscan/urlscan-python/
7
+ Project-URL: Documentation, https://urlscan.github.io/urlscan-python/
8
+ Project-URL: Issues, https://github.com/urlscan/urlscan-python/issues/
9
+ License-File: LICENSE
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Python: <4.0,>=3.10
16
+ Requires-Dist: httpx~=0.27
17
+ Description-Content-Type: text/markdown
18
+
19
+ # urlscan-python
20
+
21
+ The official Python API client for urlscan.io.
22
+
23
+ ## Requirements
24
+
25
+ - Python 3.10+
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install urlscan-python
31
+ ```
32
+
33
+ ## Quickstart
34
+
35
+ Start by importing `urlscan` module
36
+
37
+ ```py
38
+ >>> import urlscan
39
+ ```
40
+
41
+ Create a client with your API key:
42
+
43
+ ```py
44
+ >>> client = urlscan.Client("<your_api_key>")
45
+ ```
46
+
47
+ Scan a URL:
48
+
49
+ ```py
50
+ >>> res = client.scan("<url>", visibility="public")
51
+ >>> uuid: str = res["uuid"]
52
+ ```
53
+
54
+ Wait for a scan result:
55
+
56
+ ```py
57
+ >>> client.wait_for_result(uuid)
58
+ ```
59
+
60
+ Get a scan result:
61
+
62
+ ```py
63
+ >>> result = client.get_result(uuid)
64
+ ```
65
+
66
+ Bulk scan:
67
+
68
+ ```py
69
+ >>> client.bulk_scan(["<url>", "<url>"], visibility="public")
70
+ ```
71
+
72
+ Alternatively, you can use `_and_get_result(s)` suffixed methods to do scan, wait and get at once.
73
+
74
+ ```py
75
+ >>> client.scan_and_get_result("<url>", visibility="public")
76
+ >>> client.bulk_scan_and_get_results(["<url>", "<url>"], visibility="public")
77
+ ```
78
+
79
+ `urlscan.Client.search()` returns an iterator to iterate search results:
80
+
81
+ ```py
82
+ >>> for result in client.search("page.domain:example.com"):
83
+ >>> print(result["_id"])
84
+ ```
85
+
86
+ ## Examples
87
+
88
+ See [Examples](https://github.com/urlscan/urlscan-python/tree/main/examples/).
89
+
90
+ ## References
91
+
92
+ - [Client](https://urlscan.github.io/urlscan-python/references/client/)
93
+ - [Iterator](https://urlscan.github.io/urlscan-python/references/iterator/)
94
+ - [Errors](https://urlscan.github.io/urlscan-python/references/errors/)
95
+
96
+ ## Help Wanted?
97
+
98
+ Please feel free to to [open an issue](https://github.com/urlscan/urlscan-python/issues/new) if you find a bug or some feature that you want to see implemented.
@@ -0,0 +1,12 @@
1
+ urlscan/__init__.py,sha256=wPTsRLwf8pSoR-dApmXnPWTcqfWdkSjzntkdYJfcxdg,263
2
+ urlscan/_version.py,sha256=CactNZqrHHYTPrkHKccy2WKXmaiUdtTgPqSjFyVXnJk,18
3
+ urlscan/_version.pyi,sha256=GxQ4ZGLPQObN92QW_Hb8IJPEuYINNn186FjrRovM09g,13
4
+ urlscan/client.py,sha256=P6xGypSDPkSIls8XsJrm18ecaKDz8o73a8m72iQ4AG8,22124
5
+ urlscan/error.py,sha256=aKSV6WY0zSn9s_NES3egkZ3YJHpydb6ALpwYfW27QzY,688
6
+ urlscan/iterator.py,sha256=94w6VjA-sjv1GZh2Ml9MNZWmtbP9GytRoGH5psV87oE,2801
7
+ urlscan/types.py,sha256=zaguwOY_XGB87h7o7mo6kPywHZptHWqB84zB-M1NjYM,208
8
+ urlscan/utils.py,sha256=CFDhQLgyaaVBfPR8BygHMXDi0t-1lNCDq45bqGuZo5M,183
9
+ urlscan_python-0.0.1.dist-info/METADATA,sha256=uYx9iadfE8UN2jxYRX89begtFUyH0HnTrplNAY4fDAQ,2342
10
+ urlscan_python-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ urlscan_python-0.0.1.dist-info/licenses/LICENSE,sha256=3mQG9Twzsjb827NBj4AxQSxCcc1c8XArzDucw5lhM5s,1067
12
+ urlscan_python-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 urlscan.io
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.