sessemi 0.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sessemi-0.22.0/PKG-INFO +22 -0
- sessemi-0.22.0/README.md +2 -0
- sessemi-0.22.0/pyproject.toml +33 -0
- sessemi-0.22.0/sessemi/__init__.py +5 -0
- sessemi-0.22.0/sessemi/client.py +532 -0
- sessemi-0.22.0/sessemi.egg-info/PKG-INFO +22 -0
- sessemi-0.22.0/sessemi.egg-info/SOURCES.txt +9 -0
- sessemi-0.22.0/sessemi.egg-info/dependency_links.txt +1 -0
- sessemi-0.22.0/sessemi.egg-info/requires.txt +4 -0
- sessemi-0.22.0/sessemi.egg-info/top_level.txt +1 -0
- sessemi-0.22.0/setup.cfg +4 -0
sessemi-0.22.0/PKG-INFO
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sessemi
|
|
3
|
+
Version: 0.22.0
|
|
4
|
+
Summary: Python client for the Sessemi web scraping API
|
|
5
|
+
Author-email: Andrew Odiit <andrew@sessemi.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://sessemi.com
|
|
8
|
+
Project-URL: Documentation, https://sessemi.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/sessemi/sessemi-python
|
|
10
|
+
Project-URL: Issues, https://github.com/sessemi/sessemi-python/issues
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: requests>=2.28
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: python-dotenv; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# sessemi-python
|
|
22
|
+
python client for the sessemi web scraping api
|
sessemi-0.22.0/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sessemi"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Python client for the Sessemi web scraping API"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
dependencies = ["requests>=2.28"]
|
|
13
|
+
authors = [
|
|
14
|
+
{name = "Andrew Odiit", email = "andrew@sessemi.com"},
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://sessemi.com"
|
|
25
|
+
Documentation = "https://sessemi.com/docs"
|
|
26
|
+
Repository = "https://github.com/sessemi/sessemi-python"
|
|
27
|
+
Issues = "https://github.com/sessemi/sessemi-python/issues"
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = ["python-dotenv"]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.dynamic]
|
|
33
|
+
version = {attr = "sessemi.__version__"}
|
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
# open sessemi
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import logging
|
|
5
|
+
import base64
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
|
|
9
|
+
import requests as _requests
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("sessemi")
|
|
12
|
+
|
|
13
|
+
_UNSET = object()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ScrapeResult:
|
|
18
|
+
success: bool
|
|
19
|
+
url: str
|
|
20
|
+
content: str = "" # response body (HTML or JSON depending on target Content-Type)
|
|
21
|
+
body_size: int = 0 # size of content field
|
|
22
|
+
cookies: list = field(default_factory=list)
|
|
23
|
+
user_agent: str = ""
|
|
24
|
+
worker_id: int = -1
|
|
25
|
+
proxy_used: str = ""
|
|
26
|
+
proxy_port: int = 0
|
|
27
|
+
country: str = ""
|
|
28
|
+
session: str = ""
|
|
29
|
+
challenge_type: str = ""
|
|
30
|
+
challenge_provider: str = ""
|
|
31
|
+
wait_for_match: str = ""
|
|
32
|
+
failure_type: str = ""
|
|
33
|
+
status_code: int = 0
|
|
34
|
+
duration_ms: int = 0
|
|
35
|
+
queued_ms: int = 0
|
|
36
|
+
retry_count: int = 0
|
|
37
|
+
error: str = ""
|
|
38
|
+
warning: str = ""
|
|
39
|
+
pool: str = ""
|
|
40
|
+
solved: bool = False
|
|
41
|
+
credits_charged: int = 0
|
|
42
|
+
credits_remaining: int = 0
|
|
43
|
+
resolved_url: str = ""
|
|
44
|
+
script_result: dict = field(default=None, repr=False) # JS script execution result
|
|
45
|
+
screenshot: bytes = b""
|
|
46
|
+
response: _requests.Response = field(default=None, repr=False)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_json(cls, data: dict, response: _requests.Response = None) -> "ScrapeResult":
|
|
50
|
+
ss_b64 = data.pop("screenshot", None)
|
|
51
|
+
known = {k for k in cls.__dataclass_fields__}
|
|
52
|
+
obj = cls(**{k: v for k, v in data.items() if k in known})
|
|
53
|
+
if ss_b64:
|
|
54
|
+
obj.screenshot = base64.b64decode(ss_b64)
|
|
55
|
+
obj.response = response
|
|
56
|
+
return obj
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def ok(self) -> bool:
|
|
60
|
+
return self.success and self.body_size > 0
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def content_bytes(self) -> bytes:
|
|
64
|
+
"""Raw response bytes — drop-in for requests.Response.content"""
|
|
65
|
+
if self.response is not None:
|
|
66
|
+
return self.response.content
|
|
67
|
+
return self.content.encode("utf-8") if self.content else b""
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def text(self) -> str:
|
|
71
|
+
"""Response text — drop-in for requests.Response.text"""
|
|
72
|
+
return self.content
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SessemiError(Exception):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class SessemiTimeout(SessemiError):
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class SessemiUnavailable(SessemiError):
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Sessemi:
|
|
88
|
+
"""
|
|
89
|
+
All config from env vars or constructor args:
|
|
90
|
+
SESSEMI_URL - base URL (e.g. https://xxx.ngrok-free.app)
|
|
91
|
+
SESSEMI_KEY - API key
|
|
92
|
+
SESSEMI_TIMEOUT - default timeout per scrape (seconds)
|
|
93
|
+
SESSEMI_RETRIES - default retry count
|
|
94
|
+
SESSEMI_RETRY_ON - comma-separated failure types to retry on
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
url: str = _UNSET,
|
|
100
|
+
key: str = _UNSET,
|
|
101
|
+
timeout: int = _UNSET,
|
|
102
|
+
retries: int = _UNSET,
|
|
103
|
+
retry_on: list = _UNSET,
|
|
104
|
+
):
|
|
105
|
+
self.base_url = (
|
|
106
|
+
url if url is not _UNSET
|
|
107
|
+
else os.environ.get("SESSEMI_URL", "https://api.sessemi.com")
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
self.api_key = (
|
|
111
|
+
key if key is not _UNSET
|
|
112
|
+
else os.environ.get("SESSEMI_KEY", "")
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
self.timeout = (
|
|
116
|
+
timeout if timeout is not _UNSET
|
|
117
|
+
else int(os.environ.get("SESSEMI_TIMEOUT", "60"))
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
self.retries = (
|
|
121
|
+
retries if retries is not _UNSET
|
|
122
|
+
else int(os.environ.get("SESSEMI_RETRIES", "3"))
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
self.retry_on = (
|
|
126
|
+
retry_on if retry_on is not _UNSET
|
|
127
|
+
else [x.strip() for x in os.environ.get("SESSEMI_RETRY_ON", "blocked").split(",") if x.strip()]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self._http = _requests.Session()
|
|
131
|
+
if self.api_key:
|
|
132
|
+
self._http.headers["X-API-Key"] = self.api_key
|
|
133
|
+
|
|
134
|
+
def scrape(
|
|
135
|
+
self,
|
|
136
|
+
url: str,
|
|
137
|
+
*,
|
|
138
|
+
stealth: bool = None,
|
|
139
|
+
pool: str = None,
|
|
140
|
+
solve: bool = None,
|
|
141
|
+
timeout: int = None,
|
|
142
|
+
proxy: str = None,
|
|
143
|
+
country: str = None,
|
|
144
|
+
session: str = None,
|
|
145
|
+
screenshot: bool = False,
|
|
146
|
+
block_resources: bool = False,
|
|
147
|
+
wait_for: str = None,
|
|
148
|
+
wait_for_js: str = None,
|
|
149
|
+
wait_timeout: int = None,
|
|
150
|
+
retry: int = None,
|
|
151
|
+
retry_on: list = None,
|
|
152
|
+
render: bool = False,
|
|
153
|
+
exclude_cookies: list = None,
|
|
154
|
+
headers: dict = None,
|
|
155
|
+
method: str = None,
|
|
156
|
+
body: str = None,
|
|
157
|
+
) -> ScrapeResult:
|
|
158
|
+
"""
|
|
159
|
+
Scrape a URL through api
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
url: Target URL to scrape.
|
|
163
|
+
pool: Proxy pool: "datacenter" (1 credit) or "residential"
|
|
164
|
+
(10 credits, solving included). Default: datacenter.
|
|
165
|
+
solve: Enable challenge solving (Cloudflare, Akamai, DataDome).
|
|
166
|
+
Default: True for residential, False for datacenter.
|
|
167
|
+
Datacenter + solve = 6 credits (budget option).
|
|
168
|
+
timeout: Max seconds for the scrape (default: self.timeout).
|
|
169
|
+
proxy: Per-request proxy URL. Supports standard format
|
|
170
|
+
"http://user:pass@host:port" and colon format
|
|
171
|
+
"host:port:user:pass". Server expands {session_id}
|
|
172
|
+
and {country} automatically.
|
|
173
|
+
Use "none"/"direct" for no proxy, or omit for server default.
|
|
174
|
+
country: Proxy country code (e.g. "FR", "DE"). Only with
|
|
175
|
+
pool="residential".
|
|
176
|
+
session: Session ID — pins request to a specific worker so cookies
|
|
177
|
+
and IP persist across requests. Any string works.
|
|
178
|
+
screenshot: If True, include base64 PNG screenshot in response.
|
|
179
|
+
wait_for: CSS selector(s) to wait for after page load. Comma-
|
|
180
|
+
separated for OR (e.g. ".products, .no-results").
|
|
181
|
+
Use for AJAX-loaded content.
|
|
182
|
+
wait_for_js: JS expression that returns truthy when page is ready.
|
|
183
|
+
Use for text matching or complex conditions, e.g.
|
|
184
|
+
"document.querySelector('h1')?.textContent.includes('Résultat')"
|
|
185
|
+
Can be combined with wait_for — first match wins.
|
|
186
|
+
wait_timeout: Max seconds to wait for selector/JS (default: 10).
|
|
187
|
+
retry: Max retries on failure (default: self.retries).
|
|
188
|
+
retry_on: Failure types to retry on (default: self.retry_on).
|
|
189
|
+
Options: "server_error", "challenge_timeout",
|
|
190
|
+
"challenge_unsolved", "navigate_failed", "blocked".
|
|
191
|
+
headers: Custom HTTP headers to send with the request.
|
|
192
|
+
Dict of {name: value}. Applied on both fast path
|
|
193
|
+
and browser path (render=True). Host and Connection
|
|
194
|
+
cannot be overridden.
|
|
195
|
+
method: HTTP method: "GET", "POST", "PUT", "PATCH", "DELETE".
|
|
196
|
+
Default: "GET". Only applies to the fast path.
|
|
197
|
+
body: Request body for POST/PUT/PATCH. Typically URL-encoded
|
|
198
|
+
form data or a JSON string. Set Content-Type via
|
|
199
|
+
the headers parameter.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
ScrapeResult with:
|
|
203
|
+
.solved — True if a challenge was detected and solved
|
|
204
|
+
.warning — Non-fatal advisory (e.g. DC solve less reliable)
|
|
205
|
+
.pool — Pool used: "datacenter", "residential", "custom"
|
|
206
|
+
.challenge_type — "clear", "solved", "blocked", "timeout", etc.
|
|
207
|
+
.wait_for_match — "css", "js", "timeout", or ""
|
|
208
|
+
"""
|
|
209
|
+
request_body = body # capture before local 'body' shadows the param
|
|
210
|
+
body = {"url": url, "timeout": timeout or self.timeout}
|
|
211
|
+
|
|
212
|
+
if stealth is not None:
|
|
213
|
+
body["stealth"] = stealth
|
|
214
|
+
if pool:
|
|
215
|
+
body["pool"] = pool
|
|
216
|
+
if solve is not None:
|
|
217
|
+
body["solve"] = solve
|
|
218
|
+
if proxy:
|
|
219
|
+
body["proxy"] = proxy
|
|
220
|
+
if country:
|
|
221
|
+
body["country"] = country.upper()
|
|
222
|
+
if session:
|
|
223
|
+
body["session"] = session
|
|
224
|
+
if screenshot:
|
|
225
|
+
body["screenshot"] = True
|
|
226
|
+
if block_resources:
|
|
227
|
+
body["block_resources"] = True
|
|
228
|
+
if wait_for:
|
|
229
|
+
body["wait_for"] = wait_for
|
|
230
|
+
if wait_for_js:
|
|
231
|
+
body["wait_for_js"] = wait_for_js
|
|
232
|
+
if wait_timeout is not None:
|
|
233
|
+
body["wait_timeout"] = wait_timeout
|
|
234
|
+
if render:
|
|
235
|
+
body["render"] = True
|
|
236
|
+
if exclude_cookies:
|
|
237
|
+
body["exclude_cookies"] = exclude_cookies
|
|
238
|
+
if headers:
|
|
239
|
+
body["headers"] = headers
|
|
240
|
+
if method and method.upper() != "GET":
|
|
241
|
+
body["method"] = method.upper()
|
|
242
|
+
if request_body:
|
|
243
|
+
body["body"] = request_body
|
|
244
|
+
|
|
245
|
+
r = retry if retry is not None else self.retries
|
|
246
|
+
ro = retry_on if retry_on is not None else self.retry_on
|
|
247
|
+
if r > 0:
|
|
248
|
+
body["retry"] = r
|
|
249
|
+
if ro:
|
|
250
|
+
body["retry_on"] = ro
|
|
251
|
+
|
|
252
|
+
data, resp = self._post("/scrape", body)
|
|
253
|
+
return ScrapeResult.from_json(data, response=resp)
|
|
254
|
+
|
|
255
|
+
def script_exec(
|
|
256
|
+
self,
|
|
257
|
+
script: str,
|
|
258
|
+
*,
|
|
259
|
+
session: str,
|
|
260
|
+
timeout: int = None,
|
|
261
|
+
) -> ScrapeResult:
|
|
262
|
+
"""Run JavaScript on the current page in an existing browser session.
|
|
263
|
+
|
|
264
|
+
Script-only mode: no URL navigation. The script executes in the
|
|
265
|
+
context of whatever page the session last navigated to, with full
|
|
266
|
+
access to the browser's cookies (including validated _abck, etc.).
|
|
267
|
+
|
|
268
|
+
The script body is wrapped in an async IIFE — use ``await`` freely
|
|
269
|
+
and ``return`` the result (will be JSON-serialized).
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
script: JavaScript code to execute.
|
|
273
|
+
session: Session ID (must already exist from a prior scrape).
|
|
274
|
+
timeout: Max seconds for script execution.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
ScrapeResult with ``script_result`` populated.
|
|
278
|
+
The engine returns ``{"value": "<json_string>"}`` via Marionette;
|
|
279
|
+
use :meth:`parse_script_result` to unwrap.
|
|
280
|
+
"""
|
|
281
|
+
body = {
|
|
282
|
+
"script": script,
|
|
283
|
+
"session": session,
|
|
284
|
+
"timeout": timeout or self.timeout,
|
|
285
|
+
}
|
|
286
|
+
data, resp = self._post("/scrape", body)
|
|
287
|
+
return ScrapeResult.from_json(data, response=resp)
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def parse_script_result(script_result):
|
|
291
|
+
"""Unwrap script_result from engine response.
|
|
292
|
+
|
|
293
|
+
The engine wraps as ``{"value": "<json_string>"}`` via Marionette.
|
|
294
|
+
Returns the parsed Python object.
|
|
295
|
+
"""
|
|
296
|
+
import json as _json
|
|
297
|
+
if script_result is None:
|
|
298
|
+
return None
|
|
299
|
+
if isinstance(script_result, dict) and "value" in script_result:
|
|
300
|
+
raw = script_result["value"]
|
|
301
|
+
return _json.loads(raw) if isinstance(raw, str) else raw
|
|
302
|
+
if isinstance(script_result, str):
|
|
303
|
+
return _json.loads(script_result)
|
|
304
|
+
return script_result
|
|
305
|
+
|
|
306
|
+
def scrape_batch(
|
|
307
|
+
self,
|
|
308
|
+
urls: list,
|
|
309
|
+
*,
|
|
310
|
+
country: str = None,
|
|
311
|
+
render: bool = None,
|
|
312
|
+
solve: bool = None,
|
|
313
|
+
stealth: bool = None,
|
|
314
|
+
block_resources: bool = None,
|
|
315
|
+
headers: dict = None,
|
|
316
|
+
timeout: int = 300,
|
|
317
|
+
poll_interval: float = 2.0,
|
|
318
|
+
) -> list:
|
|
319
|
+
"""Scrape multiple URLs concurrently via async tasks.
|
|
320
|
+
|
|
321
|
+
Submits all URLs as async tasks, then polls until all complete
|
|
322
|
+
or the timeout is reached. Returns results in the same order
|
|
323
|
+
as the input URLs.
|
|
324
|
+
|
|
325
|
+
Not compatible with the ``session`` parameter (server rejects it).
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
urls: List of URLs to scrape.
|
|
329
|
+
country: Two-letter country code for proxy geolocation.
|
|
330
|
+
render: Force browser rendering.
|
|
331
|
+
solve: Attempt anti-bot challenge solving.
|
|
332
|
+
stealth: Start fast, escalate only when challenged.
|
|
333
|
+
block_resources: Block images/fonts/media for speed.
|
|
334
|
+
headers: Custom HTTP headers to forward.
|
|
335
|
+
timeout: Max seconds to wait for all tasks (default 300).
|
|
336
|
+
poll_interval: Seconds between poll cycles (default 2.0).
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
List of ScrapeResult, one per URL, in input order.
|
|
340
|
+
Failed tasks have success=False with error details.
|
|
341
|
+
|
|
342
|
+
Example::
|
|
343
|
+
|
|
344
|
+
results = client.scrape_batch(
|
|
345
|
+
["https://example.com/1", "https://example.com/2"],
|
|
346
|
+
stealth=True,
|
|
347
|
+
country="FR",
|
|
348
|
+
)
|
|
349
|
+
for r in results:
|
|
350
|
+
print(f"{r.url} — {'OK' if r.ok else r.error}")
|
|
351
|
+
"""
|
|
352
|
+
if not urls:
|
|
353
|
+
return []
|
|
354
|
+
|
|
355
|
+
# Shared params (everything except url)
|
|
356
|
+
shared = {}
|
|
357
|
+
if country is not None:
|
|
358
|
+
shared["country"] = country
|
|
359
|
+
if render is not None:
|
|
360
|
+
shared["render"] = render
|
|
361
|
+
if solve is not None:
|
|
362
|
+
shared["solve"] = solve
|
|
363
|
+
if stealth is not None:
|
|
364
|
+
shared["stealth"] = stealth
|
|
365
|
+
if block_resources is not None:
|
|
366
|
+
shared["block_resources"] = block_resources
|
|
367
|
+
if headers is not None:
|
|
368
|
+
shared["headers"] = headers
|
|
369
|
+
|
|
370
|
+
# ── Submit all as async tasks ──
|
|
371
|
+
task_ids = [] # parallel to urls
|
|
372
|
+
for url in urls:
|
|
373
|
+
body = {"url": url, **shared}
|
|
374
|
+
try:
|
|
375
|
+
resp = self._http.post(
|
|
376
|
+
f"{self.base_url}/scrape?async=true",
|
|
377
|
+
json=body,
|
|
378
|
+
timeout=(10, 30),
|
|
379
|
+
)
|
|
380
|
+
if resp.status_code == 202:
|
|
381
|
+
task_ids.append(resp.json().get("task_id"))
|
|
382
|
+
else:
|
|
383
|
+
task_ids.append(None)
|
|
384
|
+
logger.warning("batch: submit failed for %s (HTTP %d)", url[:60], resp.status_code)
|
|
385
|
+
except Exception as exc:
|
|
386
|
+
task_ids.append(None)
|
|
387
|
+
logger.warning("batch: submit error for %s: %s", url[:60], exc)
|
|
388
|
+
|
|
389
|
+
submitted = sum(1 for t in task_ids if t is not None)
|
|
390
|
+
logger.info("batch: submitted %d/%d async tasks", submitted, len(urls))
|
|
391
|
+
|
|
392
|
+
# Pre-fill failures for tasks that never submitted
|
|
393
|
+
results = [None] * len(urls)
|
|
394
|
+
for i, tid in enumerate(task_ids):
|
|
395
|
+
if tid is None:
|
|
396
|
+
results[i] = ScrapeResult.from_json({
|
|
397
|
+
"success": False,
|
|
398
|
+
"url": urls[i],
|
|
399
|
+
"error": "async task submission failed",
|
|
400
|
+
})
|
|
401
|
+
|
|
402
|
+
# ── Poll until all tasks resolve or timeout ──
|
|
403
|
+
pending = {i: tid for i, tid in enumerate(task_ids) if tid is not None}
|
|
404
|
+
deadline = time.monotonic() + timeout
|
|
405
|
+
|
|
406
|
+
while pending and time.monotonic() < deadline:
|
|
407
|
+
time.sleep(poll_interval)
|
|
408
|
+
|
|
409
|
+
for i, tid in list(pending.items()):
|
|
410
|
+
try:
|
|
411
|
+
resp = self._http.get(
|
|
412
|
+
f"{self.base_url}/tasks/{tid}",
|
|
413
|
+
timeout=(10, 30),
|
|
414
|
+
)
|
|
415
|
+
if resp.status_code != 200:
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
data = resp.json()
|
|
419
|
+
status = data.get("status")
|
|
420
|
+
|
|
421
|
+
if status in ("done", "failed"):
|
|
422
|
+
result_data = data.get("result")
|
|
423
|
+
if result_data and isinstance(result_data, dict):
|
|
424
|
+
results[i] = ScrapeResult.from_json(result_data)
|
|
425
|
+
else:
|
|
426
|
+
results[i] = ScrapeResult.from_json({
|
|
427
|
+
"success": False,
|
|
428
|
+
"url": urls[i],
|
|
429
|
+
"error": f"task {status} with no result",
|
|
430
|
+
})
|
|
431
|
+
del pending[i]
|
|
432
|
+
except Exception:
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
if pending:
|
|
436
|
+
logger.debug("batch: %d/%d tasks still pending", len(pending), len(urls))
|
|
437
|
+
|
|
438
|
+
# Timeout stragglers
|
|
439
|
+
for i in pending:
|
|
440
|
+
results[i] = ScrapeResult.from_json({
|
|
441
|
+
"success": False,
|
|
442
|
+
"url": urls[i],
|
|
443
|
+
"error": f"task timed out after {timeout}s (task_id: {task_ids[i]})",
|
|
444
|
+
})
|
|
445
|
+
logger.warning("batch: task %s timed out for %s", task_ids[i], urls[i][:60])
|
|
446
|
+
|
|
447
|
+
ok_count = sum(1 for r in results if r.success)
|
|
448
|
+
logger.info("batch: complete — %d/%d succeeded", ok_count, len(urls))
|
|
449
|
+
return results
|
|
450
|
+
|
|
451
|
+
def screenshot(self, url: str, *, timeout: int = None) -> bytes:
|
|
452
|
+
resp = self._http.post(
|
|
453
|
+
f"{self.base_url}/screenshot",
|
|
454
|
+
json={"url": url, "timeout": timeout or self.timeout},
|
|
455
|
+
timeout=(timeout or self.timeout) + 30,
|
|
456
|
+
)
|
|
457
|
+
resp.raise_for_status()
|
|
458
|
+
return resp.content
|
|
459
|
+
|
|
460
|
+
def health(self) -> dict:
|
|
461
|
+
resp = self._http.get(f"{self.base_url}/health", timeout=10)
|
|
462
|
+
resp.raise_for_status()
|
|
463
|
+
return resp.json()
|
|
464
|
+
|
|
465
|
+
def ping(self) -> bool:
|
|
466
|
+
try:
|
|
467
|
+
h = self.health()
|
|
468
|
+
return h.get("status") in ("healthy", "busy", "queuing")
|
|
469
|
+
except Exception:
|
|
470
|
+
return False
|
|
471
|
+
|
|
472
|
+
def _post(self, path: str, body: dict) -> tuple:
|
|
473
|
+
req_timeout = body.get("timeout", self.timeout) + 60
|
|
474
|
+
t0 = time.monotonic()
|
|
475
|
+
try:
|
|
476
|
+
resp = self._http.post(
|
|
477
|
+
f"{self.base_url}{path}",
|
|
478
|
+
json=body,
|
|
479
|
+
timeout=req_timeout,
|
|
480
|
+
)
|
|
481
|
+
except _requests.exceptions.Timeout:
|
|
482
|
+
raise SessemiTimeout(f"request timed out after {req_timeout}s")
|
|
483
|
+
except (
|
|
484
|
+
_requests.exceptions.ConnectionError,
|
|
485
|
+
_requests.exceptions.ChunkedEncodingError,
|
|
486
|
+
) as e:
|
|
487
|
+
raise SessemiUnavailable(f"cannot reach {self.base_url}: {e}")
|
|
488
|
+
|
|
489
|
+
# Parse JSON safely — ngrok/proxies may return HTML error pages
|
|
490
|
+
try:
|
|
491
|
+
data = resp.json()
|
|
492
|
+
except (ValueError, _requests.exceptions.JSONDecodeError):
|
|
493
|
+
# Not JSON — likely ngrok 502/504 HTML page or tunnel expired
|
|
494
|
+
snippet = resp.text[:200].strip()
|
|
495
|
+
if resp.status_code >= 500:
|
|
496
|
+
raise SessemiUnavailable(
|
|
497
|
+
f"upstream error (HTTP {resp.status_code}): {snippet}")
|
|
498
|
+
elif resp.status_code == 404:
|
|
499
|
+
raise SessemiUnavailable(
|
|
500
|
+
f"endpoint not found (HTTP 404) — check SESSEMI_URL: {snippet}")
|
|
501
|
+
else:
|
|
502
|
+
raise SessemiError(
|
|
503
|
+
f"non-JSON response (HTTP {resp.status_code}): {snippet}")
|
|
504
|
+
|
|
505
|
+
if resp.status_code == 429:
|
|
506
|
+
raise SessemiUnavailable(data.get("error", "queue full"))
|
|
507
|
+
|
|
508
|
+
# Surface the server's error message for 4xx responses.
|
|
509
|
+
# Without this, raise_for_status() gives a generic "400 Bad Request"
|
|
510
|
+
# and the actual explanation (e.g. "Country targeting requires
|
|
511
|
+
# pool=residential") is lost.
|
|
512
|
+
if resp.status_code >= 400 and resp.status_code < 500:
|
|
513
|
+
msg = data.get("error", resp.text[:200])
|
|
514
|
+
hint = data.get("hint", "")
|
|
515
|
+
detail = f"{msg} ({hint})" if hint else msg
|
|
516
|
+
raise SessemiError(
|
|
517
|
+
f"HTTP {resp.status_code}: {detail}"
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
resp.raise_for_status()
|
|
521
|
+
|
|
522
|
+
elapsed = int((time.monotonic() - t0) * 1000)
|
|
523
|
+
url_short = body.get("url", "?")[:60]
|
|
524
|
+
if data.get("success"):
|
|
525
|
+
logger.debug("✓ %s (%dms w%s)", url_short, elapsed, data.get("worker_id", "?"))
|
|
526
|
+
else:
|
|
527
|
+
logger.warning("✗ %s → %s (%dms)", url_short, data.get("failure_type", "?"), elapsed)
|
|
528
|
+
|
|
529
|
+
if data.get("warning"):
|
|
530
|
+
logger.warning("⚠ %s: %s", url_short, data["warning"])
|
|
531
|
+
|
|
532
|
+
return data, resp
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sessemi
|
|
3
|
+
Version: 0.22.0
|
|
4
|
+
Summary: Python client for the Sessemi web scraping API
|
|
5
|
+
Author-email: Andrew Odiit <andrew@sessemi.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://sessemi.com
|
|
8
|
+
Project-URL: Documentation, https://sessemi.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/sessemi/sessemi-python
|
|
10
|
+
Project-URL: Issues, https://github.com/sessemi/sessemi-python/issues
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: requests>=2.28
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: python-dotenv; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# sessemi-python
|
|
22
|
+
python client for the sessemi web scraping api
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sessemi
|
sessemi-0.22.0/setup.cfg
ADDED