thordata-sdk 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata_sdk
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
5
5
  Home-page: https://github.com/Thordata/thordata-python-sdk
6
6
  Author: Thordata Developer Team
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='thordata_sdk',
5
- version='0.2.2', # Bump version due to breaking auth changes
5
+ version='0.2.3', # Bump version due to breaking auth changes
6
6
  packages=find_packages(include=['thordata_sdk', 'thordata_sdk.*']),
7
7
  install_requires=[
8
8
  'requests>=2.25.0', # Standard synchronous HTTP
@@ -3,6 +3,6 @@ from .client import ThordataClient
3
3
  from .async_client import AsyncThordataClient
4
4
 
5
5
  # Version of the thordata-sdk package
6
- __version__ = "0.2.2"
6
+ __version__ = "0.2.3"
7
7
 
8
8
  __all__ = ["ThordataClient", "AsyncThordataClient"]
@@ -1,7 +1,8 @@
1
1
  import aiohttp
2
2
  import logging
3
3
  import json
4
- from typing import Optional, Dict, Any
4
+ import base64
5
+ from typing import Optional, Dict, Any, Union
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
 
@@ -9,11 +10,6 @@ logger = logging.getLogger(__name__)
9
10
  class AsyncThordataClient:
10
11
  """
11
12
  Thordata Asynchronous Client (built on aiohttp).
12
- Designed for high-concurrency and low-latency data collection tasks.
13
-
14
- Usage:
15
- async with AsyncThordataClient(...) as client:
16
- await client.get("http://example.com")
17
13
  """
18
14
 
19
15
  def __init__(
@@ -24,22 +20,19 @@ class AsyncThordataClient:
24
20
  proxy_host: str = "gate.thordata.com",
25
21
  proxy_port: int = 22225
26
22
  ):
27
- """
28
- Initialize the asynchronous client.
29
- """
30
23
  self.scraper_token = scraper_token
31
24
  self.public_token = public_token
32
25
  self.public_key = public_key
33
26
 
34
- # Proxy Authentication
35
27
  self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
36
28
  self.proxy_url = f"http://{proxy_host}:{proxy_port}"
37
29
 
38
- # API Endpoints
39
30
  self.base_url = "https://scraperapi.thordata.com"
31
+ self.universal_url = "https://universalapi.thordata.com"
40
32
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
41
33
 
42
34
  self.SERP_API_URL = f"{self.base_url}/request"
35
+ self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
43
36
  self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
44
37
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
45
38
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
@@ -55,21 +48,14 @@ class AsyncThordataClient:
55
48
  await self.close()
56
49
 
57
50
  async def close(self):
58
- """Close the underlying aiohttp session."""
59
51
  if self._session and not self._session.closed:
60
52
  await self._session.close()
61
53
  self._session = None
62
54
 
63
- # --- Proxy Usage ---
64
-
55
+ # --- Proxy ---
65
56
  async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
66
- """
67
- Send an asynchronous GET request through the Thordata Proxy.
68
- """
69
57
  if self._session is None:
70
58
  raise RuntimeError("Client session not initialized.")
71
-
72
- logger.debug(f"Async Proxy Request: {url}")
73
59
  try:
74
60
  return await self._session.get(
75
61
  url,
@@ -81,21 +67,16 @@ class AsyncThordataClient:
81
67
  logger.error(f"Async Request failed: {e}")
82
68
  raise
83
69
 
84
- # --- SERP API ---
85
-
70
+ # --- SERP ---
86
71
  async def serp_search(
87
72
  self, query: str, engine: str = "google", num: int = 10, **kwargs
88
73
  ) -> Dict[str, Any]:
89
- """Async SERP search."""
90
74
  if self._session is None:
91
75
  raise RuntimeError("Client session not initialized.")
92
76
 
93
77
  payload = {
94
- "q": query,
95
- "num": str(num),
96
- "json": "1",
97
- "engine": engine.lower(),
98
- **kwargs
78
+ "q": query, "num": str(num), "json": "1",
79
+ "engine": engine.lower(), **kwargs
99
80
  }
100
81
  if engine.lower() == 'yandex':
101
82
  payload['text'] = payload.pop('q')
@@ -117,7 +98,6 @@ class AsyncThordataClient:
117
98
  ) as response:
118
99
  response.raise_for_status()
119
100
  data = await response.json()
120
- # Handle double-encoding
121
101
  if isinstance(data, str):
122
102
  try:
123
103
  data = json.loads(data)
@@ -125,8 +105,65 @@ class AsyncThordataClient:
125
105
  pass
126
106
  return data
127
107
 
128
- # --- Web Scraper API ---
108
+ # --- Universal ---
109
+ async def universal_scrape(
110
+ self,
111
+ url: str,
112
+ js_render: bool = False,
113
+ output_format: str = "HTML",
114
+ country: str = None,
115
+ block_resources: bool = False
116
+ ) -> Union[str, bytes]:
117
+ if self._session is None:
118
+ raise RuntimeError("Client session not initialized.")
119
+
120
+ headers = {
121
+ "Authorization": f"Bearer {self.scraper_token}",
122
+ "Content-Type": "application/x-www-form-urlencoded"
123
+ }
124
+
125
+ payload = {
126
+ "url": url,
127
+ "js_render": "True" if js_render else "False",
128
+ "type": output_format.lower(),
129
+ "block_resources": "True" if block_resources else "False"
130
+ }
131
+ if country:
132
+ payload["country"] = country
133
+
134
+ async with self._session.post(
135
+ self.UNIVERSAL_API_URL, data=payload, headers=headers
136
+ ) as response:
137
+ response.raise_for_status()
138
+
139
+ try:
140
+ resp_json = await response.json()
141
+ except Exception:
142
+ if output_format.upper() == "PNG":
143
+ return await response.read()
144
+ return await response.text()
145
+
146
+ if isinstance(resp_json, dict) and resp_json.get("code") \
147
+ and resp_json.get("code") != 200:
148
+ raise Exception(f"Universal API Error: {resp_json}")
149
+
150
+ if "html" in resp_json:
151
+ return resp_json["html"]
152
+
153
+ if "png" in resp_json:
154
+ png_str = resp_json["png"]
155
+ if not png_str:
156
+ raise Exception("API returned empty PNG data")
157
+
158
+ png_str = png_str.replace("\n", "").replace("\r", "")
159
+ missing_padding = len(png_str) % 4
160
+ if missing_padding:
161
+ png_str += '=' * (4 - missing_padding)
162
+ return base64.b64decode(png_str)
163
+
164
+ return str(resp_json)
129
165
 
166
+ # --- Web Scraper ---
130
167
  async def create_scraper_task(
131
168
  self,
132
169
  file_name: str,
@@ -135,7 +172,6 @@ class AsyncThordataClient:
135
172
  spider_name: str = "youtube.com",
136
173
  universal_params: Dict[str, Any] = None
137
174
  ) -> str:
138
- """Create an async scraping task."""
139
175
  if self._session is None:
140
176
  raise RuntimeError("Client session not initialized.")
141
177
 
@@ -164,7 +200,6 @@ class AsyncThordataClient:
164
200
  return data["data"]["task_id"]
165
201
 
166
202
  async def get_task_status(self, task_id: str) -> str:
167
- """Check task status."""
168
203
  headers = {
169
204
  "token": self.public_token,
170
205
  "key": self.public_key,
@@ -183,7 +218,6 @@ class AsyncThordataClient:
183
218
  return "Unknown"
184
219
 
185
220
  async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
186
- """Get download link."""
187
221
  headers = {
188
222
  "token": self.public_token,
189
223
  "key": self.public_key,
@@ -1,7 +1,8 @@
1
1
  import requests
2
2
  import logging
3
3
  import json
4
- from typing import Dict, Any
4
+ import base64
5
+ from typing import Dict, Any, Union
5
6
 
6
7
  # Configure a library-specific logger
7
8
  logger = logging.getLogger(__name__)
@@ -14,7 +15,8 @@ class ThordataClient:
14
15
  Handles authentication for:
15
16
  1. Proxy Network (HTTP/HTTPS)
16
17
  2. SERP API (Real-time Search)
17
- 3. Web Scraper API (Async Task Management)
18
+ 3. Universal Scraping API (Single Page)
19
+ 4. Web Scraper API (Async Task Management)
18
20
  """
19
21
 
20
22
  def __init__(
@@ -39,16 +41,18 @@ class ThordataClient:
39
41
  self.public_token = public_token
40
42
  self.public_key = public_key
41
43
 
42
- # Proxy Configuration (User: Scraper Token, Pass: Empty)
44
+ # Proxy Configuration
43
45
  self.proxy_url = (
44
46
  f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
45
47
  )
46
48
 
47
49
  # API Endpoints
48
50
  self.base_url = "https://scraperapi.thordata.com"
51
+ self.universal_url = "https://universalapi.thordata.com"
49
52
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
50
53
 
51
54
  self.SERP_API_URL = f"{self.base_url}/request"
55
+ self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
52
56
  self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
53
57
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
54
58
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
@@ -62,13 +66,6 @@ class ThordataClient:
62
66
  def get(self, url: str, **kwargs) -> requests.Response:
63
67
  """
64
68
  Send a GET request through the Thordata Proxy Network.
65
-
66
- Args:
67
- url (str): The target URL.
68
- **kwargs: Additional arguments passed to requests.get().
69
-
70
- Returns:
71
- requests.Response: The HTTP response.
72
69
  """
73
70
  logger.debug(f"Proxy Request: {url}")
74
71
  kwargs.setdefault("timeout", 30)
@@ -88,7 +85,6 @@ class ThordataClient:
88
85
  **kwargs
89
86
  }
90
87
 
91
- # Engine-specific parameter adjustments
92
88
  if engine.lower() == 'yandex':
93
89
  payload['text'] = payload.pop('q')
94
90
  if 'url' not in payload:
@@ -117,7 +113,6 @@ class ThordataClient:
117
113
  response.raise_for_status()
118
114
  data = response.json()
119
115
 
120
- # Handle potential double-encoded JSON strings
121
116
  if isinstance(data, str):
122
117
  try:
123
118
  data = json.loads(data)
@@ -128,6 +123,79 @@ class ThordataClient:
128
123
  logger.error(f"SERP Request Failed: {e}")
129
124
  raise
130
125
 
126
+ def universal_scrape(
127
+ self,
128
+ url: str,
129
+ js_render: bool = False,
130
+ output_format: str = "HTML",
131
+ country: str = None,
132
+ block_resources: bool = False
133
+ ) -> Union[str, bytes]:
134
+ """
135
+ Unlock target pages via the Universal Scraping API.
136
+ """
137
+ headers = {
138
+ "Authorization": f"Bearer {self.scraper_token}",
139
+ "Content-Type": "application/x-www-form-urlencoded"
140
+ }
141
+
142
+ payload = {
143
+ "url": url,
144
+ "js_render": "True" if js_render else "False",
145
+ "type": output_format.lower(),
146
+ "block_resources": "True" if block_resources else "False"
147
+ }
148
+ if country:
149
+ payload["country"] = country
150
+
151
+ logger.info(f"Universal Scrape: {url}")
152
+
153
+ try:
154
+ response = self.session.post(
155
+ self.UNIVERSAL_API_URL,
156
+ data=payload,
157
+ headers=headers,
158
+ timeout=60
159
+ )
160
+ response.raise_for_status()
161
+
162
+ # Parse JSON wrapper
163
+ try:
164
+ resp_json = response.json()
165
+ except json.JSONDecodeError:
166
+ # Fallback for raw response
167
+ if output_format.upper() == "PNG":
168
+ return response.content
169
+ return response.text
170
+
171
+ # Check API errors
172
+ if isinstance(resp_json, dict) and resp_json.get("code") \
173
+ and resp_json.get("code") != 200:
174
+ raise Exception(f"Universal API Error: {resp_json}")
175
+
176
+ # Extract HTML
177
+ if "html" in resp_json:
178
+ return resp_json["html"]
179
+
180
+ # Extract PNG (Base64 decoding with padding fix)
181
+ if "png" in resp_json:
182
+ png_str = resp_json["png"]
183
+ if not png_str:
184
+ raise Exception("API returned empty PNG data")
185
+
186
+ png_str = png_str.replace("\n", "").replace("\r", "")
187
+ missing_padding = len(png_str) % 4
188
+ if missing_padding:
189
+ png_str += '=' * (4 - missing_padding)
190
+
191
+ return base64.b64decode(png_str)
192
+
193
+ return str(resp_json)
194
+
195
+ except Exception as e:
196
+ logger.error(f"Universal Scrape Failed: {e}")
197
+ raise
198
+
131
199
  def create_scraper_task(
132
200
  self,
133
201
  file_name: str,
@@ -174,7 +242,6 @@ class ThordataClient:
174
242
  def get_task_status(self, task_id: str) -> str:
175
243
  """
176
244
  Check the status of a task.
177
- Returns: 'Running', 'Ready', 'Failed', or 'Unknown'.
178
245
  """
179
246
  headers = {
180
247
  "token": self.public_token,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata_sdk
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
5
5
  Home-page: https://github.com/Thordata/thordata-python-sdk
6
6
  Author: Thordata Developer Team
File without changes
File without changes
File without changes