thordata-sdk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/models.py CHANGED
@@ -1,725 +1,840 @@
1
- """
2
- Data models for the Thordata Python SDK.
3
-
4
- This module provides type-safe dataclasses for configuring proxy requests,
5
- SERP API calls, and Universal Scraping requests. Using these models enables
6
- IDE autocomplete and reduces parameter errors.
7
-
8
- Example:
9
- >>> from thordata.models import ProxyConfig, SerpRequest
10
- >>>
11
- >>> # Build a proxy URL with geo-targeting
12
- >>> proxy = ProxyConfig(
13
- ... username="myuser",
14
- ... password="mypass",
15
- ... country="us",
16
- ... city="seattle"
17
- ... )
18
- >>> print(proxy.build_proxy_url())
19
-
20
- >>> # Configure a SERP request
21
- >>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
22
- >>> print(serp.to_payload())
23
- """
24
-
25
- from __future__ import annotations
26
-
27
- import json
28
- import re
29
- import uuid
30
- from dataclasses import dataclass, field
31
- from enum import Enum
32
- from typing import Any, Dict, List, Optional, Union
33
-
34
- # =============================================================================
35
- # Proxy Product Types
36
- # =============================================================================
37
-
38
-
39
- class ProxyProduct(str, Enum):
40
- """
41
- Thordata proxy product types with their default ports.
42
-
43
- Each product type has a specific port on the proxy gateway.
44
- """
45
-
46
- RESIDENTIAL = "residential"
47
- MOBILE = "mobile"
48
- DATACENTER = "datacenter"
49
- ISP = "isp"
50
-
51
- @property
52
- def default_port(self) -> int:
53
- """Get the default port for this proxy product."""
54
- ports = {
55
- "residential": 9999,
56
- "mobile": 5555,
57
- "datacenter": 7777,
58
- "isp": 6666,
59
- }
60
- return ports[self.value]
61
-
62
-
63
- # =============================================================================
64
- # Proxy Configuration Models
65
- # =============================================================================
66
-
67
-
68
- @dataclass
69
- class ProxyConfig:
70
- """
71
- Configuration for building a Thordata proxy URL.
72
-
73
- This class handles the complex username format required by Thordata proxies,
74
- where geo-targeting and session parameters are embedded in the username.
75
-
76
- Args:
77
- username: Your Thordata account username (the part after 'td-customer-').
78
- password: Your Thordata account password.
79
- product: Proxy product type (residential, mobile, datacenter, isp).
80
- host: Proxy gateway host. If None, uses default based on product.
81
- port: Proxy gateway port. If None, uses default based on product.
82
- protocol: Proxy protocol - 'http' or 'https'.
83
-
84
- # Geo-targeting (all optional)
85
- continent: Target continent code (af/an/as/eu/na/oc/sa).
86
- country: Target country code in ISO 3166-1 alpha-2 format.
87
- state: Target state name in lowercase.
88
- city: Target city name in lowercase.
89
- asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
90
-
91
- # Session control (optional)
92
- session_id: Session identifier for sticky sessions.
93
- session_duration: Session duration in minutes (1-90).
94
-
95
- Example:
96
- >>> config = ProxyConfig(
97
- ... username="GnrqUwwu3obt",
98
- ... password="PkCSzvt30iww",
99
- ... product=ProxyProduct.RESIDENTIAL,
100
- ... country="us",
101
- ... state="california",
102
- ... session_id="mysession123",
103
- ... session_duration=10
104
- ... )
105
- >>> print(config.build_proxy_url())
106
- http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
107
- """
108
-
109
- username: str
110
- password: str
111
- product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL
112
- host: Optional[str] = None
113
- port: Optional[int] = None
114
- protocol: str = "http"
115
-
116
- # Geo-targeting
117
- continent: Optional[str] = None
118
- country: Optional[str] = None
119
- state: Optional[str] = None
120
- city: Optional[str] = None
121
- asn: Optional[str] = None
122
-
123
- # Session control
124
- session_id: Optional[str] = None
125
- session_duration: Optional[int] = None # minutes, 1-90
126
-
127
- # Valid continent codes
128
- VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
129
-
130
- def __post_init__(self) -> None:
131
- """Validate configuration after initialization."""
132
- # Normalize product to enum
133
- if isinstance(self.product, str):
134
- self.product = ProxyProduct(self.product.lower())
135
-
136
- # Set default host and port based on product
137
- if self.host is None:
138
- # Extract user prefix from username if available
139
- # Default to generic host
140
- self.host = "pr.thordata.net"
141
-
142
- if self.port is None:
143
- self.port = self.product.default_port
144
-
145
- self._validate()
146
-
147
- def _validate(self) -> None:
148
- """Validate the proxy configuration."""
149
- # Validate protocol
150
- if self.protocol not in ("http", "https"):
151
- raise ValueError(
152
- f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
153
- )
154
-
155
- # Validate session duration
156
- if self.session_duration is not None:
157
- if not 1 <= self.session_duration <= 90:
158
- raise ValueError(
159
- f"session_duration must be between 1 and 90 minutes, "
160
- f"got {self.session_duration}"
161
- )
162
- if not self.session_id:
163
- raise ValueError("session_duration requires session_id to be set")
164
-
165
- # Validate ASN requires country
166
- if self.asn and not self.country:
167
- raise ValueError("ASN targeting requires country to be specified")
168
-
169
- # Validate continent code
170
- if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
171
- raise ValueError(
172
- f"Invalid continent code: {self.continent}. "
173
- f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
174
- )
175
-
176
- # Validate country code format (2 letters)
177
- if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
178
- raise ValueError(
179
- f"Invalid country code: {self.country}. "
180
- "Must be a 2-letter ISO 3166-1 alpha-2 code."
181
- )
182
-
183
- def build_username(self) -> str:
184
- """
185
- Build the complete username string with embedded parameters.
186
-
187
- Returns:
188
- The formatted username string for proxy authentication.
189
- """
190
- parts = [f"td-customer-{self.username}"]
191
-
192
- # Add geo-targeting parameters (order matters)
193
- if self.continent:
194
- parts.append(f"continent-{self.continent.lower()}")
195
-
196
- if self.country:
197
- parts.append(f"country-{self.country.lower()}")
198
-
199
- if self.state:
200
- parts.append(f"state-{self.state.lower()}")
201
-
202
- if self.city:
203
- parts.append(f"city-{self.city.lower()}")
204
-
205
- if self.asn:
206
- # Ensure ASN has correct format
207
- asn_value = self.asn.upper()
208
- if not asn_value.startswith("AS"):
209
- asn_value = f"AS{asn_value}"
210
- parts.append(f"asn-{asn_value}")
211
-
212
- # Add session parameters
213
- if self.session_id:
214
- parts.append(f"sessid-{self.session_id}")
215
-
216
- if self.session_duration:
217
- parts.append(f"sesstime-{self.session_duration}")
218
-
219
- return "-".join(parts)
220
-
221
- def build_proxy_url(self) -> str:
222
- """
223
- Build the complete proxy URL.
224
-
225
- Returns:
226
- The formatted proxy URL for use with requests/aiohttp.
227
- """
228
- username = self.build_username()
229
- return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
230
-
231
- def to_proxies_dict(self) -> Dict[str, str]:
232
- """
233
- Build a proxies dict suitable for the requests library.
234
-
235
- Returns:
236
- Dict with 'http' and 'https' keys pointing to the proxy URL.
237
- """
238
- url = self.build_proxy_url()
239
- return {"http": url, "https": url}
240
-
241
- def to_aiohttp_config(self) -> tuple:
242
- """
243
- Get proxy configuration for aiohttp.
244
-
245
- Returns:
246
- Tuple of (proxy_url, proxy_auth) for aiohttp.
247
- """
248
- try:
249
- import aiohttp
250
-
251
- proxy_url = f"{self.protocol}://{self.host}:{self.port}"
252
- proxy_auth = aiohttp.BasicAuth(
253
- login=self.build_username(), password=self.password
254
- )
255
- return proxy_url, proxy_auth
256
- except ImportError:
257
- raise ImportError("aiohttp is required for async proxy configuration")
258
-
259
-
260
- @dataclass
261
- class StickySession(ProxyConfig):
262
- """
263
- Convenience class for creating sticky session proxy configurations.
264
-
265
- A sticky session keeps the same IP address for a specified duration,
266
- useful for multi-step operations that require IP consistency.
267
-
268
- Args:
269
- duration_minutes: How long to keep the same IP (1-90 minutes).
270
- auto_session_id: If True, automatically generates a unique session ID.
271
-
272
- Example:
273
- >>> session = StickySession(
274
- ... username="myuser",
275
- ... password="mypass",
276
- ... country="us",
277
- ... duration_minutes=15
278
- ... )
279
- >>> # Each call to build_proxy_url() uses the same session
280
- >>> url = session.build_proxy_url()
281
- """
282
-
283
- duration_minutes: int = 10
284
- auto_session_id: bool = True
285
-
286
- def __post_init__(self) -> None:
287
- # Auto-generate session ID if requested and not provided
288
- if self.auto_session_id and not self.session_id:
289
- self.session_id = uuid.uuid4().hex[:12]
290
-
291
- # Set session_duration from duration_minutes
292
- self.session_duration = self.duration_minutes
293
-
294
- # Call parent post_init
295
- super().__post_init__()
296
-
297
-
298
- # =============================================================================
299
- # SERP API Models
300
- # =============================================================================
301
-
302
-
303
- @dataclass
304
- class SerpRequest:
305
- """
306
- Configuration for a SERP API request.
307
-
308
- Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
309
-
310
- Args:
311
- query: The search query string (required).
312
- engine: Search engine to use (default: 'google').
313
- num: Number of results per page (default: 10).
314
- start: Result offset for pagination (default: 0).
315
-
316
- # Localization
317
- country: Country code for results (gl parameter for Google).
318
- language: Language code for interface (hl parameter for Google).
319
- google_domain: Google domain to use (e.g., 'google.co.uk').
320
-
321
- # Geo-targeting
322
- location: Location name for geo-targeting.
323
- uule: Encoded location parameter (use with location).
324
-
325
- # Search type
326
- search_type: Type of search (images, news, shopping, videos, etc.).
327
-
328
- # Filters
329
- safe_search: Enable safe search filtering.
330
- time_filter: Time range filter (hour, day, week, month, year).
331
- no_autocorrect: Disable automatic spelling correction (nfpr).
332
- filter_duplicates: Enable/disable duplicate filtering.
333
-
334
- # Device & Rendering
335
- device: Device type ('desktop', 'mobile', 'tablet').
336
- render_js: Enable JavaScript rendering in SERP (render_js=True/False).
337
- no_cache: Disable internal caching (no_cache=True/False).
338
-
339
- # Output
340
- output_format: 'json' (default) or 'html'.
341
-
342
- # Advanced
343
- ludocid: Google Place ID.
344
- kgmid: Google Knowledge Graph ID.
345
-
346
- # Extra
347
- extra_params: Additional parameters to pass through (ibp, lsig, si, uds, ...).
348
- """
349
-
350
- query: str
351
- engine: str = "google"
352
- num: int = 10
353
- start: int = 0
354
-
355
- # Localization
356
- country: Optional[str] = None # 'gl' for Google
357
- language: Optional[str] = None # 'hl' for Google
358
- google_domain: Optional[str] = None
359
- countries_filter: Optional[str] = None # 'cr' parameter
360
- languages_filter: Optional[str] = None # 'lr' parameter
361
-
362
- # Geo-targeting
363
- location: Optional[str] = None
364
- uule: Optional[str] = None # Encoded location
365
-
366
- # Search type
367
- search_type: Optional[str] = None # tbm parameter (isch, shop, nws, vid, ...)
368
-
369
- # Filters
370
- safe_search: Optional[bool] = None
371
- time_filter: Optional[str] = None # tbs parameter (time part)
372
- no_autocorrect: bool = False # nfpr parameter
373
- filter_duplicates: Optional[bool] = None # filter parameter
374
-
375
- # Device & Rendering
376
- device: Optional[str] = None # 'desktop', 'mobile', 'tablet'
377
- render_js: Optional[bool] = None # render_js parameter
378
- no_cache: Optional[bool] = None # no_cache parameter
379
-
380
- # Output format
381
- output_format: str = "json" # 'json' or 'html'
382
-
383
- # Advanced Google parameters
384
- ludocid: Optional[str] = None # Google Place ID
385
- kgmid: Optional[str] = None # Knowledge Graph ID
386
-
387
- # Pass-through
388
- extra_params: Dict[str, Any] = field(default_factory=dict)
389
-
390
- # Search type mappings for tbm parameter
391
- SEARCH_TYPE_MAP = {
392
- "images": "isch",
393
- "shopping": "shop",
394
- "news": "nws",
395
- "videos": "vid",
396
- # Direct values also work
397
- "isch": "isch",
398
- "shop": "shop",
399
- "nws": "nws",
400
- "vid": "vid",
401
- }
402
-
403
- # Time filter mappings for tbs parameter
404
- TIME_FILTER_MAP = {
405
- "hour": "qdr:h",
406
- "day": "qdr:d",
407
- "week": "qdr:w",
408
- "month": "qdr:m",
409
- "year": "qdr:y",
410
- }
411
-
412
- # Engine URL defaults
413
- ENGINE_URLS = {
414
- "google": "google.com",
415
- "bing": "bing.com",
416
- "yandex": "yandex.com",
417
- "duckduckgo": "duckduckgo.com",
418
- "baidu": "baidu.com",
419
- }
420
-
421
- def to_payload(self) -> Dict[str, Any]:
422
- """
423
- Convert to API request payload.
424
-
425
- Returns:
426
- Dictionary ready to be sent to the SERP API.
427
- """
428
- engine = self.engine.lower()
429
-
430
- payload: Dict[str, Any] = {
431
- "engine": engine,
432
- "num": str(self.num),
433
- # output_format: json=1 for JSON, json=0 for raw HTML
434
- "json": "1" if self.output_format.lower() == "json" else "0",
435
- }
436
-
437
- # Handle query parameter (Yandex uses 'text', others use 'q')
438
- if engine == "yandex":
439
- payload["text"] = self.query
440
- else:
441
- payload["q"] = self.query
442
-
443
- # Set URL / domain based on google_domain or engine default
444
- if self.google_domain:
445
- # 显式设置 google_domain 参数,同时设置 url
446
- payload["google_domain"] = self.google_domain
447
- payload["url"] = self.google_domain
448
- elif engine in self.ENGINE_URLS:
449
- payload["url"] = self.ENGINE_URLS[engine]
450
-
451
- # Pagination
452
- if self.start > 0:
453
- payload["start"] = str(self.start)
454
-
455
- # Localization
456
- if self.country:
457
- payload["gl"] = self.country.lower()
458
-
459
- if self.language:
460
- payload["hl"] = self.language.lower()
461
-
462
- if self.countries_filter:
463
- payload["cr"] = self.countries_filter
464
-
465
- if self.languages_filter:
466
- payload["lr"] = self.languages_filter
467
-
468
- # Geo-targeting
469
- if self.location:
470
- payload["location"] = self.location
471
-
472
- if self.uule:
473
- payload["uule"] = self.uule
474
-
475
- # Search type (tbm)
476
- if self.search_type:
477
- search_type_lower = self.search_type.lower()
478
- tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
479
- payload["tbm"] = tbm_value
480
-
481
- # Filters
482
- if self.safe_search is not None:
483
- payload["safe"] = "active" if self.safe_search else "off"
484
-
485
- if self.time_filter:
486
- time_lower = self.time_filter.lower()
487
- tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
488
- payload["tbs"] = tbs_value
489
-
490
- if self.no_autocorrect:
491
- payload["nfpr"] = "1"
492
-
493
- if self.filter_duplicates is not None:
494
- payload["filter"] = "1" if self.filter_duplicates else "0"
495
-
496
- # Device
497
- if self.device:
498
- payload["device"] = self.device.lower()
499
-
500
- # Rendering & cache control
501
- if self.render_js is not None:
502
- payload["render_js"] = "True" if self.render_js else "False"
503
-
504
- if self.no_cache is not None:
505
- payload["no_cache"] = "True" if self.no_cache else "False"
506
-
507
- # Advanced Google parameters
508
- if self.ludocid:
509
- payload["ludocid"] = self.ludocid
510
-
511
- if self.kgmid:
512
- payload["kgmid"] = self.kgmid
513
-
514
- # Extra parameters (ibp, lsig, si, uds, etc.)
515
- payload.update(self.extra_params)
516
-
517
- return payload
518
-
519
-
520
- # =============================================================================
521
- # Universal Scraper (Web Unlocker) Models
522
- # =============================================================================
523
-
524
-
525
- @dataclass
526
- class UniversalScrapeRequest:
527
- """
528
- Configuration for a Universal Scraping API (Web Unlocker) request.
529
-
530
- This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
531
-
532
- Args:
533
- url: Target URL to scrape (required).
534
- js_render: Enable JavaScript rendering with headless browser.
535
- output_format: Output format - 'html' or 'png' (screenshot).
536
- country: Country code for geo-targeting the request.
537
- block_resources: Block specific resources (e.g., 'script', 'image').
538
- clean_content: Remove JS/CSS from returned content (e.g., 'js,css').
539
- wait: Wait time in milliseconds after page load (max 100000).
540
- wait_for: CSS selector to wait for before returning.
541
- headers: Custom request headers as list of {name, value} dicts.
542
- cookies: Custom cookies as list of {name, value} dicts.
543
- extra_params: Additional parameters to pass through.
544
-
545
- Example:
546
- >>> req = UniversalScrapeRequest(
547
- ... url="https://example.com",
548
- ... js_render=True,
549
- ... output_format="html",
550
- ... country="us",
551
- ... wait=5000,
552
- ... wait_for=".content"
553
- ... )
554
- >>> payload = req.to_payload()
555
- """
556
-
557
- url: str
558
- js_render: bool = False
559
- output_format: str = "html" # 'html' or 'png'
560
- country: Optional[str] = None
561
- block_resources: Optional[str] = None # e.g., 'script', 'image', 'script,image'
562
- clean_content: Optional[str] = None # e.g., 'js', 'css', 'js,css'
563
- wait: Optional[int] = None # Milliseconds, max 100000
564
- wait_for: Optional[str] = None # CSS selector
565
- headers: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
566
- cookies: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
567
- extra_params: Dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
568
-
569
- def __post_init__(self) -> None:
570
- """Validate configuration."""
571
- valid_formats = {"html", "png"}
572
- if self.output_format.lower() not in valid_formats:
573
- raise ValueError(
574
- f"Invalid output_format: {self.output_format}. "
575
- f"Must be one of: {', '.join(valid_formats)}"
576
- )
577
-
578
- if self.wait is not None and (self.wait < 0 or self.wait > 100000):
579
- raise ValueError(
580
- f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
581
- )
582
-
583
- def to_payload(self) -> Dict[str, Any]:
584
- """
585
- Convert to API request payload.
586
-
587
- Returns:
588
- Dictionary ready to be sent to the Universal API.
589
- """
590
- payload: Dict[str, Any] = {
591
- "url": self.url,
592
- "js_render": "True" if self.js_render else "False",
593
- "type": self.output_format.lower(),
594
- }
595
-
596
- if self.country:
597
- payload["country"] = self.country.lower()
598
-
599
- if self.block_resources:
600
- payload["block_resources"] = self.block_resources
601
-
602
- if self.clean_content:
603
- payload["clean_content"] = self.clean_content
604
-
605
- if self.wait is not None:
606
- payload["wait"] = str(self.wait)
607
-
608
- if self.wait_for:
609
- payload["wait_for"] = self.wait_for
610
-
611
- if self.headers:
612
- payload["headers"] = json.dumps(self.headers)
613
-
614
- if self.cookies:
615
- payload["cookies"] = json.dumps(self.cookies)
616
-
617
- payload.update(self.extra_params)
618
-
619
- return payload
620
-
621
-
622
- # =============================================================================
623
- # Web Scraper Task Models
624
- # =============================================================================
625
-
626
-
627
- @dataclass
628
- class ScraperTaskConfig:
629
- """
630
- Configuration for creating a Web Scraper API task.
631
-
632
- Note: You must get spider_id and spider_name from the Thordata Dashboard.
633
-
634
- Args:
635
- file_name: Name for the output file.
636
- spider_id: Spider identifier from Dashboard.
637
- spider_name: Spider name (usually the target domain).
638
- parameters: Spider-specific parameters.
639
- universal_params: Global spider settings.
640
- include_errors: Include error details in output.
641
-
642
- Example:
643
- >>> config = ScraperTaskConfig(
644
- ... file_name="youtube_data",
645
- ... spider_id="youtube_video-post_by-url",
646
- ... spider_name="youtube.com",
647
- ... parameters={
648
- ... "url": "https://youtube.com/@channel/videos",
649
- ... "num_of_posts": "50"
650
- ... }
651
- ... )
652
- >>> payload = config.to_payload()
653
- """
654
-
655
- file_name: str
656
- spider_id: str
657
- spider_name: str
658
- parameters: Dict[str, Any]
659
- universal_params: Optional[Dict[str, Any]] = None
660
- include_errors: bool = True
661
-
662
- def to_payload(self) -> Dict[str, Any]:
663
- """
664
- Convert to API request payload.
665
-
666
- Returns:
667
- Dictionary ready to be sent to the Web Scraper API.
668
- """
669
- payload: Dict[str, Any] = {
670
- "file_name": self.file_name,
671
- "spider_id": self.spider_id,
672
- "spider_name": self.spider_name,
673
- "spider_parameters": json.dumps([self.parameters]),
674
- "spider_errors": "true" if self.include_errors else "false",
675
- }
676
-
677
- if self.universal_params:
678
- payload["spider_universal"] = json.dumps(self.universal_params)
679
-
680
- return payload
681
-
682
-
683
- # =============================================================================
684
- # Response Models
685
- # =============================================================================
686
-
687
-
688
- @dataclass
689
- class TaskStatusResponse:
690
- """
691
- Response from task status check.
692
-
693
- Attributes:
694
- task_id: The task identifier.
695
- status: Current task status.
696
- progress: Optional progress percentage.
697
- message: Optional status message.
698
- """
699
-
700
- task_id: str
701
- status: str
702
- progress: Optional[int] = None
703
- message: Optional[str] = None
704
-
705
- def is_complete(self) -> bool:
706
- """Check if the task has completed (success or failure)."""
707
- terminal_statuses = {
708
- "ready",
709
- "success",
710
- "finished",
711
- "failed",
712
- "error",
713
- "cancelled",
714
- }
715
- return self.status.lower() in terminal_statuses
716
-
717
- def is_success(self) -> bool:
718
- """Check if the task completed successfully."""
719
- success_statuses = {"ready", "success", "finished"}
720
- return self.status.lower() in success_statuses
721
-
722
- def is_failed(self) -> bool:
723
- """Check if the task failed."""
724
- failure_statuses = {"failed", "error"}
725
- return self.status.lower() in failure_statuses
1
+ """
2
+ Data models for the Thordata Python SDK.
3
+
4
+ This module provides type-safe dataclasses for configuring proxy requests,
5
+ SERP API calls, and Universal Scraping requests. Using these models enables
6
+ IDE autocomplete and reduces parameter errors.
7
+
8
+ Example:
9
+ >>> from thordata.models import ProxyConfig, SerpRequest
10
+ >>>
11
+ >>> # Build a proxy URL with geo-targeting
12
+ >>> proxy = ProxyConfig(
13
+ ... username="myuser",
14
+ ... password="mypass",
15
+ ... country="us",
16
+ ... city="seattle"
17
+ ... )
18
+ >>> print(proxy.build_proxy_url())
19
+
20
+ >>> # Configure a SERP request
21
+ >>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
22
+ >>> print(serp.to_payload())
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import re
29
+ import uuid
30
+ from dataclasses import dataclass, field
31
+ from enum import Enum
32
+ from typing import Any, Dict, List, Optional, Union
33
+
34
+ # =============================================================================
35
+ # Proxy Product Types
36
+ # =============================================================================
37
+
38
+
39
+ class ProxyProduct(str, Enum):
40
+ """
41
+ Thordata proxy product types with their default ports.
42
+
43
+ Each product type has a specific port on the proxy gateway.
44
+ """
45
+
46
+ RESIDENTIAL = "residential"
47
+ MOBILE = "mobile"
48
+ DATACENTER = "datacenter"
49
+ ISP = "isp"
50
+
51
+ @property
52
+ def default_port(self) -> int:
53
+ """Get the default port for this proxy product."""
54
+ ports = {
55
+ "residential": 9999,
56
+ "mobile": 5555,
57
+ "datacenter": 7777,
58
+ "isp": 6666,
59
+ }
60
+ return ports[self.value]
61
+
62
+
63
+ # =============================================================================
64
+ # Proxy Configuration Models
65
+ # =============================================================================
66
+
67
+
68
+ @dataclass
69
+ class ProxyConfig:
70
+ """
71
+ Configuration for building a Thordata proxy URL.
72
+
73
+ This class handles the complex username format required by Thordata proxies,
74
+ where geo-targeting and session parameters are embedded in the username.
75
+
76
+ Args:
77
+ username: Your Thordata account username (the part after 'td-customer-').
78
+ password: Your Thordata account password.
79
+ product: Proxy product type (residential, mobile, datacenter, isp).
80
+ host: Proxy gateway host. If None, uses default based on product.
81
+ port: Proxy gateway port. If None, uses default based on product.
82
+ protocol: Proxy protocol - 'http' or 'https'.
83
+
84
+ # Geo-targeting (all optional)
85
+ continent: Target continent code (af/an/as/eu/na/oc/sa).
86
+ country: Target country code in ISO 3166-1 alpha-2 format.
87
+ state: Target state name in lowercase.
88
+ city: Target city name in lowercase.
89
+ asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
90
+
91
+ # Session control (optional)
92
+ session_id: Session identifier for sticky sessions.
93
+ session_duration: Session duration in minutes (1-90).
94
+
95
+ Example:
96
+ >>> config = ProxyConfig(
97
+ ... username="GnrqUwwu3obt",
98
+ ... password="PkCSzvt30iww",
99
+ ... product=ProxyProduct.RESIDENTIAL,
100
+ ... country="us",
101
+ ... state="california",
102
+ ... session_id="mysession123",
103
+ ... session_duration=10
104
+ ... )
105
+ >>> print(config.build_proxy_url())
106
+ http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
107
+ """
108
+
109
+ username: str
110
+ password: str
111
+ product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL
112
+ host: Optional[str] = None
113
+ port: Optional[int] = None
114
+ protocol: str = "http"
115
+
116
+ # Geo-targeting
117
+ continent: Optional[str] = None
118
+ country: Optional[str] = None
119
+ state: Optional[str] = None
120
+ city: Optional[str] = None
121
+ asn: Optional[str] = None
122
+
123
+ # Session control
124
+ session_id: Optional[str] = None
125
+ session_duration: Optional[int] = None # minutes, 1-90
126
+
127
+ # Valid continent codes
128
+ VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
129
+
130
+ def __post_init__(self) -> None:
131
+ """Validate configuration after initialization."""
132
+ # Normalize product to enum
133
+ if isinstance(self.product, str):
134
+ self.product = ProxyProduct(self.product.lower())
135
+
136
+ # Set default host and port based on product
137
+ if self.host is None:
138
+ # Set host based on product type
139
+ host_map = {
140
+ ProxyProduct.RESIDENTIAL: "t.pr.thordata.net",
141
+ ProxyProduct.DATACENTER: "dc.pr.thordata.net",
142
+ ProxyProduct.MOBILE: "m.pr.thordata.net",
143
+ ProxyProduct.ISP: "isp.pr.thordata.net",
144
+ }
145
+ self.host = host_map.get(self.product, "pr.thordata.net")
146
+
147
+ if self.port is None:
148
+ self.port = self.product.default_port
149
+
150
+ self._validate()
151
+
152
+ def _validate(self) -> None:
153
+ """Validate the proxy configuration."""
154
+ # Validate protocol
155
+ if self.protocol not in ("http", "https"):
156
+ raise ValueError(
157
+ f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
158
+ )
159
+
160
+ # Validate session duration
161
+ if self.session_duration is not None:
162
+ if not 1 <= self.session_duration <= 90:
163
+ raise ValueError(
164
+ f"session_duration must be between 1 and 90 minutes, "
165
+ f"got {self.session_duration}"
166
+ )
167
+ if not self.session_id:
168
+ raise ValueError("session_duration requires session_id to be set")
169
+
170
+ # Validate ASN requires country
171
+ if self.asn and not self.country:
172
+ raise ValueError("ASN targeting requires country to be specified")
173
+
174
+ # Validate continent code
175
+ if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
176
+ raise ValueError(
177
+ f"Invalid continent code: {self.continent}. "
178
+ f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
179
+ )
180
+
181
+ # Validate country code format (2 letters)
182
+ if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
183
+ raise ValueError(
184
+ f"Invalid country code: {self.country}. "
185
+ "Must be a 2-letter ISO 3166-1 alpha-2 code."
186
+ )
187
+
188
+ def build_username(self) -> str:
189
+ """
190
+ Build the complete username string with embedded parameters.
191
+
192
+ Returns:
193
+ The formatted username string for proxy authentication.
194
+ """
195
+ parts = [f"td-customer-{self.username}"]
196
+
197
+ # Add geo-targeting parameters (order matters)
198
+ if self.continent:
199
+ parts.append(f"continent-{self.continent.lower()}")
200
+
201
+ if self.country:
202
+ parts.append(f"country-{self.country.lower()}")
203
+
204
+ if self.state:
205
+ parts.append(f"state-{self.state.lower()}")
206
+
207
+ if self.city:
208
+ parts.append(f"city-{self.city.lower()}")
209
+
210
+ if self.asn:
211
+ # Ensure ASN has correct format
212
+ asn_value = self.asn.upper()
213
+ if not asn_value.startswith("AS"):
214
+ asn_value = f"AS{asn_value}"
215
+ parts.append(f"asn-{asn_value}")
216
+
217
+ # Add session parameters
218
+ if self.session_id:
219
+ parts.append(f"sessid-{self.session_id}")
220
+
221
+ if self.session_duration:
222
+ parts.append(f"sesstime-{self.session_duration}")
223
+
224
+ return "-".join(parts)
225
+
226
+ def build_proxy_url(self) -> str:
227
+ """
228
+ Build the complete proxy URL.
229
+
230
+ Returns:
231
+ The formatted proxy URL for use with requests/aiohttp.
232
+ """
233
+ username = self.build_username()
234
+ return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
235
+
236
+ def to_proxies_dict(self) -> Dict[str, str]:
237
+ """
238
+ Build a proxies dict suitable for the requests library.
239
+
240
+ Returns:
241
+ Dict with 'http' and 'https' keys pointing to the proxy URL.
242
+ """
243
+ url = self.build_proxy_url()
244
+ return {"http": url, "https": url}
245
+
246
+ def to_aiohttp_config(self) -> tuple:
247
+ """
248
+ Get proxy configuration for aiohttp.
249
+
250
+ Returns:
251
+ Tuple of (proxy_url, proxy_auth) for aiohttp.
252
+ """
253
+ try:
254
+ import aiohttp
255
+
256
+ proxy_url = f"{self.protocol}://{self.host}:{self.port}"
257
+ proxy_auth = aiohttp.BasicAuth(
258
+ login=self.build_username(), password=self.password
259
+ )
260
+ return proxy_url, proxy_auth
261
+ except ImportError as e:
262
+ raise ImportError(
263
+ "aiohttp is required for async proxy configuration"
264
+ ) from e
265
+
266
+
267
+ @dataclass
268
+ class StaticISPProxy:
269
+ """
270
+ Configuration for static ISP proxy with direct IP connection.
271
+
272
+ Static ISP proxies connect directly to a purchased IP address,
273
+ not through the gateway.
274
+
275
+ Args:
276
+ host: The static IP address you purchased.
277
+ username: Your ISP proxy username.
278
+ password: Your ISP proxy password.
279
+ port: Port number (default: 6666).
280
+ protocol: Proxy protocol - 'http' or 'https'.
281
+
282
+ Example:
283
+ >>> proxy = StaticISPProxy(
284
+ ... host="xx.xxx.xxx.xxx",
285
+ ... username="myuser",
286
+ ... password="mypass"
287
+ ... )
288
+ >>> print(proxy.build_proxy_url())
289
+ http://myuser:mypass@xx.xxx.xxx.xxx:6666
290
+ """
291
+
292
+ host: str
293
+ username: str
294
+ password: str
295
+ port: int = 6666
296
+ protocol: str = "http"
297
+
298
+ def __post_init__(self) -> None:
299
+ """Validate configuration."""
300
+ if self.protocol not in ("http", "https"):
301
+ raise ValueError(
302
+ f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
303
+ )
304
+
305
+ def build_proxy_url(self) -> str:
306
+ """
307
+ Build the complete proxy URL for direct connection.
308
+
309
+ Returns:
310
+ The formatted proxy URL.
311
+ """
312
+ return (
313
+ f"{self.protocol}://{self.username}:{self.password}@{self.host}:{self.port}"
314
+ )
315
+
316
+ def to_proxies_dict(self) -> Dict[str, str]:
317
+ """
318
+ Build a proxies dict suitable for the requests library.
319
+
320
+ Returns:
321
+ Dict with 'http' and 'https' keys pointing to the proxy URL.
322
+ """
323
+ url = self.build_proxy_url()
324
+ return {"http": url, "https": url}
325
+
326
+ def to_aiohttp_config(self) -> tuple:
327
+ """
328
+ Get proxy configuration for aiohttp.
329
+
330
+ Returns:
331
+ Tuple of (proxy_url, proxy_auth) for aiohttp.
332
+ """
333
+ try:
334
+ import aiohttp
335
+
336
+ proxy_url = f"{self.protocol}://{self.host}:{self.port}"
337
+ proxy_auth = aiohttp.BasicAuth(login=self.username, password=self.password)
338
+ return proxy_url, proxy_auth
339
+ except ImportError as e:
340
+ raise ImportError(
341
+ "aiohttp is required for async proxy configuration"
342
+ ) from e
343
+
344
+ @classmethod
345
+ def from_env(cls) -> "StaticISPProxy":
346
+ """
347
+ Create StaticISPProxy from environment variables.
348
+
349
+ Required env vars:
350
+ - THORDATA_ISP_HOST
351
+ - THORDATA_ISP_USERNAME
352
+ - THORDATA_ISP_PASSWORD
353
+
354
+ Returns:
355
+ Configured StaticISPProxy instance.
356
+
357
+ Raises:
358
+ ValueError: If required environment variables are missing.
359
+ """
360
+ import os
361
+
362
+ host = os.getenv("THORDATA_ISP_HOST")
363
+ username = os.getenv("THORDATA_ISP_USERNAME")
364
+ password = os.getenv("THORDATA_ISP_PASSWORD")
365
+
366
+ if not all([host, username, password]):
367
+ raise ValueError(
368
+ "THORDATA_ISP_HOST, THORDATA_ISP_USERNAME, and "
369
+ "THORDATA_ISP_PASSWORD are required"
370
+ )
371
+
372
+ return cls(host=host, username=username, password=password)
373
+
374
+
375
+ @dataclass
376
+ class StickySession(ProxyConfig):
377
+ """
378
+ Convenience class for creating sticky session proxy configurations.
379
+
380
+ A sticky session keeps the same IP address for a specified duration,
381
+ useful for multi-step operations that require IP consistency.
382
+
383
+ Args:
384
+ duration_minutes: How long to keep the same IP (1-90 minutes).
385
+ auto_session_id: If True, automatically generates a unique session ID.
386
+
387
+ Example:
388
+ >>> session = StickySession(
389
+ ... username="myuser",
390
+ ... password="mypass",
391
+ ... country="us",
392
+ ... duration_minutes=15
393
+ ... )
394
+ >>> # Each call to build_proxy_url() uses the same session
395
+ >>> url = session.build_proxy_url()
396
+ """
397
+
398
+ duration_minutes: int = 10
399
+ auto_session_id: bool = True
400
+
401
+ def __post_init__(self) -> None:
402
+ # Auto-generate session ID if requested and not provided
403
+ if self.auto_session_id and not self.session_id:
404
+ self.session_id = uuid.uuid4().hex[:12]
405
+
406
+ # Set session_duration from duration_minutes
407
+ self.session_duration = self.duration_minutes
408
+
409
+ # Call parent post_init
410
+ super().__post_init__()
411
+
412
+
413
+ # =============================================================================
414
+ # SERP API Models
415
+ # =============================================================================
416
+
417
+
418
+ @dataclass
419
+ class SerpRequest:
420
+ """
421
+ Configuration for a SERP API request.
422
+
423
+ Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
424
+
425
+ Args:
426
+ query: The search query string (required).
427
+ engine: Search engine to use (default: 'google').
428
+ num: Number of results per page (default: 10).
429
+ start: Result offset for pagination (default: 0).
430
+
431
+ # Localization
432
+ country: Country code for results (gl parameter for Google).
433
+ language: Language code for interface (hl parameter for Google).
434
+ google_domain: Google domain to use (e.g., 'google.co.uk').
435
+
436
+ # Geo-targeting
437
+ location: Location name for geo-targeting.
438
+ uule: Encoded location parameter (use with location).
439
+
440
+ # Search type
441
+ search_type: Type of search (images, news, shopping, videos, etc.).
442
+
443
+ # Filters
444
+ safe_search: Enable safe search filtering.
445
+ time_filter: Time range filter (hour, day, week, month, year).
446
+ no_autocorrect: Disable automatic spelling correction (nfpr).
447
+ filter_duplicates: Enable/disable duplicate filtering.
448
+
449
+ # Device & Rendering
450
+ device: Device type ('desktop', 'mobile', 'tablet').
451
+ render_js: Enable JavaScript rendering in SERP (render_js=True/False).
452
+ no_cache: Disable internal caching (no_cache=True/False).
453
+
454
+ # Output
455
+ output_format: 'json' (default) or 'html'.
456
+
457
+ # Advanced
458
+ ludocid: Google Place ID.
459
+ kgmid: Google Knowledge Graph ID.
460
+
461
+ # Extra
462
+ extra_params: Additional parameters to pass through (ibp, lsig, si, uds, ...).
463
+ """
464
+
465
+ query: str
466
+ engine: str = "google"
467
+ num: int = 10
468
+ start: int = 0
469
+
470
+ # Localization
471
+ country: Optional[str] = None # 'gl' for Google
472
+ language: Optional[str] = None # 'hl' for Google
473
+ google_domain: Optional[str] = None
474
+ countries_filter: Optional[str] = None # 'cr' parameter
475
+ languages_filter: Optional[str] = None # 'lr' parameter
476
+
477
+ # Geo-targeting
478
+ location: Optional[str] = None
479
+ uule: Optional[str] = None # Encoded location
480
+
481
+ # Search type
482
+ search_type: Optional[str] = None # tbm parameter (isch, shop, nws, vid, ...)
483
+
484
+ # Filters
485
+ safe_search: Optional[bool] = None
486
+ time_filter: Optional[str] = None # tbs parameter (time part)
487
+ no_autocorrect: bool = False # nfpr parameter
488
+ filter_duplicates: Optional[bool] = None # filter parameter
489
+
490
+ # Device & Rendering
491
+ device: Optional[str] = None # 'desktop', 'mobile', 'tablet'
492
+ render_js: Optional[bool] = None # render_js parameter
493
+ no_cache: Optional[bool] = None # no_cache parameter
494
+
495
+ # Output format
496
+ output_format: str = "json" # 'json' or 'html'
497
+
498
+ # Advanced Google parameters
499
+ ludocid: Optional[str] = None # Google Place ID
500
+ kgmid: Optional[str] = None # Knowledge Graph ID
501
+
502
+ # Pass-through
503
+ extra_params: Dict[str, Any] = field(default_factory=dict)
504
+
505
+ # Search type mappings for tbm parameter
506
+ SEARCH_TYPE_MAP = {
507
+ "images": "isch",
508
+ "shopping": "shop",
509
+ "news": "nws",
510
+ "videos": "vid",
511
+ # Direct values also work
512
+ "isch": "isch",
513
+ "shop": "shop",
514
+ "nws": "nws",
515
+ "vid": "vid",
516
+ }
517
+
518
+ # Time filter mappings for tbs parameter
519
+ TIME_FILTER_MAP = {
520
+ "hour": "qdr:h",
521
+ "day": "qdr:d",
522
+ "week": "qdr:w",
523
+ "month": "qdr:m",
524
+ "year": "qdr:y",
525
+ }
526
+
527
+ # Engine URL defaults
528
+ ENGINE_URLS = {
529
+ "google": "google.com",
530
+ "bing": "bing.com",
531
+ "yandex": "yandex.com",
532
+ "duckduckgo": "duckduckgo.com",
533
+ "baidu": "baidu.com",
534
+ }
535
+
536
+ def to_payload(self) -> Dict[str, Any]:
537
+ """
538
+ Convert to API request payload.
539
+
540
+ Returns:
541
+ Dictionary ready to be sent to the SERP API.
542
+ """
543
+ engine = self.engine.lower()
544
+
545
+ payload: Dict[str, Any] = {
546
+ "engine": engine,
547
+ "num": str(self.num),
548
+ # output_format: json=1 for JSON, json=0 for raw HTML
549
+ "json": "1" if self.output_format.lower() == "json" else "0",
550
+ }
551
+
552
+ # Handle query parameter (Yandex uses 'text', others use 'q')
553
+ if engine == "yandex":
554
+ payload["text"] = self.query
555
+ else:
556
+ payload["q"] = self.query
557
+
558
+ # Set URL / domain based on google_domain or engine default
559
+ if self.google_domain:
560
+ # 显式设置 google_domain 参数,同时设置 url
561
+ payload["google_domain"] = self.google_domain
562
+ payload["url"] = self.google_domain
563
+ elif engine in self.ENGINE_URLS:
564
+ payload["url"] = self.ENGINE_URLS[engine]
565
+
566
+ # Pagination
567
+ if self.start > 0:
568
+ payload["start"] = str(self.start)
569
+
570
+ # Localization
571
+ if self.country:
572
+ payload["gl"] = self.country.lower()
573
+
574
+ if self.language:
575
+ payload["hl"] = self.language.lower()
576
+
577
+ if self.countries_filter:
578
+ payload["cr"] = self.countries_filter
579
+
580
+ if self.languages_filter:
581
+ payload["lr"] = self.languages_filter
582
+
583
+ # Geo-targeting
584
+ if self.location:
585
+ payload["location"] = self.location
586
+
587
+ if self.uule:
588
+ payload["uule"] = self.uule
589
+
590
+ # Search type (tbm)
591
+ if self.search_type:
592
+ search_type_lower = self.search_type.lower()
593
+ tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
594
+ payload["tbm"] = tbm_value
595
+
596
+ # Filters
597
+ if self.safe_search is not None:
598
+ payload["safe"] = "active" if self.safe_search else "off"
599
+
600
+ if self.time_filter:
601
+ time_lower = self.time_filter.lower()
602
+ tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
603
+ payload["tbs"] = tbs_value
604
+
605
+ if self.no_autocorrect:
606
+ payload["nfpr"] = "1"
607
+
608
+ if self.filter_duplicates is not None:
609
+ payload["filter"] = "1" if self.filter_duplicates else "0"
610
+
611
+ # Device
612
+ if self.device:
613
+ payload["device"] = self.device.lower()
614
+
615
+ # Rendering & cache control
616
+ if self.render_js is not None:
617
+ payload["render_js"] = "True" if self.render_js else "False"
618
+
619
+ if self.no_cache is not None:
620
+ payload["no_cache"] = "True" if self.no_cache else "False"
621
+
622
+ # Advanced Google parameters
623
+ if self.ludocid:
624
+ payload["ludocid"] = self.ludocid
625
+
626
+ if self.kgmid:
627
+ payload["kgmid"] = self.kgmid
628
+
629
+ # Extra parameters (ibp, lsig, si, uds, etc.)
630
+ payload.update(self.extra_params)
631
+
632
+ return payload
633
+
634
+
635
+ # =============================================================================
636
+ # Universal Scraper (Web Unlocker) Models
637
+ # =============================================================================
638
+
639
+
640
+ @dataclass
641
+ class UniversalScrapeRequest:
642
+ """
643
+ Configuration for a Universal Scraping API (Web Unlocker) request.
644
+
645
+ This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
646
+
647
+ Args:
648
+ url: Target URL to scrape (required).
649
+ js_render: Enable JavaScript rendering with headless browser.
650
+ output_format: Output format - 'html' or 'png' (screenshot).
651
+ country: Country code for geo-targeting the request.
652
+ block_resources: Block specific resources (e.g., 'script', 'image').
653
+ clean_content: Remove JS/CSS from returned content (e.g., 'js,css').
654
+ wait: Wait time in milliseconds after page load (max 100000).
655
+ wait_for: CSS selector to wait for before returning.
656
+ headers: Custom request headers as list of {name, value} dicts.
657
+ cookies: Custom cookies as list of {name, value} dicts.
658
+ extra_params: Additional parameters to pass through.
659
+
660
+ Example:
661
+ >>> req = UniversalScrapeRequest(
662
+ ... url="https://example.com",
663
+ ... js_render=True,
664
+ ... output_format="html",
665
+ ... country="us",
666
+ ... wait=5000,
667
+ ... wait_for=".content"
668
+ ... )
669
+ >>> payload = req.to_payload()
670
+ """
671
+
672
+ url: str
673
+ js_render: bool = False
674
+ output_format: str = "html" # 'html' or 'png'
675
+ country: Optional[str] = None
676
+ block_resources: Optional[str] = None # e.g., 'script', 'image', 'script,image'
677
+ clean_content: Optional[str] = None # e.g., 'js', 'css', 'js,css'
678
+ wait: Optional[int] = None # Milliseconds, max 100000
679
+ wait_for: Optional[str] = None # CSS selector
680
+ headers: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
681
+ cookies: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
682
+ extra_params: Dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
683
+
684
+ def __post_init__(self) -> None:
685
+ """Validate configuration."""
686
+ valid_formats = {"html", "png"}
687
+ if self.output_format.lower() not in valid_formats:
688
+ raise ValueError(
689
+ f"Invalid output_format: {self.output_format}. "
690
+ f"Must be one of: {', '.join(valid_formats)}"
691
+ )
692
+
693
+ if self.wait is not None and (self.wait < 0 or self.wait > 100000):
694
+ raise ValueError(
695
+ f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
696
+ )
697
+
698
+ def to_payload(self) -> Dict[str, Any]:
699
+ """
700
+ Convert to API request payload.
701
+
702
+ Returns:
703
+ Dictionary ready to be sent to the Universal API.
704
+ """
705
+ payload: Dict[str, Any] = {
706
+ "url": self.url,
707
+ "js_render": "True" if self.js_render else "False",
708
+ "type": self.output_format.lower(),
709
+ }
710
+
711
+ if self.country:
712
+ payload["country"] = self.country.lower()
713
+
714
+ if self.block_resources:
715
+ payload["block_resources"] = self.block_resources
716
+
717
+ if self.clean_content:
718
+ payload["clean_content"] = self.clean_content
719
+
720
+ if self.wait is not None:
721
+ payload["wait"] = str(self.wait)
722
+
723
+ if self.wait_for:
724
+ payload["wait_for"] = self.wait_for
725
+
726
+ if self.headers:
727
+ payload["headers"] = json.dumps(self.headers)
728
+
729
+ if self.cookies:
730
+ payload["cookies"] = json.dumps(self.cookies)
731
+
732
+ payload.update(self.extra_params)
733
+
734
+ return payload
735
+
736
+
737
+ # =============================================================================
738
+ # Web Scraper Task Models
739
+ # =============================================================================
740
+
741
+
742
+ @dataclass
743
+ class ScraperTaskConfig:
744
+ """
745
+ Configuration for creating a Web Scraper API task.
746
+
747
+ Note: You must get spider_id and spider_name from the Thordata Dashboard.
748
+
749
+ Args:
750
+ file_name: Name for the output file.
751
+ spider_id: Spider identifier from Dashboard.
752
+ spider_name: Spider name (usually the target domain).
753
+ parameters: Spider-specific parameters.
754
+ universal_params: Global spider settings.
755
+ include_errors: Include error details in output.
756
+
757
+ Example:
758
+ >>> config = ScraperTaskConfig(
759
+ ... file_name="youtube_data",
760
+ ... spider_id="youtube_video-post_by-url",
761
+ ... spider_name="youtube.com",
762
+ ... parameters={
763
+ ... "url": "https://youtube.com/@channel/videos",
764
+ ... "num_of_posts": "50"
765
+ ... }
766
+ ... )
767
+ >>> payload = config.to_payload()
768
+ """
769
+
770
+ file_name: str
771
+ spider_id: str
772
+ spider_name: str
773
+ parameters: Dict[str, Any]
774
+ universal_params: Optional[Dict[str, Any]] = None
775
+ include_errors: bool = True
776
+
777
+ def to_payload(self) -> Dict[str, Any]:
778
+ """
779
+ Convert to API request payload.
780
+
781
+ Returns:
782
+ Dictionary ready to be sent to the Web Scraper API.
783
+ """
784
+ payload: Dict[str, Any] = {
785
+ "file_name": self.file_name,
786
+ "spider_id": self.spider_id,
787
+ "spider_name": self.spider_name,
788
+ "spider_parameters": json.dumps([self.parameters]),
789
+ "spider_errors": "true" if self.include_errors else "false",
790
+ }
791
+
792
+ if self.universal_params:
793
+ payload["spider_universal"] = json.dumps(self.universal_params)
794
+
795
+ return payload
796
+
797
+
798
+ # =============================================================================
799
+ # Response Models
800
+ # =============================================================================
801
+
802
+
803
+ @dataclass
804
+ class TaskStatusResponse:
805
+ """
806
+ Response from task status check.
807
+
808
+ Attributes:
809
+ task_id: The task identifier.
810
+ status: Current task status.
811
+ progress: Optional progress percentage.
812
+ message: Optional status message.
813
+ """
814
+
815
+ task_id: str
816
+ status: str
817
+ progress: Optional[int] = None
818
+ message: Optional[str] = None
819
+
820
+ def is_complete(self) -> bool:
821
+ """Check if the task has completed (success or failure)."""
822
+ terminal_statuses = {
823
+ "ready",
824
+ "success",
825
+ "finished",
826
+ "failed",
827
+ "error",
828
+ "cancelled",
829
+ }
830
+ return self.status.lower() in terminal_statuses
831
+
832
+ def is_success(self) -> bool:
833
+ """Check if the task completed successfully."""
834
+ success_statuses = {"ready", "success", "finished"}
835
+ return self.status.lower() in success_statuses
836
+
837
+ def is_failed(self) -> bool:
838
+ """Check if the task failed."""
839
+ failure_statuses = {"failed", "error"}
840
+ return self.status.lower() in failure_statuses