thordata-sdk 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/models.py ADDED
@@ -0,0 +1,698 @@
1
+ """
2
+ Data models for the Thordata Python SDK.
3
+
4
+ This module provides type-safe dataclasses for configuring proxy requests,
5
+ SERP API calls, and Universal Scraping requests. Using these models enables
6
+ IDE autocomplete and reduces parameter errors.
7
+
8
+ Example:
9
+ >>> from thordata.models import ProxyConfig, SerpRequest
10
+ >>>
11
+ >>> # Build a proxy URL with geo-targeting
12
+ >>> proxy = ProxyConfig(
13
+ ... username="myuser",
14
+ ... password="mypass",
15
+ ... country="us",
16
+ ... city="seattle"
17
+ ... )
18
+ >>> print(proxy.build_proxy_url())
19
+
20
+ >>> # Configure a SERP request
21
+ >>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
22
+ >>> print(serp.to_payload())
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import re
28
+ import uuid
29
+ import json
30
+ from dataclasses import dataclass, field
31
+ from typing import Optional, Dict, Any, List, Union
32
+ from enum import Enum
33
+
34
+
35
+ # =============================================================================
36
+ # Proxy Product Types
37
+ # =============================================================================
38
+
39
+ class ProxyProduct(str, Enum):
40
+ """
41
+ Thordata proxy product types with their default ports.
42
+
43
+ Each product type has a specific port on the proxy gateway.
44
+ """
45
+ RESIDENTIAL = "residential"
46
+ MOBILE = "mobile"
47
+ DATACENTER = "datacenter"
48
+ ISP = "isp"
49
+
50
+ @property
51
+ def default_port(self) -> int:
52
+ """Get the default port for this proxy product."""
53
+ ports = {
54
+ "residential": 9999,
55
+ "mobile": 5555,
56
+ "datacenter": 7777,
57
+ "isp": 6666,
58
+ }
59
+ return ports[self.value]
60
+
61
+
62
+ # =============================================================================
63
+ # Proxy Configuration Models
64
+ # =============================================================================
65
+
66
+ @dataclass
67
+ class ProxyConfig:
68
+ """
69
+ Configuration for building a Thordata proxy URL.
70
+
71
+ This class handles the complex username format required by Thordata proxies,
72
+ where geo-targeting and session parameters are embedded in the username.
73
+
74
+ Args:
75
+ username: Your Thordata account username (the part after 'td-customer-').
76
+ password: Your Thordata account password.
77
+ product: Proxy product type (residential, mobile, datacenter, isp).
78
+ host: Proxy gateway host. If None, uses default based on product.
79
+ port: Proxy gateway port. If None, uses default based on product.
80
+ protocol: Proxy protocol - 'http' or 'https'.
81
+
82
+ # Geo-targeting (all optional)
83
+ continent: Target continent code (af/an/as/eu/na/oc/sa).
84
+ country: Target country code in ISO 3166-1 alpha-2 format.
85
+ state: Target state name in lowercase.
86
+ city: Target city name in lowercase.
87
+ asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
88
+
89
+ # Session control (optional)
90
+ session_id: Session identifier for sticky sessions.
91
+ session_duration: Session duration in minutes (1-90).
92
+
93
+ Example:
94
+ >>> config = ProxyConfig(
95
+ ... username="GnrqUwwu3obt",
96
+ ... password="PkCSzvt30iww",
97
+ ... product=ProxyProduct.RESIDENTIAL,
98
+ ... country="us",
99
+ ... state="california",
100
+ ... session_id="mysession123",
101
+ ... session_duration=10
102
+ ... )
103
+ >>> print(config.build_proxy_url())
104
+ http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
105
+ """
106
+
107
+ username: str
108
+ password: str
109
+ product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL
110
+ host: Optional[str] = None
111
+ port: Optional[int] = None
112
+ protocol: str = "http"
113
+
114
+ # Geo-targeting
115
+ continent: Optional[str] = None
116
+ country: Optional[str] = None
117
+ state: Optional[str] = None
118
+ city: Optional[str] = None
119
+ asn: Optional[str] = None
120
+
121
+ # Session control
122
+ session_id: Optional[str] = None
123
+ session_duration: Optional[int] = None # minutes, 1-90
124
+
125
+ # Valid continent codes
126
+ VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
127
+
128
+ def __post_init__(self) -> None:
129
+ """Validate configuration after initialization."""
130
+ # Normalize product to enum
131
+ if isinstance(self.product, str):
132
+ self.product = ProxyProduct(self.product.lower())
133
+
134
+ # Set default host and port based on product
135
+ if self.host is None:
136
+ # Extract user prefix from username if available
137
+ # Default to generic host
138
+ self.host = "pr.thordata.net"
139
+
140
+ if self.port is None:
141
+ self.port = self.product.default_port
142
+
143
+ self._validate()
144
+
145
+ def _validate(self) -> None:
146
+ """Validate the proxy configuration."""
147
+ # Validate protocol
148
+ if self.protocol not in ("http", "https"):
149
+ raise ValueError(
150
+ f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
151
+ )
152
+
153
+ # Validate session duration
154
+ if self.session_duration is not None:
155
+ if not 1 <= self.session_duration <= 90:
156
+ raise ValueError(
157
+ f"session_duration must be between 1 and 90 minutes, "
158
+ f"got {self.session_duration}"
159
+ )
160
+ if not self.session_id:
161
+ raise ValueError("session_duration requires session_id to be set")
162
+
163
+ # Validate ASN requires country
164
+ if self.asn and not self.country:
165
+ raise ValueError("ASN targeting requires country to be specified")
166
+
167
+ # Validate continent code
168
+ if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
169
+ raise ValueError(
170
+ f"Invalid continent code: {self.continent}. "
171
+ f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
172
+ )
173
+
174
+ # Validate country code format (2 letters)
175
+ if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
176
+ raise ValueError(
177
+ f"Invalid country code: {self.country}. "
178
+ "Must be a 2-letter ISO 3166-1 alpha-2 code."
179
+ )
180
+
181
+ def build_username(self) -> str:
182
+ """
183
+ Build the complete username string with embedded parameters.
184
+
185
+ Returns:
186
+ The formatted username string for proxy authentication.
187
+ """
188
+ parts = [f"td-customer-{self.username}"]
189
+
190
+ # Add geo-targeting parameters (order matters)
191
+ if self.continent:
192
+ parts.append(f"continent-{self.continent.lower()}")
193
+
194
+ if self.country:
195
+ parts.append(f"country-{self.country.lower()}")
196
+
197
+ if self.state:
198
+ parts.append(f"state-{self.state.lower()}")
199
+
200
+ if self.city:
201
+ parts.append(f"city-{self.city.lower()}")
202
+
203
+ if self.asn:
204
+ # Ensure ASN has correct format
205
+ asn_value = self.asn.upper()
206
+ if not asn_value.startswith("AS"):
207
+ asn_value = f"AS{asn_value}"
208
+ parts.append(f"asn-{asn_value}")
209
+
210
+ # Add session parameters
211
+ if self.session_id:
212
+ parts.append(f"sessid-{self.session_id}")
213
+
214
+ if self.session_duration:
215
+ parts.append(f"sesstime-{self.session_duration}")
216
+
217
+ return "-".join(parts)
218
+
219
+ def build_proxy_url(self) -> str:
220
+ """
221
+ Build the complete proxy URL.
222
+
223
+ Returns:
224
+ The formatted proxy URL for use with requests/aiohttp.
225
+ """
226
+ username = self.build_username()
227
+ return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
228
+
229
+ def to_proxies_dict(self) -> Dict[str, str]:
230
+ """
231
+ Build a proxies dict suitable for the requests library.
232
+
233
+ Returns:
234
+ Dict with 'http' and 'https' keys pointing to the proxy URL.
235
+ """
236
+ url = self.build_proxy_url()
237
+ return {"http": url, "https": url}
238
+
239
+ def to_aiohttp_config(self) -> tuple:
240
+ """
241
+ Get proxy configuration for aiohttp.
242
+
243
+ Returns:
244
+ Tuple of (proxy_url, proxy_auth) for aiohttp.
245
+ """
246
+ try:
247
+ import aiohttp
248
+ proxy_url = f"{self.protocol}://{self.host}:{self.port}"
249
+ proxy_auth = aiohttp.BasicAuth(
250
+ login=self.build_username(),
251
+ password=self.password
252
+ )
253
+ return proxy_url, proxy_auth
254
+ except ImportError:
255
+ raise ImportError("aiohttp is required for async proxy configuration")
256
+
257
+
258
+ @dataclass
259
+ class StickySession(ProxyConfig):
260
+ """
261
+ Convenience class for creating sticky session proxy configurations.
262
+
263
+ A sticky session keeps the same IP address for a specified duration,
264
+ useful for multi-step operations that require IP consistency.
265
+
266
+ Args:
267
+ duration_minutes: How long to keep the same IP (1-90 minutes).
268
+ auto_session_id: If True, automatically generates a unique session ID.
269
+
270
+ Example:
271
+ >>> session = StickySession(
272
+ ... username="myuser",
273
+ ... password="mypass",
274
+ ... country="us",
275
+ ... duration_minutes=15
276
+ ... )
277
+ >>> # Each call to build_proxy_url() uses the same session
278
+ >>> url = session.build_proxy_url()
279
+ """
280
+
281
+ duration_minutes: int = 10
282
+ auto_session_id: bool = True
283
+
284
+ def __post_init__(self) -> None:
285
+ # Auto-generate session ID if requested and not provided
286
+ if self.auto_session_id and not self.session_id:
287
+ self.session_id = uuid.uuid4().hex[:12]
288
+
289
+ # Set session_duration from duration_minutes
290
+ self.session_duration = self.duration_minutes
291
+
292
+ # Call parent post_init
293
+ super().__post_init__()
294
+
295
+
296
+ # =============================================================================
297
+ # SERP API Models
298
+ # =============================================================================
299
+
300
+ @dataclass
301
+ class SerpRequest:
302
+ """
303
+ Configuration for a SERP API request.
304
+
305
+ Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
306
+
307
+ Args:
308
+ query: The search query string (required).
309
+ engine: Search engine to use (default: 'google').
310
+ num: Number of results per page (default: 10).
311
+ start: Result offset for pagination (default: 0).
312
+
313
+ # Localization
314
+ country: Country code for results (gl parameter for Google).
315
+ language: Language code for interface (hl parameter for Google).
316
+ google_domain: Google domain to use (e.g., 'google.co.uk').
317
+
318
+ # Geo-targeting
319
+ location: Location name for geo-targeting.
320
+ uule: Encoded location parameter (use with location).
321
+
322
+ # Search type
323
+ search_type: Type of search (images, news, shopping, videos, etc.).
324
+
325
+ # Filters
326
+ safe_search: Enable safe search filtering.
327
+ time_filter: Time range filter (hour, day, week, month, year).
328
+
329
+ # Advanced
330
+ device: Device type (desktop, mobile, tablet).
331
+ extra_params: Additional parameters to pass through.
332
+
333
+ Example:
334
+ >>> req = SerpRequest(
335
+ ... query="python programming",
336
+ ... engine="google",
337
+ ... num=20,
338
+ ... country="us",
339
+ ... language="en",
340
+ ... search_type="news"
341
+ ... )
342
+ >>> payload = req.to_payload()
343
+ """
344
+
345
+ query: str
346
+ engine: str = "google"
347
+ num: int = 10
348
+ start: int = 0
349
+
350
+ # Localization
351
+ country: Optional[str] = None # 'gl' for Google
352
+ language: Optional[str] = None # 'hl' for Google
353
+ google_domain: Optional[str] = None
354
+ countries_filter: Optional[str] = None # 'cr' parameter
355
+ languages_filter: Optional[str] = None # 'lr' parameter
356
+
357
+ # Geo-targeting
358
+ location: Optional[str] = None
359
+ uule: Optional[str] = None # Encoded location
360
+
361
+ # Search type
362
+ search_type: Optional[str] = None # tbm parameter (isch, shop, nws, vid)
363
+
364
+ # Filters
365
+ safe_search: Optional[bool] = None
366
+ time_filter: Optional[str] = None # tbs parameter
367
+ no_autocorrect: bool = False # nfpr parameter
368
+ filter_duplicates: Optional[bool] = None
369
+
370
+ # Advanced
371
+ device: Optional[str] = None
372
+
373
+ # Advanced Google parameters
374
+ ludocid: Optional[str] = None # Google Place ID
375
+ kgmid: Optional[str] = None # Knowledge Graph ID
376
+
377
+ # Pass-through
378
+ extra_params: Dict[str, Any] = field(default_factory=dict)
379
+
380
+ # Search type mappings for tbm parameter
381
+ SEARCH_TYPE_MAP = {
382
+ "images": "isch",
383
+ "shopping": "shop",
384
+ "news": "nws",
385
+ "videos": "vid",
386
+ # Direct values also work
387
+ "isch": "isch",
388
+ "shop": "shop",
389
+ "nws": "nws",
390
+ "vid": "vid",
391
+ }
392
+
393
+ # Time filter mappings for tbs parameter
394
+ TIME_FILTER_MAP = {
395
+ "hour": "qdr:h",
396
+ "day": "qdr:d",
397
+ "week": "qdr:w",
398
+ "month": "qdr:m",
399
+ "year": "qdr:y",
400
+ }
401
+
402
+ # Engine URL defaults
403
+ ENGINE_URLS = {
404
+ "google": "google.com",
405
+ "bing": "bing.com",
406
+ "yandex": "yandex.com",
407
+ "duckduckgo": "duckduckgo.com",
408
+ "baidu": "baidu.com",
409
+ }
410
+
411
+ def to_payload(self) -> Dict[str, Any]:
412
+ """
413
+ Convert to API request payload.
414
+
415
+ Returns:
416
+ Dictionary ready to be sent to the SERP API.
417
+ """
418
+ engine = self.engine.lower()
419
+
420
+ payload: Dict[str, Any] = {
421
+ "engine": engine,
422
+ "num": str(self.num),
423
+ "json": "1",
424
+ }
425
+
426
+ # Handle query parameter (Yandex uses 'text', others use 'q')
427
+ if engine == "yandex":
428
+ payload["text"] = self.query
429
+ else:
430
+ payload["q"] = self.query
431
+
432
+ # Set URL based on google_domain or engine default
433
+ if self.google_domain:
434
+ payload["url"] = self.google_domain
435
+ elif engine in self.ENGINE_URLS:
436
+ payload["url"] = self.ENGINE_URLS[engine]
437
+
438
+ # Pagination
439
+ if self.start > 0:
440
+ payload["start"] = str(self.start)
441
+
442
+ # Localization
443
+ if self.country:
444
+ payload["gl"] = self.country.lower()
445
+
446
+ if self.language:
447
+ payload["hl"] = self.language.lower()
448
+
449
+ if self.countries_filter:
450
+ payload["cr"] = self.countries_filter
451
+
452
+ if self.languages_filter:
453
+ payload["lr"] = self.languages_filter
454
+
455
+ # Geo-targeting
456
+ if self.location:
457
+ payload["location"] = self.location
458
+
459
+ if self.uule:
460
+ payload["uule"] = self.uule
461
+
462
+ # Search type
463
+ if self.search_type:
464
+ search_type_lower = self.search_type.lower()
465
+ tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
466
+ payload["tbm"] = tbm_value
467
+
468
+ # Filters
469
+ if self.safe_search is not None:
470
+ payload["safe"] = "active" if self.safe_search else "off"
471
+
472
+ if self.time_filter:
473
+ time_lower = self.time_filter.lower()
474
+ tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
475
+ payload["tbs"] = tbs_value
476
+
477
+ if self.no_autocorrect:
478
+ payload["nfpr"] = "1"
479
+
480
+ if self.filter_duplicates is not None:
481
+ payload["filter"] = "1" if self.filter_duplicates else "0"
482
+
483
+ # Device
484
+ if self.device:
485
+ payload["device"] = self.device.lower()
486
+
487
+ # Advanced Google parameters
488
+ if self.ludocid:
489
+ payload["ludocid"] = self.ludocid
490
+
491
+ if self.kgmid:
492
+ payload["kgmid"] = self.kgmid
493
+
494
+ # Extra parameters
495
+ payload.update(self.extra_params)
496
+
497
+ return payload
498
+
499
+
500
+ # =============================================================================
501
+ # Universal Scraper (Web Unlocker) Models
502
+ # =============================================================================
503
+
504
+ @dataclass
505
+ class UniversalScrapeRequest:
506
+ """
507
+ Configuration for a Universal Scraping API (Web Unlocker) request.
508
+
509
+ This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
510
+
511
+ Args:
512
+ url: Target URL to scrape (required).
513
+ js_render: Enable JavaScript rendering with headless browser.
514
+ output_format: Output format - 'html' or 'png' (screenshot).
515
+ country: Country code for geo-targeting the request.
516
+ block_resources: Block specific resources (e.g., 'script', 'image').
517
+ clean_content: Remove JS/CSS from returned content (e.g., 'js,css').
518
+ wait: Wait time in milliseconds after page load (max 100000).
519
+ wait_for: CSS selector to wait for before returning.
520
+ headers: Custom request headers as list of {name, value} dicts.
521
+ cookies: Custom cookies as list of {name, value} dicts.
522
+ extra_params: Additional parameters to pass through.
523
+
524
+ Example:
525
+ >>> req = UniversalScrapeRequest(
526
+ ... url="https://example.com",
527
+ ... js_render=True,
528
+ ... output_format="html",
529
+ ... country="us",
530
+ ... wait=5000,
531
+ ... wait_for=".content"
532
+ ... )
533
+ >>> payload = req.to_payload()
534
+ """
535
+
536
+ url: str
537
+ js_render: bool = False
538
+ output_format: str = "html" # 'html' or 'png'
539
+ country: Optional[str] = None
540
+ block_resources: Optional[str] = None # e.g., 'script', 'image', 'script,image'
541
+ clean_content: Optional[str] = None # e.g., 'js', 'css', 'js,css'
542
+ wait: Optional[int] = None # Milliseconds, max 100000
543
+ wait_for: Optional[str] = None # CSS selector
544
+ headers: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
545
+ cookies: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
546
+ extra_params: Dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
547
+
548
+ def __post_init__(self) -> None:
549
+ """Validate configuration."""
550
+ valid_formats = {"html", "png"}
551
+ if self.output_format.lower() not in valid_formats:
552
+ raise ValueError(
553
+ f"Invalid output_format: {self.output_format}. "
554
+ f"Must be one of: {', '.join(valid_formats)}"
555
+ )
556
+
557
+ if self.wait is not None and (self.wait < 0 or self.wait > 100000):
558
+ raise ValueError(
559
+ f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
560
+ )
561
+
562
+ def to_payload(self) -> Dict[str, Any]:
563
+ """
564
+ Convert to API request payload.
565
+
566
+ Returns:
567
+ Dictionary ready to be sent to the Universal API.
568
+ """
569
+ payload: Dict[str, Any] = {
570
+ "url": self.url,
571
+ "js_render": "True" if self.js_render else "False",
572
+ "type": self.output_format.lower(),
573
+ }
574
+
575
+ if self.country:
576
+ payload["country"] = self.country.lower()
577
+
578
+ if self.block_resources:
579
+ payload["block_resources"] = self.block_resources
580
+
581
+ if self.clean_content:
582
+ payload["clean_content"] = self.clean_content
583
+
584
+ if self.wait is not None:
585
+ payload["wait"] = str(self.wait)
586
+
587
+ if self.wait_for:
588
+ payload["wait_for"] = self.wait_for
589
+
590
+ if self.headers:
591
+ payload["headers"] = json.dumps(self.headers)
592
+
593
+ if self.cookies:
594
+ payload["cookies"] = json.dumps(self.cookies)
595
+
596
+ payload.update(self.extra_params)
597
+
598
+ return payload
599
+
600
+
601
+ # =============================================================================
602
+ # Web Scraper Task Models
603
+ # =============================================================================
604
+
605
+ @dataclass
606
+ class ScraperTaskConfig:
607
+ """
608
+ Configuration for creating a Web Scraper API task.
609
+
610
+ Note: You must get spider_id and spider_name from the Thordata Dashboard.
611
+
612
+ Args:
613
+ file_name: Name for the output file.
614
+ spider_id: Spider identifier from Dashboard.
615
+ spider_name: Spider name (usually the target domain).
616
+ parameters: Spider-specific parameters.
617
+ universal_params: Global spider settings.
618
+ include_errors: Include error details in output.
619
+
620
+ Example:
621
+ >>> config = ScraperTaskConfig(
622
+ ... file_name="youtube_data",
623
+ ... spider_id="youtube_video-post_by-url",
624
+ ... spider_name="youtube.com",
625
+ ... parameters={
626
+ ... "url": "https://youtube.com/@channel/videos",
627
+ ... "num_of_posts": "50"
628
+ ... }
629
+ ... )
630
+ >>> payload = config.to_payload()
631
+ """
632
+
633
+ file_name: str
634
+ spider_id: str
635
+ spider_name: str
636
+ parameters: Dict[str, Any]
637
+ universal_params: Optional[Dict[str, Any]] = None
638
+ include_errors: bool = True
639
+
640
+ def to_payload(self) -> Dict[str, Any]:
641
+ """
642
+ Convert to API request payload.
643
+
644
+ Returns:
645
+ Dictionary ready to be sent to the Web Scraper API.
646
+ """
647
+ payload: Dict[str, Any] = {
648
+ "file_name": self.file_name,
649
+ "spider_id": self.spider_id,
650
+ "spider_name": self.spider_name,
651
+ "spider_parameters": json.dumps([self.parameters]),
652
+ "spider_errors": "true" if self.include_errors else "false",
653
+ }
654
+
655
+ if self.universal_params:
656
+ payload["spider_universal"] = json.dumps(self.universal_params)
657
+
658
+ return payload
659
+
660
+
661
+ # =============================================================================
662
+ # Response Models
663
+ # =============================================================================
664
+
665
+ @dataclass
666
+ class TaskStatusResponse:
667
+ """
668
+ Response from task status check.
669
+
670
+ Attributes:
671
+ task_id: The task identifier.
672
+ status: Current task status.
673
+ progress: Optional progress percentage.
674
+ message: Optional status message.
675
+ """
676
+
677
+ task_id: str
678
+ status: str
679
+ progress: Optional[int] = None
680
+ message: Optional[str] = None
681
+
682
+ def is_complete(self) -> bool:
683
+ """Check if the task has completed (success or failure)."""
684
+ terminal_statuses = {
685
+ "ready", "success", "finished",
686
+ "failed", "error", "cancelled"
687
+ }
688
+ return self.status.lower() in terminal_statuses
689
+
690
+ def is_success(self) -> bool:
691
+ """Check if the task completed successfully."""
692
+ success_statuses = {"ready", "success", "finished"}
693
+ return self.status.lower() in success_statuses
694
+
695
+ def is_failed(self) -> bool:
696
+ """Check if the task failed."""
697
+ failure_statuses = {"failed", "error"}
698
+ return self.status.lower() in failure_statuses