thordata-sdk 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/retry.py CHANGED
@@ -186,7 +186,7 @@ def with_retry(
186
186
  if isinstance(e, ThordataRateLimitError) and e.retry_after:
187
187
  delay = max(delay, e.retry_after)
188
188
 
189
- logger.warning(
189
+ logger.info(
190
190
  f"Retry attempt {attempt + 1}/{config.max_retries} "
191
191
  f"after {delay:.2f}s due to: {e}"
192
192
  )
@@ -0,0 +1,38 @@
1
+ """
2
+ Web Scraper Tool Registry.
3
+ High-level abstractions for specific scraping targets.
4
+ """
5
+
6
+ from .base import ToolRequest, VideoToolRequest
7
+ from .code import GitHub
8
+ from .ecommerce import Amazon, Walmart, eBay
9
+ from .professional import Crunchbase, Glassdoor, Indeed
10
+ from .search import GoogleMaps, GooglePlay, GoogleShopping
11
+ from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
12
+ from .travel import Airbnb, Booking, Zillow
13
+ from .video import YouTube
14
+
15
+ __all__ = [
16
+ "ToolRequest",
17
+ "VideoToolRequest",
18
+ "Amazon",
19
+ "eBay",
20
+ "Walmart",
21
+ "GoogleMaps",
22
+ "GoogleShopping",
23
+ "GooglePlay",
24
+ "TikTok",
25
+ "Facebook",
26
+ "Instagram",
27
+ "Twitter",
28
+ "LinkedIn",
29
+ "Reddit",
30
+ "YouTube",
31
+ "GitHub",
32
+ "Indeed",
33
+ "Glassdoor",
34
+ "Crunchbase",
35
+ "Booking",
36
+ "Zillow",
37
+ "Airbnb",
38
+ ]
thordata/tools/base.py ADDED
@@ -0,0 +1,42 @@
1
+ """
2
+ Base classes for Web Scraper Tools.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import asdict, dataclass
8
+ from typing import Any, ClassVar
9
+
10
+
11
+ @dataclass
12
+ class ToolRequest:
13
+ """Base class for standard scraping tools."""
14
+
15
+ # These must be defined in subclasses
16
+ SPIDER_ID: ClassVar[str]
17
+ SPIDER_NAME: ClassVar[str]
18
+
19
+ def to_task_parameters(self) -> dict[str, Any]:
20
+ """Convert dataclass fields to API parameters dict."""
21
+ # Filter out internal fields and None values
22
+ return {
23
+ k: v
24
+ for k, v in asdict(self).items()
25
+ if v is not None and k != "common_settings"
26
+ }
27
+
28
+ def get_spider_id(self) -> str:
29
+ return self.SPIDER_ID
30
+
31
+ def get_spider_name(self) -> str:
32
+ return self.SPIDER_NAME
33
+
34
+
35
+ @dataclass
36
+ class VideoToolRequest(ToolRequest):
37
+ """
38
+ Marker class for Video/Audio tools that use the /video_builder endpoint.
39
+ Concrete classes must define a 'common_settings' field.
40
+ """
41
+
42
+ pass
thordata/tools/code.py ADDED
@@ -0,0 +1,39 @@
1
+ """
2
+ Code Repository Scraper Tools (GitHub, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class GitHub:
13
+ """Namespace for GitHub tools."""
14
+
15
+ @dataclass
16
+ class Repository(ToolRequest):
17
+ """Github Repository Scraper by Repo URL"""
18
+
19
+ SPIDER_ID = "github_repository_by-repo-url"
20
+ SPIDER_NAME = "github.com"
21
+ repo_url: str
22
+
23
+ @dataclass
24
+ class RepositoryBySearchUrl(ToolRequest):
25
+ """Github Repository Scraper by Search URL"""
26
+
27
+ SPIDER_ID = "github_repository_by-search-url"
28
+ SPIDER_NAME = "github.com"
29
+ search_url: str
30
+ page_turning: int | None = None
31
+ max_num: int | None = None
32
+
33
+ @dataclass
34
+ class RepositoryByUrl(ToolRequest):
35
+ """Github Repository Scraper by URL"""
36
+
37
+ SPIDER_ID = "github_repository_by-url"
38
+ SPIDER_NAME = "github.com"
39
+ url: str
@@ -0,0 +1,251 @@
1
+ """
2
+ E-Commerce Scraper Tools (Amazon, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class Amazon:
13
+ """Namespace for Amazon tools."""
14
+
15
+ # --- Product Details (5 methods) ---
16
+ @dataclass
17
+ class ProductByAsin(ToolRequest):
18
+ """Amazon Product Details Scraper by ASIN."""
19
+
20
+ SPIDER_ID = "amazon_product_by-asin"
21
+ SPIDER_NAME = "amazon.com"
22
+
23
+ asin: str
24
+ domain: str = "amazon.com"
25
+
26
+ # Backward compatible alias
27
+ Product = ProductByAsin
28
+
29
+ @dataclass
30
+ class ProductByUrl(ToolRequest):
31
+ """Amazon Product Details Scraper by URL."""
32
+
33
+ SPIDER_ID = "amazon_product_by-url"
34
+ SPIDER_NAME = "amazon.com"
35
+
36
+ url: str
37
+ zip_code: str | None = None
38
+
39
+ @dataclass
40
+ class ProductByKeywords(ToolRequest):
41
+ """Amazon Product Details Scraper by Keywords."""
42
+
43
+ SPIDER_ID = "amazon_product_by-keywords"
44
+ SPIDER_NAME = "amazon.com"
45
+
46
+ keyword: str
47
+ page_turning: int | None = None
48
+ lowest_price: float | None = None
49
+ highest_price: float | None = None
50
+
51
+ @dataclass
52
+ class ProductByCategoryUrl(ToolRequest):
53
+ """Amazon Product Details Scraper by Category URL."""
54
+
55
+ SPIDER_ID = "amazon_product_by-category-url"
56
+ SPIDER_NAME = "amazon.com"
57
+
58
+ url: str
59
+ sort_by: str | None = None
60
+ page_turning: int | None = None
61
+
62
+ @dataclass
63
+ class ProductByBestSellers(ToolRequest):
64
+ """Amazon Product Details Scraper by Best Sellers URL."""
65
+
66
+ SPIDER_ID = "amazon_product_by-best-sellers"
67
+ SPIDER_NAME = "amazon.com"
68
+
69
+ url: str
70
+ page_turning: int | None = None
71
+
72
+ # --- Other Amazon Tools ---
73
+
74
+ @dataclass
75
+ class GlobalProductByUrl(ToolRequest):
76
+ """Amazon Global Product Details Scraper by URL"""
77
+
78
+ SPIDER_ID = "amazon_global-product_by-url"
79
+ SPIDER_NAME = "amazon.com"
80
+
81
+ url: str
82
+
83
+ # Backward compatible alias
84
+ GlobalProduct = GlobalProductByUrl
85
+
86
+ @dataclass
87
+ class GlobalProductByCategoryUrl(ToolRequest):
88
+ """Amazon Global Product Details Scraper by Category URL"""
89
+
90
+ SPIDER_ID = "amazon_global-product_by-category-url"
91
+ SPIDER_NAME = "amazon.com"
92
+
93
+ url: str
94
+ sort_by: str | None = None
95
+ get_sponsored: str | None = None
96
+ maximum: int | None = None
97
+
98
+ @dataclass
99
+ class GlobalProductBySellerUrl(ToolRequest):
100
+ """Amazon Global Product Details Scraper by Seller URL"""
101
+
102
+ SPIDER_ID = "amazon_global-product_by-seller-url"
103
+ SPIDER_NAME = "amazon.com"
104
+
105
+ url: str
106
+ maximum: int | None = None
107
+
108
+ @dataclass
109
+ class GlobalProductByKeywords(ToolRequest):
110
+ """Amazon Global Product Details Scraper by Keywords"""
111
+
112
+ SPIDER_ID = "amazon_global-product_by-keywords"
113
+ SPIDER_NAME = "amazon.com"
114
+
115
+ keyword: str
116
+ domain: str = "https://www.amazon.com"
117
+ lowest_price: str | None = None
118
+ highest_price: str | None = None
119
+ page_turning: int | None = None
120
+
121
+ @dataclass
122
+ class GlobalProductByKeywordsBrand(ToolRequest):
123
+ """Amazon Global Product Details Scraper by Keywords and Brand"""
124
+
125
+ SPIDER_ID = "amazon_global-product_by-keywords-brand"
126
+ SPIDER_NAME = "amazon.com"
127
+
128
+ keyword: str
129
+ brands: str
130
+ page_turning: int | None = None
131
+
132
+ @dataclass
133
+ class Review(ToolRequest):
134
+ """Amazon Product Review Scraper"""
135
+
136
+ SPIDER_ID = "amazon_comment_by-url"
137
+ SPIDER_NAME = "amazon.com"
138
+
139
+ url: str
140
+ page_turning: int = 1
141
+
142
+ @dataclass
143
+ class Seller(ToolRequest):
144
+ """Amazon Seller Information Scraper"""
145
+
146
+ SPIDER_ID = "amazon_seller_by-url"
147
+ SPIDER_NAME = "amazon.com"
148
+
149
+ url: str
150
+
151
+ @dataclass
152
+ class Search(ToolRequest):
153
+ """Amazon Product Listing Scraper"""
154
+
155
+ SPIDER_ID = "amazon_product-list_by-keywords-domain"
156
+ SPIDER_NAME = "amazon.com"
157
+
158
+ keyword: str
159
+ domain: str = "https://www.amazon.com/"
160
+ page_turning: int = 1
161
+
162
+
163
+ class eBay:
164
+ """Namespace for eBay tools."""
165
+
166
+ @dataclass
167
+ class ProductByUrl(ToolRequest):
168
+ """eBay Information Scraper by URL"""
169
+
170
+ SPIDER_ID = "ebay_ebay_by-url"
171
+ SPIDER_NAME = "ebay.com"
172
+ url: str
173
+
174
+ @dataclass
175
+ class ProductByCategoryUrl(ToolRequest):
176
+ """eBay Information Scraper by Category URL"""
177
+
178
+ SPIDER_ID = "ebay_ebay_by-category-url"
179
+ SPIDER_NAME = "ebay.com"
180
+ url: str
181
+ count: str | None = None
182
+
183
+ @dataclass
184
+ class ProductByKeywords(ToolRequest):
185
+ """eBay Information Scraper by Keywords"""
186
+
187
+ SPIDER_ID = "ebay_ebay_by-keywords"
188
+ SPIDER_NAME = "ebay.com"
189
+ keywords: str
190
+ count: str | None = None
191
+
192
+ @dataclass
193
+ class ProductByListUrl(ToolRequest):
194
+ """eBay Information Scraper by List URL"""
195
+
196
+ SPIDER_ID = "ebay_ebay_by-listurl"
197
+ SPIDER_NAME = "ebay.com"
198
+ url: str
199
+ count: str | None = None
200
+
201
+
202
+ class Walmart:
203
+ """Namespace for Walmart tools."""
204
+
205
+ @dataclass
206
+ class ProductByUrl(ToolRequest):
207
+ """Walmart Product Information Scraper by URL"""
208
+
209
+ SPIDER_ID = "walmart_product_by-url"
210
+ SPIDER_NAME = "walmart.com"
211
+ url: str
212
+ all_variations: str | None = None
213
+
214
+ @dataclass
215
+ class ProductByCategoryUrl(ToolRequest):
216
+ """Walmart Product Information Scraper by Category URL"""
217
+
218
+ SPIDER_ID = "walmart_product_by-category-url"
219
+ SPIDER_NAME = "walmart.com"
220
+ category_url: str
221
+ all_variations: str | None = None
222
+ page_turning: int | None = None
223
+
224
+ @dataclass
225
+ class ProductBySku(ToolRequest):
226
+ """Walmart Product Information Scraper by SKU"""
227
+
228
+ SPIDER_ID = "walmart_product_by-sku"
229
+ SPIDER_NAME = "walmart.com"
230
+ sku: str
231
+ all_variations: str | None = None
232
+
233
+ @dataclass
234
+ class ProductByKeywords(ToolRequest):
235
+ """Walmart Product Information Scraper by Keywords"""
236
+
237
+ SPIDER_ID = "walmart_product_by-keywords"
238
+ SPIDER_NAME = "walmart.com"
239
+ keyword: str
240
+ domain: str = "https://www.walmart.com/"
241
+ all_variations: str | None = None
242
+ page_turning: int | None = None
243
+
244
+ @dataclass
245
+ class ProductByZipcodes(ToolRequest):
246
+ """Walmart Product Information Scraper by Zipcodes"""
247
+
248
+ SPIDER_ID = "walmart_product_by-zipcodes"
249
+ SPIDER_NAME = "walmart.com"
250
+ url: str
251
+ zip_code: str | None = None
@@ -0,0 +1,155 @@
1
+ """
2
+ Professional Platform Scraper Tools (Indeed, Glassdoor, Crunchbase, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class Indeed:
13
+ """Namespace for Indeed tools."""
14
+
15
+ @dataclass
16
+ class JobByUrl(ToolRequest):
17
+ """Indeed Job Listings Scraper by Job URL"""
18
+
19
+ SPIDER_ID = "indeed_job-listings_by-job-url"
20
+ SPIDER_NAME = "indeed.com"
21
+ job_url: str
22
+
23
+ @dataclass
24
+ class JobByKeyword(ToolRequest):
25
+ """Indeed Job Listings Scraper by Keyword"""
26
+
27
+ SPIDER_ID = "indeed_job-listings_by-keyword"
28
+ SPIDER_NAME = "indeed.com"
29
+ keyword: str
30
+ location: str
31
+ country: str | None = None
32
+ domain: str | None = None
33
+ date_posted: str | None = None
34
+ posted_by: str | None = None
35
+ pay: str | None = None
36
+ location_radius: str | None = None
37
+
38
+ @dataclass
39
+ class CompanyByListUrl(ToolRequest):
40
+ """Indeed Companies Info Scraper by Company List URL"""
41
+
42
+ SPIDER_ID = "indeed_companies-info_by-company-list-url"
43
+ SPIDER_NAME = "indeed.com"
44
+ company_list_url: str
45
+
46
+ @dataclass
47
+ class CompanyByKeyword(ToolRequest):
48
+ """Indeed Companies Info Scraper by Keyword"""
49
+
50
+ SPIDER_ID = "indeed_companies-info_by-keyword"
51
+ SPIDER_NAME = "indeed.com"
52
+ keyword: str
53
+
54
+ @dataclass
55
+ class CompanyByIndustryAndState(ToolRequest):
56
+ """Indeed Companies Info Scraper by Industry and State"""
57
+
58
+ SPIDER_ID = "indeed_companies-info_by-industry-and-state"
59
+ SPIDER_NAME = "indeed.com"
60
+ industry: str
61
+ state: str | None = None
62
+
63
+ @dataclass
64
+ class CompanyByUrl(ToolRequest):
65
+ """Indeed Companies Info Scraper by Company URL"""
66
+
67
+ SPIDER_ID = "indeed_companies-info_by-company-url"
68
+ SPIDER_NAME = "indeed.com"
69
+ company_url: str
70
+
71
+
72
+ class Glassdoor:
73
+ """Namespace for Glassdoor tools."""
74
+
75
+ @dataclass
76
+ class CompanyByUrl(ToolRequest):
77
+ """Glassdoor Company Overview Information Scraper by URL"""
78
+
79
+ SPIDER_ID = "glassdoor_company_by-url"
80
+ SPIDER_NAME = "glassdoor.com"
81
+ url: str
82
+
83
+ @dataclass
84
+ class CompanyByInputFilter(ToolRequest):
85
+ """Glassdoor Company Overview Information Scraper by Input Filter"""
86
+
87
+ SPIDER_ID = "glassdoor_company_by-inputfilter"
88
+ SPIDER_NAME = "glassdoor.com"
89
+ company_name: str
90
+ location: str | None = None
91
+ industries: str | None = None
92
+ Job_title: str | None = None # Note: capital J in API
93
+
94
+ @dataclass
95
+ class CompanyByKeywords(ToolRequest):
96
+ """Glassdoor Company Overview Information Scraper by Keywords"""
97
+
98
+ SPIDER_ID = "glassdoor_company_by-keywords"
99
+ SPIDER_NAME = "glassdoor.com"
100
+ search_url: str
101
+ max_search_results: int | None = None
102
+
103
+ @dataclass
104
+ class CompanyByListUrl(ToolRequest):
105
+ """Glassdoor Company Overview Information Scraper by List URL"""
106
+
107
+ SPIDER_ID = "glassdoor_company_by-listurl"
108
+ SPIDER_NAME = "glassdoor.com"
109
+ url: str
110
+
111
+ @dataclass
112
+ class JobByUrl(ToolRequest):
113
+ """Glassdoor Job Information Scraper by URL"""
114
+
115
+ SPIDER_ID = "glassdoor_joblistings_by-url"
116
+ SPIDER_NAME = "glassdoor.com"
117
+ url: str
118
+
119
+ @dataclass
120
+ class JobByKeywords(ToolRequest):
121
+ """Glassdoor Job Information Scraper by Keywords"""
122
+
123
+ SPIDER_ID = "glassdoor_joblistings_by-keywords"
124
+ SPIDER_NAME = "glassdoor.com"
125
+ keyword: str
126
+ location: str
127
+ country: str | None = None
128
+
129
+ @dataclass
130
+ class JobByListUrl(ToolRequest):
131
+ """Glassdoor Job Information Scraper by List URL"""
132
+
133
+ SPIDER_ID = "glassdoor_joblistings_by-listurl"
134
+ SPIDER_NAME = "glassdoor.com"
135
+ url: str
136
+
137
+
138
+ class Crunchbase:
139
+ """Namespace for Crunchbase tools."""
140
+
141
+ @dataclass
142
+ class CompanyByUrl(ToolRequest):
143
+ """Crunchbase Company Information Scraper by URL"""
144
+
145
+ SPIDER_ID = "crunchbase_company_by-url"
146
+ SPIDER_NAME = "crunchbase.com"
147
+ url: str
148
+
149
+ @dataclass
150
+ class CompanyByKeywords(ToolRequest):
151
+ """Crunchbase Company Information Scraper by Keywords"""
152
+
153
+ SPIDER_ID = "crunchbase_company_by-keywords"
154
+ SPIDER_NAME = "crunchbase.com"
155
+ keyword: str
@@ -0,0 +1,115 @@
1
+ """
2
+ Search Engine & Map Scraper Tools (Google, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class GoogleMaps:
13
+ """Namespace for Google Maps tools."""
14
+
15
+ @dataclass
16
+ class DetailsByUrl(ToolRequest):
17
+ """Google Maps Details Scraper by URL."""
18
+
19
+ SPIDER_ID = "google_map-details_by-url"
20
+ SPIDER_NAME = "google.com"
21
+
22
+ url: str
23
+
24
+ @dataclass
25
+ class DetailsByCid(ToolRequest):
26
+ """Google Maps Details Scraper by CID."""
27
+
28
+ SPIDER_ID = "google_map-details_by-cid"
29
+ SPIDER_NAME = "google.com"
30
+
31
+ CID: str
32
+
33
+ @dataclass
34
+ class DetailsByLocation(ToolRequest):
35
+ """Google Maps Details Scraper by Location keyword + country (+ optional lat/long/zoom).""" # noqa: E501
36
+
37
+ SPIDER_ID = "google_map-details_by-location"
38
+ SPIDER_NAME = "google.com"
39
+
40
+ country: str
41
+ keyword: str
42
+ lat: str | None = None
43
+ long: str | None = None
44
+ zoom_level: str | None = None
45
+
46
+ @dataclass
47
+ class DetailsByPlaceId(ToolRequest):
48
+ """Google Maps Details Scraper by Place ID."""
49
+
50
+ SPIDER_ID = "google_map-details_by-placeid"
51
+ SPIDER_NAME = "google.com"
52
+
53
+ place_id: str
54
+
55
+ # Backward compatible alias: keep old name working
56
+ Details = DetailsByUrl
57
+
58
+ @dataclass
59
+ class Reviews(ToolRequest):
60
+ """Google Maps Review Information Scraper"""
61
+
62
+ SPIDER_ID = "google_comment_by-url"
63
+ SPIDER_NAME = "google.com"
64
+
65
+ url: str
66
+ days_limit: int | None = None # Crawl reviews within X days
67
+
68
+
69
+ class GoogleShopping:
70
+ """Namespace for Google Shopping tools."""
71
+
72
+ @dataclass
73
+ class Product(ToolRequest):
74
+ """Google Shopping Information Scraper by URL"""
75
+
76
+ SPIDER_ID = "google_shopping_by-url"
77
+ SPIDER_NAME = "google.com"
78
+ url: str
79
+ country: str | None = None # e.g. "US"
80
+
81
+ @dataclass
82
+ class ProductByKeywords(ToolRequest):
83
+ """Google Shopping Information Scraper by Keywords"""
84
+
85
+ SPIDER_ID = "google_shopping_by-keywords"
86
+ SPIDER_NAME = "google.com"
87
+ keyword: str
88
+ country: str | None = None # e.g. "US"
89
+
90
+
91
+ class GooglePlay:
92
+ """Namespace for Google Play Store tools."""
93
+
94
+ @dataclass
95
+ class AppInfo(ToolRequest):
96
+ """Google Play Store Information Scraper"""
97
+
98
+ SPIDER_ID = "google-play-store_information_by-url"
99
+ SPIDER_NAME = "google.com"
100
+
101
+ app_url: str
102
+ country: str | None = None
103
+
104
+ @dataclass
105
+ class Reviews(ToolRequest):
106
+ """Google Play Store Reviews Scraper"""
107
+
108
+ SPIDER_ID = "google-play-store_reviews_by-url"
109
+ SPIDER_NAME = "google.com"
110
+
111
+ app_url: str
112
+ num_of_reviews: int | None = None
113
+ start_date: str | None = None # yyyy-mm-dd
114
+ end_date: str | None = None
115
+ country: str | None = None