thordata-sdk 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +503 -1796
- thordata/client.py +444 -1322
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/enums.py +41 -380
- thordata/exceptions.py +70 -19
- thordata/models.py +37 -1193
- thordata/retry.py +1 -1
- thordata/tools/__init__.py +38 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +39 -0
- thordata/tools/ecommerce.py +251 -0
- thordata/tools/professional.py +155 -0
- thordata/tools/search.py +115 -0
- thordata/tools/social.py +374 -0
- thordata/tools/travel.py +100 -0
- thordata/tools/video.py +154 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +156 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +67 -0
- thordata_sdk-1.6.0.dist-info/METADATA +287 -0
- thordata_sdk-1.6.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/WHEEL +1 -1
- thordata/_example_utils.py +0 -77
- thordata/demo.py +0 -138
- thordata_sdk-1.4.0.dist-info/METADATA +0 -208
- thordata_sdk-1.4.0.dist-info/RECORD +0 -18
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/top_level.txt +0 -0
thordata/retry.py
CHANGED
|
@@ -186,7 +186,7 @@ def with_retry(
|
|
|
186
186
|
if isinstance(e, ThordataRateLimitError) and e.retry_after:
|
|
187
187
|
delay = max(delay, e.retry_after)
|
|
188
188
|
|
|
189
|
-
logger.
|
|
189
|
+
logger.info(
|
|
190
190
|
f"Retry attempt {attempt + 1}/{config.max_retries} "
|
|
191
191
|
f"after {delay:.2f}s due to: {e}"
|
|
192
192
|
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web Scraper Tool Registry.
|
|
3
|
+
High-level abstractions for specific scraping targets.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import ToolRequest, VideoToolRequest
|
|
7
|
+
from .code import GitHub
|
|
8
|
+
from .ecommerce import Amazon, Walmart, eBay
|
|
9
|
+
from .professional import Crunchbase, Glassdoor, Indeed
|
|
10
|
+
from .search import GoogleMaps, GooglePlay, GoogleShopping
|
|
11
|
+
from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
|
|
12
|
+
from .travel import Airbnb, Booking, Zillow
|
|
13
|
+
from .video import YouTube
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ToolRequest",
|
|
17
|
+
"VideoToolRequest",
|
|
18
|
+
"Amazon",
|
|
19
|
+
"eBay",
|
|
20
|
+
"Walmart",
|
|
21
|
+
"GoogleMaps",
|
|
22
|
+
"GoogleShopping",
|
|
23
|
+
"GooglePlay",
|
|
24
|
+
"TikTok",
|
|
25
|
+
"Facebook",
|
|
26
|
+
"Instagram",
|
|
27
|
+
"Twitter",
|
|
28
|
+
"LinkedIn",
|
|
29
|
+
"Reddit",
|
|
30
|
+
"YouTube",
|
|
31
|
+
"GitHub",
|
|
32
|
+
"Indeed",
|
|
33
|
+
"Glassdoor",
|
|
34
|
+
"Crunchbase",
|
|
35
|
+
"Booking",
|
|
36
|
+
"Zillow",
|
|
37
|
+
"Airbnb",
|
|
38
|
+
]
|
thordata/tools/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base classes for Web Scraper Tools.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
8
|
+
from typing import Any, ClassVar
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ToolRequest:
|
|
13
|
+
"""Base class for standard scraping tools."""
|
|
14
|
+
|
|
15
|
+
# These must be defined in subclasses
|
|
16
|
+
SPIDER_ID: ClassVar[str]
|
|
17
|
+
SPIDER_NAME: ClassVar[str]
|
|
18
|
+
|
|
19
|
+
def to_task_parameters(self) -> dict[str, Any]:
|
|
20
|
+
"""Convert dataclass fields to API parameters dict."""
|
|
21
|
+
# Filter out internal fields and None values
|
|
22
|
+
return {
|
|
23
|
+
k: v
|
|
24
|
+
for k, v in asdict(self).items()
|
|
25
|
+
if v is not None and k != "common_settings"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def get_spider_id(self) -> str:
|
|
29
|
+
return self.SPIDER_ID
|
|
30
|
+
|
|
31
|
+
def get_spider_name(self) -> str:
|
|
32
|
+
return self.SPIDER_NAME
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class VideoToolRequest(ToolRequest):
|
|
37
|
+
"""
|
|
38
|
+
Marker class for Video/Audio tools that use the /video_builder endpoint.
|
|
39
|
+
Concrete classes must define a 'common_settings' field.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
pass
|
thordata/tools/code.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Repository Scraper Tools (GitHub, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GitHub:
|
|
13
|
+
"""Namespace for GitHub tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Repository(ToolRequest):
|
|
17
|
+
"""Github Repository Scraper by Repo URL"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "github_repository_by-repo-url"
|
|
20
|
+
SPIDER_NAME = "github.com"
|
|
21
|
+
repo_url: str
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RepositoryBySearchUrl(ToolRequest):
|
|
25
|
+
"""Github Repository Scraper by Search URL"""
|
|
26
|
+
|
|
27
|
+
SPIDER_ID = "github_repository_by-search-url"
|
|
28
|
+
SPIDER_NAME = "github.com"
|
|
29
|
+
search_url: str
|
|
30
|
+
page_turning: int | None = None
|
|
31
|
+
max_num: int | None = None
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class RepositoryByUrl(ToolRequest):
|
|
35
|
+
"""Github Repository Scraper by URL"""
|
|
36
|
+
|
|
37
|
+
SPIDER_ID = "github_repository_by-url"
|
|
38
|
+
SPIDER_NAME = "github.com"
|
|
39
|
+
url: str
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""
|
|
2
|
+
E-Commerce Scraper Tools (Amazon, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Amazon:
|
|
13
|
+
"""Namespace for Amazon tools."""
|
|
14
|
+
|
|
15
|
+
# --- Product Details (5 methods) ---
|
|
16
|
+
@dataclass
|
|
17
|
+
class ProductByAsin(ToolRequest):
|
|
18
|
+
"""Amazon Product Details Scraper by ASIN."""
|
|
19
|
+
|
|
20
|
+
SPIDER_ID = "amazon_product_by-asin"
|
|
21
|
+
SPIDER_NAME = "amazon.com"
|
|
22
|
+
|
|
23
|
+
asin: str
|
|
24
|
+
domain: str = "amazon.com"
|
|
25
|
+
|
|
26
|
+
# Backward compatible alias
|
|
27
|
+
Product = ProductByAsin
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ProductByUrl(ToolRequest):
|
|
31
|
+
"""Amazon Product Details Scraper by URL."""
|
|
32
|
+
|
|
33
|
+
SPIDER_ID = "amazon_product_by-url"
|
|
34
|
+
SPIDER_NAME = "amazon.com"
|
|
35
|
+
|
|
36
|
+
url: str
|
|
37
|
+
zip_code: str | None = None
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ProductByKeywords(ToolRequest):
|
|
41
|
+
"""Amazon Product Details Scraper by Keywords."""
|
|
42
|
+
|
|
43
|
+
SPIDER_ID = "amazon_product_by-keywords"
|
|
44
|
+
SPIDER_NAME = "amazon.com"
|
|
45
|
+
|
|
46
|
+
keyword: str
|
|
47
|
+
page_turning: int | None = None
|
|
48
|
+
lowest_price: float | None = None
|
|
49
|
+
highest_price: float | None = None
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ProductByCategoryUrl(ToolRequest):
|
|
53
|
+
"""Amazon Product Details Scraper by Category URL."""
|
|
54
|
+
|
|
55
|
+
SPIDER_ID = "amazon_product_by-category-url"
|
|
56
|
+
SPIDER_NAME = "amazon.com"
|
|
57
|
+
|
|
58
|
+
url: str
|
|
59
|
+
sort_by: str | None = None
|
|
60
|
+
page_turning: int | None = None
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class ProductByBestSellers(ToolRequest):
|
|
64
|
+
"""Amazon Product Details Scraper by Best Sellers URL."""
|
|
65
|
+
|
|
66
|
+
SPIDER_ID = "amazon_product_by-best-sellers"
|
|
67
|
+
SPIDER_NAME = "amazon.com"
|
|
68
|
+
|
|
69
|
+
url: str
|
|
70
|
+
page_turning: int | None = None
|
|
71
|
+
|
|
72
|
+
# --- Other Amazon Tools ---
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class GlobalProductByUrl(ToolRequest):
|
|
76
|
+
"""Amazon Global Product Details Scraper by URL"""
|
|
77
|
+
|
|
78
|
+
SPIDER_ID = "amazon_global-product_by-url"
|
|
79
|
+
SPIDER_NAME = "amazon.com"
|
|
80
|
+
|
|
81
|
+
url: str
|
|
82
|
+
|
|
83
|
+
# Backward compatible alias
|
|
84
|
+
GlobalProduct = GlobalProductByUrl
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class GlobalProductByCategoryUrl(ToolRequest):
|
|
88
|
+
"""Amazon Global Product Details Scraper by Category URL"""
|
|
89
|
+
|
|
90
|
+
SPIDER_ID = "amazon_global-product_by-category-url"
|
|
91
|
+
SPIDER_NAME = "amazon.com"
|
|
92
|
+
|
|
93
|
+
url: str
|
|
94
|
+
sort_by: str | None = None
|
|
95
|
+
get_sponsored: str | None = None
|
|
96
|
+
maximum: int | None = None
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class GlobalProductBySellerUrl(ToolRequest):
|
|
100
|
+
"""Amazon Global Product Details Scraper by Seller URL"""
|
|
101
|
+
|
|
102
|
+
SPIDER_ID = "amazon_global-product_by-seller-url"
|
|
103
|
+
SPIDER_NAME = "amazon.com"
|
|
104
|
+
|
|
105
|
+
url: str
|
|
106
|
+
maximum: int | None = None
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class GlobalProductByKeywords(ToolRequest):
|
|
110
|
+
"""Amazon Global Product Details Scraper by Keywords"""
|
|
111
|
+
|
|
112
|
+
SPIDER_ID = "amazon_global-product_by-keywords"
|
|
113
|
+
SPIDER_NAME = "amazon.com"
|
|
114
|
+
|
|
115
|
+
keyword: str
|
|
116
|
+
domain: str = "https://www.amazon.com"
|
|
117
|
+
lowest_price: str | None = None
|
|
118
|
+
highest_price: str | None = None
|
|
119
|
+
page_turning: int | None = None
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class GlobalProductByKeywordsBrand(ToolRequest):
|
|
123
|
+
"""Amazon Global Product Details Scraper by Keywords and Brand"""
|
|
124
|
+
|
|
125
|
+
SPIDER_ID = "amazon_global-product_by-keywords-brand"
|
|
126
|
+
SPIDER_NAME = "amazon.com"
|
|
127
|
+
|
|
128
|
+
keyword: str
|
|
129
|
+
brands: str
|
|
130
|
+
page_turning: int | None = None
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class Review(ToolRequest):
|
|
134
|
+
"""Amazon Product Review Scraper"""
|
|
135
|
+
|
|
136
|
+
SPIDER_ID = "amazon_comment_by-url"
|
|
137
|
+
SPIDER_NAME = "amazon.com"
|
|
138
|
+
|
|
139
|
+
url: str
|
|
140
|
+
page_turning: int = 1
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class Seller(ToolRequest):
|
|
144
|
+
"""Amazon Seller Information Scraper"""
|
|
145
|
+
|
|
146
|
+
SPIDER_ID = "amazon_seller_by-url"
|
|
147
|
+
SPIDER_NAME = "amazon.com"
|
|
148
|
+
|
|
149
|
+
url: str
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class Search(ToolRequest):
|
|
153
|
+
"""Amazon Product Listing Scraper"""
|
|
154
|
+
|
|
155
|
+
SPIDER_ID = "amazon_product-list_by-keywords-domain"
|
|
156
|
+
SPIDER_NAME = "amazon.com"
|
|
157
|
+
|
|
158
|
+
keyword: str
|
|
159
|
+
domain: str = "https://www.amazon.com/"
|
|
160
|
+
page_turning: int = 1
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class eBay:
|
|
164
|
+
"""Namespace for eBay tools."""
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class ProductByUrl(ToolRequest):
|
|
168
|
+
"""eBay Information Scraper by URL"""
|
|
169
|
+
|
|
170
|
+
SPIDER_ID = "ebay_ebay_by-url"
|
|
171
|
+
SPIDER_NAME = "ebay.com"
|
|
172
|
+
url: str
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class ProductByCategoryUrl(ToolRequest):
|
|
176
|
+
"""eBay Information Scraper by Category URL"""
|
|
177
|
+
|
|
178
|
+
SPIDER_ID = "ebay_ebay_by-category-url"
|
|
179
|
+
SPIDER_NAME = "ebay.com"
|
|
180
|
+
url: str
|
|
181
|
+
count: str | None = None
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class ProductByKeywords(ToolRequest):
|
|
185
|
+
"""eBay Information Scraper by Keywords"""
|
|
186
|
+
|
|
187
|
+
SPIDER_ID = "ebay_ebay_by-keywords"
|
|
188
|
+
SPIDER_NAME = "ebay.com"
|
|
189
|
+
keywords: str
|
|
190
|
+
count: str | None = None
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class ProductByListUrl(ToolRequest):
|
|
194
|
+
"""eBay Information Scraper by List URL"""
|
|
195
|
+
|
|
196
|
+
SPIDER_ID = "ebay_ebay_by-listurl"
|
|
197
|
+
SPIDER_NAME = "ebay.com"
|
|
198
|
+
url: str
|
|
199
|
+
count: str | None = None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class Walmart:
|
|
203
|
+
"""Namespace for Walmart tools."""
|
|
204
|
+
|
|
205
|
+
@dataclass
|
|
206
|
+
class ProductByUrl(ToolRequest):
|
|
207
|
+
"""Walmart Product Information Scraper by URL"""
|
|
208
|
+
|
|
209
|
+
SPIDER_ID = "walmart_product_by-url"
|
|
210
|
+
SPIDER_NAME = "walmart.com"
|
|
211
|
+
url: str
|
|
212
|
+
all_variations: str | None = None
|
|
213
|
+
|
|
214
|
+
@dataclass
|
|
215
|
+
class ProductByCategoryUrl(ToolRequest):
|
|
216
|
+
"""Walmart Product Information Scraper by Category URL"""
|
|
217
|
+
|
|
218
|
+
SPIDER_ID = "walmart_product_by-category-url"
|
|
219
|
+
SPIDER_NAME = "walmart.com"
|
|
220
|
+
category_url: str
|
|
221
|
+
all_variations: str | None = None
|
|
222
|
+
page_turning: int | None = None
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class ProductBySku(ToolRequest):
|
|
226
|
+
"""Walmart Product Information Scraper by SKU"""
|
|
227
|
+
|
|
228
|
+
SPIDER_ID = "walmart_product_by-sku"
|
|
229
|
+
SPIDER_NAME = "walmart.com"
|
|
230
|
+
sku: str
|
|
231
|
+
all_variations: str | None = None
|
|
232
|
+
|
|
233
|
+
@dataclass
|
|
234
|
+
class ProductByKeywords(ToolRequest):
|
|
235
|
+
"""Walmart Product Information Scraper by Keywords"""
|
|
236
|
+
|
|
237
|
+
SPIDER_ID = "walmart_product_by-keywords"
|
|
238
|
+
SPIDER_NAME = "walmart.com"
|
|
239
|
+
keyword: str
|
|
240
|
+
domain: str = "https://www.walmart.com/"
|
|
241
|
+
all_variations: str | None = None
|
|
242
|
+
page_turning: int | None = None
|
|
243
|
+
|
|
244
|
+
@dataclass
|
|
245
|
+
class ProductByZipcodes(ToolRequest):
|
|
246
|
+
"""Walmart Product Information Scraper by Zipcodes"""
|
|
247
|
+
|
|
248
|
+
SPIDER_ID = "walmart_product_by-zipcodes"
|
|
249
|
+
SPIDER_NAME = "walmart.com"
|
|
250
|
+
url: str
|
|
251
|
+
zip_code: str | None = None
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Professional Platform Scraper Tools (Indeed, Glassdoor, Crunchbase, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Indeed:
|
|
13
|
+
"""Namespace for Indeed tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JobByUrl(ToolRequest):
|
|
17
|
+
"""Indeed Job Listings Scraper by Job URL"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "indeed_job-listings_by-job-url"
|
|
20
|
+
SPIDER_NAME = "indeed.com"
|
|
21
|
+
job_url: str
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class JobByKeyword(ToolRequest):
|
|
25
|
+
"""Indeed Job Listings Scraper by Keyword"""
|
|
26
|
+
|
|
27
|
+
SPIDER_ID = "indeed_job-listings_by-keyword"
|
|
28
|
+
SPIDER_NAME = "indeed.com"
|
|
29
|
+
keyword: str
|
|
30
|
+
location: str
|
|
31
|
+
country: str | None = None
|
|
32
|
+
domain: str | None = None
|
|
33
|
+
date_posted: str | None = None
|
|
34
|
+
posted_by: str | None = None
|
|
35
|
+
pay: str | None = None
|
|
36
|
+
location_radius: str | None = None
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class CompanyByListUrl(ToolRequest):
|
|
40
|
+
"""Indeed Companies Info Scraper by Company List URL"""
|
|
41
|
+
|
|
42
|
+
SPIDER_ID = "indeed_companies-info_by-company-list-url"
|
|
43
|
+
SPIDER_NAME = "indeed.com"
|
|
44
|
+
company_list_url: str
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CompanyByKeyword(ToolRequest):
|
|
48
|
+
"""Indeed Companies Info Scraper by Keyword"""
|
|
49
|
+
|
|
50
|
+
SPIDER_ID = "indeed_companies-info_by-keyword"
|
|
51
|
+
SPIDER_NAME = "indeed.com"
|
|
52
|
+
keyword: str
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CompanyByIndustryAndState(ToolRequest):
|
|
56
|
+
"""Indeed Companies Info Scraper by Industry and State"""
|
|
57
|
+
|
|
58
|
+
SPIDER_ID = "indeed_companies-info_by-industry-and-state"
|
|
59
|
+
SPIDER_NAME = "indeed.com"
|
|
60
|
+
industry: str
|
|
61
|
+
state: str | None = None
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class CompanyByUrl(ToolRequest):
|
|
65
|
+
"""Indeed Companies Info Scraper by Company URL"""
|
|
66
|
+
|
|
67
|
+
SPIDER_ID = "indeed_companies-info_by-company-url"
|
|
68
|
+
SPIDER_NAME = "indeed.com"
|
|
69
|
+
company_url: str
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Glassdoor:
|
|
73
|
+
"""Namespace for Glassdoor tools."""
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class CompanyByUrl(ToolRequest):
|
|
77
|
+
"""Glassdoor Company Overview Information Scraper by URL"""
|
|
78
|
+
|
|
79
|
+
SPIDER_ID = "glassdoor_company_by-url"
|
|
80
|
+
SPIDER_NAME = "glassdoor.com"
|
|
81
|
+
url: str
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class CompanyByInputFilter(ToolRequest):
|
|
85
|
+
"""Glassdoor Company Overview Information Scraper by Input Filter"""
|
|
86
|
+
|
|
87
|
+
SPIDER_ID = "glassdoor_company_by-inputfilter"
|
|
88
|
+
SPIDER_NAME = "glassdoor.com"
|
|
89
|
+
company_name: str
|
|
90
|
+
location: str | None = None
|
|
91
|
+
industries: str | None = None
|
|
92
|
+
Job_title: str | None = None # Note: capital J in API
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class CompanyByKeywords(ToolRequest):
|
|
96
|
+
"""Glassdoor Company Overview Information Scraper by Keywords"""
|
|
97
|
+
|
|
98
|
+
SPIDER_ID = "glassdoor_company_by-keywords"
|
|
99
|
+
SPIDER_NAME = "glassdoor.com"
|
|
100
|
+
search_url: str
|
|
101
|
+
max_search_results: int | None = None
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class CompanyByListUrl(ToolRequest):
|
|
105
|
+
"""Glassdoor Company Overview Information Scraper by List URL"""
|
|
106
|
+
|
|
107
|
+
SPIDER_ID = "glassdoor_company_by-listurl"
|
|
108
|
+
SPIDER_NAME = "glassdoor.com"
|
|
109
|
+
url: str
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class JobByUrl(ToolRequest):
|
|
113
|
+
"""Glassdoor Job Information Scraper by URL"""
|
|
114
|
+
|
|
115
|
+
SPIDER_ID = "glassdoor_joblistings_by-url"
|
|
116
|
+
SPIDER_NAME = "glassdoor.com"
|
|
117
|
+
url: str
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class JobByKeywords(ToolRequest):
|
|
121
|
+
"""Glassdoor Job Information Scraper by Keywords"""
|
|
122
|
+
|
|
123
|
+
SPIDER_ID = "glassdoor_joblistings_by-keywords"
|
|
124
|
+
SPIDER_NAME = "glassdoor.com"
|
|
125
|
+
keyword: str
|
|
126
|
+
location: str
|
|
127
|
+
country: str | None = None
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class JobByListUrl(ToolRequest):
|
|
131
|
+
"""Glassdoor Job Information Scraper by List URL"""
|
|
132
|
+
|
|
133
|
+
SPIDER_ID = "glassdoor_joblistings_by-listurl"
|
|
134
|
+
SPIDER_NAME = "glassdoor.com"
|
|
135
|
+
url: str
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class Crunchbase:
|
|
139
|
+
"""Namespace for Crunchbase tools."""
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class CompanyByUrl(ToolRequest):
|
|
143
|
+
"""Crunchbase Company Information Scraper by URL"""
|
|
144
|
+
|
|
145
|
+
SPIDER_ID = "crunchbase_company_by-url"
|
|
146
|
+
SPIDER_NAME = "crunchbase.com"
|
|
147
|
+
url: str
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class CompanyByKeywords(ToolRequest):
|
|
151
|
+
"""Crunchbase Company Information Scraper by Keywords"""
|
|
152
|
+
|
|
153
|
+
SPIDER_ID = "crunchbase_company_by-keywords"
|
|
154
|
+
SPIDER_NAME = "crunchbase.com"
|
|
155
|
+
keyword: str
|
thordata/tools/search.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Search Engine & Map Scraper Tools (Google, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GoogleMaps:
|
|
13
|
+
"""Namespace for Google Maps tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class DetailsByUrl(ToolRequest):
|
|
17
|
+
"""Google Maps Details Scraper by URL."""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "google_map-details_by-url"
|
|
20
|
+
SPIDER_NAME = "google.com"
|
|
21
|
+
|
|
22
|
+
url: str
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DetailsByCid(ToolRequest):
|
|
26
|
+
"""Google Maps Details Scraper by CID."""
|
|
27
|
+
|
|
28
|
+
SPIDER_ID = "google_map-details_by-cid"
|
|
29
|
+
SPIDER_NAME = "google.com"
|
|
30
|
+
|
|
31
|
+
CID: str
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DetailsByLocation(ToolRequest):
|
|
35
|
+
"""Google Maps Details Scraper by Location keyword + country (+ optional lat/long/zoom).""" # noqa: E501
|
|
36
|
+
|
|
37
|
+
SPIDER_ID = "google_map-details_by-location"
|
|
38
|
+
SPIDER_NAME = "google.com"
|
|
39
|
+
|
|
40
|
+
country: str
|
|
41
|
+
keyword: str
|
|
42
|
+
lat: str | None = None
|
|
43
|
+
long: str | None = None
|
|
44
|
+
zoom_level: str | None = None
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class DetailsByPlaceId(ToolRequest):
|
|
48
|
+
"""Google Maps Details Scraper by Place ID."""
|
|
49
|
+
|
|
50
|
+
SPIDER_ID = "google_map-details_by-placeid"
|
|
51
|
+
SPIDER_NAME = "google.com"
|
|
52
|
+
|
|
53
|
+
place_id: str
|
|
54
|
+
|
|
55
|
+
# Backward compatible alias: keep old name working
|
|
56
|
+
Details = DetailsByUrl
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Reviews(ToolRequest):
|
|
60
|
+
"""Google Maps Review Information Scraper"""
|
|
61
|
+
|
|
62
|
+
SPIDER_ID = "google_comment_by-url"
|
|
63
|
+
SPIDER_NAME = "google.com"
|
|
64
|
+
|
|
65
|
+
url: str
|
|
66
|
+
days_limit: int | None = None # Crawl reviews within X days
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class GoogleShopping:
|
|
70
|
+
"""Namespace for Google Shopping tools."""
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class Product(ToolRequest):
|
|
74
|
+
"""Google Shopping Information Scraper by URL"""
|
|
75
|
+
|
|
76
|
+
SPIDER_ID = "google_shopping_by-url"
|
|
77
|
+
SPIDER_NAME = "google.com"
|
|
78
|
+
url: str
|
|
79
|
+
country: str | None = None # e.g. "US"
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class ProductByKeywords(ToolRequest):
|
|
83
|
+
"""Google Shopping Information Scraper by Keywords"""
|
|
84
|
+
|
|
85
|
+
SPIDER_ID = "google_shopping_by-keywords"
|
|
86
|
+
SPIDER_NAME = "google.com"
|
|
87
|
+
keyword: str
|
|
88
|
+
country: str | None = None # e.g. "US"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class GooglePlay:
|
|
92
|
+
"""Namespace for Google Play Store tools."""
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class AppInfo(ToolRequest):
|
|
96
|
+
"""Google Play Store Information Scraper"""
|
|
97
|
+
|
|
98
|
+
SPIDER_ID = "google-play-store_information_by-url"
|
|
99
|
+
SPIDER_NAME = "google.com"
|
|
100
|
+
|
|
101
|
+
app_url: str
|
|
102
|
+
country: str | None = None
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class Reviews(ToolRequest):
|
|
106
|
+
"""Google Play Store Reviews Scraper"""
|
|
107
|
+
|
|
108
|
+
SPIDER_ID = "google-play-store_reviews_by-url"
|
|
109
|
+
SPIDER_NAME = "google.com"
|
|
110
|
+
|
|
111
|
+
app_url: str
|
|
112
|
+
num_of_reviews: int | None = None
|
|
113
|
+
start_date: str | None = None # yyyy-mm-dd
|
|
114
|
+
end_date: str | None = None
|
|
115
|
+
country: str | None = None
|