thordata-sdk 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +492 -1790
- thordata/client.py +432 -1315
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/enums.py +41 -380
- thordata/models.py +37 -1193
- thordata/tools/__init__.py +28 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +26 -0
- thordata/tools/ecommerce.py +67 -0
- thordata/tools/search.py +73 -0
- thordata/tools/social.py +190 -0
- thordata/tools/video.py +81 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +144 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +67 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.5.0.dist-info}/METADATA +73 -50
- thordata_sdk-1.5.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.5.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.4.0.dist-info/RECORD +0 -18
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web Scraper Tool Registry.
|
|
3
|
+
High-level abstractions for specific scraping targets.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import ToolRequest, VideoToolRequest
|
|
7
|
+
from .code import GitHub
|
|
8
|
+
from .ecommerce import Amazon
|
|
9
|
+
from .search import GoogleMaps, GooglePlay, GoogleShopping
|
|
10
|
+
from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
|
|
11
|
+
from .video import YouTube
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ToolRequest",
|
|
15
|
+
"VideoToolRequest",
|
|
16
|
+
"Amazon",
|
|
17
|
+
"GoogleMaps",
|
|
18
|
+
"GoogleShopping",
|
|
19
|
+
"GooglePlay",
|
|
20
|
+
"TikTok",
|
|
21
|
+
"Facebook",
|
|
22
|
+
"Instagram",
|
|
23
|
+
"Twitter",
|
|
24
|
+
"LinkedIn",
|
|
25
|
+
"Reddit",
|
|
26
|
+
"YouTube",
|
|
27
|
+
"GitHub",
|
|
28
|
+
]
|
thordata/tools/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base classes for Web Scraper Tools.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
8
|
+
from typing import Any, ClassVar
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ToolRequest:
|
|
13
|
+
"""Base class for standard scraping tools."""
|
|
14
|
+
|
|
15
|
+
# These must be defined in subclasses
|
|
16
|
+
SPIDER_ID: ClassVar[str]
|
|
17
|
+
SPIDER_NAME: ClassVar[str]
|
|
18
|
+
|
|
19
|
+
def to_task_parameters(self) -> dict[str, Any]:
|
|
20
|
+
"""Convert dataclass fields to API parameters dict."""
|
|
21
|
+
# Filter out internal fields and None values
|
|
22
|
+
return {
|
|
23
|
+
k: v
|
|
24
|
+
for k, v in asdict(self).items()
|
|
25
|
+
if v is not None and k != "common_settings"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def get_spider_id(self) -> str:
|
|
29
|
+
return self.SPIDER_ID
|
|
30
|
+
|
|
31
|
+
def get_spider_name(self) -> str:
|
|
32
|
+
return self.SPIDER_NAME
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class VideoToolRequest(ToolRequest):
|
|
37
|
+
"""
|
|
38
|
+
Marker class for Video/Audio tools that use the /video_builder endpoint.
|
|
39
|
+
Concrete classes must define a 'common_settings' field.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
pass
|
thordata/tools/code.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Repository Scraper Tools (GitHub, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GitHub:
|
|
13
|
+
"""Namespace for GitHub tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Repository(ToolRequest):
|
|
17
|
+
"""Github Repository Scraper"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "github_repository_by-repo-url"
|
|
20
|
+
SPIDER_NAME = "github.com"
|
|
21
|
+
|
|
22
|
+
repo_url: str
|
|
23
|
+
search_url: str | None = None
|
|
24
|
+
url: str | None = None # The generic URL param
|
|
25
|
+
page_turning: int | None = None
|
|
26
|
+
max_num: int | None = None
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
E-Commerce Scraper Tools (Amazon, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Amazon:
|
|
13
|
+
"""Namespace for Amazon tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Product(ToolRequest):
|
|
17
|
+
"""Amazon Product Details Scraper"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "amazon_product_by-asin"
|
|
20
|
+
SPIDER_NAME = "amazon.com"
|
|
21
|
+
|
|
22
|
+
asin: str
|
|
23
|
+
domain: str = "amazon.com"
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GlobalProduct(ToolRequest):
|
|
27
|
+
"""Amazon Global Product Details Scraper"""
|
|
28
|
+
|
|
29
|
+
SPIDER_ID = "amazon_global-product_by-url"
|
|
30
|
+
SPIDER_NAME = "amazon.com"
|
|
31
|
+
|
|
32
|
+
url: str
|
|
33
|
+
zip_code: str | None = None
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Review(ToolRequest):
|
|
37
|
+
"""Amazon Product Review Scraper"""
|
|
38
|
+
|
|
39
|
+
SPIDER_ID = "amazon_comment_by-url"
|
|
40
|
+
SPIDER_NAME = "amazon.com"
|
|
41
|
+
|
|
42
|
+
url: str
|
|
43
|
+
page_turning: int = 1
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Seller(ToolRequest):
|
|
47
|
+
"""Amazon Seller Information Scraper"""
|
|
48
|
+
|
|
49
|
+
SPIDER_ID = "amazon_seller_by-url"
|
|
50
|
+
SPIDER_NAME = "amazon.com"
|
|
51
|
+
|
|
52
|
+
url: str
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class Search(ToolRequest):
|
|
56
|
+
"""Amazon Product Listing Scraper"""
|
|
57
|
+
|
|
58
|
+
SPIDER_ID = "amazon_product-list_by-keywords-domain"
|
|
59
|
+
SPIDER_NAME = "amazon.com"
|
|
60
|
+
|
|
61
|
+
keyword: str
|
|
62
|
+
domain: str = "amazon.com"
|
|
63
|
+
page_turning: int = 1
|
|
64
|
+
sort_by: str | None = None # Best Sellers, Newest Arrivals, etc.
|
|
65
|
+
min_price: float | None = None
|
|
66
|
+
max_price: float | None = None
|
|
67
|
+
get_sponsored: bool | None = None
|
thordata/tools/search.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Search Engine & Map Scraper Tools (Google, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GoogleMaps:
|
|
13
|
+
"""Namespace for Google Maps tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Details(ToolRequest):
|
|
17
|
+
"""Google Maps Details Information Scraper"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "google_map-details_by-url"
|
|
20
|
+
SPIDER_NAME = "google.com"
|
|
21
|
+
|
|
22
|
+
url: str # Google Maps URL
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Reviews(ToolRequest):
|
|
26
|
+
"""Google Maps Review Information Scraper"""
|
|
27
|
+
|
|
28
|
+
SPIDER_ID = "google_comment_by-url"
|
|
29
|
+
SPIDER_NAME = "google.com"
|
|
30
|
+
|
|
31
|
+
url: str
|
|
32
|
+
days_limit: int | None = None # Crawl reviews within X days
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class GoogleShopping:
|
|
36
|
+
"""Namespace for Google Shopping tools."""
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Product(ToolRequest):
|
|
40
|
+
"""Google Shopping Information Scraper"""
|
|
41
|
+
|
|
42
|
+
SPIDER_ID = "google_shopping_by-url"
|
|
43
|
+
SPIDER_NAME = "google.com"
|
|
44
|
+
|
|
45
|
+
url: str
|
|
46
|
+
country: str | None = None # e.g. "US"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GooglePlay:
|
|
50
|
+
"""Namespace for Google Play Store tools."""
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class AppInfo(ToolRequest):
|
|
54
|
+
"""Google Play Store Information Scraper"""
|
|
55
|
+
|
|
56
|
+
SPIDER_ID = "google-play-store_information_by-url"
|
|
57
|
+
SPIDER_NAME = "google.com"
|
|
58
|
+
|
|
59
|
+
app_url: str
|
|
60
|
+
country: str | None = None
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Reviews(ToolRequest):
|
|
64
|
+
"""Google Play Store Reviews Scraper"""
|
|
65
|
+
|
|
66
|
+
SPIDER_ID = "google-play-store_reviews_by-url"
|
|
67
|
+
SPIDER_NAME = "google.com"
|
|
68
|
+
|
|
69
|
+
app_url: str
|
|
70
|
+
num_of_reviews: int | None = None
|
|
71
|
+
start_date: str | None = None # yyyy-mm-dd
|
|
72
|
+
end_date: str | None = None
|
|
73
|
+
country: str | None = None
|
thordata/tools/social.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Social Media Scraper Tools.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TikTok:
|
|
13
|
+
@dataclass
|
|
14
|
+
class Post(ToolRequest):
|
|
15
|
+
"""TikTok Post Information Scraper"""
|
|
16
|
+
|
|
17
|
+
SPIDER_ID = "tiktok_posts_by-url"
|
|
18
|
+
SPIDER_NAME = "tiktok.com"
|
|
19
|
+
url: str
|
|
20
|
+
page_turning: int | None = None
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Comment(ToolRequest):
|
|
24
|
+
"""TikTok Comment Scraper"""
|
|
25
|
+
|
|
26
|
+
SPIDER_ID = "tiktok_comment_by-url"
|
|
27
|
+
SPIDER_NAME = "tiktok.com"
|
|
28
|
+
url: str
|
|
29
|
+
page_turning: int | None = None
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class Profile(ToolRequest):
|
|
33
|
+
"""TikTok Profile Information Scraper"""
|
|
34
|
+
|
|
35
|
+
SPIDER_ID = "tiktok_profiles_by-url"
|
|
36
|
+
SPIDER_NAME = "tiktok.com"
|
|
37
|
+
|
|
38
|
+
url: str # Profile URL (e.g. https://www.tiktok.com/@user)
|
|
39
|
+
search_url: str | None = None
|
|
40
|
+
|
|
41
|
+
country: str | None = None
|
|
42
|
+
page_turning: int | None = None
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class Shop(ToolRequest):
|
|
46
|
+
"""TikTok Shop Information Scraper"""
|
|
47
|
+
|
|
48
|
+
SPIDER_ID = "tiktok_shop_by-url"
|
|
49
|
+
SPIDER_NAME = "tiktok.com"
|
|
50
|
+
url: str
|
|
51
|
+
category_url: str | None = None
|
|
52
|
+
keyword: str | None = None
|
|
53
|
+
page_turning: int | None = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Facebook:
|
|
57
|
+
@dataclass
|
|
58
|
+
class Posts(ToolRequest):
|
|
59
|
+
"""Facebook Posts Scraper"""
|
|
60
|
+
|
|
61
|
+
SPIDER_ID = "facebook_post_by-keywords"
|
|
62
|
+
SPIDER_NAME = "facebook.com"
|
|
63
|
+
keyword: str
|
|
64
|
+
recent_posts: bool | None = None
|
|
65
|
+
date: str | None = None # Year 2025 etc.
|
|
66
|
+
number: int = 10
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class PostDetails(ToolRequest):
|
|
70
|
+
"""Facebook Post Details Scraper"""
|
|
71
|
+
|
|
72
|
+
SPIDER_ID = "facebook_post_by-posts-url"
|
|
73
|
+
SPIDER_NAME = "facebook.com"
|
|
74
|
+
url: str
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Instagram:
|
|
78
|
+
@dataclass
|
|
79
|
+
class Profile(ToolRequest):
|
|
80
|
+
"""Instagram Profile Scraper"""
|
|
81
|
+
|
|
82
|
+
SPIDER_ID = "ins_profiles_by-username"
|
|
83
|
+
SPIDER_NAME = "instagram.com"
|
|
84
|
+
username: str
|
|
85
|
+
profileurl: str | None = None
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class Post(ToolRequest):
|
|
89
|
+
"""Instagram Post Information Scraper"""
|
|
90
|
+
|
|
91
|
+
SPIDER_ID = "ins_posts_by-profileurl"
|
|
92
|
+
SPIDER_NAME = "instagram.com"
|
|
93
|
+
profileurl: str
|
|
94
|
+
resultsLimit: int = 10
|
|
95
|
+
start_date: str | None = None
|
|
96
|
+
end_date: str | None = None
|
|
97
|
+
post_type: str | None = None # Post or Reel
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class Reel(ToolRequest):
|
|
101
|
+
"""Instagram Reel Information Scraper"""
|
|
102
|
+
|
|
103
|
+
SPIDER_ID = "ins_reel_by-url"
|
|
104
|
+
SPIDER_NAME = "instagram.com"
|
|
105
|
+
url: str
|
|
106
|
+
num_of_posts: int | None = None
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Comment(ToolRequest):
|
|
110
|
+
"""Instagram Post Comment Scraper"""
|
|
111
|
+
|
|
112
|
+
SPIDER_ID = "ins_comment_by-posturl"
|
|
113
|
+
SPIDER_NAME = "instagram.com"
|
|
114
|
+
posturl: str
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Twitter:
|
|
118
|
+
@dataclass
|
|
119
|
+
class Profile(ToolRequest):
|
|
120
|
+
"""Twitter(X) Profile Scraper"""
|
|
121
|
+
|
|
122
|
+
SPIDER_ID = "twitter_profiles_by-url"
|
|
123
|
+
SPIDER_NAME = "twitter.com"
|
|
124
|
+
url: str
|
|
125
|
+
max_number_of_posts: int | None = None
|
|
126
|
+
user_name: str | None = None
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class Post(ToolRequest):
|
|
130
|
+
"""
|
|
131
|
+
Twitter(X) Post Information Scraper
|
|
132
|
+
Updates based on integration snippet:
|
|
133
|
+
- SPIDER_NAME is 'x.com'
|
|
134
|
+
- Only 'url' is required.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
SPIDER_ID = "twitter_by-posturl_by-url"
|
|
138
|
+
SPIDER_NAME = "x.com" # Updated from snippet
|
|
139
|
+
|
|
140
|
+
url: str # Post URL (e.g. https://x.com/user/status/123)
|
|
141
|
+
|
|
142
|
+
start_date: str | None = None
|
|
143
|
+
end_date: str | None = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class LinkedIn:
|
|
147
|
+
@dataclass
|
|
148
|
+
class Company(ToolRequest):
|
|
149
|
+
"""LinkedIn Company Information Scraper"""
|
|
150
|
+
|
|
151
|
+
SPIDER_ID = "linkedin_company_information_by-url"
|
|
152
|
+
SPIDER_NAME = "linkedin.com"
|
|
153
|
+
url: str
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class Jobs(ToolRequest):
|
|
157
|
+
"""LinkedIn Job Listing Scraper"""
|
|
158
|
+
|
|
159
|
+
SPIDER_ID = "linkedin_job_listings_information_by-job-listing-url"
|
|
160
|
+
SPIDER_NAME = "linkedin.com"
|
|
161
|
+
job_listing_url: str
|
|
162
|
+
location: str
|
|
163
|
+
job_url: str | None = None
|
|
164
|
+
page_turning: int | None = None
|
|
165
|
+
keyword: str | None = None
|
|
166
|
+
remote: str | None = None # On_site, Remote, Hybrid
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class Reddit:
|
|
170
|
+
@dataclass
|
|
171
|
+
class Posts(ToolRequest):
|
|
172
|
+
"""Reddit Post Information Scraper"""
|
|
173
|
+
|
|
174
|
+
SPIDER_ID = "reddit_posts_by-url"
|
|
175
|
+
SPIDER_NAME = "reddit.com"
|
|
176
|
+
url: str
|
|
177
|
+
keyword: str | None = None
|
|
178
|
+
subreddit_url: str | None = None
|
|
179
|
+
num_of_posts: int | None = None
|
|
180
|
+
sort_by: str | None = None # Relevance, Hot, Top, New
|
|
181
|
+
|
|
182
|
+
@dataclass
|
|
183
|
+
class Comment(ToolRequest):
|
|
184
|
+
"""Reddit Post Comment Scraper"""
|
|
185
|
+
|
|
186
|
+
SPIDER_ID = "reddit_comment_by-url"
|
|
187
|
+
SPIDER_NAME = "reddit.com"
|
|
188
|
+
url: str
|
|
189
|
+
days_back: int | None = None
|
|
190
|
+
load_all_replies: bool | None = None
|
thordata/tools/video.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Video & Audio Scraper Tools (YouTube, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
from ..types.common import CommonSettings
|
|
10
|
+
from .base import ToolRequest, VideoToolRequest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class YouTube:
|
|
14
|
+
"""Namespace for YouTube tools."""
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class VideoDownload(VideoToolRequest):
|
|
18
|
+
"""YouTube Video File Scraper (Download). Uses video_builder."""
|
|
19
|
+
|
|
20
|
+
SPIDER_ID = "youtube_video_by-url"
|
|
21
|
+
SPIDER_NAME = "youtube.com"
|
|
22
|
+
|
|
23
|
+
url: str # Video URL
|
|
24
|
+
common_settings: CommonSettings = field(default_factory=CommonSettings)
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class AudioDownload(VideoToolRequest):
|
|
28
|
+
"""YouTube Audio File Scraper (Download). Uses video_builder."""
|
|
29
|
+
|
|
30
|
+
SPIDER_ID = "youtube_audio_by-url"
|
|
31
|
+
SPIDER_NAME = "youtube.com"
|
|
32
|
+
|
|
33
|
+
url: str
|
|
34
|
+
common_settings: CommonSettings = field(default_factory=CommonSettings)
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class SubtitleDownload(VideoToolRequest):
|
|
38
|
+
"""YouTube Subtitle File Scraper. Uses video_builder."""
|
|
39
|
+
|
|
40
|
+
SPIDER_ID = "youtube_transcript_by-id"
|
|
41
|
+
SPIDER_NAME = "youtube.com"
|
|
42
|
+
|
|
43
|
+
video_id: str
|
|
44
|
+
subtitles_type: str | None = None # Auto generated / user uploaded
|
|
45
|
+
common_settings: CommonSettings = field(default_factory=CommonSettings)
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class Profile(VideoToolRequest):
|
|
49
|
+
"""YouTube Profile Scraper. Uses video_builder."""
|
|
50
|
+
|
|
51
|
+
SPIDER_ID = "youtube_profiles_by-keyword"
|
|
52
|
+
SPIDER_NAME = "youtube.com"
|
|
53
|
+
|
|
54
|
+
url: str # Channel URL
|
|
55
|
+
page_turning: int = 1
|
|
56
|
+
keyword: str | None = None
|
|
57
|
+
common_settings: CommonSettings = field(default_factory=CommonSettings)
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class Comments(VideoToolRequest):
|
|
61
|
+
"""YouTube Comment Information Scraper. Uses video_builder."""
|
|
62
|
+
|
|
63
|
+
SPIDER_ID = "youtube_comment_by-id"
|
|
64
|
+
SPIDER_NAME = "youtube.com"
|
|
65
|
+
|
|
66
|
+
video_id: str
|
|
67
|
+
num_of_comments: int | None = None
|
|
68
|
+
sort_by: str | None = None # Top comments / Newest first
|
|
69
|
+
common_settings: CommonSettings = field(default_factory=CommonSettings)
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class VideoInfo(ToolRequest):
|
|
73
|
+
"""YouTube Video Post Scraper (Metadata only). Standard builder."""
|
|
74
|
+
|
|
75
|
+
# Note: This one does NOT inherit from VideoToolRequest because it uses the standard builder
|
|
76
|
+
# and doesn't support common_settings in the same way.
|
|
77
|
+
SPIDER_ID = "youtube_video-post_by-url"
|
|
78
|
+
SPIDER_NAME = "youtube.com"
|
|
79
|
+
|
|
80
|
+
url: str # Channel Video URL
|
|
81
|
+
num_of_posts: str | None = None
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thordata Data Types and Models.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .common import (
|
|
6
|
+
CommonSettings,
|
|
7
|
+
Continent,
|
|
8
|
+
Country,
|
|
9
|
+
Device,
|
|
10
|
+
OutputFormat,
|
|
11
|
+
ThordataBaseConfig,
|
|
12
|
+
normalize_enum_value,
|
|
13
|
+
)
|
|
14
|
+
from .proxy import (
|
|
15
|
+
ProxyConfig,
|
|
16
|
+
ProxyHost,
|
|
17
|
+
ProxyPort,
|
|
18
|
+
ProxyProduct,
|
|
19
|
+
ProxyServer,
|
|
20
|
+
ProxyType,
|
|
21
|
+
ProxyUser,
|
|
22
|
+
ProxyUserList,
|
|
23
|
+
SessionType,
|
|
24
|
+
StaticISPProxy,
|
|
25
|
+
StickySession,
|
|
26
|
+
)
|
|
27
|
+
from .serp import (
|
|
28
|
+
BingSearchType,
|
|
29
|
+
Engine,
|
|
30
|
+
GoogleSearchType,
|
|
31
|
+
GoogleTbm,
|
|
32
|
+
SerpRequest,
|
|
33
|
+
TimeRange,
|
|
34
|
+
)
|
|
35
|
+
from .task import (
|
|
36
|
+
DataFormat,
|
|
37
|
+
ScraperTaskConfig,
|
|
38
|
+
TaskStatus,
|
|
39
|
+
TaskStatusResponse,
|
|
40
|
+
UsageStatistics,
|
|
41
|
+
VideoTaskConfig,
|
|
42
|
+
)
|
|
43
|
+
from .universal import UniversalScrapeRequest
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"CommonSettings",
|
|
47
|
+
"Device",
|
|
48
|
+
"OutputFormat",
|
|
49
|
+
"ThordataBaseConfig",
|
|
50
|
+
"Continent",
|
|
51
|
+
"Country",
|
|
52
|
+
"normalize_enum_value",
|
|
53
|
+
"ProxyConfig",
|
|
54
|
+
"ProxyProduct",
|
|
55
|
+
"ProxyServer",
|
|
56
|
+
"ProxyType",
|
|
57
|
+
"ProxyUser",
|
|
58
|
+
"ProxyUserList",
|
|
59
|
+
"SessionType",
|
|
60
|
+
"StaticISPProxy",
|
|
61
|
+
"StickySession",
|
|
62
|
+
"ProxyHost",
|
|
63
|
+
"ProxyPort",
|
|
64
|
+
"BingSearchType",
|
|
65
|
+
"Engine",
|
|
66
|
+
"GoogleSearchType",
|
|
67
|
+
"GoogleTbm",
|
|
68
|
+
"SerpRequest",
|
|
69
|
+
"TimeRange",
|
|
70
|
+
"DataFormat",
|
|
71
|
+
"ScraperTaskConfig",
|
|
72
|
+
"TaskStatus",
|
|
73
|
+
"TaskStatusResponse",
|
|
74
|
+
"UsageStatistics",
|
|
75
|
+
"VideoTaskConfig",
|
|
76
|
+
"UniversalScrapeRequest",
|
|
77
|
+
]
|