thordata-sdk 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """
2
+ Web Scraper Tool Registry.
3
+ High-level abstractions for specific scraping targets.
4
+ """
5
+
6
+ from .base import ToolRequest, VideoToolRequest
7
+ from .code import GitHub
8
+ from .ecommerce import Amazon
9
+ from .search import GoogleMaps, GooglePlay, GoogleShopping
10
+ from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
11
+ from .video import YouTube
12
+
13
+ __all__ = [
14
+ "ToolRequest",
15
+ "VideoToolRequest",
16
+ "Amazon",
17
+ "GoogleMaps",
18
+ "GoogleShopping",
19
+ "GooglePlay",
20
+ "TikTok",
21
+ "Facebook",
22
+ "Instagram",
23
+ "Twitter",
24
+ "LinkedIn",
25
+ "Reddit",
26
+ "YouTube",
27
+ "GitHub",
28
+ ]
thordata/tools/base.py ADDED
@@ -0,0 +1,42 @@
1
+ """
2
+ Base classes for Web Scraper Tools.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import asdict, dataclass
8
+ from typing import Any, ClassVar
9
+
10
+
11
+ @dataclass
12
+ class ToolRequest:
13
+ """Base class for standard scraping tools."""
14
+
15
+ # These must be defined in subclasses
16
+ SPIDER_ID: ClassVar[str]
17
+ SPIDER_NAME: ClassVar[str]
18
+
19
+ def to_task_parameters(self) -> dict[str, Any]:
20
+ """Convert dataclass fields to API parameters dict."""
21
+ # Filter out internal fields and None values
22
+ return {
23
+ k: v
24
+ for k, v in asdict(self).items()
25
+ if v is not None and k != "common_settings"
26
+ }
27
+
28
+ def get_spider_id(self) -> str:
29
+ return self.SPIDER_ID
30
+
31
+ def get_spider_name(self) -> str:
32
+ return self.SPIDER_NAME
33
+
34
+
35
+ @dataclass
36
+ class VideoToolRequest(ToolRequest):
37
+ """
38
+ Marker class for Video/Audio tools that use the /video_builder endpoint.
39
+ Concrete classes must define a 'common_settings' field.
40
+ """
41
+
42
+ pass
thordata/tools/code.py ADDED
@@ -0,0 +1,26 @@
1
+ """
2
+ Code Repository Scraper Tools (GitHub, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class GitHub:
13
+ """Namespace for GitHub tools."""
14
+
15
+ @dataclass
16
+ class Repository(ToolRequest):
17
+ """Github Repository Scraper"""
18
+
19
+ SPIDER_ID = "github_repository_by-repo-url"
20
+ SPIDER_NAME = "github.com"
21
+
22
+ repo_url: str
23
+ search_url: str | None = None
24
+ url: str | None = None # The generic URL param
25
+ page_turning: int | None = None
26
+ max_num: int | None = None
@@ -0,0 +1,67 @@
1
+ """
2
+ E-Commerce Scraper Tools (Amazon, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class Amazon:
13
+ """Namespace for Amazon tools."""
14
+
15
+ @dataclass
16
+ class Product(ToolRequest):
17
+ """Amazon Product Details Scraper"""
18
+
19
+ SPIDER_ID = "amazon_product_by-asin"
20
+ SPIDER_NAME = "amazon.com"
21
+
22
+ asin: str
23
+ domain: str = "amazon.com"
24
+
25
+ @dataclass
26
+ class GlobalProduct(ToolRequest):
27
+ """Amazon Global Product Details Scraper"""
28
+
29
+ SPIDER_ID = "amazon_global-product_by-url"
30
+ SPIDER_NAME = "amazon.com"
31
+
32
+ url: str
33
+ zip_code: str | None = None
34
+
35
+ @dataclass
36
+ class Review(ToolRequest):
37
+ """Amazon Product Review Scraper"""
38
+
39
+ SPIDER_ID = "amazon_comment_by-url"
40
+ SPIDER_NAME = "amazon.com"
41
+
42
+ url: str
43
+ page_turning: int = 1
44
+
45
+ @dataclass
46
+ class Seller(ToolRequest):
47
+ """Amazon Seller Information Scraper"""
48
+
49
+ SPIDER_ID = "amazon_seller_by-url"
50
+ SPIDER_NAME = "amazon.com"
51
+
52
+ url: str
53
+
54
+ @dataclass
55
+ class Search(ToolRequest):
56
+ """Amazon Product Listing Scraper"""
57
+
58
+ SPIDER_ID = "amazon_product-list_by-keywords-domain"
59
+ SPIDER_NAME = "amazon.com"
60
+
61
+ keyword: str
62
+ domain: str = "amazon.com"
63
+ page_turning: int = 1
64
+ sort_by: str | None = None # Best Sellers, Newest Arrivals, etc.
65
+ min_price: float | None = None
66
+ max_price: float | None = None
67
+ get_sponsored: bool | None = None
@@ -0,0 +1,73 @@
1
+ """
2
+ Search Engine & Map Scraper Tools (Google, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class GoogleMaps:
13
+ """Namespace for Google Maps tools."""
14
+
15
+ @dataclass
16
+ class Details(ToolRequest):
17
+ """Google Maps Details Information Scraper"""
18
+
19
+ SPIDER_ID = "google_map-details_by-url"
20
+ SPIDER_NAME = "google.com"
21
+
22
+ url: str # Google Maps URL
23
+
24
+ @dataclass
25
+ class Reviews(ToolRequest):
26
+ """Google Maps Review Information Scraper"""
27
+
28
+ SPIDER_ID = "google_comment_by-url"
29
+ SPIDER_NAME = "google.com"
30
+
31
+ url: str
32
+ days_limit: int | None = None # Crawl reviews within X days
33
+
34
+
35
+ class GoogleShopping:
36
+ """Namespace for Google Shopping tools."""
37
+
38
+ @dataclass
39
+ class Product(ToolRequest):
40
+ """Google Shopping Information Scraper"""
41
+
42
+ SPIDER_ID = "google_shopping_by-url"
43
+ SPIDER_NAME = "google.com"
44
+
45
+ url: str
46
+ country: str | None = None # e.g. "US"
47
+
48
+
49
+ class GooglePlay:
50
+ """Namespace for Google Play Store tools."""
51
+
52
+ @dataclass
53
+ class AppInfo(ToolRequest):
54
+ """Google Play Store Information Scraper"""
55
+
56
+ SPIDER_ID = "google-play-store_information_by-url"
57
+ SPIDER_NAME = "google.com"
58
+
59
+ app_url: str
60
+ country: str | None = None
61
+
62
+ @dataclass
63
+ class Reviews(ToolRequest):
64
+ """Google Play Store Reviews Scraper"""
65
+
66
+ SPIDER_ID = "google-play-store_reviews_by-url"
67
+ SPIDER_NAME = "google.com"
68
+
69
+ app_url: str
70
+ num_of_reviews: int | None = None
71
+ start_date: str | None = None # yyyy-mm-dd
72
+ end_date: str | None = None
73
+ country: str | None = None
@@ -0,0 +1,190 @@
1
+ """
2
+ Social Media Scraper Tools.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class TikTok:
13
+ @dataclass
14
+ class Post(ToolRequest):
15
+ """TikTok Post Information Scraper"""
16
+
17
+ SPIDER_ID = "tiktok_posts_by-url"
18
+ SPIDER_NAME = "tiktok.com"
19
+ url: str
20
+ page_turning: int | None = None
21
+
22
+ @dataclass
23
+ class Comment(ToolRequest):
24
+ """TikTok Comment Scraper"""
25
+
26
+ SPIDER_ID = "tiktok_comment_by-url"
27
+ SPIDER_NAME = "tiktok.com"
28
+ url: str
29
+ page_turning: int | None = None
30
+
31
+ @dataclass
32
+ class Profile(ToolRequest):
33
+ """TikTok Profile Information Scraper"""
34
+
35
+ SPIDER_ID = "tiktok_profiles_by-url"
36
+ SPIDER_NAME = "tiktok.com"
37
+
38
+ url: str # Profile URL (e.g. https://www.tiktok.com/@user)
39
+ search_url: str | None = None
40
+
41
+ country: str | None = None
42
+ page_turning: int | None = None
43
+
44
+ @dataclass
45
+ class Shop(ToolRequest):
46
+ """TikTok Shop Information Scraper"""
47
+
48
+ SPIDER_ID = "tiktok_shop_by-url"
49
+ SPIDER_NAME = "tiktok.com"
50
+ url: str
51
+ category_url: str | None = None
52
+ keyword: str | None = None
53
+ page_turning: int | None = None
54
+
55
+
56
+ class Facebook:
57
+ @dataclass
58
+ class Posts(ToolRequest):
59
+ """Facebook Posts Scraper"""
60
+
61
+ SPIDER_ID = "facebook_post_by-keywords"
62
+ SPIDER_NAME = "facebook.com"
63
+ keyword: str
64
+ recent_posts: bool | None = None
65
+ date: str | None = None # Year 2025 etc.
66
+ number: int = 10
67
+
68
+ @dataclass
69
+ class PostDetails(ToolRequest):
70
+ """Facebook Post Details Scraper"""
71
+
72
+ SPIDER_ID = "facebook_post_by-posts-url"
73
+ SPIDER_NAME = "facebook.com"
74
+ url: str
75
+
76
+
77
+ class Instagram:
78
+ @dataclass
79
+ class Profile(ToolRequest):
80
+ """Instagram Profile Scraper"""
81
+
82
+ SPIDER_ID = "ins_profiles_by-username"
83
+ SPIDER_NAME = "instagram.com"
84
+ username: str
85
+ profileurl: str | None = None
86
+
87
+ @dataclass
88
+ class Post(ToolRequest):
89
+ """Instagram Post Information Scraper"""
90
+
91
+ SPIDER_ID = "ins_posts_by-profileurl"
92
+ SPIDER_NAME = "instagram.com"
93
+ profileurl: str
94
+ resultsLimit: int = 10
95
+ start_date: str | None = None
96
+ end_date: str | None = None
97
+ post_type: str | None = None # Post or Reel
98
+
99
+ @dataclass
100
+ class Reel(ToolRequest):
101
+ """Instagram Reel Information Scraper"""
102
+
103
+ SPIDER_ID = "ins_reel_by-url"
104
+ SPIDER_NAME = "instagram.com"
105
+ url: str
106
+ num_of_posts: int | None = None
107
+
108
+ @dataclass
109
+ class Comment(ToolRequest):
110
+ """Instagram Post Comment Scraper"""
111
+
112
+ SPIDER_ID = "ins_comment_by-posturl"
113
+ SPIDER_NAME = "instagram.com"
114
+ posturl: str
115
+
116
+
117
+ class Twitter:
118
+ @dataclass
119
+ class Profile(ToolRequest):
120
+ """Twitter(X) Profile Scraper"""
121
+
122
+ SPIDER_ID = "twitter_profiles_by-url"
123
+ SPIDER_NAME = "twitter.com"
124
+ url: str
125
+ max_number_of_posts: int | None = None
126
+ user_name: str | None = None
127
+
128
+ @dataclass
129
+ class Post(ToolRequest):
130
+ """
131
+ Twitter(X) Post Information Scraper
132
+ Updates based on integration snippet:
133
+ - SPIDER_NAME is 'x.com'
134
+ - Only 'url' is required.
135
+ """
136
+
137
+ SPIDER_ID = "twitter_by-posturl_by-url"
138
+ SPIDER_NAME = "x.com" # Updated from snippet
139
+
140
+ url: str # Post URL (e.g. https://x.com/user/status/123)
141
+
142
+ start_date: str | None = None
143
+ end_date: str | None = None
144
+
145
+
146
+ class LinkedIn:
147
+ @dataclass
148
+ class Company(ToolRequest):
149
+ """LinkedIn Company Information Scraper"""
150
+
151
+ SPIDER_ID = "linkedin_company_information_by-url"
152
+ SPIDER_NAME = "linkedin.com"
153
+ url: str
154
+
155
+ @dataclass
156
+ class Jobs(ToolRequest):
157
+ """LinkedIn Job Listing Scraper"""
158
+
159
+ SPIDER_ID = "linkedin_job_listings_information_by-job-listing-url"
160
+ SPIDER_NAME = "linkedin.com"
161
+ job_listing_url: str
162
+ location: str
163
+ job_url: str | None = None
164
+ page_turning: int | None = None
165
+ keyword: str | None = None
166
+ remote: str | None = None # On_site, Remote, Hybrid
167
+
168
+
169
+ class Reddit:
170
+ @dataclass
171
+ class Posts(ToolRequest):
172
+ """Reddit Post Information Scraper"""
173
+
174
+ SPIDER_ID = "reddit_posts_by-url"
175
+ SPIDER_NAME = "reddit.com"
176
+ url: str
177
+ keyword: str | None = None
178
+ subreddit_url: str | None = None
179
+ num_of_posts: int | None = None
180
+ sort_by: str | None = None # Relevance, Hot, Top, New
181
+
182
+ @dataclass
183
+ class Comment(ToolRequest):
184
+ """Reddit Post Comment Scraper"""
185
+
186
+ SPIDER_ID = "reddit_comment_by-url"
187
+ SPIDER_NAME = "reddit.com"
188
+ url: str
189
+ days_back: int | None = None
190
+ load_all_replies: bool | None = None
@@ -0,0 +1,81 @@
1
+ """
2
+ Video & Audio Scraper Tools (YouTube, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass, field
8
+
9
+ from ..types.common import CommonSettings
10
+ from .base import ToolRequest, VideoToolRequest
11
+
12
+
13
+ class YouTube:
14
+ """Namespace for YouTube tools."""
15
+
16
+ @dataclass
17
+ class VideoDownload(VideoToolRequest):
18
+ """YouTube Video File Scraper (Download). Uses video_builder."""
19
+
20
+ SPIDER_ID = "youtube_video_by-url"
21
+ SPIDER_NAME = "youtube.com"
22
+
23
+ url: str # Video URL
24
+ common_settings: CommonSettings = field(default_factory=CommonSettings)
25
+
26
+ @dataclass
27
+ class AudioDownload(VideoToolRequest):
28
+ """YouTube Audio File Scraper (Download). Uses video_builder."""
29
+
30
+ SPIDER_ID = "youtube_audio_by-url"
31
+ SPIDER_NAME = "youtube.com"
32
+
33
+ url: str
34
+ common_settings: CommonSettings = field(default_factory=CommonSettings)
35
+
36
+ @dataclass
37
+ class SubtitleDownload(VideoToolRequest):
38
+ """YouTube Subtitle File Scraper. Uses video_builder."""
39
+
40
+ SPIDER_ID = "youtube_transcript_by-id"
41
+ SPIDER_NAME = "youtube.com"
42
+
43
+ video_id: str
44
+ subtitles_type: str | None = None # Auto generated / user uploaded
45
+ common_settings: CommonSettings = field(default_factory=CommonSettings)
46
+
47
+ @dataclass
48
+ class Profile(VideoToolRequest):
49
+ """YouTube Profile Scraper. Uses video_builder."""
50
+
51
+ SPIDER_ID = "youtube_profiles_by-keyword"
52
+ SPIDER_NAME = "youtube.com"
53
+
54
+ url: str # Channel URL
55
+ page_turning: int = 1
56
+ keyword: str | None = None
57
+ common_settings: CommonSettings = field(default_factory=CommonSettings)
58
+
59
+ @dataclass
60
+ class Comments(VideoToolRequest):
61
+ """YouTube Comment Information Scraper. Uses video_builder."""
62
+
63
+ SPIDER_ID = "youtube_comment_by-id"
64
+ SPIDER_NAME = "youtube.com"
65
+
66
+ video_id: str
67
+ num_of_comments: int | None = None
68
+ sort_by: str | None = None # Top comments / Newest first
69
+ common_settings: CommonSettings = field(default_factory=CommonSettings)
70
+
71
+ @dataclass
72
+ class VideoInfo(ToolRequest):
73
+ """YouTube Video Post Scraper (Metadata only). Standard builder."""
74
+
75
+ # Note: This one does NOT inherit from VideoToolRequest because it uses the standard builder
76
+ # and doesn't support common_settings in the same way.
77
+ SPIDER_ID = "youtube_video-post_by-url"
78
+ SPIDER_NAME = "youtube.com"
79
+
80
+ url: str # Channel Video URL
81
+ num_of_posts: str | None = None
@@ -0,0 +1,77 @@
1
+ """
2
+ Thordata Data Types and Models.
3
+ """
4
+
5
+ from .common import (
6
+ CommonSettings,
7
+ Continent,
8
+ Country,
9
+ Device,
10
+ OutputFormat,
11
+ ThordataBaseConfig,
12
+ normalize_enum_value,
13
+ )
14
+ from .proxy import (
15
+ ProxyConfig,
16
+ ProxyHost,
17
+ ProxyPort,
18
+ ProxyProduct,
19
+ ProxyServer,
20
+ ProxyType,
21
+ ProxyUser,
22
+ ProxyUserList,
23
+ SessionType,
24
+ StaticISPProxy,
25
+ StickySession,
26
+ )
27
+ from .serp import (
28
+ BingSearchType,
29
+ Engine,
30
+ GoogleSearchType,
31
+ GoogleTbm,
32
+ SerpRequest,
33
+ TimeRange,
34
+ )
35
+ from .task import (
36
+ DataFormat,
37
+ ScraperTaskConfig,
38
+ TaskStatus,
39
+ TaskStatusResponse,
40
+ UsageStatistics,
41
+ VideoTaskConfig,
42
+ )
43
+ from .universal import UniversalScrapeRequest
44
+
45
+ __all__ = [
46
+ "CommonSettings",
47
+ "Device",
48
+ "OutputFormat",
49
+ "ThordataBaseConfig",
50
+ "Continent",
51
+ "Country",
52
+ "normalize_enum_value",
53
+ "ProxyConfig",
54
+ "ProxyProduct",
55
+ "ProxyServer",
56
+ "ProxyType",
57
+ "ProxyUser",
58
+ "ProxyUserList",
59
+ "SessionType",
60
+ "StaticISPProxy",
61
+ "StickySession",
62
+ "ProxyHost",
63
+ "ProxyPort",
64
+ "BingSearchType",
65
+ "Engine",
66
+ "GoogleSearchType",
67
+ "GoogleTbm",
68
+ "SerpRequest",
69
+ "TimeRange",
70
+ "DataFormat",
71
+ "ScraperTaskConfig",
72
+ "TaskStatus",
73
+ "TaskStatusResponse",
74
+ "UsageStatistics",
75
+ "VideoTaskConfig",
76
+ "UniversalScrapeRequest",
77
+ ]