thordata-sdk 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ """
2
+ Professional Platform Scraper Tools (Indeed, Glassdoor, Crunchbase, etc.)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from .base import ToolRequest
10
+
11
+
12
+ class Indeed:
13
+ """Namespace for Indeed tools."""
14
+
15
+ @dataclass
16
+ class JobByUrl(ToolRequest):
17
+ """Indeed Job Listings Scraper by Job URL"""
18
+
19
+ SPIDER_ID = "indeed_job-listings_by-job-url"
20
+ SPIDER_NAME = "indeed.com"
21
+ job_url: str
22
+
23
+ @dataclass
24
+ class JobByKeyword(ToolRequest):
25
+ """Indeed Job Listings Scraper by Keyword"""
26
+
27
+ SPIDER_ID = "indeed_job-listings_by-keyword"
28
+ SPIDER_NAME = "indeed.com"
29
+ keyword: str
30
+ location: str
31
+ country: str | None = None
32
+ domain: str | None = None
33
+ date_posted: str | None = None
34
+ posted_by: str | None = None
35
+ pay: str | None = None
36
+ location_radius: str | None = None
37
+
38
+ @dataclass
39
+ class CompanyByListUrl(ToolRequest):
40
+ """Indeed Companies Info Scraper by Company List URL"""
41
+
42
+ SPIDER_ID = "indeed_companies-info_by-company-list-url"
43
+ SPIDER_NAME = "indeed.com"
44
+ company_list_url: str
45
+
46
+ @dataclass
47
+ class CompanyByKeyword(ToolRequest):
48
+ """Indeed Companies Info Scraper by Keyword"""
49
+
50
+ SPIDER_ID = "indeed_companies-info_by-keyword"
51
+ SPIDER_NAME = "indeed.com"
52
+ keyword: str
53
+
54
+ @dataclass
55
+ class CompanyByIndustryAndState(ToolRequest):
56
+ """Indeed Companies Info Scraper by Industry and State"""
57
+
58
+ SPIDER_ID = "indeed_companies-info_by-industry-and-state"
59
+ SPIDER_NAME = "indeed.com"
60
+ industry: str
61
+ state: str | None = None
62
+
63
+ @dataclass
64
+ class CompanyByUrl(ToolRequest):
65
+ """Indeed Companies Info Scraper by Company URL"""
66
+
67
+ SPIDER_ID = "indeed_companies-info_by-company-url"
68
+ SPIDER_NAME = "indeed.com"
69
+ company_url: str
70
+
71
+
72
+ class Glassdoor:
73
+ """Namespace for Glassdoor tools."""
74
+
75
+ @dataclass
76
+ class CompanyByUrl(ToolRequest):
77
+ """Glassdoor Company Overview Information Scraper by URL"""
78
+
79
+ SPIDER_ID = "glassdoor_company_by-url"
80
+ SPIDER_NAME = "glassdoor.com"
81
+ url: str
82
+
83
+ @dataclass
84
+ class CompanyByInputFilter(ToolRequest):
85
+ """Glassdoor Company Overview Information Scraper by Input Filter"""
86
+
87
+ SPIDER_ID = "glassdoor_company_by-inputfilter"
88
+ SPIDER_NAME = "glassdoor.com"
89
+ company_name: str
90
+ location: str | None = None
91
+ industries: str | None = None
92
+ Job_title: str | None = None # Note: capital J in API
93
+
94
+ @dataclass
95
+ class CompanyByKeywords(ToolRequest):
96
+ """Glassdoor Company Overview Information Scraper by Keywords"""
97
+
98
+ SPIDER_ID = "glassdoor_company_by-keywords"
99
+ SPIDER_NAME = "glassdoor.com"
100
+ search_url: str
101
+ max_search_results: int | None = None
102
+
103
+ @dataclass
104
+ class CompanyByListUrl(ToolRequest):
105
+ """Glassdoor Company Overview Information Scraper by List URL"""
106
+
107
+ SPIDER_ID = "glassdoor_company_by-listurl"
108
+ SPIDER_NAME = "glassdoor.com"
109
+ url: str
110
+
111
+ @dataclass
112
+ class JobByUrl(ToolRequest):
113
+ """Glassdoor Job Information Scraper by URL"""
114
+
115
+ SPIDER_ID = "glassdoor_joblistings_by-url"
116
+ SPIDER_NAME = "glassdoor.com"
117
+ url: str
118
+
119
+ @dataclass
120
+ class JobByKeywords(ToolRequest):
121
+ """Glassdoor Job Information Scraper by Keywords"""
122
+
123
+ SPIDER_ID = "glassdoor_joblistings_by-keywords"
124
+ SPIDER_NAME = "glassdoor.com"
125
+ keyword: str
126
+ location: str
127
+ country: str | None = None
128
+
129
+ @dataclass
130
+ class JobByListUrl(ToolRequest):
131
+ """Glassdoor Job Information Scraper by List URL"""
132
+
133
+ SPIDER_ID = "glassdoor_joblistings_by-listurl"
134
+ SPIDER_NAME = "glassdoor.com"
135
+ url: str
136
+
137
+
138
+ class Crunchbase:
139
+ """Namespace for Crunchbase tools."""
140
+
141
+ @dataclass
142
+ class CompanyByUrl(ToolRequest):
143
+ """Crunchbase Company Information Scraper by URL"""
144
+
145
+ SPIDER_ID = "crunchbase_company_by-url"
146
+ SPIDER_NAME = "crunchbase.com"
147
+ url: str
148
+
149
+ @dataclass
150
+ class CompanyByKeywords(ToolRequest):
151
+ """Crunchbase Company Information Scraper by Keywords"""
152
+
153
+ SPIDER_ID = "crunchbase_company_by-keywords"
154
+ SPIDER_NAME = "crunchbase.com"
155
+ keyword: str
thordata/tools/search.py CHANGED
@@ -13,13 +13,47 @@ class GoogleMaps:
13
13
  """Namespace for Google Maps tools."""
14
14
 
15
15
  @dataclass
16
- class Details(ToolRequest):
17
- """Google Maps Details Information Scraper"""
16
+ class DetailsByUrl(ToolRequest):
17
+ """Google Maps Details Scraper by URL."""
18
18
 
19
19
  SPIDER_ID = "google_map-details_by-url"
20
20
  SPIDER_NAME = "google.com"
21
21
 
22
- url: str # Google Maps URL
22
+ url: str
23
+
24
+ @dataclass
25
+ class DetailsByCid(ToolRequest):
26
+ """Google Maps Details Scraper by CID."""
27
+
28
+ SPIDER_ID = "google_map-details_by-cid"
29
+ SPIDER_NAME = "google.com"
30
+
31
+ CID: str
32
+
33
+ @dataclass
34
+ class DetailsByLocation(ToolRequest):
35
+ """Google Maps Details Scraper by Location keyword + country (+ optional lat/long/zoom).""" # noqa: E501
36
+
37
+ SPIDER_ID = "google_map-details_by-location"
38
+ SPIDER_NAME = "google.com"
39
+
40
+ country: str
41
+ keyword: str
42
+ lat: str | None = None
43
+ long: str | None = None
44
+ zoom_level: str | None = None
45
+
46
+ @dataclass
47
+ class DetailsByPlaceId(ToolRequest):
48
+ """Google Maps Details Scraper by Place ID."""
49
+
50
+ SPIDER_ID = "google_map-details_by-placeid"
51
+ SPIDER_NAME = "google.com"
52
+
53
+ place_id: str
54
+
55
+ # Backward compatible alias: keep old name working
56
+ Details = DetailsByUrl
23
57
 
24
58
  @dataclass
25
59
  class Reviews(ToolRequest):
@@ -37,14 +71,22 @@ class GoogleShopping:
37
71
 
38
72
  @dataclass
39
73
  class Product(ToolRequest):
40
- """Google Shopping Information Scraper"""
74
+ """Google Shopping Information Scraper by URL"""
41
75
 
42
76
  SPIDER_ID = "google_shopping_by-url"
43
77
  SPIDER_NAME = "google.com"
44
-
45
78
  url: str
46
79
  country: str | None = None # e.g. "US"
47
80
 
81
+ @dataclass
82
+ class ProductByKeywords(ToolRequest):
83
+ """Google Shopping Information Scraper by Keywords"""
84
+
85
+ SPIDER_ID = "google_shopping_by-keywords"
86
+ SPIDER_NAME = "google.com"
87
+ keyword: str
88
+ country: str | None = None # e.g. "US"
89
+
48
90
 
49
91
  class GooglePlay:
50
92
  """Namespace for Google Play Store tools."""
thordata/tools/social.py CHANGED
@@ -12,12 +12,47 @@ from .base import ToolRequest
12
12
  class TikTok:
13
13
  @dataclass
14
14
  class Post(ToolRequest):
15
- """TikTok Post Information Scraper"""
15
+ """TikTok Post Information Scraper by URL"""
16
16
 
17
17
  SPIDER_ID = "tiktok_posts_by-url"
18
18
  SPIDER_NAME = "tiktok.com"
19
19
  url: str
20
- page_turning: int | None = None
20
+ country: str | None = None
21
+
22
+ @dataclass
23
+ class PostsByKeywords(ToolRequest):
24
+ """TikTok Post Information Scraper by Keywords"""
25
+
26
+ SPIDER_ID = "tiktok_posts_by-keywords"
27
+ SPIDER_NAME = "tiktok.com"
28
+ search_keyword: str
29
+ num_of_posts: int | None = None
30
+ posts_to_not_include: str | None = None
31
+ country: str | None = None
32
+
33
+ @dataclass
34
+ class PostsByProfileUrl(ToolRequest):
35
+ """TikTok Post Information Scraper by Profile URL"""
36
+
37
+ SPIDER_ID = "tiktok_posts_by-profileurl"
38
+ SPIDER_NAME = "tiktok.com"
39
+ url: str
40
+ start_date: str | None = None
41
+ end_date: str | None = None
42
+ num_of_posts: int | None = None
43
+ what_to_collect: str | None = None
44
+ post_type: str | None = None
45
+ posts_to_not_include: str | None = None
46
+ country: str | None = None
47
+
48
+ @dataclass
49
+ class PostsByListUrl(ToolRequest):
50
+ """TikTok Post Information Scraper by List URL"""
51
+
52
+ SPIDER_ID = "tiktok_posts_by-listurl"
53
+ SPIDER_NAME = "tiktok.com"
54
+ url: str
55
+ num_of_posts: int | None = None
21
56
 
22
57
  @dataclass
23
58
  class Comment(ToolRequest):
@@ -30,33 +65,62 @@ class TikTok:
30
65
 
31
66
  @dataclass
32
67
  class Profile(ToolRequest):
33
- """TikTok Profile Information Scraper"""
68
+ """TikTok Profile Information Scraper by URL"""
34
69
 
35
70
  SPIDER_ID = "tiktok_profiles_by-url"
36
71
  SPIDER_NAME = "tiktok.com"
37
-
38
72
  url: str # Profile URL (e.g. https://www.tiktok.com/@user)
39
- search_url: str | None = None
73
+ country: str | None = None
74
+
75
+ @dataclass
76
+ class ProfilesByListUrl(ToolRequest):
77
+ """TikTok Profile Information Scraper by List URL"""
40
78
 
79
+ SPIDER_ID = "tiktok_profiles_by-listurl"
80
+ SPIDER_NAME = "tiktok.com"
81
+ search_url: str
41
82
  country: str | None = None
42
83
  page_turning: int | None = None
43
84
 
44
85
  @dataclass
45
86
  class Shop(ToolRequest):
46
- """TikTok Shop Information Scraper"""
87
+ """TikTok Shop Information Scraper by URL"""
47
88
 
48
89
  SPIDER_ID = "tiktok_shop_by-url"
49
90
  SPIDER_NAME = "tiktok.com"
50
91
  url: str
51
- category_url: str | None = None
52
- keyword: str | None = None
92
+
93
+ @dataclass
94
+ class ShopByCategoryUrl(ToolRequest):
95
+ """TikTok Shop Information Scraper by Category URL"""
96
+
97
+ SPIDER_ID = "tiktok_shop_by-category-url"
98
+ SPIDER_NAME = "tiktok.com"
99
+ category_url: str
100
+
101
+ @dataclass
102
+ class ShopByKeywords(ToolRequest):
103
+ """TikTok Shop Information Scraper by Keywords"""
104
+
105
+ SPIDER_ID = "tiktok_shop_by-keywords"
106
+ SPIDER_NAME = "tiktok.com"
107
+ keyword: str
108
+ domain: str = "https://www.tiktok.com/shop"
53
109
  page_turning: int | None = None
54
110
 
55
111
 
56
112
  class Facebook:
113
+ @dataclass
114
+ class PostDetails(ToolRequest):
115
+ """Facebook Post Details Scraper"""
116
+
117
+ SPIDER_ID = "facebook_post_by-posts-url"
118
+ SPIDER_NAME = "facebook.com"
119
+ url: str
120
+
57
121
  @dataclass
58
122
  class Posts(ToolRequest):
59
- """Facebook Posts Scraper"""
123
+ """Facebook Posts Scraper by Keywords"""
60
124
 
61
125
  SPIDER_ID = "facebook_post_by-keywords"
62
126
  SPIDER_NAME = "facebook.com"
@@ -66,27 +130,70 @@ class Facebook:
66
130
  number: int = 10
67
131
 
68
132
  @dataclass
69
- class PostDetails(ToolRequest):
70
- """Facebook Post Details Scraper"""
133
+ class EventByEventListUrl(ToolRequest):
134
+ """Facebook Events Scraper by Event List URL"""
71
135
 
72
- SPIDER_ID = "facebook_post_by-posts-url"
136
+ SPIDER_ID = "facebook_event_by-eventlist-url"
137
+ SPIDER_NAME = "facebook.com"
138
+ url: str
139
+ upcoming_events_only: str | None = None
140
+
141
+ @dataclass
142
+ class EventBySearchUrl(ToolRequest):
143
+ """Facebook Events Scraper by Search URL"""
144
+
145
+ SPIDER_ID = "facebook_event_by-search-url"
146
+ SPIDER_NAME = "facebook.com"
147
+ url: str
148
+
149
+ @dataclass
150
+ class EventByEventsUrl(ToolRequest):
151
+ """Facebook Events Scraper by Events URL"""
152
+
153
+ SPIDER_ID = "facebook_event_by-events-url"
154
+ SPIDER_NAME = "facebook.com"
155
+ url: str
156
+
157
+ @dataclass
158
+ class Profile(ToolRequest):
159
+ """Facebook Profile Scraper"""
160
+
161
+ SPIDER_ID = "facebook_profile_by-profiles-url"
162
+ SPIDER_NAME = "facebook.com"
163
+ url: str
164
+
165
+ @dataclass
166
+ class Comment(ToolRequest):
167
+ """Facebook Post Comments Scraper"""
168
+
169
+ SPIDER_ID = "facebook_comment_by-comments-url"
73
170
  SPIDER_NAME = "facebook.com"
74
171
  url: str
172
+ get_all_replies: str | None = None
173
+ limit_records: str | None = None
174
+ comments_sort: str | None = None # All comments
75
175
 
76
176
 
77
177
  class Instagram:
78
178
  @dataclass
79
179
  class Profile(ToolRequest):
80
- """Instagram Profile Scraper"""
180
+ """Instagram Profile Scraper by Username"""
81
181
 
82
182
  SPIDER_ID = "ins_profiles_by-username"
83
183
  SPIDER_NAME = "instagram.com"
84
184
  username: str
85
- profileurl: str | None = None
185
+
186
+ @dataclass
187
+ class ProfileByUrl(ToolRequest):
188
+ """Instagram Profile Scraper by Profile URL"""
189
+
190
+ SPIDER_ID = "ins_profiles_by-profileurl"
191
+ SPIDER_NAME = "instagram.com"
192
+ profileurl: str
86
193
 
87
194
  @dataclass
88
195
  class Post(ToolRequest):
89
- """Instagram Post Information Scraper"""
196
+ """Instagram Post Information Scraper by Profile URL"""
90
197
 
91
198
  SPIDER_ID = "ins_posts_by-profileurl"
92
199
  SPIDER_NAME = "instagram.com"
@@ -96,14 +203,45 @@ class Instagram:
96
203
  end_date: str | None = None
97
204
  post_type: str | None = None # Post or Reel
98
205
 
206
+ @dataclass
207
+ class PostByUrl(ToolRequest):
208
+ """Instagram Post Information Scraper by Post URL"""
209
+
210
+ SPIDER_ID = "ins_posts_by-posturl"
211
+ SPIDER_NAME = "instagram.com"
212
+ posturl: str
213
+
99
214
  @dataclass
100
215
  class Reel(ToolRequest):
101
- """Instagram Reel Information Scraper"""
216
+ """Instagram Reel Information Scraper by URL"""
102
217
 
103
218
  SPIDER_ID = "ins_reel_by-url"
104
219
  SPIDER_NAME = "instagram.com"
105
220
  url: str
221
+
222
+ @dataclass
223
+ class AllReel(ToolRequest):
224
+ """Instagram All Reel Information Scraper by URL"""
225
+
226
+ SPIDER_ID = "ins_allreel_by-url"
227
+ SPIDER_NAME = "instagram.com"
228
+ url: str
229
+ num_of_posts: int | None = None
230
+ posts_to_not_include: str | None = None
231
+ start_date: str | None = None
232
+ end_date: str | None = None
233
+
234
+ @dataclass
235
+ class ReelByListUrl(ToolRequest):
236
+ """Instagram Reel Information Scraper by List URL"""
237
+
238
+ SPIDER_ID = "ins_reel_by-listurl"
239
+ SPIDER_NAME = "instagram.com"
240
+ url: str
106
241
  num_of_posts: int | None = None
242
+ posts_to_not_include: str | None = None
243
+ start_date: str | None = None
244
+ end_date: str | None = None
107
245
 
108
246
  @dataclass
109
247
  class Comment(ToolRequest):
@@ -117,30 +255,35 @@ class Instagram:
117
255
  class Twitter:
118
256
  @dataclass
119
257
  class Profile(ToolRequest):
120
- """Twitter(X) Profile Scraper"""
258
+ """Twitter(X) Profile Scraper by Profile URL"""
121
259
 
122
- SPIDER_ID = "twitter_profiles_by-url"
123
- SPIDER_NAME = "twitter.com"
260
+ SPIDER_ID = "twitter_profile_by-profileurl"
261
+ SPIDER_NAME = "x.com"
124
262
  url: str
125
- max_number_of_posts: int | None = None
126
- user_name: str | None = None
127
263
 
128
264
  @dataclass
129
- class Post(ToolRequest):
130
- """
131
- Twitter(X) Post Information Scraper
132
- Updates based on integration snippet:
133
- - SPIDER_NAME is 'x.com'
134
- - Only 'url' is required.
135
- """
265
+ class ProfileByUsername(ToolRequest):
266
+ """Twitter(X) Profile Scraper by Username"""
267
+
268
+ SPIDER_ID = "twitter_profile_by-username"
269
+ SPIDER_NAME = "x.com"
270
+ user_name: str
136
271
 
137
- SPIDER_ID = "twitter_by-posturl_by-url"
138
- SPIDER_NAME = "x.com" # Updated from snippet
272
+ @dataclass
273
+ class Post(ToolRequest):
274
+ """Twitter(X) Post Information Scraper by Post URL"""
139
275
 
276
+ SPIDER_ID = "twitter_post_by-posturl"
277
+ SPIDER_NAME = "x.com"
140
278
  url: str # Post URL (e.g. https://x.com/user/status/123)
141
279
 
142
- start_date: str | None = None
143
- end_date: str | None = None
280
+ @dataclass
281
+ class PostByProfileUrl(ToolRequest):
282
+ """Twitter(X) Post Information Scraper by Profile URL"""
283
+
284
+ SPIDER_ID = "twitter_post_by-profileurl"
285
+ SPIDER_NAME = "x.com"
286
+ url: str # Profile URL
144
287
 
145
288
 
146
289
  class LinkedIn:
@@ -154,30 +297,70 @@ class LinkedIn:
154
297
 
155
298
  @dataclass
156
299
  class Jobs(ToolRequest):
157
- """LinkedIn Job Listing Scraper"""
300
+ """LinkedIn Job Listing Scraper by Job Listing URL"""
158
301
 
159
302
  SPIDER_ID = "linkedin_job_listings_information_by-job-listing-url"
160
303
  SPIDER_NAME = "linkedin.com"
161
304
  job_listing_url: str
305
+ page_turning: int | None = None
306
+
307
+ @dataclass
308
+ class JobByUrl(ToolRequest):
309
+ """LinkedIn Job Listing Scraper by Job URL"""
310
+
311
+ SPIDER_ID = "linkedin_job_listings_information_by-job-url"
312
+ SPIDER_NAME = "linkedin.com"
313
+ job_url: str
314
+
315
+ @dataclass
316
+ class JobByKeyword(ToolRequest):
317
+ """LinkedIn Job Listing Scraper by Keyword"""
318
+
319
+ SPIDER_ID = "linkedin_job_listings_information_by-keyword"
320
+ SPIDER_NAME = "linkedin.com"
162
321
  location: str
163
- job_url: str | None = None
322
+ keyword: str
323
+ time_range: str | None = None
324
+ experience_level: str | None = None
325
+ job_type: str | None = None
326
+ remote: str | None = None
327
+ company: str | None = None
328
+ selective_search: str | None = None
329
+ jobs_to_not_include: str | None = None
330
+ location_radius: str | None = None
164
331
  page_turning: int | None = None
165
- keyword: str | None = None
166
- remote: str | None = None # On_site, Remote, Hybrid
167
332
 
168
333
 
169
334
  class Reddit:
170
335
  @dataclass
171
336
  class Posts(ToolRequest):
172
- """Reddit Post Information Scraper"""
337
+ """Reddit Post Information Scraper by URL"""
173
338
 
174
339
  SPIDER_ID = "reddit_posts_by-url"
175
340
  SPIDER_NAME = "reddit.com"
176
341
  url: str
177
- keyword: str | None = None
178
- subreddit_url: str | None = None
342
+
343
+ @dataclass
344
+ class PostsByKeywords(ToolRequest):
345
+ """Reddit Post Information Scraper by Keywords"""
346
+
347
+ SPIDER_ID = "reddit_posts_by-keywords"
348
+ SPIDER_NAME = "reddit.com"
349
+ keyword: str
350
+ date: str | None = None # All time
351
+ num_of_posts: int | None = None
352
+ sort_by: str | None = None
353
+
354
+ @dataclass
355
+ class PostsBySubredditUrl(ToolRequest):
356
+ """Reddit Post Information Scraper by Subreddit URL"""
357
+
358
+ SPIDER_ID = "reddit_posts_by-subredditurl"
359
+ SPIDER_NAME = "reddit.com"
360
+ url: str
361
+ sort_by: str | None = None
179
362
  num_of_posts: int | None = None
180
- sort_by: str | None = None # Relevance, Hot, Top, New
363
+ sort_by_time: str | None = None # All Time
181
364
 
182
365
  @dataclass
183
366
  class Comment(ToolRequest):
@@ -187,4 +370,5 @@ class Reddit:
187
370
  SPIDER_NAME = "reddit.com"
188
371
  url: str
189
372
  days_back: int | None = None
190
- load_all_replies: bool | None = None
373
+ load_all_replies: str | None = None
374
+ comment_limit: int | None = None