twitterscraper-ruby 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -2
- data/lib/twitterscraper/cli.rb +5 -1
- data/lib/twitterscraper/client.rb +1 -1
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +37 -24
- data/lib/twitterscraper/tweet.rb +10 -3
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
|
4
|
+
data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
|
7
|
+
data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -137,8 +137,8 @@ $ cat tweets.json | jq . | less
|
|
137
137
|
| ------------- | ------------- | ------------- |
|
138
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
139
139
|
| `--query` | Specify a keyword used during the search. | |
|
140
|
-
| `--start_date` |
|
141
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
142
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
143
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,6 +20,7 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
23
24
|
threads: options['threads'],
|
24
25
|
proxy: options['proxy']
|
25
26
|
}
|
@@ -63,6 +64,7 @@ module Twitterscraper
|
|
63
64
|
'end_date:',
|
64
65
|
'lang:',
|
65
66
|
'limit:',
|
67
|
+
'daily_limit:',
|
66
68
|
'threads:',
|
67
69
|
'output:',
|
68
70
|
'format:',
|
@@ -72,8 +74,10 @@ module Twitterscraper
|
|
72
74
|
'verbose',
|
73
75
|
)
|
74
76
|
|
77
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
75
78
|
options['lang'] ||= ''
|
76
79
|
options['limit'] = (options['limit'] || 100).to_i
|
80
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
77
81
|
options['threads'] = (options['threads'] || 2).to_i
|
78
82
|
options['format'] ||= 'json'
|
79
83
|
options['output'] ||= "tweets.#{options['format']}"
|
@@ -101,7 +105,7 @@ module Twitterscraper
|
|
101
105
|
end
|
102
106
|
|
103
107
|
def print_version
|
104
|
-
puts "twitterscraper-#{
|
108
|
+
puts "twitterscraper-#{VERSION}"
|
105
109
|
end
|
106
110
|
end
|
107
111
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,27 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
82
|
http_request = lambda do
|
79
|
-
logger.debug
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
80
84
|
get_single_page(url, headers, proxies)
|
81
85
|
end
|
82
86
|
|
83
87
|
if cache_enabled?
|
84
88
|
client = Cache.new
|
85
89
|
if (response = client.read(url))
|
86
|
-
logger.debug
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
87
91
|
else
|
88
92
|
response = http_request.call
|
89
|
-
client.write(url, response)
|
93
|
+
client.write(url, response) unless stop_requested?
|
90
94
|
end
|
91
95
|
else
|
92
96
|
response = http_request.call
|
93
97
|
end
|
94
|
-
return [], nil if response.nil?
|
98
|
+
return [], nil if response.nil? || response.empty?
|
95
99
|
|
96
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
97
101
|
|
@@ -114,31 +118,31 @@ module Twitterscraper
|
|
114
118
|
|
115
119
|
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
|
116
120
|
if query.nil? || query == ''
|
117
|
-
raise 'Please specify a search query.'
|
121
|
+
raise Error.new('Please specify a search query.')
|
118
122
|
end
|
119
123
|
|
120
124
|
if ERB::Util.url_encode(query).length >= 500
|
121
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
125
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
122
126
|
end
|
123
127
|
|
124
128
|
if start_date && end_date
|
125
129
|
if start_date == end_date
|
126
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
130
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
127
131
|
elsif start_date > end_date
|
128
|
-
raise ':start_date must occur before :end_date.'
|
132
|
+
raise Error.new(':start_date must occur before :end_date.')
|
129
133
|
end
|
130
134
|
end
|
131
135
|
|
132
136
|
if start_date
|
133
137
|
if start_date < OLDEST_DATE
|
134
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
138
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
135
139
|
end
|
136
140
|
end
|
137
141
|
|
138
142
|
if end_date
|
139
143
|
today = Date.today
|
140
144
|
if end_date > Date.today
|
141
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
145
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
142
146
|
end
|
143
147
|
end
|
144
148
|
end
|
@@ -156,27 +160,32 @@ module Twitterscraper
|
|
156
160
|
end
|
157
161
|
end
|
158
162
|
|
159
|
-
def main_loop(query, lang, limit, headers, proxies)
|
163
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
160
164
|
pos = nil
|
165
|
+
daily_tweets = []
|
161
166
|
|
162
167
|
while true
|
163
168
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
164
169
|
unless new_tweets.empty?
|
170
|
+
daily_tweets.concat(new_tweets)
|
171
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
172
|
+
|
165
173
|
@mutex.synchronize {
|
166
174
|
@all_tweets.concat(new_tweets)
|
167
175
|
@all_tweets.uniq! { |t| t.tweet_id }
|
168
176
|
}
|
169
177
|
end
|
170
|
-
logger.info
|
178
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
171
179
|
|
172
180
|
break unless new_pos
|
181
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
173
182
|
break if @all_tweets.size >= limit
|
174
183
|
|
175
184
|
pos = new_pos
|
176
185
|
end
|
177
186
|
|
178
|
-
if @all_tweets.size >= limit
|
179
|
-
logger.
|
187
|
+
if !@stop_requested && @all_tweets.size >= limit
|
188
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
180
189
|
@stop_requested = true
|
181
190
|
end
|
182
191
|
end
|
@@ -185,32 +194,36 @@ module Twitterscraper
|
|
185
194
|
@stop_requested
|
186
195
|
end
|
187
196
|
|
188
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
197
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
|
189
198
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
190
199
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
191
200
|
queries = build_queries(query, start_date, end_date)
|
192
201
|
threads = queries.size if threads > queries.size
|
193
|
-
proxies = proxy ?
|
202
|
+
proxies = proxy ? Proxy::Pool.new : []
|
194
203
|
|
195
204
|
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
196
205
|
|
197
|
-
logger.
|
206
|
+
logger.debug "Fetch #{proxies.size} proxies" if proxy
|
207
|
+
logger.info "The number of threads #{threads}"
|
198
208
|
|
199
209
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
200
|
-
logger.info
|
210
|
+
logger.info "Headers #{headers}"
|
201
211
|
|
202
212
|
@all_tweets = []
|
203
213
|
@mutex = Mutex.new
|
204
214
|
@stop_requested = false
|
205
215
|
|
206
216
|
if threads > 1
|
217
|
+
Thread.abort_on_exception = true
|
218
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
219
|
+
|
207
220
|
Parallel.each(queries, in_threads: threads) do |query|
|
208
|
-
main_loop(query, lang, limit, headers, proxies)
|
221
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
209
222
|
raise Parallel::Break if stop_requested?
|
210
223
|
end
|
211
224
|
else
|
212
225
|
queries.each do |query|
|
213
|
-
main_loop(query, lang, limit, headers, proxies)
|
226
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
214
227
|
break if stop_requested?
|
215
228
|
end
|
216
229
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -59,12 +59,19 @@ module Twitterscraper
|
|
59
59
|
def from_tweets_html(html)
|
60
60
|
html.map do |tweet|
|
61
61
|
from_tweet_html(tweet)
|
62
|
-
end
|
62
|
+
end.compact
|
63
63
|
end
|
64
64
|
|
65
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
66
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
67
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
68
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
69
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
70
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -89,7 +96,7 @@ module Twitterscraper
|
|
89
96
|
|
90
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
91
98
|
new(
|
92
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
93
100
|
name: html.attr('data-name'),
|
94
101
|
user_id: html.attr('data-user-id').to_i,
|
95
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED