twitterscraper-ruby 0.11.0 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -2
- data/lib/twitterscraper/cli.rb +5 -1
- data/lib/twitterscraper/client.rb +1 -1
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +37 -24
- data/lib/twitterscraper/tweet.rb +10 -3
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
|
4
|
+
data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
|
7
|
+
data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -137,8 +137,8 @@ $ cat tweets.json | jq . | less
|
|
137
137
|
| ------------- | ------------- | ------------- |
|
138
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
139
139
|
| `--query` | Specify a keyword used during the search. | |
|
140
|
-
| `--start_date` |
|
141
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
142
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
143
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,6 +20,7 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
23
24
|
threads: options['threads'],
|
24
25
|
proxy: options['proxy']
|
25
26
|
}
|
@@ -63,6 +64,7 @@ module Twitterscraper
|
|
63
64
|
'end_date:',
|
64
65
|
'lang:',
|
65
66
|
'limit:',
|
67
|
+
'daily_limit:',
|
66
68
|
'threads:',
|
67
69
|
'output:',
|
68
70
|
'format:',
|
@@ -72,8 +74,10 @@ module Twitterscraper
|
|
72
74
|
'verbose',
|
73
75
|
)
|
74
76
|
|
77
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
75
78
|
options['lang'] ||= ''
|
76
79
|
options['limit'] = (options['limit'] || 100).to_i
|
80
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
77
81
|
options['threads'] = (options['threads'] || 2).to_i
|
78
82
|
options['format'] ||= 'json'
|
79
83
|
options['output'] ||= "tweets.#{options['format']}"
|
@@ -101,7 +105,7 @@ module Twitterscraper
|
|
101
105
|
end
|
102
106
|
|
103
107
|
def print_version
|
104
|
-
puts "twitterscraper-#{
|
108
|
+
puts "twitterscraper-#{VERSION}"
|
105
109
|
end
|
106
110
|
end
|
107
111
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,27 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
82
|
http_request = lambda do
|
79
|
-
logger.debug
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
80
84
|
get_single_page(url, headers, proxies)
|
81
85
|
end
|
82
86
|
|
83
87
|
if cache_enabled?
|
84
88
|
client = Cache.new
|
85
89
|
if (response = client.read(url))
|
86
|
-
logger.debug
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
87
91
|
else
|
88
92
|
response = http_request.call
|
89
|
-
client.write(url, response)
|
93
|
+
client.write(url, response) unless stop_requested?
|
90
94
|
end
|
91
95
|
else
|
92
96
|
response = http_request.call
|
93
97
|
end
|
94
|
-
return [], nil if response.nil?
|
98
|
+
return [], nil if response.nil? || response.empty?
|
95
99
|
|
96
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
97
101
|
|
@@ -114,31 +118,31 @@ module Twitterscraper
|
|
114
118
|
|
115
119
|
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
|
116
120
|
if query.nil? || query == ''
|
117
|
-
raise 'Please specify a search query.'
|
121
|
+
raise Error.new('Please specify a search query.')
|
118
122
|
end
|
119
123
|
|
120
124
|
if ERB::Util.url_encode(query).length >= 500
|
121
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
125
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
122
126
|
end
|
123
127
|
|
124
128
|
if start_date && end_date
|
125
129
|
if start_date == end_date
|
126
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
130
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
127
131
|
elsif start_date > end_date
|
128
|
-
raise ':start_date must occur before :end_date.'
|
132
|
+
raise Error.new(':start_date must occur before :end_date.')
|
129
133
|
end
|
130
134
|
end
|
131
135
|
|
132
136
|
if start_date
|
133
137
|
if start_date < OLDEST_DATE
|
134
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
138
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
135
139
|
end
|
136
140
|
end
|
137
141
|
|
138
142
|
if end_date
|
139
143
|
today = Date.today
|
140
144
|
if end_date > Date.today
|
141
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
145
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
142
146
|
end
|
143
147
|
end
|
144
148
|
end
|
@@ -156,27 +160,32 @@ module Twitterscraper
|
|
156
160
|
end
|
157
161
|
end
|
158
162
|
|
159
|
-
def main_loop(query, lang, limit, headers, proxies)
|
163
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
160
164
|
pos = nil
|
165
|
+
daily_tweets = []
|
161
166
|
|
162
167
|
while true
|
163
168
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
164
169
|
unless new_tweets.empty?
|
170
|
+
daily_tweets.concat(new_tweets)
|
171
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
172
|
+
|
165
173
|
@mutex.synchronize {
|
166
174
|
@all_tweets.concat(new_tweets)
|
167
175
|
@all_tweets.uniq! { |t| t.tweet_id }
|
168
176
|
}
|
169
177
|
end
|
170
|
-
logger.info
|
178
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
171
179
|
|
172
180
|
break unless new_pos
|
181
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
173
182
|
break if @all_tweets.size >= limit
|
174
183
|
|
175
184
|
pos = new_pos
|
176
185
|
end
|
177
186
|
|
178
|
-
if @all_tweets.size >= limit
|
179
|
-
logger.
|
187
|
+
if !@stop_requested && @all_tweets.size >= limit
|
188
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
180
189
|
@stop_requested = true
|
181
190
|
end
|
182
191
|
end
|
@@ -185,32 +194,36 @@ module Twitterscraper
|
|
185
194
|
@stop_requested
|
186
195
|
end
|
187
196
|
|
188
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
197
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
|
189
198
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
190
199
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
191
200
|
queries = build_queries(query, start_date, end_date)
|
192
201
|
threads = queries.size if threads > queries.size
|
193
|
-
proxies = proxy ?
|
202
|
+
proxies = proxy ? Proxy::Pool.new : []
|
194
203
|
|
195
204
|
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
196
205
|
|
197
|
-
logger.
|
206
|
+
logger.debug "Fetch #{proxies.size} proxies" if proxy
|
207
|
+
logger.info "The number of threads #{threads}"
|
198
208
|
|
199
209
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
200
|
-
logger.info
|
210
|
+
logger.info "Headers #{headers}"
|
201
211
|
|
202
212
|
@all_tweets = []
|
203
213
|
@mutex = Mutex.new
|
204
214
|
@stop_requested = false
|
205
215
|
|
206
216
|
if threads > 1
|
217
|
+
Thread.abort_on_exception = true
|
218
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
219
|
+
|
207
220
|
Parallel.each(queries, in_threads: threads) do |query|
|
208
|
-
main_loop(query, lang, limit, headers, proxies)
|
221
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
209
222
|
raise Parallel::Break if stop_requested?
|
210
223
|
end
|
211
224
|
else
|
212
225
|
queries.each do |query|
|
213
|
-
main_loop(query, lang, limit, headers, proxies)
|
226
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
214
227
|
break if stop_requested?
|
215
228
|
end
|
216
229
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -59,12 +59,19 @@ module Twitterscraper
|
|
59
59
|
def from_tweets_html(html)
|
60
60
|
html.map do |tweet|
|
61
61
|
from_tweet_html(tweet)
|
62
|
-
end
|
62
|
+
end.compact
|
63
63
|
end
|
64
64
|
|
65
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
66
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
67
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
68
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
69
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
70
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -89,7 +96,7 @@ module Twitterscraper
|
|
89
96
|
|
90
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
91
98
|
new(
|
92
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
93
100
|
name: html.attr('data-name'),
|
94
101
|
user_id: html.attr('data-user-id').to_i,
|
95
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED