twitterscraper-ruby 0.18.0 → 0.19.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +14 -13
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +36 -42
- data/lib/twitterscraper/template.rb +24 -26
- data/lib/twitterscraper/template/tweets.html.erb +5 -2
- data/lib/twitterscraper/type.rb +4 -0
- data/lib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
|
4
|
+
data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
|
7
|
+
data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
|
data/Gemfile.lock
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl = 86400 #
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def exist?(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.exist?(file)
|
32
|
+
end
|
33
|
+
|
28
34
|
def delete(key)
|
29
35
|
key = cache_key(key)
|
30
36
|
file = File.join(@dir, key)
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -32,16 +32,17 @@ module Twitterscraper
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def export(name, tweets)
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
36
|
+
file = build_output_name(format, options)
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
38
|
+
|
39
|
+
if format == 'json'
|
40
|
+
File.write(file, generate_json(tweets))
|
41
|
+
elsif format == 'html'
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
43
|
+
else
|
44
|
+
puts "Invalid format #{format}"
|
45
|
+
end
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
@@ -90,7 +91,6 @@ module Twitterscraper
|
|
90
91
|
options['threads_granularity'] ||= 'auto'
|
91
92
|
options['format'] ||= 'json'
|
92
93
|
options['order'] ||= 'desc'
|
93
|
-
options['output'] ||= build_output_name(options)
|
94
94
|
|
95
95
|
options['cache'] = options['cache'] != 'false'
|
96
96
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -98,10 +98,11 @@ module Twitterscraper
|
|
98
98
|
options
|
99
99
|
end
|
100
100
|
|
101
|
-
def build_output_name(options)
|
101
|
+
def build_output_name(format, options)
|
102
102
|
query = options['query'].gsub(/[ :?#&]/, '_')
|
103
103
|
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
104
|
-
|
104
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
105
|
+
File.join('out', file)
|
105
106
|
end
|
106
107
|
|
107
108
|
def initialize_logger
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
+
USER_AGENT_LIST = [
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
11
|
+
]
|
12
|
+
|
5
13
|
def initialize(cache: true, proxy: true)
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
16
|
+
|
6
17
|
@cache = cache
|
7
|
-
|
18
|
+
|
19
|
+
if (@proxy = proxy)
|
20
|
+
@proxies = Proxy::Pool.new
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
22
|
+
else
|
23
|
+
@proxies = []
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def request_headers
|
29
|
+
@request_headers
|
8
30
|
end
|
9
31
|
|
10
32
|
def cache_enabled?
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
14
36
|
def proxy_enabled?
|
15
37
|
@proxy
|
16
38
|
end
|
39
|
+
|
40
|
+
def proxies
|
41
|
+
@proxies
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
10
10
|
module Query
|
11
11
|
include Logger
|
12
12
|
|
13
|
-
USER_AGENT_LIST = [
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
-
]
|
20
|
-
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
def get_single_page(url,
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
47
39
|
return nil if stop_requested?
|
48
|
-
|
40
|
+
if proxy_enabled?
|
49
41
|
proxy = proxies.sample
|
50
42
|
logger.info("Using proxy #{proxy}")
|
51
43
|
end
|
52
|
-
Http.get(url,
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
53
45
|
rescue => e
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
55
47
|
if (retries -= 1) > 0
|
@@ -74,14 +66,14 @@ module Twitterscraper
|
|
74
66
|
[items_html, json_resp]
|
75
67
|
end
|
76
68
|
|
77
|
-
def query_single_page(query, lang, type, pos
|
69
|
+
def query_single_page(query, lang, type, pos)
|
78
70
|
logger.info "Querying #{query}"
|
79
71
|
encoded_query = ERB::Util.url_encode(query)
|
80
72
|
|
81
73
|
url = build_query_url(encoded_query, lang, type, pos)
|
82
74
|
http_request = lambda do
|
83
75
|
logger.debug "Scraping tweets from url=#{url}"
|
84
|
-
get_single_page(url
|
76
|
+
get_single_page(url)
|
85
77
|
end
|
86
78
|
|
87
79
|
if cache_enabled?
|
@@ -160,7 +152,7 @@ module Twitterscraper
|
|
160
152
|
if threads_granularity == 'day'
|
161
153
|
date_range = start_date.upto(end_date - 1)
|
162
154
|
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
163
|
-
|
155
|
+
elsif threads_granularity == 'hour'
|
164
156
|
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
165
157
|
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
166
158
|
queries = []
|
@@ -185,34 +177,35 @@ module Twitterscraper
|
|
185
177
|
end
|
186
178
|
end
|
187
179
|
|
188
|
-
def main_loop(query, lang, type, limit, daily_limit
|
180
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
189
181
|
pos = nil
|
190
|
-
|
182
|
+
tmp_tweets = []
|
191
183
|
|
192
184
|
while true
|
193
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
185
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
194
186
|
unless new_tweets.empty?
|
195
|
-
|
196
|
-
|
187
|
+
tmp_tweets.concat(new_tweets)
|
188
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
189
|
+
end
|
197
190
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
191
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
192
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
193
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
194
|
+
|
195
|
+
if !@stop_requested && total_size >= limit
|
196
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
197
|
+
@stop_requested = true
|
202
198
|
end
|
203
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
204
199
|
|
205
200
|
break unless new_pos
|
206
|
-
break if
|
201
|
+
break if @stop_requested
|
202
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
207
203
|
break if @all_tweets.size >= limit
|
208
204
|
|
209
205
|
pos = new_pos
|
210
206
|
end
|
211
207
|
|
212
|
-
|
213
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
214
|
-
@stop_requested = true
|
215
|
-
end
|
208
|
+
tmp_tweets
|
216
209
|
end
|
217
210
|
|
218
211
|
def stop_requested?
|
@@ -233,13 +226,6 @@ module Twitterscraper
|
|
233
226
|
if threads > queries.size
|
234
227
|
threads = queries.size
|
235
228
|
end
|
236
|
-
if proxy_enabled?
|
237
|
-
proxies = Proxy::Pool.new
|
238
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
239
|
-
else
|
240
|
-
proxies = []
|
241
|
-
logger.debug 'Proxy disabled'
|
242
|
-
end
|
243
229
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
244
230
|
|
245
231
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
@@ -247,24 +233,32 @@ module Twitterscraper
|
|
247
233
|
logger.info "The number of queries #{queries.size}"
|
248
234
|
logger.info "The number of threads #{threads}"
|
249
235
|
|
250
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
251
|
-
logger.info "Headers #{headers}"
|
252
|
-
|
253
236
|
@all_tweets = []
|
254
|
-
@mutex = Mutex.new
|
255
237
|
@stop_requested = false
|
238
|
+
@results_counter = {}
|
256
239
|
|
257
240
|
if threads > 1
|
241
|
+
@mutex = Mutex.new
|
258
242
|
Thread.abort_on_exception = true
|
259
243
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
260
244
|
|
261
245
|
Parallel.each(queries, in_threads: threads) do |query|
|
262
|
-
|
246
|
+
@results_counter[Parallel.worker_number] = 0
|
247
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
248
|
+
@mutex.synchronize {
|
249
|
+
@all_tweets.concat(tmp_tweets)
|
250
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
251
|
+
}
|
252
|
+
@results_counter[Parallel.worker_number] = 0
|
253
|
+
|
263
254
|
raise Parallel::Break if stop_requested?
|
264
255
|
end
|
265
256
|
else
|
266
257
|
queries.each do |query|
|
267
|
-
main_loop(query, lang, type, limit, daily_limit
|
258
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
259
|
+
@all_tweets.concat(tmp_tweets)
|
260
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
261
|
+
|
268
262
|
break if stop_requested?
|
269
263
|
end
|
270
264
|
end
|
@@ -16,39 +16,37 @@ module Twitterscraper
|
|
16
16
|
)
|
17
17
|
end
|
18
18
|
|
19
|
-
def chart_data(tweets,
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
break if data.size - 1 == i
|
32
|
-
if data[i] == 0 && data[i + 1] == 0
|
33
|
-
data.delete(timestamp)
|
19
|
+
def chart_data(tweets, grouping: 'auto')
|
20
|
+
if grouping && tweets.size > 100
|
21
|
+
if grouping == 'auto'
|
22
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
23
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
24
|
+
|
25
|
+
if duration > 3 * month
|
26
|
+
grouping = 'day'
|
27
|
+
elsif duration > month || tweets.size > 10000
|
28
|
+
grouping = 'hour'
|
29
|
+
else
|
30
|
+
grouping = 'minute'
|
34
31
|
end
|
35
32
|
end
|
36
33
|
end
|
37
34
|
|
38
|
-
|
39
|
-
time = data.keys.min
|
40
|
-
max_time = data.keys.max
|
41
|
-
sec_interval = 60 * min_interval
|
35
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
42
36
|
|
43
|
-
|
44
|
-
|
45
|
-
break if next_time + sec_interval > max_time
|
37
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
38
|
+
t = tweet.created_at
|
46
39
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
time =
|
40
|
+
if grouping == 'day'
|
41
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
42
|
+
elsif grouping == 'hour'
|
43
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
44
|
+
elsif grouping == 'minute'
|
45
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
46
|
+
else
|
47
|
+
time = t
|
51
48
|
end
|
49
|
+
memo[time.to_i] += 1
|
52
50
|
end
|
53
51
|
|
54
52
|
data.sort_by { |k, _| k }.map do |timestamp, count|
|
@@ -1,5 +1,8 @@
|
|
1
|
-
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="ja">
|
2
3
|
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
|
3
6
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
4
7
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
5
8
|
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
@@ -80,7 +83,7 @@
|
|
80
83
|
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
81
84
|
|
82
85
|
<div class="tweets-container">
|
83
|
-
<% tweets.sort_by { |t| -t.created_at.to_i }.each.with_index do |tweet, i| %>
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
84
87
|
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
85
88
|
<% if i < convert_limit %>
|
86
89
|
<blockquote class="twitter-tweet">
|
data/lib/twitterscraper/type.rb
CHANGED
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.19.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|