twitterscraper-ruby 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +14 -13
- data/lib/twitterscraper/client.rb +27 -1
- data/lib/twitterscraper/query.rb +36 -42
- data/lib/twitterscraper/template.rb +24 -26
- data/lib/twitterscraper/template/tweets.html.erb +5 -2
- data/lib/twitterscraper/type.rb +4 -0
- data/lib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
|
4
|
+
data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
|
7
|
+
data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
|
data/Gemfile.lock
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl = 86400 #
|
7
|
+
@ttl = 86400 * 3 # 3 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def exist?(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.exist?(file)
|
32
|
+
end
|
33
|
+
|
28
34
|
def delete(key)
|
29
35
|
key = cache_key(key)
|
30
36
|
file = File.join(@dir, key)
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -32,16 +32,17 @@ module Twitterscraper
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def export(name, tweets)
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
options['format'].split(',').map(&:strip).each do |format|
|
36
|
+
file = build_output_name(format, options)
|
37
|
+
Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
|
38
|
+
|
39
|
+
if format == 'json'
|
40
|
+
File.write(file, generate_json(tweets))
|
41
|
+
elsif format == 'html'
|
42
|
+
File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
|
43
|
+
else
|
44
|
+
puts "Invalid format #{format}"
|
45
|
+
end
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
@@ -90,7 +91,6 @@ module Twitterscraper
|
|
90
91
|
options['threads_granularity'] ||= 'auto'
|
91
92
|
options['format'] ||= 'json'
|
92
93
|
options['order'] ||= 'desc'
|
93
|
-
options['output'] ||= build_output_name(options)
|
94
94
|
|
95
95
|
options['cache'] = options['cache'] != 'false'
|
96
96
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -98,10 +98,11 @@ module Twitterscraper
|
|
98
98
|
options
|
99
99
|
end
|
100
100
|
|
101
|
-
def build_output_name(options)
|
101
|
+
def build_output_name(format, options)
|
102
102
|
query = options['query'].gsub(/[ :?#&]/, '_')
|
103
103
|
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
104
|
-
|
104
|
+
file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
|
105
|
+
File.join('out', file)
|
105
106
|
end
|
106
107
|
|
107
108
|
def initialize_logger
|
@@ -2,9 +2,31 @@ module Twitterscraper
|
|
2
2
|
class Client
|
3
3
|
include Query
|
4
4
|
|
5
|
+
USER_AGENT_LIST = [
|
6
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
7
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
8
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
9
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
10
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
11
|
+
]
|
12
|
+
|
5
13
|
def initialize(cache: true, proxy: true)
|
14
|
+
@request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
15
|
+
Twitterscraper.logger.info "Headers #{@request_headers}"
|
16
|
+
|
6
17
|
@cache = cache
|
7
|
-
|
18
|
+
|
19
|
+
if (@proxy = proxy)
|
20
|
+
@proxies = Proxy::Pool.new
|
21
|
+
Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
|
22
|
+
else
|
23
|
+
@proxies = []
|
24
|
+
Twitterscraper.logger.debug 'Proxy disabled'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def request_headers
|
29
|
+
@request_headers
|
8
30
|
end
|
9
31
|
|
10
32
|
def cache_enabled?
|
@@ -14,5 +36,9 @@ module Twitterscraper
|
|
14
36
|
def proxy_enabled?
|
15
37
|
@proxy
|
16
38
|
end
|
39
|
+
|
40
|
+
def proxies
|
41
|
+
@proxies
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -10,14 +10,6 @@ module Twitterscraper
|
|
10
10
|
module Query
|
11
11
|
include Logger
|
12
12
|
|
13
|
-
USER_AGENT_LIST = [
|
14
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
-
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
-
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
-
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
-
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
-
]
|
20
|
-
|
21
13
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
14
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
15
|
'default&include_available_features=1&include_entities=1&' +
|
@@ -43,13 +35,13 @@ module Twitterscraper
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
def get_single_page(url,
|
38
|
+
def get_single_page(url, timeout = 6, retries = 30)
|
47
39
|
return nil if stop_requested?
|
48
|
-
|
40
|
+
if proxy_enabled?
|
49
41
|
proxy = proxies.sample
|
50
42
|
logger.info("Using proxy #{proxy}")
|
51
43
|
end
|
52
|
-
Http.get(url,
|
44
|
+
Http.get(url, request_headers, proxy, timeout)
|
53
45
|
rescue => e
|
54
46
|
logger.debug "get_single_page: #{e.inspect}"
|
55
47
|
if (retries -= 1) > 0
|
@@ -74,14 +66,14 @@ module Twitterscraper
|
|
74
66
|
[items_html, json_resp]
|
75
67
|
end
|
76
68
|
|
77
|
-
def query_single_page(query, lang, type, pos
|
69
|
+
def query_single_page(query, lang, type, pos)
|
78
70
|
logger.info "Querying #{query}"
|
79
71
|
encoded_query = ERB::Util.url_encode(query)
|
80
72
|
|
81
73
|
url = build_query_url(encoded_query, lang, type, pos)
|
82
74
|
http_request = lambda do
|
83
75
|
logger.debug "Scraping tweets from url=#{url}"
|
84
|
-
get_single_page(url
|
76
|
+
get_single_page(url)
|
85
77
|
end
|
86
78
|
|
87
79
|
if cache_enabled?
|
@@ -160,7 +152,7 @@ module Twitterscraper
|
|
160
152
|
if threads_granularity == 'day'
|
161
153
|
date_range = start_date.upto(end_date - 1)
|
162
154
|
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
163
|
-
|
155
|
+
elsif threads_granularity == 'hour'
|
164
156
|
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
165
157
|
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
166
158
|
queries = []
|
@@ -185,34 +177,35 @@ module Twitterscraper
|
|
185
177
|
end
|
186
178
|
end
|
187
179
|
|
188
|
-
def main_loop(query, lang, type, limit, daily_limit
|
180
|
+
def main_loop(query, lang, type, limit, daily_limit)
|
189
181
|
pos = nil
|
190
|
-
|
182
|
+
tmp_tweets = []
|
191
183
|
|
192
184
|
while true
|
193
|
-
new_tweets, new_pos = query_single_page(query, lang, type, pos
|
185
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos)
|
194
186
|
unless new_tweets.empty?
|
195
|
-
|
196
|
-
|
187
|
+
tmp_tweets.concat(new_tweets)
|
188
|
+
tmp_tweets.uniq! { |t| t.tweet_id }
|
189
|
+
end
|
197
190
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
191
|
+
@results_counter[Parallel.worker_number] = tmp_tweets.size
|
192
|
+
total_size = @all_tweets.size + @results_counter.values.sum
|
193
|
+
logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
|
194
|
+
|
195
|
+
if !@stop_requested && total_size >= limit
|
196
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
|
197
|
+
@stop_requested = true
|
202
198
|
end
|
203
|
-
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
204
199
|
|
205
200
|
break unless new_pos
|
206
|
-
break if
|
201
|
+
break if @stop_requested
|
202
|
+
break if daily_limit && tmp_tweets.size >= daily_limit
|
207
203
|
break if @all_tweets.size >= limit
|
208
204
|
|
209
205
|
pos = new_pos
|
210
206
|
end
|
211
207
|
|
212
|
-
|
213
|
-
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
214
|
-
@stop_requested = true
|
215
|
-
end
|
208
|
+
tmp_tweets
|
216
209
|
end
|
217
210
|
|
218
211
|
def stop_requested?
|
@@ -233,13 +226,6 @@ module Twitterscraper
|
|
233
226
|
if threads > queries.size
|
234
227
|
threads = queries.size
|
235
228
|
end
|
236
|
-
if proxy_enabled?
|
237
|
-
proxies = Proxy::Pool.new
|
238
|
-
logger.debug "Fetch #{proxies.size} proxies"
|
239
|
-
else
|
240
|
-
proxies = []
|
241
|
-
logger.debug 'Proxy disabled'
|
242
|
-
end
|
243
229
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
244
230
|
|
245
231
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
@@ -247,24 +233,32 @@ module Twitterscraper
|
|
247
233
|
logger.info "The number of queries #{queries.size}"
|
248
234
|
logger.info "The number of threads #{threads}"
|
249
235
|
|
250
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
251
|
-
logger.info "Headers #{headers}"
|
252
|
-
|
253
236
|
@all_tweets = []
|
254
|
-
@mutex = Mutex.new
|
255
237
|
@stop_requested = false
|
238
|
+
@results_counter = {}
|
256
239
|
|
257
240
|
if threads > 1
|
241
|
+
@mutex = Mutex.new
|
258
242
|
Thread.abort_on_exception = true
|
259
243
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
260
244
|
|
261
245
|
Parallel.each(queries, in_threads: threads) do |query|
|
262
|
-
|
246
|
+
@results_counter[Parallel.worker_number] = 0
|
247
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
248
|
+
@mutex.synchronize {
|
249
|
+
@all_tweets.concat(tmp_tweets)
|
250
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
251
|
+
}
|
252
|
+
@results_counter[Parallel.worker_number] = 0
|
253
|
+
|
263
254
|
raise Parallel::Break if stop_requested?
|
264
255
|
end
|
265
256
|
else
|
266
257
|
queries.each do |query|
|
267
|
-
main_loop(query, lang, type, limit, daily_limit
|
258
|
+
tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
|
259
|
+
@all_tweets.concat(tmp_tweets)
|
260
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
261
|
+
|
268
262
|
break if stop_requested?
|
269
263
|
end
|
270
264
|
end
|
@@ -16,39 +16,37 @@ module Twitterscraper
|
|
16
16
|
)
|
17
17
|
end
|
18
18
|
|
19
|
-
def chart_data(tweets,
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
break if data.size - 1 == i
|
32
|
-
if data[i] == 0 && data[i + 1] == 0
|
33
|
-
data.delete(timestamp)
|
19
|
+
def chart_data(tweets, grouping: 'auto')
|
20
|
+
if grouping && tweets.size > 100
|
21
|
+
if grouping == 'auto'
|
22
|
+
month = 28 * 24 * 60 * 60 # 28 days
|
23
|
+
duration = tweets[-1].created_at - tweets[0].created_at
|
24
|
+
|
25
|
+
if duration > 3 * month
|
26
|
+
grouping = 'day'
|
27
|
+
elsif duration > month || tweets.size > 10000
|
28
|
+
grouping = 'hour'
|
29
|
+
else
|
30
|
+
grouping = 'minute'
|
34
31
|
end
|
35
32
|
end
|
36
33
|
end
|
37
34
|
|
38
|
-
|
39
|
-
time = data.keys.min
|
40
|
-
max_time = data.keys.max
|
41
|
-
sec_interval = 60 * min_interval
|
35
|
+
Twitterscraper.logger.info "Chart grouping #{grouping}"
|
42
36
|
|
43
|
-
|
44
|
-
|
45
|
-
break if next_time + sec_interval > max_time
|
37
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
38
|
+
t = tweet.created_at
|
46
39
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
time =
|
40
|
+
if grouping == 'day'
|
41
|
+
time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
|
42
|
+
elsif grouping == 'hour'
|
43
|
+
time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
|
44
|
+
elsif grouping == 'minute'
|
45
|
+
time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
|
46
|
+
else
|
47
|
+
time = t
|
51
48
|
end
|
49
|
+
memo[time.to_i] += 1
|
52
50
|
end
|
53
51
|
|
54
52
|
data.sort_by { |k, _| k }.map do |timestamp, count|
|
@@ -1,5 +1,8 @@
|
|
1
|
-
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="ja">
|
2
3
|
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
|
3
6
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
4
7
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
5
8
|
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
@@ -80,7 +83,7 @@
|
|
80
83
|
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
81
84
|
|
82
85
|
<div class="tweets-container">
|
83
|
-
<% tweets.sort_by { |t| -t.created_at.to_i }.each.with_index do |tweet, i| %>
|
86
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
|
84
87
|
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
85
88
|
<% if i < convert_limit %>
|
86
89
|
<blockquote class="twitter-tweet">
|
data/lib/twitterscraper/type.rb
CHANGED
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.19.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|