twitterscraper-ruby 0.18.0 → 0.19.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8e9bdefe1c4d10e6d9f1d12aeb279b2a3751c570e96e05daaf849dd423bb03bf
4
- data.tar.gz: 7de97de19daeecce2837fe8e5999b6c9490ab49a18a2ab9e603bf4d039abc4b9
3
+ metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
4
+ data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
5
5
  SHA512:
6
- metadata.gz: 55b7e0b52b2ce44418305798ed27a677405244a48f5ad0a797e3abf7958b0581a313ebd33f3f69b891ba7454f8f5c9c0db845c9ca8be321cd27212932821776e
7
- data.tar.gz: 8fe97a0dc164fc0108b8e6a35843fba19ade5fbaf4f1ee2b4a400afbd3bdbb220a49dfbef4fceb1d8ecc43df3b4f4b7bad0ee5ea94c0aac464c0477e42efb866
6
+ metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
7
+ data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.18.0)
4
+ twitterscraper-ruby (0.19.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 86400 # 1 day
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,12 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
28
34
  def delete(key)
29
35
  key = cache_key(key)
30
36
  file = File.join(@dir, key)
@@ -32,16 +32,17 @@ module Twitterscraper
32
32
  end
33
33
 
34
34
  def export(name, tweets)
35
- filepath = options['output']
36
- Dir.mkdir(File.dirname(filepath)) unless File.exist?(File.dirname(filepath))
37
- write_json = lambda { File.write(filepath, generate_json(tweets)) }
38
-
39
- if options['format'] == 'json'
40
- write_json.call
41
- elsif options['format'] == 'html'
42
- File.write(filepath, Template.new.tweets_embedded_html(name, tweets, options))
43
- else
44
- write_json.call
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
45
46
  end
46
47
  end
47
48
 
@@ -90,7 +91,6 @@ module Twitterscraper
90
91
  options['threads_granularity'] ||= 'auto'
91
92
  options['format'] ||= 'json'
92
93
  options['order'] ||= 'desc'
93
- options['output'] ||= build_output_name(options)
94
94
 
95
95
  options['cache'] = options['cache'] != 'false'
96
96
  options['proxy'] = options['proxy'] != 'false'
@@ -98,10 +98,11 @@ module Twitterscraper
98
98
  options
99
99
  end
100
100
 
101
- def build_output_name(options)
101
+ def build_output_name(format, options)
102
102
  query = options['query'].gsub(/[ :?#&]/, '_')
103
103
  date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
104
- File.join('out', [options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format'])
104
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
105
+ File.join('out', file)
105
106
  end
106
107
 
107
108
  def initialize_logger
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -74,14 +66,14 @@ module Twitterscraper
74
66
  [items_html, json_resp]
75
67
  end
76
68
 
77
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
78
70
  logger.info "Querying #{query}"
79
71
  encoded_query = ERB::Util.url_encode(query)
80
72
 
81
73
  url = build_query_url(encoded_query, lang, type, pos)
82
74
  http_request = lambda do
83
75
  logger.debug "Scraping tweets from url=#{url}"
84
- get_single_page(url, headers, proxies)
76
+ get_single_page(url)
85
77
  end
86
78
 
87
79
  if cache_enabled?
@@ -160,7 +152,7 @@ module Twitterscraper
160
152
  if threads_granularity == 'day'
161
153
  date_range = start_date.upto(end_date - 1)
162
154
  queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
163
- else
155
+ elsif threads_granularity == 'hour'
164
156
  time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
165
157
  end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
166
158
  queries = []
@@ -185,34 +177,35 @@ module Twitterscraper
185
177
  end
186
178
  end
187
179
 
188
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
180
+ def main_loop(query, lang, type, limit, daily_limit)
189
181
  pos = nil
190
- daily_tweets = []
182
+ tmp_tweets = []
191
183
 
192
184
  while true
193
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
185
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
194
186
  unless new_tweets.empty?
195
- daily_tweets.concat(new_tweets)
196
- daily_tweets.uniq! { |t| t.tweet_id }
187
+ tmp_tweets.concat(new_tweets)
188
+ tmp_tweets.uniq! { |t| t.tweet_id }
189
+ end
197
190
 
198
- @mutex.synchronize {
199
- @all_tweets.concat(new_tweets)
200
- @all_tweets.uniq! { |t| t.tweet_id }
201
- }
191
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
192
+ total_size = @all_tweets.size + @results_counter.values.sum
193
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
194
+
195
+ if !@stop_requested && total_size >= limit
196
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
197
+ @stop_requested = true
202
198
  end
203
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
204
199
 
205
200
  break unless new_pos
206
- break if daily_limit && daily_tweets.size >= daily_limit
201
+ break if @stop_requested
202
+ break if daily_limit && tmp_tweets.size >= daily_limit
207
203
  break if @all_tweets.size >= limit
208
204
 
209
205
  pos = new_pos
210
206
  end
211
207
 
212
- if !@stop_requested && @all_tweets.size >= limit
213
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
214
- @stop_requested = true
215
- end
208
+ tmp_tweets
216
209
  end
217
210
 
218
211
  def stop_requested?
@@ -233,13 +226,6 @@ module Twitterscraper
233
226
  if threads > queries.size
234
227
  threads = queries.size
235
228
  end
236
- if proxy_enabled?
237
- proxies = Proxy::Pool.new
238
- logger.debug "Fetch #{proxies.size} proxies"
239
- else
240
- proxies = []
241
- logger.debug 'Proxy disabled'
242
- end
243
229
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
244
230
 
245
231
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
@@ -247,24 +233,32 @@ module Twitterscraper
247
233
  logger.info "The number of queries #{queries.size}"
248
234
  logger.info "The number of threads #{threads}"
249
235
 
250
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
251
- logger.info "Headers #{headers}"
252
-
253
236
  @all_tweets = []
254
- @mutex = Mutex.new
255
237
  @stop_requested = false
238
+ @results_counter = {}
256
239
 
257
240
  if threads > 1
241
+ @mutex = Mutex.new
258
242
  Thread.abort_on_exception = true
259
243
  logger.debug "Set 'Thread.abort_on_exception' to true"
260
244
 
261
245
  Parallel.each(queries, in_threads: threads) do |query|
262
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
246
+ @results_counter[Parallel.worker_number] = 0
247
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
248
+ @mutex.synchronize {
249
+ @all_tweets.concat(tmp_tweets)
250
+ @all_tweets.uniq! { |t| t.tweet_id }
251
+ }
252
+ @results_counter[Parallel.worker_number] = 0
253
+
263
254
  raise Parallel::Break if stop_requested?
264
255
  end
265
256
  else
266
257
  queries.each do |query|
267
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
258
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
259
+ @all_tweets.concat(tmp_tweets)
260
+ @all_tweets.uniq! { |t| t.tweet_id }
261
+
268
262
  break if stop_requested?
269
263
  end
270
264
  end
@@ -16,39 +16,37 @@ module Twitterscraper
16
16
  )
17
17
  end
18
18
 
19
- def chart_data(tweets, trimming: true, smoothing: true)
20
- min_interval = 5
21
-
22
- data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
23
- t = tweet.created_at
24
- min = (t.min.to_f / min_interval).floor * min_interval
25
- time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
26
- memo[time.to_i] += 1
27
- end
28
-
29
- if false && trimming
30
- data.keys.sort.each.with_index do |timestamp, i|
31
- break if data.size - 1 == i
32
- if data[i] == 0 && data[i + 1] == 0
33
- data.delete(timestamp)
19
+ def chart_data(tweets, grouping: 'auto')
20
+ if grouping && tweets.size > 100
21
+ if grouping == 'auto'
22
+ month = 28 * 24 * 60 * 60 # 28 days
23
+ duration = tweets[-1].created_at - tweets[0].created_at
24
+
25
+ if duration > 3 * month
26
+ grouping = 'day'
27
+ elsif duration > month || tweets.size > 10000
28
+ grouping = 'hour'
29
+ else
30
+ grouping = 'minute'
34
31
  end
35
32
  end
36
33
  end
37
34
 
38
- if false && smoothing
39
- time = data.keys.min
40
- max_time = data.keys.max
41
- sec_interval = 60 * min_interval
35
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
42
36
 
43
- while true
44
- next_time = time + sec_interval
45
- break if next_time + sec_interval > max_time
37
+ data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
38
+ t = tweet.created_at
46
39
 
47
- unless data.has_key?(next_time)
48
- data[next_time] = (data[time] + data[next_time + sec_interval]) / 2
49
- end
50
- time = next_time
40
+ if grouping == 'day'
41
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
42
+ elsif grouping == 'hour'
43
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
44
+ elsif grouping == 'minute'
45
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
46
+ else
47
+ time = t
51
48
  end
49
+ memo[time.to_i] += 1
52
50
  end
53
51
 
54
52
  data.sort_by { |k, _| k }.map do |timestamp, count|
@@ -1,5 +1,8 @@
1
- <html>
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
2
3
  <head>
4
+ <meta charset="UTF-8">
5
+
3
6
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
4
7
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
5
8
  <script src="https://code.highcharts.com/stock/highstock.js"></script>
@@ -80,7 +83,7 @@
80
83
  <div id="chart-container"><div style="color: gray;">Loading...</div></div>
81
84
 
82
85
  <div class="tweets-container">
83
- <% tweets.sort_by { |t| -t.created_at.to_i }.each.with_index do |tweet, i| %>
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
84
87
  <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
85
88
  <% if i < convert_limit %>
86
89
  <blockquote class="twitter-tweet">
@@ -11,5 +11,9 @@ module Twitterscraper
11
11
  def user?
12
12
  @value == 'user'
13
13
  end
14
+
15
+ def to_s
16
+ @value
17
+ end
14
18
  end
15
19
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.18.0'
2
+ VERSION = '0.19.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.0
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-19 00:00:00.000000000 Z
11
+ date: 2020-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri