twitterscraper-ruby 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8e9bdefe1c4d10e6d9f1d12aeb279b2a3751c570e96e05daaf849dd423bb03bf
4
- data.tar.gz: 7de97de19daeecce2837fe8e5999b6c9490ab49a18a2ab9e603bf4d039abc4b9
3
+ metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
4
+ data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
5
5
  SHA512:
6
- metadata.gz: 55b7e0b52b2ce44418305798ed27a677405244a48f5ad0a797e3abf7958b0581a313ebd33f3f69b891ba7454f8f5c9c0db845c9ca8be321cd27212932821776e
7
- data.tar.gz: 8fe97a0dc164fc0108b8e6a35843fba19ade5fbaf4f1ee2b4a400afbd3bdbb220a49dfbef4fceb1d8ecc43df3b4f4b7bad0ee5ea94c0aac464c0477e42efb866
6
+ metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
7
+ data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.18.0)
4
+ twitterscraper-ruby (0.19.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 86400 # 1 day
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,12 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
28
34
  def delete(key)
29
35
  key = cache_key(key)
30
36
  file = File.join(@dir, key)
@@ -32,16 +32,17 @@ module Twitterscraper
32
32
  end
33
33
 
34
34
  def export(name, tweets)
35
- filepath = options['output']
36
- Dir.mkdir(File.dirname(filepath)) unless File.exist?(File.dirname(filepath))
37
- write_json = lambda { File.write(filepath, generate_json(tweets)) }
38
-
39
- if options['format'] == 'json'
40
- write_json.call
41
- elsif options['format'] == 'html'
42
- File.write(filepath, Template.new.tweets_embedded_html(name, tweets, options))
43
- else
44
- write_json.call
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
45
46
  end
46
47
  end
47
48
 
@@ -90,7 +91,6 @@ module Twitterscraper
90
91
  options['threads_granularity'] ||= 'auto'
91
92
  options['format'] ||= 'json'
92
93
  options['order'] ||= 'desc'
93
- options['output'] ||= build_output_name(options)
94
94
 
95
95
  options['cache'] = options['cache'] != 'false'
96
96
  options['proxy'] = options['proxy'] != 'false'
@@ -98,10 +98,11 @@ module Twitterscraper
98
98
  options
99
99
  end
100
100
 
101
- def build_output_name(options)
101
+ def build_output_name(format, options)
102
102
  query = options['query'].gsub(/[ :?#&]/, '_')
103
103
  date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
104
- File.join('out', [options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format'])
104
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
105
+ File.join('out', file)
105
106
  end
106
107
 
107
108
  def initialize_logger
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -74,14 +66,14 @@ module Twitterscraper
74
66
  [items_html, json_resp]
75
67
  end
76
68
 
77
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
78
70
  logger.info "Querying #{query}"
79
71
  encoded_query = ERB::Util.url_encode(query)
80
72
 
81
73
  url = build_query_url(encoded_query, lang, type, pos)
82
74
  http_request = lambda do
83
75
  logger.debug "Scraping tweets from url=#{url}"
84
- get_single_page(url, headers, proxies)
76
+ get_single_page(url)
85
77
  end
86
78
 
87
79
  if cache_enabled?
@@ -160,7 +152,7 @@ module Twitterscraper
160
152
  if threads_granularity == 'day'
161
153
  date_range = start_date.upto(end_date - 1)
162
154
  queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
163
- else
155
+ elsif threads_granularity == 'hour'
164
156
  time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
165
157
  end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
166
158
  queries = []
@@ -185,34 +177,35 @@ module Twitterscraper
185
177
  end
186
178
  end
187
179
 
188
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
180
+ def main_loop(query, lang, type, limit, daily_limit)
189
181
  pos = nil
190
- daily_tweets = []
182
+ tmp_tweets = []
191
183
 
192
184
  while true
193
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
185
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
194
186
  unless new_tweets.empty?
195
- daily_tweets.concat(new_tweets)
196
- daily_tweets.uniq! { |t| t.tweet_id }
187
+ tmp_tweets.concat(new_tweets)
188
+ tmp_tweets.uniq! { |t| t.tweet_id }
189
+ end
197
190
 
198
- @mutex.synchronize {
199
- @all_tweets.concat(new_tweets)
200
- @all_tweets.uniq! { |t| t.tweet_id }
201
- }
191
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
192
+ total_size = @all_tweets.size + @results_counter.values.sum
193
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
194
+
195
+ if !@stop_requested && total_size >= limit
196
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
197
+ @stop_requested = true
202
198
  end
203
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
204
199
 
205
200
  break unless new_pos
206
- break if daily_limit && daily_tweets.size >= daily_limit
201
+ break if @stop_requested
202
+ break if daily_limit && tmp_tweets.size >= daily_limit
207
203
  break if @all_tweets.size >= limit
208
204
 
209
205
  pos = new_pos
210
206
  end
211
207
 
212
- if !@stop_requested && @all_tweets.size >= limit
213
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
214
- @stop_requested = true
215
- end
208
+ tmp_tweets
216
209
  end
217
210
 
218
211
  def stop_requested?
@@ -233,13 +226,6 @@ module Twitterscraper
233
226
  if threads > queries.size
234
227
  threads = queries.size
235
228
  end
236
- if proxy_enabled?
237
- proxies = Proxy::Pool.new
238
- logger.debug "Fetch #{proxies.size} proxies"
239
- else
240
- proxies = []
241
- logger.debug 'Proxy disabled'
242
- end
243
229
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
244
230
 
245
231
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
@@ -247,24 +233,32 @@ module Twitterscraper
247
233
  logger.info "The number of queries #{queries.size}"
248
234
  logger.info "The number of threads #{threads}"
249
235
 
250
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
251
- logger.info "Headers #{headers}"
252
-
253
236
  @all_tweets = []
254
- @mutex = Mutex.new
255
237
  @stop_requested = false
238
+ @results_counter = {}
256
239
 
257
240
  if threads > 1
241
+ @mutex = Mutex.new
258
242
  Thread.abort_on_exception = true
259
243
  logger.debug "Set 'Thread.abort_on_exception' to true"
260
244
 
261
245
  Parallel.each(queries, in_threads: threads) do |query|
262
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
246
+ @results_counter[Parallel.worker_number] = 0
247
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
248
+ @mutex.synchronize {
249
+ @all_tweets.concat(tmp_tweets)
250
+ @all_tweets.uniq! { |t| t.tweet_id }
251
+ }
252
+ @results_counter[Parallel.worker_number] = 0
253
+
263
254
  raise Parallel::Break if stop_requested?
264
255
  end
265
256
  else
266
257
  queries.each do |query|
267
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
258
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
259
+ @all_tweets.concat(tmp_tweets)
260
+ @all_tweets.uniq! { |t| t.tweet_id }
261
+
268
262
  break if stop_requested?
269
263
  end
270
264
  end
@@ -16,39 +16,37 @@ module Twitterscraper
16
16
  )
17
17
  end
18
18
 
19
- def chart_data(tweets, trimming: true, smoothing: true)
20
- min_interval = 5
21
-
22
- data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
23
- t = tweet.created_at
24
- min = (t.min.to_f / min_interval).floor * min_interval
25
- time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
26
- memo[time.to_i] += 1
27
- end
28
-
29
- if false && trimming
30
- data.keys.sort.each.with_index do |timestamp, i|
31
- break if data.size - 1 == i
32
- if data[i] == 0 && data[i + 1] == 0
33
- data.delete(timestamp)
19
+ def chart_data(tweets, grouping: 'auto')
20
+ if grouping && tweets.size > 100
21
+ if grouping == 'auto'
22
+ month = 28 * 24 * 60 * 60 # 28 days
23
+ duration = tweets[-1].created_at - tweets[0].created_at
24
+
25
+ if duration > 3 * month
26
+ grouping = 'day'
27
+ elsif duration > month || tweets.size > 10000
28
+ grouping = 'hour'
29
+ else
30
+ grouping = 'minute'
34
31
  end
35
32
  end
36
33
  end
37
34
 
38
- if false && smoothing
39
- time = data.keys.min
40
- max_time = data.keys.max
41
- sec_interval = 60 * min_interval
35
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
42
36
 
43
- while true
44
- next_time = time + sec_interval
45
- break if next_time + sec_interval > max_time
37
+ data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
38
+ t = tweet.created_at
46
39
 
47
- unless data.has_key?(next_time)
48
- data[next_time] = (data[time] + data[next_time + sec_interval]) / 2
49
- end
50
- time = next_time
40
+ if grouping == 'day'
41
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
42
+ elsif grouping == 'hour'
43
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
44
+ elsif grouping == 'minute'
45
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
46
+ else
47
+ time = t
51
48
  end
49
+ memo[time.to_i] += 1
52
50
  end
53
51
 
54
52
  data.sort_by { |k, _| k }.map do |timestamp, count|
@@ -1,5 +1,8 @@
1
- <html>
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
2
3
  <head>
4
+ <meta charset="UTF-8">
5
+
3
6
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
4
7
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
5
8
  <script src="https://code.highcharts.com/stock/highstock.js"></script>
@@ -80,7 +83,7 @@
80
83
  <div id="chart-container"><div style="color: gray;">Loading...</div></div>
81
84
 
82
85
  <div class="tweets-container">
83
- <% tweets.sort_by { |t| -t.created_at.to_i }.each.with_index do |tweet, i| %>
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
84
87
  <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
85
88
  <% if i < convert_limit %>
86
89
  <blockquote class="twitter-tweet">
@@ -11,5 +11,9 @@ module Twitterscraper
11
11
  def user?
12
12
  @value == 'user'
13
13
  end
14
+
15
+ def to_s
16
+ @value
17
+ end
14
18
  end
15
19
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.18.0'
2
+ VERSION = '0.19.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.0
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-19 00:00:00.000000000 Z
11
+ date: 2020-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri