twitterscraper-ruby 0.15.2 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f7d320841125d9a582ece6083f421f0abf301addbc5c5c2a3d2b2c09bedbc33
4
- data.tar.gz: 6ea43165ffa4f37c4319566689a42f2f275d8a70402b0d6b4164df519fee90b5
3
+ metadata.gz: 73a9e9108284fc79cf5ec6b36b6f7ad3f83f2b4f03a2bc527dc18cb4b33e83c7
4
+ data.tar.gz: c7fcfdbdd1d808780c56610be9b8717352c812759b9344d9fa87cbd430a8d8e2
5
5
  SHA512:
6
- metadata.gz: ee3756538ec28e9f0113e611e2731ec33107dabacf7cb730b257d6c94351407ef171a9bc91402a589fa73fdb6b705f73b11582766af1d04a3413b8bc79dc6619
7
- data.tar.gz: 78200dc658a9c1cf43ed7367e499b0d1b243728aecb2ffd7366b5612f8905bb33d27ab7e1412327d05b7fff159196fe9e24d18c8cc4c24898af10533fbdf43df
6
+ metadata.gz: 1019547fe8c37a1bb5b4a9cd96a2737a14491087075ff448b48f72538758337c76ab513e153d4567454b192d30fafaa374913ae0c3548d7802e7bdd478fe4a2f
7
+ data.tar.gz: 48134e8b6858154850003da8684d3c8b7f124cab6d19e0ce76d05326dc8fef44694b32211e245509993e8b7b1afafa6d95914b05c66b9c95c54bb27d041983fe
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /tmp/
9
9
  /cache
10
10
  /.idea
11
+ .DS_Store
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.15.2)
4
+ twitterscraper-ruby (0.20.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,8 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | day or hour | auto |
179
+ | `--chart_grouping` | string | day, hour or minute | auto |
176
180
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
181
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
182
  | `--format` | string | The format of the output. | json(default) or html |
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 3600 # 1 hour
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,18 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
34
+ def delete(key)
35
+ key = cache_key(key)
36
+ file = File.join(@dir, key)
37
+ File.delete(file) if File.exist?(file)
38
+ end
39
+
28
40
  def fetch(key, &block)
29
41
  if (value = read(key))
30
42
  value
@@ -24,21 +24,25 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
30
- export(tweets) unless tweets.empty?
31
+ export(options['query'], tweets) unless tweets.empty?
31
32
  end
32
33
 
33
- def export(tweets)
34
- write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
-
36
- if options['format'] == 'json'
37
- write_json.call
38
- elsif options['format'] == 'html'
39
- File.write('tweets.html', Template.tweets_embedded_html(tweets))
40
- else
41
- write_json.call
34
+ def export(name, tweets)
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
42
46
  end
43
47
  end
44
48
 
@@ -69,6 +73,8 @@ module Twitterscraper
69
73
  'daily_limit:',
70
74
  'order:',
71
75
  'threads:',
76
+ 'threads_granularity:',
77
+ 'chart_grouping:',
72
78
  'output:',
73
79
  'format:',
74
80
  'cache:',
@@ -82,10 +88,10 @@ module Twitterscraper
82
88
  options['lang'] ||= ''
83
89
  options['limit'] = (options['limit'] || 100).to_i
84
90
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
91
+ options['threads'] = (options['threads'] || 10).to_i
92
+ options['threads_granularity'] ||= 'auto'
86
93
  options['format'] ||= 'json'
87
94
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
89
95
 
90
96
  options['cache'] = options['cache'] != 'false'
91
97
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +99,13 @@ module Twitterscraper
93
99
  options
94
100
  end
95
101
 
102
+ def build_output_name(format, options)
103
+ query = options['query'].gsub(/[ :?#&]/, '_')
104
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
105
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
106
+ File.join('out', file)
107
+ end
108
+
96
109
  def initialize_logger
97
110
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
111
  end
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -69,30 +61,33 @@ module Twitterscraper
69
61
  else
70
62
  json_resp = JSON.parse(text)
71
63
  items_html = json_resp['items_html'] || ''
72
- logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
73
64
  end
74
65
 
75
66
  [items_html, json_resp]
76
67
  end
77
68
 
78
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
79
70
  logger.info "Querying #{query}"
80
- query = ERB::Util.url_encode(query)
71
+ encoded_query = ERB::Util.url_encode(query)
81
72
 
82
- url = build_query_url(query, lang, type, pos)
73
+ url = build_query_url(encoded_query, lang, type, pos)
83
74
  http_request = lambda do
84
- logger.debug "Scraping tweets from #{url}"
85
- get_single_page(url, headers, proxies)
75
+ logger.debug "Scraping tweets from url=#{url}"
76
+ get_single_page(url)
86
77
  end
87
78
 
88
79
  if cache_enabled?
89
80
  client = Cache.new
90
81
  if (response = client.read(url))
91
- logger.debug 'Fetching tweets from cache'
82
+ logger.debug "Fetching tweets from cache url=#{url}"
92
83
  else
93
84
  response = http_request.call
94
85
  client.write(url, response) unless stop_requested?
95
86
  end
87
+ if @queries && query == @queries.last && pos.nil?
88
+ logger.debug "Delete a cache query=#{query}"
89
+ client.delete(url)
90
+ end
96
91
  else
97
92
  response = http_request.call
98
93
  end
@@ -100,6 +95,12 @@ module Twitterscraper
100
95
 
101
96
  html, json_resp = parse_single_page(response, pos.nil?)
102
97
 
98
+ if json_resp && json_resp['message']
99
+ logger.warn json_resp['message'] # Sorry, you are rate limited.
100
+ @stop_requested = true
101
+ Cache.new.delete(url) if cache_enabled?
102
+ end
103
+
103
104
  tweets = Tweet.from_html(html)
104
105
 
105
106
  if tweets.empty?
@@ -130,126 +131,151 @@ module Twitterscraper
130
131
  if start_date && end_date
131
132
  if start_date == end_date
132
133
  raise Error.new('Please specify different values for :start_date and :end_date.')
133
- elsif start_date > end_date
134
+ elsif Date.parse(start_date) > Date.parse(end_date)
134
135
  raise Error.new(':start_date must occur before :end_date.')
135
136
  end
136
137
  end
137
138
 
138
139
  if start_date
139
- if start_date < OLDEST_DATE
140
+ if Date.parse(start_date) < OLDEST_DATE
140
141
  raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
141
142
  end
142
143
  end
144
+ end
143
145
 
144
- if end_date
145
- today = Date.today
146
- if end_date > Date.today
147
- raise Error.new(":end_date must be less than or equal to today(#{today})")
148
- end
146
+ def build_queries(query, start_date, end_date, threads_granularity, type)
147
+ if type.search?
148
+ start_date = Date.parse(start_date) if start_date.is_a?(String)
149
+ end_date = Date.parse(end_date) if end_date.is_a?(String)
150
+ elsif type.user?
151
+ start_date = nil
152
+ end_date = nil
149
153
  end
150
- end
151
154
 
152
- def build_queries(query, start_date, end_date)
153
155
  if start_date && end_date
154
- date_range = start_date.upto(end_date - 1)
155
- date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
156
+ if threads_granularity == 'auto'
157
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
158
+ end
159
+
160
+ if threads_granularity == 'day'
161
+ date_range = start_date.upto(end_date - 1)
162
+ queries = date_range.map { |date| query + " since:#{date}_00:00:00_UTC until:#{date + 1}_00:00:00_UTC" }
163
+ elsif threads_granularity == 'hour'
164
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
165
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
166
+ queries = []
167
+
168
+ while true
169
+ if time < Time.now.utc
170
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H')}:00:00_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H')}:00:00_UTC")
171
+ end
172
+ time += 3600
173
+ break if time >= end_time
174
+ end
175
+ else
176
+ raise Error.new("Invalid :threads_granularity value=#{threads_granularity}")
177
+ end
178
+
179
+ @queries = queries
180
+
156
181
  elsif start_date
157
- [query + " since:#{start_date}"]
182
+ [query + " since:#{start_date}_00:00:00_UTC"]
158
183
  elsif end_date
159
- [query + " until:#{end_date}"]
184
+ [query + " until:#{end_date}_00:00:00_UTC"]
160
185
  else
161
186
  [query]
162
187
  end
163
188
  end
164
189
 
165
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
190
+ def main_loop(query, lang, type, limit, daily_limit)
166
191
  pos = nil
167
- daily_tweets = []
192
+ tmp_tweets = []
168
193
 
169
194
  while true
170
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
195
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
171
196
  unless new_tweets.empty?
172
- daily_tweets.concat(new_tweets)
173
- daily_tweets.uniq! { |t| t.tweet_id }
197
+ tmp_tweets.concat(new_tweets)
198
+ tmp_tweets.uniq! { |t| t.tweet_id }
199
+ end
174
200
 
175
- @mutex.synchronize {
176
- @all_tweets.concat(new_tweets)
177
- @all_tweets.uniq! { |t| t.tweet_id }
178
- }
201
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
202
+ total_size = @all_tweets.size + @results_counter.values.sum
203
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
204
+
205
+ if !@stop_requested && total_size >= limit
206
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
207
+ @stop_requested = true
179
208
  end
180
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
181
209
 
182
210
  break unless new_pos
183
- break if daily_limit && daily_tweets.size >= daily_limit
211
+ break if @stop_requested
212
+ break if daily_limit && tmp_tweets.size >= daily_limit
184
213
  break if @all_tweets.size >= limit
185
214
 
186
215
  pos = new_pos
187
216
  end
188
217
 
189
- if !@stop_requested && @all_tweets.size >= limit
190
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
191
- @stop_requested = true
192
- end
218
+ tmp_tweets
193
219
  end
194
220
 
195
221
  def stop_requested?
196
222
  @stop_requested
197
223
  end
198
224
 
199
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
200
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
201
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
202
- queries = build_queries(query, start_date, end_date)
225
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
203
226
  type = Type.new(type)
227
+ queries = build_queries(query, start_date, end_date, threads_granularity, type)
204
228
  if threads > queries.size
205
- logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
206
229
  threads = queries.size
207
230
  end
208
- if proxy_enabled?
209
- proxies = Proxy::Pool.new
210
- logger.debug "Fetch #{proxies.size} proxies"
211
- else
212
- proxies = []
213
- logger.debug 'Proxy disabled'
214
- end
215
231
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
216
232
 
217
-
218
233
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
219
234
 
235
+ logger.info "The number of queries #{queries.size}"
220
236
  logger.info "The number of threads #{threads}"
221
237
 
222
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
223
- logger.info "Headers #{headers}"
224
-
225
238
  @all_tweets = []
226
- @mutex = Mutex.new
227
239
  @stop_requested = false
240
+ @results_counter = {}
228
241
 
229
242
  if threads > 1
243
+ @mutex = Mutex.new
230
244
  Thread.abort_on_exception = true
231
245
  logger.debug "Set 'Thread.abort_on_exception' to true"
232
246
 
233
247
  Parallel.each(queries, in_threads: threads) do |query|
234
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
248
+ @results_counter[Parallel.worker_number] = 0
249
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
250
+ @mutex.synchronize {
251
+ @all_tweets.concat(tmp_tweets)
252
+ @all_tweets.uniq! { |t| t.tweet_id }
253
+ }
254
+ @results_counter[Parallel.worker_number] = 0
255
+
235
256
  raise Parallel::Break if stop_requested?
236
257
  end
237
258
  else
238
259
  queries.each do |query|
239
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
260
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
261
+ @all_tweets.concat(tmp_tweets)
262
+ @all_tweets.uniq! { |t| t.tweet_id }
263
+
240
264
  break if stop_requested?
241
265
  end
242
266
  end
243
267
 
268
+ logger.info "Return #{@all_tweets.size} tweets"
269
+
244
270
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
245
271
  end
246
272
 
247
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
248
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
273
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
274
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
249
275
  end
250
276
 
251
277
  def user_timeline(screen_name, limit: 100, order: 'desc')
252
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
278
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
253
279
  end
254
280
  end
255
281
  end
@@ -1,48 +1,58 @@
1
1
  module Twitterscraper
2
- module Template
3
- module_function
2
+ class Template
3
+ def tweets_embedded_html(name, tweets, options)
4
+ path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
5
+ template = ERB.new(File.read(path))
4
6
 
5
- def tweets_embedded_html(tweets)
6
- tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
- EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
7
+ tweets = tweets.sort_by { |t| t.created_at.to_i }
8
+ grouping = options['chart_grouping'] || 'auto'
9
+
10
+ template.result_with_hash(
11
+ chart_name: name,
12
+ chart_data: chart_data(tweets, grouping: grouping).to_json,
13
+ first_tweet: tweets[0],
14
+ last_tweet: tweets[-1],
15
+ tweets: tweets,
16
+ convert_limit: 30,
17
+ )
8
18
  end
9
19
 
10
- EMBED_TWEET_HTML = <<~'HTML'
11
- <blockquote class="twitter-tweet">
12
- <a href="__TWEET_URL__"></a>
13
- </blockquote>
14
- HTML
15
-
16
- EMBED_TWEETS_HTML = <<~'HTML'
17
- <html>
18
- <head>
19
- <style type=text/css>
20
- .twitter-tweet {
21
- margin: 30px auto 0 auto !important;
22
- }
23
- </style>
24
- <script>
25
- window.twttr = (function(d, s, id) {
26
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
- if (d.getElementById(id)) return t;
28
- js = d.createElement(s);
29
- js.id = id;
30
- js.src = "https://platform.twitter.com/widgets.js";
31
- fjs.parentNode.insertBefore(js, fjs);
32
-
33
- t._e = [];
34
- t.ready = function(f) {
35
- t._e.push(f);
36
- };
37
-
38
- return t;
39
- }(document, "script", "twitter-wjs"));
40
- </script>
41
- </head>
42
- <body>
43
- __TWEETS__
44
- </body>
45
- </html>
46
- HTML
20
+ def chart_data(tweets, grouping: 'auto')
21
+ if grouping && tweets.size > 100
22
+ if grouping == 'auto'
23
+ month = 28 * 24 * 60 * 60 # 28 days
24
+ duration = tweets[-1].created_at - tweets[0].created_at
25
+
26
+ if duration > 3 * month
27
+ grouping = 'day'
28
+ elsif duration > month || tweets.size > 10000
29
+ grouping = 'hour'
30
+ else
31
+ grouping = 'minute'
32
+ end
33
+ end
34
+ end
35
+
36
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
37
+
38
+ data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
39
+ t = tweet.created_at
40
+
41
+ if grouping == 'day'
42
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
43
+ elsif grouping == 'hour'
44
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
45
+ elsif grouping == 'minute'
46
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
47
+ else
48
+ time = t
49
+ end
50
+ memo[time.to_i] += 1
51
+ end
52
+
53
+ data.sort_by { |k, _| k }.map do |timestamp, count|
54
+ [timestamp * 1000, count]
55
+ end
56
+ end
47
57
  end
48
58
  end
@@ -0,0 +1,112 @@
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+
6
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
7
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
8
+ <script src="https://code.highcharts.com/stock/highstock.js"></script>
9
+ <script>
10
+ function updateTweets() {
11
+ window.twttr = (function (d, s, id) {
12
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
13
+ if (d.getElementById(id)) return t;
14
+ js = d.createElement(s);
15
+ js.id = id;
16
+ js.src = "https://platform.twitter.com/widgets.js";
17
+ fjs.parentNode.insertBefore(js, fjs);
18
+
19
+ t._e = [];
20
+ t.ready = function (f) {
21
+ t._e.push(f);
22
+ };
23
+
24
+ return t;
25
+ }(document, "script", "twitter-wjs"));
26
+ }
27
+
28
+ function drawChart() {
29
+ Highcharts.setOptions({
30
+ time: {
31
+ timezone: moment.tz.guess()
32
+ }
33
+ });
34
+
35
+ var data = <%= chart_data %>;
36
+ var config = {
37
+ title: {
38
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
39
+ },
40
+ subtitle: {
41
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
42
+ },
43
+ series: [{
44
+ data: data
45
+ }],
46
+ rangeSelector: {enabled: false},
47
+ scrollbar: {enabled: false},
48
+ navigator: {enabled: false},
49
+ exporting: {enabled: false},
50
+ credits: {enabled: false}
51
+ };
52
+
53
+ Highcharts.stockChart('chart-container', config);
54
+ }
55
+
56
+ document.addEventListener("DOMContentLoaded", function () {
57
+ drawChart();
58
+ updateTweets();
59
+ });
60
+ </script>
61
+
62
+ <style type=text/css>
63
+ #chart-container {
64
+ max-width: 1200px;
65
+ height: 675px;
66
+ margin: 0 auto;
67
+ border: 1px solid rgb(204, 214, 221);
68
+ display: flex;
69
+ justify-content: center;
70
+ align-items: center;
71
+ }
72
+ .tweets-container {
73
+ max-width: 550px;
74
+ margin: 0 auto 0 auto;
75
+ }
76
+
77
+ .twitter-tweet {
78
+ margin: 15px 0 15px 0 !important;
79
+ }
80
+ </style>
81
+ </head>
82
+ <body>
83
+ <div id="chart-container"><div style="color: gray;">Loading...</div></div>
84
+
85
+ <div class="tweets-container">
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
87
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
88
+ <% if i < convert_limit %>
89
+ <blockquote class="twitter-tweet">
90
+ <% else %>
91
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
92
+ <% end %>
93
+
94
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
95
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
96
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
97
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
98
+ </div>
99
+
100
+ <div><%= tweet.text %></div>
101
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
102
+
103
+ <% if i < convert_limit %>
104
+ </blockquote>
105
+ <% else %>
106
+ </div>
107
+ <% end %>
108
+ <% end %>
109
+ </div>
110
+
111
+ </body>
112
+ </html>
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -11,5 +11,9 @@ module Twitterscraper
11
11
  def user?
12
12
  @value == 'user'
13
13
  end
14
+
15
+ def to_s
16
+ @value
17
+ end
14
18
  end
15
19
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.15.2'
2
+ VERSION = '0.20.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.20.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-17 00:00:00.000000000 Z
11
+ date: 2020-07-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -72,6 +72,7 @@ files:
72
72
  - lib/twitterscraper/proxy.rb
73
73
  - lib/twitterscraper/query.rb
74
74
  - lib/twitterscraper/template.rb
75
+ - lib/twitterscraper/template/tweets.html.erb
75
76
  - lib/twitterscraper/tweet.rb
76
77
  - lib/twitterscraper/type.rb
77
78
  - lib/version.rb