twitterscraper-ruby 0.16.0 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66dda5275a9067d328f6637f127895ded954534d304e5e4b349f286a271a08d8
4
- data.tar.gz: 6c3ffb3fba82376fc2de49514245ea96c7cb4fa16c32dcd2fff1ab1ae327bd14
3
+ metadata.gz: 8cb289da12a175a02664132b076349edf457585141b4bd196f1e2fb78ea69587
4
+ data.tar.gz: e9b3c55ee7e096b26746473d0b0dd12b7e259331daa6cae304571f902de72dd9
5
5
  SHA512:
6
- metadata.gz: 24267284f4f29adc86d5bbe70a30bbe31d6d898546576065f1a9accafc3944a352117bbf6eb0de273743a00fb2d26c5cf37ed016cc0324187a25ca279230d812
7
- data.tar.gz: 0bc9f01659560c83b0289bf63119849135b7ec27520dd03c7abd645da99ef660ca4b5fd12301b359cd5cc45a82914d7ceae88ad93ad756fde166718b3d0fe6c2
6
+ metadata.gz: 49a0d32d438c6c202257b733a877624429771cd6a57d3981716df5e0d946fc4b1af87f18d9029d5b43fb9df65a2a4a06579851d979514302999b85612d01f3e5
7
+ data.tar.gz: a59d26670417db7c57d203486b04798510a5f1c85468dde63aa1e7875c4151bc56169a7090ad745b14a08d473e8882a24e75b9ab0d04ae4bcece5912c8c2a3a5
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /tmp/
9
9
  /cache
10
10
  /.idea
11
+ .DS_Store
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.16.0)
4
+ twitterscraper-ruby (0.20.1)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,8 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | day or hour | auto |
179
+ | `--chart_grouping` | string | day, hour or minute | auto |
176
180
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
181
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
182
  | `--format` | string | The format of the output. | json(default) or html |
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 86400 # 1 day
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,12 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
28
34
  def delete(key)
29
35
  key = cache_key(key)
30
36
  file = File.join(@dir, key)
@@ -24,6 +24,7 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
@@ -31,14 +32,17 @@ module Twitterscraper
31
32
  end
32
33
 
33
34
  def export(name, tweets)
34
- write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
-
36
- if options['format'] == 'json'
37
- write_json.call
38
- elsif options['format'] == 'html'
39
- File.write(options['output'], Template.new.tweets_embedded_html(name, tweets, options))
40
- else
41
- write_json.call
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
42
46
  end
43
47
  end
44
48
 
@@ -69,6 +73,8 @@ module Twitterscraper
69
73
  'daily_limit:',
70
74
  'order:',
71
75
  'threads:',
76
+ 'threads_granularity:',
77
+ 'chart_grouping:',
72
78
  'output:',
73
79
  'format:',
74
80
  'cache:',
@@ -82,10 +88,10 @@ module Twitterscraper
82
88
  options['lang'] ||= ''
83
89
  options['limit'] = (options['limit'] || 100).to_i
84
90
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
91
+ options['threads'] = (options['threads'] || 10).to_i
92
+ options['threads_granularity'] ||= 'auto'
86
93
  options['format'] ||= 'json'
87
94
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
89
95
 
90
96
  options['cache'] = options['cache'] != 'false'
91
97
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +99,13 @@ module Twitterscraper
93
99
  options
94
100
  end
95
101
 
102
+ def build_output_name(format, options)
103
+ query = options['query'].gsub(/[ :?#&]/, '_')
104
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
105
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
106
+ File.join('out', file)
107
+ end
108
+
96
109
  def initialize_logger
97
110
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
111
  end
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -74,24 +66,28 @@ module Twitterscraper
74
66
  [items_html, json_resp]
75
67
  end
76
68
 
77
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
78
70
  logger.info "Querying #{query}"
79
- query = ERB::Util.url_encode(query)
71
+ encoded_query = ERB::Util.url_encode(query)
80
72
 
81
- url = build_query_url(query, lang, type, pos)
73
+ url = build_query_url(encoded_query, lang, type, pos)
82
74
  http_request = lambda do
83
- logger.debug "Scraping tweets from #{url}"
84
- get_single_page(url, headers, proxies)
75
+ logger.debug "Scraping tweets from url=#{url}"
76
+ get_single_page(url)
85
77
  end
86
78
 
87
79
  if cache_enabled?
88
80
  client = Cache.new
89
81
  if (response = client.read(url))
90
- logger.debug 'Fetching tweets from cache'
82
+ logger.debug "Fetching tweets from cache url=#{url}"
91
83
  else
92
84
  response = http_request.call
93
85
  client.write(url, response) unless stop_requested?
94
86
  end
87
+ if @queries && query == @queries.last && pos.nil?
88
+ logger.debug "Delete a cache query=#{query}"
89
+ client.delete(url)
90
+ end
95
91
  else
96
92
  response = http_request.call
97
93
  end
@@ -135,132 +131,151 @@ module Twitterscraper
135
131
  if start_date && end_date
136
132
  if start_date == end_date
137
133
  raise Error.new('Please specify different values for :start_date and :end_date.')
138
- elsif start_date > end_date
134
+ elsif Date.parse(start_date) > Date.parse(end_date)
139
135
  raise Error.new(':start_date must occur before :end_date.')
140
136
  end
141
137
  end
142
138
 
143
139
  if start_date
144
- if start_date < OLDEST_DATE
140
+ if Date.parse(start_date) < OLDEST_DATE
145
141
  raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
146
142
  end
147
143
  end
148
144
  end
149
145
 
150
- def build_queries(query, start_date, end_date)
151
- if start_date && end_date
152
- # date_range = start_date.upto(end_date - 1)
153
- # date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
146
+ def build_queries(query, start_date, end_date, threads_granularity, type)
147
+ if type.search?
148
+ start_date = Date.parse(start_date) if start_date.is_a?(String)
149
+ end_date = Date.parse(end_date) if end_date.is_a?(String)
150
+ elsif type.user?
151
+ start_date = nil
152
+ end_date = nil
153
+ end
154
154
 
155
- queries = []
156
- time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
157
- end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
155
+ if start_date && end_date
156
+ if threads_granularity == 'auto'
157
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
158
+ end
158
159
 
159
- while true
160
- if time < Time.now.utc
161
- queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
160
+ if threads_granularity == 'day'
161
+ date_range = start_date.upto(end_date - 1)
162
+ queries = date_range.map { |date| query + " since:#{date}_00:00:00_UTC until:#{date + 1}_00:00:00_UTC" }
163
+ elsif threads_granularity == 'hour'
164
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
165
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
166
+ queries = []
167
+
168
+ while true
169
+ if time < Time.now.utc
170
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H')}:00:00_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H')}:00:00_UTC")
171
+ end
172
+ time += 3600
173
+ break if time >= end_time
162
174
  end
163
- time += 3600
164
- break if time >= end_time
175
+ else
176
+ raise Error.new("Invalid :threads_granularity value=#{threads_granularity}")
165
177
  end
166
178
 
167
- queries
179
+ @queries = queries
168
180
 
169
181
  elsif start_date
170
- [query + " since:#{start_date}"]
182
+ [query + " since:#{start_date}_00:00:00_UTC"]
171
183
  elsif end_date
172
- [query + " until:#{end_date}"]
184
+ [query + " until:#{end_date}_00:00:00_UTC"]
173
185
  else
174
186
  [query]
175
187
  end
176
188
  end
177
189
 
178
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
190
+ def main_loop(query, lang, type, limit, daily_limit)
179
191
  pos = nil
180
- daily_tweets = []
192
+ tmp_tweets = []
181
193
 
182
194
  while true
183
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
195
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
184
196
  unless new_tweets.empty?
185
- daily_tweets.concat(new_tweets)
186
- daily_tweets.uniq! { |t| t.tweet_id }
197
+ tmp_tweets.concat(new_tweets)
198
+ tmp_tweets.uniq! { |t| t.tweet_id }
199
+ end
187
200
 
188
- @mutex.synchronize {
189
- @all_tweets.concat(new_tweets)
190
- @all_tweets.uniq! { |t| t.tweet_id }
191
- }
201
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
202
+ total_size = @all_tweets.size + @results_counter.values.sum
203
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
204
+
205
+ if !@stop_requested && total_size >= limit
206
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
207
+ @stop_requested = true
192
208
  end
193
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
194
209
 
195
210
  break unless new_pos
196
- break if daily_limit && daily_tweets.size >= daily_limit
211
+ break if @stop_requested
212
+ break if daily_limit && tmp_tweets.size >= daily_limit
197
213
  break if @all_tweets.size >= limit
198
214
 
199
215
  pos = new_pos
200
216
  end
201
217
 
202
- if !@stop_requested && @all_tweets.size >= limit
203
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
204
- @stop_requested = true
205
- end
218
+ tmp_tweets
206
219
  end
207
220
 
208
221
  def stop_requested?
209
222
  @stop_requested
210
223
  end
211
224
 
212
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
213
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
214
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
215
- queries = build_queries(query, start_date, end_date)
225
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
216
226
  type = Type.new(type)
227
+ queries = build_queries(query, start_date, end_date, threads_granularity, type)
217
228
  if threads > queries.size
218
229
  threads = queries.size
219
230
  end
220
- if proxy_enabled?
221
- proxies = Proxy::Pool.new
222
- logger.debug "Fetch #{proxies.size} proxies"
223
- else
224
- proxies = []
225
- logger.debug 'Proxy disabled'
226
- end
227
231
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
228
232
 
229
233
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
230
234
 
235
+ logger.info "The number of queries #{queries.size}"
231
236
  logger.info "The number of threads #{threads}"
232
237
 
233
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
234
- logger.info "Headers #{headers}"
235
-
236
238
  @all_tweets = []
237
- @mutex = Mutex.new
238
239
  @stop_requested = false
240
+ @results_counter = {}
239
241
 
240
242
  if threads > 1
243
+ @mutex = Mutex.new
241
244
  Thread.abort_on_exception = true
242
245
  logger.debug "Set 'Thread.abort_on_exception' to true"
243
246
 
244
247
  Parallel.each(queries, in_threads: threads) do |query|
245
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
248
+ @results_counter[Parallel.worker_number] = 0
249
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
250
+ @mutex.synchronize {
251
+ @all_tweets.concat(tmp_tweets)
252
+ @all_tweets.uniq! { |t| t.tweet_id }
253
+ }
254
+ @results_counter[Parallel.worker_number] = 0
255
+
246
256
  raise Parallel::Break if stop_requested?
247
257
  end
248
258
  else
249
259
  queries.each do |query|
250
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
260
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
261
+ @all_tweets.concat(tmp_tweets)
262
+ @all_tweets.uniq! { |t| t.tweet_id }
263
+
251
264
  break if stop_requested?
252
265
  end
253
266
  end
254
267
 
268
+ logger.info "Return #{@all_tweets.size} tweets"
269
+
255
270
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
256
271
  end
257
272
 
258
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
259
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
273
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
274
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
260
275
  end
261
276
 
262
277
  def user_timeline(screen_name, limit: 100, order: 'desc')
263
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
278
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
264
279
  end
265
280
  end
266
281
  end
@@ -4,25 +4,53 @@ module Twitterscraper
4
4
  path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
5
5
  template = ERB.new(File.read(path))
6
6
 
7
+ tweets = tweets.sort_by { |t| t.created_at.to_i }
8
+ grouping = options['chart_grouping'] || 'auto'
9
+
7
10
  template.result_with_hash(
8
11
  chart_name: name,
9
- chart_data: chart_data(tweets).to_json,
10
- first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
11
- last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
12
- tweets_size: tweets.size,
13
- tweets: tweets.take(50)
12
+ chart_data: chart_data(tweets, grouping: grouping).to_json,
13
+ first_tweet: tweets[0],
14
+ last_tweet: tweets[-1],
15
+ tweets: tweets,
16
+ convert_limit: 30,
14
17
  )
15
18
  end
16
19
 
17
- def chart_data(tweets)
20
+ def chart_data(tweets, grouping: 'auto')
21
+ if grouping && tweets.size > 100
22
+ if grouping == 'auto'
23
+ month = 28 * 24 * 60 * 60 # 28 days
24
+ duration = tweets[-1].created_at - tweets[0].created_at
25
+
26
+ if duration > 3 * month
27
+ grouping = 'day'
28
+ elsif duration > month || tweets.size > 10000
29
+ grouping = 'hour'
30
+ else
31
+ grouping = 'minute'
32
+ end
33
+ end
34
+ end
35
+
36
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
37
+
18
38
  data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
19
39
  t = tweet.created_at
20
- min = (t.min.to_f / 5).floor * 5
21
- time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
40
+
41
+ if grouping == 'day'
42
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
43
+ elsif grouping == 'hour'
44
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
45
+ elsif grouping == 'minute'
46
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
47
+ else
48
+ time = t
49
+ end
22
50
  memo[time.to_i] += 1
23
51
  end
24
52
 
25
- data.sort_by { |k, v| k }.map do |timestamp, count|
53
+ data.sort_by { |k, _| k }.map do |timestamp, count|
26
54
  [timestamp * 1000, count]
27
55
  end
28
56
  end
@@ -1,27 +1,30 @@
1
- <html>
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
2
3
  <head>
3
- <script>
4
- window.twttr = (function (d, s, id) {
5
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
6
- if (d.getElementById(id)) return t;
7
- js = d.createElement(s);
8
- js.id = id;
9
- js.src = "https://platform.twitter.com/widgets.js";
10
- fjs.parentNode.insertBefore(js, fjs);
11
-
12
- t._e = [];
13
- t.ready = function (f) {
14
- t._e.push(f);
15
- };
16
-
17
- return t;
18
- }(document, "script", "twitter-wjs"));
19
- </script>
4
+ <meta charset="UTF-8">
20
5
 
21
6
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
22
7
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
23
8
  <script src="https://code.highcharts.com/stock/highstock.js"></script>
24
9
  <script>
10
+ function updateTweets() {
11
+ window.twttr = (function (d, s, id) {
12
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
13
+ if (d.getElementById(id)) return t;
14
+ js = d.createElement(s);
15
+ js.id = id;
16
+ js.src = "https://platform.twitter.com/widgets.js";
17
+ fjs.parentNode.insertBefore(js, fjs);
18
+
19
+ t._e = [];
20
+ t.ready = function (f) {
21
+ t._e.push(f);
22
+ };
23
+
24
+ return t;
25
+ }(document, "script", "twitter-wjs"));
26
+ }
27
+
25
28
  function drawChart() {
26
29
  Highcharts.setOptions({
27
30
  time: {
@@ -29,30 +32,43 @@
29
32
  }
30
33
  });
31
34
 
32
- Highcharts.stockChart('chart', {
35
+ var data = <%= chart_data %>;
36
+ var config = {
33
37
  title: {
34
- text: '<%= tweets_size %> tweets of <%= chart_name %>'
38
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
35
39
  },
36
40
  subtitle: {
37
- text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
41
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
38
42
  },
39
43
  series: [{
40
- data: <%= chart_data %>
44
+ data: data
41
45
  }],
42
46
  rangeSelector: {enabled: false},
43
47
  scrollbar: {enabled: false},
44
48
  navigator: {enabled: false},
45
49
  exporting: {enabled: false},
46
50
  credits: {enabled: false}
47
- });
51
+ };
52
+
53
+ Highcharts.stockChart('chart-container', config);
48
54
  }
49
55
 
50
56
  document.addEventListener("DOMContentLoaded", function () {
51
57
  drawChart();
58
+ updateTweets();
52
59
  });
53
60
  </script>
54
61
 
55
62
  <style type=text/css>
63
+ #chart-container {
64
+ max-width: 1200px;
65
+ height: 675px;
66
+ margin: 0 auto;
67
+ border: 1px solid rgb(204, 214, 221);
68
+ display: flex;
69
+ justify-content: center;
70
+ align-items: center;
71
+ }
56
72
  .tweets-container {
57
73
  max-width: 550px;
58
74
  margin: 0 auto 0 auto;
@@ -64,17 +80,31 @@
64
80
  </style>
65
81
  </head>
66
82
  <body>
67
- <div id="chart"></div>
83
+ <div id="chart-container"><div style="color: gray;">Loading...</div></div>
68
84
 
69
85
  <div class="tweets-container">
70
- <% tweets.each do |tweet| %>
71
- <blockquote class="twitter-tweet">
72
- <a href="<%= tweet.tweet_url %>"></a>
73
- </blockquote>
74
- <% end %>
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
87
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
88
+ <% if i < convert_limit %>
89
+ <blockquote class="twitter-tweet">
90
+ <% else %>
91
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
92
+ <% end %>
93
+
94
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
95
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
96
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
97
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
98
+ </div>
99
+
100
+ <div><%= tweet.text %></div>
101
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
75
102
 
76
- <% if tweets_size > tweets.size %>
77
- <div>and more!</div>
103
+ <% if i < convert_limit %>
104
+ </blockquote>
105
+ <% else %>
106
+ </div>
107
+ <% end %>
78
108
  <% end %>
79
109
  </div>
80
110
 
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -11,5 +11,9 @@ module Twitterscraper
11
11
  def user?
12
12
  @value == 'user'
13
13
  end
14
+
15
+ def to_s
16
+ @value
17
+ end
14
18
  end
15
19
  end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.16.0'
2
+ VERSION = '0.20.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.20.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-18 00:00:00.000000000 Z
11
+ date: 2021-04-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri