twitterscraper-ruby 0.15.1 → 0.19.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f04cb0ba394884918271b5485b596c07203b7a6e9f4fec42d074ef4f02b6a0a
4
- data.tar.gz: a4f618df53d1e8b54954619e87d383e43dbe5a63bbf83b33ee38f975998f2678
3
+ metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
4
+ data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
5
5
  SHA512:
6
- metadata.gz: fa9f02cf3ef0bf280f45b18ebacaec0b06dbd610477355602fcc59d382b5590c990695297e1e793457fdcff4cb7dd037f076c1f0fa4706eb69c67c3a165243e4
7
- data.tar.gz: 9c08d9e4d1ee56fa133675bc73a50f502040cc9a2844d9a46a39c38ccdffdf43c15b17c2e4a8b74561f523493ccbc4a055f0add239574d2f5129ee4abe1f5ed9
6
+ metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
7
+ data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /tmp/
9
9
  /cache
10
10
  /.idea
11
+ .DS_Store
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.15.1)
4
+ twitterscraper-ruby (0.19.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | | auto |
176
179
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
180
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
181
  | `--format` | string | The format of the output. | json(default) or html |
@@ -4,6 +4,7 @@ require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
5
  require 'twitterscraper/cache'
6
6
  require 'twitterscraper/query'
7
+ require 'twitterscraper/type'
7
8
  require 'twitterscraper/client'
8
9
  require 'twitterscraper/tweet'
9
10
  require 'twitterscraper/template'
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 3600 # 1 hour
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,18 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
34
+ def delete(key)
35
+ key = cache_key(key)
36
+ file = File.join(@dir, key)
37
+ File.delete(file) if File.exist?(file)
38
+ end
39
+
28
40
  def fetch(key, &block)
29
41
  if (value = read(key))
30
42
  value
@@ -24,21 +24,25 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
30
- export(tweets) unless tweets.empty?
31
+ export(options['query'], tweets) unless tweets.empty?
31
32
  end
32
33
 
33
- def export(tweets)
34
- write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
-
36
- if options['format'] == 'json'
37
- write_json.call
38
- elsif options['format'] == 'html'
39
- File.write('tweets.html', Template.tweets_embedded_html(tweets))
40
- else
41
- write_json.call
34
+ def export(name, tweets)
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
42
46
  end
43
47
  end
44
48
 
@@ -69,6 +73,7 @@ module Twitterscraper
69
73
  'daily_limit:',
70
74
  'order:',
71
75
  'threads:',
76
+ 'threads_granularity:',
72
77
  'output:',
73
78
  'format:',
74
79
  'cache:',
@@ -82,10 +87,10 @@ module Twitterscraper
82
87
  options['lang'] ||= ''
83
88
  options['limit'] = (options['limit'] || 100).to_i
84
89
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
90
+ options['threads'] = (options['threads'] || 10).to_i
91
+ options['threads_granularity'] ||= 'auto'
86
92
  options['format'] ||= 'json'
87
93
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
89
94
 
90
95
  options['cache'] = options['cache'] != 'false'
91
96
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +98,13 @@ module Twitterscraper
93
98
  options
94
99
  end
95
100
 
101
+ def build_output_name(format, options)
102
+ query = options['query'].gsub(/[ :?#&]/, '_')
103
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
104
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
105
+ File.join('out', file)
106
+ end
107
+
96
108
  def initialize_logger
97
109
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
110
  end
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -28,7 +20,7 @@ module Twitterscraper
28
20
  'max_position=__POS__&reset_error_state=false'
29
21
 
30
22
  def build_query_url(query, lang, type, pos)
31
- if type == 'user'
23
+ if type.user?
32
24
  if pos
33
25
  RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
26
  else
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -69,30 +61,33 @@ module Twitterscraper
69
61
  else
70
62
  json_resp = JSON.parse(text)
71
63
  items_html = json_resp['items_html'] || ''
72
- logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
73
64
  end
74
65
 
75
66
  [items_html, json_resp]
76
67
  end
77
68
 
78
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
79
70
  logger.info "Querying #{query}"
80
- query = ERB::Util.url_encode(query)
71
+ encoded_query = ERB::Util.url_encode(query)
81
72
 
82
- url = build_query_url(query, lang, type, pos)
73
+ url = build_query_url(encoded_query, lang, type, pos)
83
74
  http_request = lambda do
84
- logger.debug "Scraping tweets from #{url}"
85
- get_single_page(url, headers, proxies)
75
+ logger.debug "Scraping tweets from url=#{url}"
76
+ get_single_page(url)
86
77
  end
87
78
 
88
79
  if cache_enabled?
89
80
  client = Cache.new
90
81
  if (response = client.read(url))
91
- logger.debug 'Fetching tweets from cache'
82
+ logger.debug "Fetching tweets from cache url=#{url}"
92
83
  else
93
84
  response = http_request.call
94
85
  client.write(url, response) unless stop_requested?
95
86
  end
87
+ if @queries && query == @queries.last && pos.nil?
88
+ logger.debug "Delete a cache query=#{query}"
89
+ client.delete(url)
90
+ end
96
91
  else
97
92
  response = http_request.call
98
93
  end
@@ -100,6 +95,12 @@ module Twitterscraper
100
95
 
101
96
  html, json_resp = parse_single_page(response, pos.nil?)
102
97
 
98
+ if json_resp && json_resp['message']
99
+ logger.warn json_resp['message'] # Sorry, you are rate limited.
100
+ @stop_requested = true
101
+ Cache.new.delete(url) if cache_enabled?
102
+ end
103
+
103
104
  tweets = Tweet.from_html(html)
104
105
 
105
106
  if tweets.empty?
@@ -108,7 +109,7 @@ module Twitterscraper
108
109
 
109
110
  if json_resp
110
111
  [tweets, json_resp['min_position']]
111
- elsif type
112
+ elsif type.user?
112
113
  [tweets, tweets[-1].tweet_id]
113
114
  else
114
115
  [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
@@ -140,19 +141,33 @@ module Twitterscraper
140
141
  raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
141
142
  end
142
143
  end
143
-
144
- if end_date
145
- today = Date.today
146
- if end_date > Date.today
147
- raise Error.new(":end_date must be less than or equal to today(#{today})")
148
- end
149
- end
150
144
  end
151
145
 
152
- def build_queries(query, start_date, end_date)
146
+ def build_queries(query, start_date, end_date, threads_granularity)
153
147
  if start_date && end_date
154
- date_range = start_date.upto(end_date - 1)
155
- date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
148
+ if threads_granularity == 'auto'
149
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
150
+ end
151
+
152
+ if threads_granularity == 'day'
153
+ date_range = start_date.upto(end_date - 1)
154
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
155
+ elsif threads_granularity == 'hour'
156
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
157
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
158
+ queries = []
159
+
160
+ while true
161
+ if time < Time.now.utc
162
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
163
+ end
164
+ time += 3600
165
+ break if time >= end_time
166
+ end
167
+ end
168
+
169
+ @queries = queries
170
+
156
171
  elsif start_date
157
172
  [query + " since:#{start_date}"]
158
173
  elsif end_date
@@ -162,93 +177,103 @@ module Twitterscraper
162
177
  end
163
178
  end
164
179
 
165
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
180
+ def main_loop(query, lang, type, limit, daily_limit)
166
181
  pos = nil
167
- daily_tweets = []
182
+ tmp_tweets = []
168
183
 
169
184
  while true
170
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
185
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
171
186
  unless new_tweets.empty?
172
- daily_tweets.concat(new_tweets)
173
- daily_tweets.uniq! { |t| t.tweet_id }
187
+ tmp_tweets.concat(new_tweets)
188
+ tmp_tweets.uniq! { |t| t.tweet_id }
189
+ end
174
190
 
175
- @mutex.synchronize {
176
- @all_tweets.concat(new_tweets)
177
- @all_tweets.uniq! { |t| t.tweet_id }
178
- }
191
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
192
+ total_size = @all_tweets.size + @results_counter.values.sum
193
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
194
+
195
+ if !@stop_requested && total_size >= limit
196
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
197
+ @stop_requested = true
179
198
  end
180
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
181
199
 
182
200
  break unless new_pos
183
- break if daily_limit && daily_tweets.size >= daily_limit
201
+ break if @stop_requested
202
+ break if daily_limit && tmp_tweets.size >= daily_limit
184
203
  break if @all_tweets.size >= limit
185
204
 
186
205
  pos = new_pos
187
206
  end
188
207
 
189
- if !@stop_requested && @all_tweets.size >= limit
190
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
191
- @stop_requested = true
192
- end
208
+ tmp_tweets
193
209
  end
194
210
 
195
211
  def stop_requested?
196
212
  @stop_requested
197
213
  end
198
214
 
199
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
200
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
201
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
202
- queries = build_queries(query, start_date, end_date)
215
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
216
+ type = Type.new(type)
217
+ if type.search?
218
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
219
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
220
+ elsif type.user?
221
+ start_date = nil
222
+ end_date = nil
223
+ end
224
+
225
+ queries = build_queries(query, start_date, end_date, threads_granularity)
203
226
  if threads > queries.size
204
- logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
227
  threads = queries.size
206
228
  end
207
- if proxy_enabled?
208
- proxies = Proxy::Pool.new
209
- logger.debug "Fetch #{proxies.size} proxies"
210
- else
211
- proxies = []
212
- logger.debug 'Proxy disabled'
213
- end
214
229
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
215
230
 
216
-
217
231
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
232
 
233
+ logger.info "The number of queries #{queries.size}"
219
234
  logger.info "The number of threads #{threads}"
220
235
 
221
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
222
- logger.info "Headers #{headers}"
223
-
224
236
  @all_tweets = []
225
- @mutex = Mutex.new
226
237
  @stop_requested = false
238
+ @results_counter = {}
227
239
 
228
240
  if threads > 1
241
+ @mutex = Mutex.new
229
242
  Thread.abort_on_exception = true
230
243
  logger.debug "Set 'Thread.abort_on_exception' to true"
231
244
 
232
245
  Parallel.each(queries, in_threads: threads) do |query|
233
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
246
+ @results_counter[Parallel.worker_number] = 0
247
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
248
+ @mutex.synchronize {
249
+ @all_tweets.concat(tmp_tweets)
250
+ @all_tweets.uniq! { |t| t.tweet_id }
251
+ }
252
+ @results_counter[Parallel.worker_number] = 0
253
+
234
254
  raise Parallel::Break if stop_requested?
235
255
  end
236
256
  else
237
257
  queries.each do |query|
238
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
258
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
259
+ @all_tweets.concat(tmp_tweets)
260
+ @all_tweets.uniq! { |t| t.tweet_id }
261
+
239
262
  break if stop_requested?
240
263
  end
241
264
  end
242
265
 
266
+ logger.info "Return #{@all_tweets.size} tweets"
267
+
243
268
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
269
  end
245
270
 
246
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
271
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
272
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
248
273
  end
249
274
 
250
275
  def user_timeline(screen_name, limit: 100, order: 'desc')
251
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
276
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
252
277
  end
253
278
  end
254
279
  end
@@ -1,48 +1,57 @@
1
1
  module Twitterscraper
2
- module Template
3
- module_function
2
+ class Template
3
+ def tweets_embedded_html(name, tweets, options)
4
+ path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
5
+ template = ERB.new(File.read(path))
4
6
 
5
- def tweets_embedded_html(tweets)
6
- tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
- EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
7
+ tweets = tweets.sort_by { |t| t.created_at.to_i }
8
+
9
+ template.result_with_hash(
10
+ chart_name: name,
11
+ chart_data: chart_data(tweets).to_json,
12
+ first_tweet: tweets[0],
13
+ last_tweet: tweets[-1],
14
+ tweets: tweets,
15
+ convert_limit: 30,
16
+ )
8
17
  end
9
18
 
10
- EMBED_TWEET_HTML = <<~'HTML'
11
- <blockquote class="twitter-tweet">
12
- <a href="__TWEET_URL__"></a>
13
- </blockquote>
14
- HTML
15
-
16
- EMBED_TWEETS_HTML = <<~'HTML'
17
- <html>
18
- <head>
19
- <style type=text/css>
20
- .twitter-tweet {
21
- margin: 30px auto 0 auto !important;
22
- }
23
- </style>
24
- <script>
25
- window.twttr = (function(d, s, id) {
26
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
- if (d.getElementById(id)) return t;
28
- js = d.createElement(s);
29
- js.id = id;
30
- js.src = "https://platform.twitter.com/widgets.js";
31
- fjs.parentNode.insertBefore(js, fjs);
32
-
33
- t._e = [];
34
- t.ready = function(f) {
35
- t._e.push(f);
36
- };
37
-
38
- return t;
39
- }(document, "script", "twitter-wjs"));
40
- </script>
41
- </head>
42
- <body>
43
- __TWEETS__
44
- </body>
45
- </html>
46
- HTML
19
+ def chart_data(tweets, grouping: 'auto')
20
+ if grouping && tweets.size > 100
21
+ if grouping == 'auto'
22
+ month = 28 * 24 * 60 * 60 # 28 days
23
+ duration = tweets[-1].created_at - tweets[0].created_at
24
+
25
+ if duration > 3 * month
26
+ grouping = 'day'
27
+ elsif duration > month || tweets.size > 10000
28
+ grouping = 'hour'
29
+ else
30
+ grouping = 'minute'
31
+ end
32
+ end
33
+ end
34
+
35
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
36
+
37
+ data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
38
+ t = tweet.created_at
39
+
40
+ if grouping == 'day'
41
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
42
+ elsif grouping == 'hour'
43
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
44
+ elsif grouping == 'minute'
45
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
46
+ else
47
+ time = t
48
+ end
49
+ memo[time.to_i] += 1
50
+ end
51
+
52
+ data.sort_by { |k, _| k }.map do |timestamp, count|
53
+ [timestamp * 1000, count]
54
+ end
55
+ end
47
56
  end
48
57
  end
@@ -0,0 +1,112 @@
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+
6
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
7
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
8
+ <script src="https://code.highcharts.com/stock/highstock.js"></script>
9
+ <script>
10
+ function updateTweets() {
11
+ window.twttr = (function (d, s, id) {
12
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
13
+ if (d.getElementById(id)) return t;
14
+ js = d.createElement(s);
15
+ js.id = id;
16
+ js.src = "https://platform.twitter.com/widgets.js";
17
+ fjs.parentNode.insertBefore(js, fjs);
18
+
19
+ t._e = [];
20
+ t.ready = function (f) {
21
+ t._e.push(f);
22
+ };
23
+
24
+ return t;
25
+ }(document, "script", "twitter-wjs"));
26
+ }
27
+
28
+ function drawChart() {
29
+ Highcharts.setOptions({
30
+ time: {
31
+ timezone: moment.tz.guess()
32
+ }
33
+ });
34
+
35
+ var data = <%= chart_data %>;
36
+ var config = {
37
+ title: {
38
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
39
+ },
40
+ subtitle: {
41
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
42
+ },
43
+ series: [{
44
+ data: data
45
+ }],
46
+ rangeSelector: {enabled: false},
47
+ scrollbar: {enabled: false},
48
+ navigator: {enabled: false},
49
+ exporting: {enabled: false},
50
+ credits: {enabled: false}
51
+ };
52
+
53
+ Highcharts.stockChart('chart-container', config);
54
+ }
55
+
56
+ document.addEventListener("DOMContentLoaded", function () {
57
+ drawChart();
58
+ updateTweets();
59
+ });
60
+ </script>
61
+
62
+ <style type=text/css>
63
+ #chart-container {
64
+ max-width: 1200px;
65
+ height: 675px;
66
+ margin: 0 auto;
67
+ border: 1px solid rgb(204, 214, 221);
68
+ display: flex;
69
+ justify-content: center;
70
+ align-items: center;
71
+ }
72
+ .tweets-container {
73
+ max-width: 550px;
74
+ margin: 0 auto 0 auto;
75
+ }
76
+
77
+ .twitter-tweet {
78
+ margin: 15px 0 15px 0 !important;
79
+ }
80
+ </style>
81
+ </head>
82
+ <body>
83
+ <div id="chart-container"><div style="color: gray;">Loading...</div></div>
84
+
85
+ <div class="tweets-container">
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
87
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
88
+ <% if i < convert_limit %>
89
+ <blockquote class="twitter-tweet">
90
+ <% else %>
91
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
92
+ <% end %>
93
+
94
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
95
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
96
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
97
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
98
+ </div>
99
+
100
+ <div><%= tweet.text %></div>
101
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
102
+
103
+ <% if i < convert_limit %>
104
+ </blockquote>
105
+ <% else %>
106
+ </div>
107
+ <% end %>
108
+ <% end %>
109
+ </div>
110
+
111
+ </body>
112
+ </html>
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -0,0 +1,19 @@
1
+ module Twitterscraper
2
+ class Type
3
+ def initialize(value)
4
+ @value = value
5
+ end
6
+
7
+ def search?
8
+ @value == 'search'
9
+ end
10
+
11
+ def user?
12
+ @value == 'user'
13
+ end
14
+
15
+ def to_s
16
+ @value
17
+ end
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.15.1'
2
+ VERSION = '0.19.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-17 00:00:00.000000000 Z
11
+ date: 2020-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -72,7 +72,9 @@ files:
72
72
  - lib/twitterscraper/proxy.rb
73
73
  - lib/twitterscraper/query.rb
74
74
  - lib/twitterscraper/template.rb
75
+ - lib/twitterscraper/template/tweets.html.erb
75
76
  - lib/twitterscraper/tweet.rb
77
+ - lib/twitterscraper/type.rb
76
78
  - lib/version.rb
77
79
  - twitterscraper-ruby.gemspec
78
80
  homepage: https://github.com/ts-3156/twitterscraper-ruby