twitterscraper-ruby 0.15.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f04cb0ba394884918271b5485b596c07203b7a6e9f4fec42d074ef4f02b6a0a
4
- data.tar.gz: a4f618df53d1e8b54954619e87d383e43dbe5a63bbf83b33ee38f975998f2678
3
+ metadata.gz: 2056b4a3d9fe7af49429e35b3a1688256fb31b74cabab841a4dd2376a79889d5
4
+ data.tar.gz: aaaf949da2ba2ae07a0d66e981aebc635c18120de06be705f96c19c92c309911
5
5
  SHA512:
6
- metadata.gz: fa9f02cf3ef0bf280f45b18ebacaec0b06dbd610477355602fcc59d382b5590c990695297e1e793457fdcff4cb7dd037f076c1f0fa4706eb69c67c3a165243e4
7
- data.tar.gz: 9c08d9e4d1ee56fa133675bc73a50f502040cc9a2844d9a46a39c38ccdffdf43c15b17c2e4a8b74561f523493ccbc4a055f0add239574d2f5129ee4abe1f5ed9
6
+ metadata.gz: c60824e4c1c0021a3e27451b1708a77bd2e15dd6258fce63ac1b95111d0230c8ab7317bcd76c2faf14d02ebe75ab8d7453924e01eee7d3fcb46eef374f16c575
7
+ data.tar.gz: 984204bd430b41b76a2d9108df4e778e2bb242010ebd18569bcb662473496826644ba5693db1d475d565bff49a3de7f0eb95fd4c9a3da9e5ed4d6a6219ebb62e
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /tmp/
9
9
  /cache
10
10
  /.idea
11
+ .DS_Store
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.15.1)
4
+ twitterscraper-ruby (0.19.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | | auto |
176
179
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
180
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
181
  | `--format` | string | The format of the output. | json(default) or html |
@@ -4,6 +4,7 @@ require 'twitterscraper/http'
4
4
  require 'twitterscraper/lang'
5
5
  require 'twitterscraper/cache'
6
6
  require 'twitterscraper/query'
7
+ require 'twitterscraper/type'
7
8
  require 'twitterscraper/client'
8
9
  require 'twitterscraper/tweet'
9
10
  require 'twitterscraper/template'
@@ -4,7 +4,7 @@ require 'digest/md5'
4
4
  module Twitterscraper
5
5
  class Cache
6
6
  def initialize()
7
- @ttl = 3600 # 1 hour
7
+ @ttl = 86400 * 3 # 3 day
8
8
  @dir = 'cache'
9
9
  Dir.mkdir(@dir) unless File.exist?(@dir)
10
10
  end
@@ -25,6 +25,18 @@ module Twitterscraper
25
25
  File.write(file, entry.to_json)
26
26
  end
27
27
 
28
+ def exist?(key)
29
+ key = cache_key(key)
30
+ file = File.join(@dir, key)
31
+ File.exist?(file)
32
+ end
33
+
34
+ def delete(key)
35
+ key = cache_key(key)
36
+ file = File.join(@dir, key)
37
+ File.delete(file) if File.exist?(file)
38
+ end
39
+
28
40
  def fetch(key, &block)
29
41
  if (value = read(key))
30
42
  value
@@ -24,21 +24,25 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
30
- export(tweets) unless tweets.empty?
31
+ export(options['query'], tweets) unless tweets.empty?
31
32
  end
32
33
 
33
- def export(tweets)
34
- write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
-
36
- if options['format'] == 'json'
37
- write_json.call
38
- elsif options['format'] == 'html'
39
- File.write('tweets.html', Template.tweets_embedded_html(tweets))
40
- else
41
- write_json.call
34
+ def export(name, tweets)
35
+ options['format'].split(',').map(&:strip).each do |format|
36
+ file = build_output_name(format, options)
37
+ Dir.mkdir(File.dirname(file)) unless File.exist?(File.dirname(file))
38
+
39
+ if format == 'json'
40
+ File.write(file, generate_json(tweets))
41
+ elsif format == 'html'
42
+ File.write(file, Template.new.tweets_embedded_html(name, tweets, options))
43
+ else
44
+ puts "Invalid format #{format}"
45
+ end
42
46
  end
43
47
  end
44
48
 
@@ -69,6 +73,7 @@ module Twitterscraper
69
73
  'daily_limit:',
70
74
  'order:',
71
75
  'threads:',
76
+ 'threads_granularity:',
72
77
  'output:',
73
78
  'format:',
74
79
  'cache:',
@@ -82,10 +87,10 @@ module Twitterscraper
82
87
  options['lang'] ||= ''
83
88
  options['limit'] = (options['limit'] || 100).to_i
84
89
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
90
+ options['threads'] = (options['threads'] || 10).to_i
91
+ options['threads_granularity'] ||= 'auto'
86
92
  options['format'] ||= 'json'
87
93
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
89
94
 
90
95
  options['cache'] = options['cache'] != 'false'
91
96
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +98,13 @@ module Twitterscraper
93
98
  options
94
99
  end
95
100
 
101
+ def build_output_name(format, options)
102
+ query = options['query'].gsub(/[ :?#&]/, '_')
103
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
104
+ file = [options['type'], 'tweets', date, query].compact.join('_') + '.' + format
105
+ File.join('out', file)
106
+ end
107
+
96
108
  def initialize_logger
97
109
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
110
  end
@@ -2,9 +2,31 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
+ USER_AGENT_LIST = [
6
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
7
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
8
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
9
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
10
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
11
+ ]
12
+
5
13
  def initialize(cache: true, proxy: true)
14
+ @request_headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
15
+ Twitterscraper.logger.info "Headers #{@request_headers}"
16
+
6
17
  @cache = cache
7
- @proxy = proxy
18
+
19
+ if (@proxy = proxy)
20
+ @proxies = Proxy::Pool.new
21
+ Twitterscraper.logger.debug "Fetch #{@proxies.size} proxies"
22
+ else
23
+ @proxies = []
24
+ Twitterscraper.logger.debug 'Proxy disabled'
25
+ end
26
+ end
27
+
28
+ def request_headers
29
+ @request_headers
8
30
  end
9
31
 
10
32
  def cache_enabled?
@@ -14,5 +36,9 @@ module Twitterscraper
14
36
  def proxy_enabled?
15
37
  @proxy
16
38
  end
39
+
40
+ def proxies
41
+ @proxies
42
+ end
17
43
  end
18
44
  end
@@ -10,14 +10,6 @@ module Twitterscraper
10
10
  module Query
11
11
  include Logger
12
12
 
13
- USER_AGENT_LIST = [
14
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
- 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
- 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
- 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
- ]
20
-
21
13
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
14
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
15
  'default&include_available_features=1&include_entities=1&' +
@@ -28,7 +20,7 @@ module Twitterscraper
28
20
  'max_position=__POS__&reset_error_state=false'
29
21
 
30
22
  def build_query_url(query, lang, type, pos)
31
- if type == 'user'
23
+ if type.user?
32
24
  if pos
33
25
  RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
26
  else
@@ -43,13 +35,13 @@ module Twitterscraper
43
35
  end
44
36
  end
45
37
 
46
- def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
38
+ def get_single_page(url, timeout = 6, retries = 30)
47
39
  return nil if stop_requested?
48
- unless proxies.empty?
40
+ if proxy_enabled?
49
41
  proxy = proxies.sample
50
42
  logger.info("Using proxy #{proxy}")
51
43
  end
52
- Http.get(url, headers, proxy, timeout)
44
+ Http.get(url, request_headers, proxy, timeout)
53
45
  rescue => e
54
46
  logger.debug "get_single_page: #{e.inspect}"
55
47
  if (retries -= 1) > 0
@@ -69,30 +61,33 @@ module Twitterscraper
69
61
  else
70
62
  json_resp = JSON.parse(text)
71
63
  items_html = json_resp['items_html'] || ''
72
- logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
73
64
  end
74
65
 
75
66
  [items_html, json_resp]
76
67
  end
77
68
 
78
- def query_single_page(query, lang, type, pos, headers: [], proxies: [])
69
+ def query_single_page(query, lang, type, pos)
79
70
  logger.info "Querying #{query}"
80
- query = ERB::Util.url_encode(query)
71
+ encoded_query = ERB::Util.url_encode(query)
81
72
 
82
- url = build_query_url(query, lang, type, pos)
73
+ url = build_query_url(encoded_query, lang, type, pos)
83
74
  http_request = lambda do
84
- logger.debug "Scraping tweets from #{url}"
85
- get_single_page(url, headers, proxies)
75
+ logger.debug "Scraping tweets from url=#{url}"
76
+ get_single_page(url)
86
77
  end
87
78
 
88
79
  if cache_enabled?
89
80
  client = Cache.new
90
81
  if (response = client.read(url))
91
- logger.debug 'Fetching tweets from cache'
82
+ logger.debug "Fetching tweets from cache url=#{url}"
92
83
  else
93
84
  response = http_request.call
94
85
  client.write(url, response) unless stop_requested?
95
86
  end
87
+ if @queries && query == @queries.last && pos.nil?
88
+ logger.debug "Delete a cache query=#{query}"
89
+ client.delete(url)
90
+ end
96
91
  else
97
92
  response = http_request.call
98
93
  end
@@ -100,6 +95,12 @@ module Twitterscraper
100
95
 
101
96
  html, json_resp = parse_single_page(response, pos.nil?)
102
97
 
98
+ if json_resp && json_resp['message']
99
+ logger.warn json_resp['message'] # Sorry, you are rate limited.
100
+ @stop_requested = true
101
+ Cache.new.delete(url) if cache_enabled?
102
+ end
103
+
103
104
  tweets = Tweet.from_html(html)
104
105
 
105
106
  if tweets.empty?
@@ -108,7 +109,7 @@ module Twitterscraper
108
109
 
109
110
  if json_resp
110
111
  [tweets, json_resp['min_position']]
111
- elsif type
112
+ elsif type.user?
112
113
  [tweets, tweets[-1].tweet_id]
113
114
  else
114
115
  [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
@@ -140,19 +141,33 @@ module Twitterscraper
140
141
  raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
141
142
  end
142
143
  end
143
-
144
- if end_date
145
- today = Date.today
146
- if end_date > Date.today
147
- raise Error.new(":end_date must be less than or equal to today(#{today})")
148
- end
149
- end
150
144
  end
151
145
 
152
- def build_queries(query, start_date, end_date)
146
+ def build_queries(query, start_date, end_date, threads_granularity)
153
147
  if start_date && end_date
154
- date_range = start_date.upto(end_date - 1)
155
- date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
148
+ if threads_granularity == 'auto'
149
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
150
+ end
151
+
152
+ if threads_granularity == 'day'
153
+ date_range = start_date.upto(end_date - 1)
154
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
155
+ elsif threads_granularity == 'hour'
156
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
157
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
158
+ queries = []
159
+
160
+ while true
161
+ if time < Time.now.utc
162
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
163
+ end
164
+ time += 3600
165
+ break if time >= end_time
166
+ end
167
+ end
168
+
169
+ @queries = queries
170
+
156
171
  elsif start_date
157
172
  [query + " since:#{start_date}"]
158
173
  elsif end_date
@@ -162,93 +177,103 @@ module Twitterscraper
162
177
  end
163
178
  end
164
179
 
165
- def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
180
+ def main_loop(query, lang, type, limit, daily_limit)
166
181
  pos = nil
167
- daily_tweets = []
182
+ tmp_tweets = []
168
183
 
169
184
  while true
170
- new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
185
+ new_tweets, new_pos = query_single_page(query, lang, type, pos)
171
186
  unless new_tweets.empty?
172
- daily_tweets.concat(new_tweets)
173
- daily_tweets.uniq! { |t| t.tweet_id }
187
+ tmp_tweets.concat(new_tweets)
188
+ tmp_tweets.uniq! { |t| t.tweet_id }
189
+ end
174
190
 
175
- @mutex.synchronize {
176
- @all_tweets.concat(new_tweets)
177
- @all_tweets.uniq! { |t| t.tweet_id }
178
- }
191
+ @results_counter[Parallel.worker_number] = tmp_tweets.size
192
+ total_size = @all_tweets.size + @results_counter.values.sum
193
+ logger.info "Got tweets new=#{new_tweets.size} tmp=#{tmp_tweets.size} all=#{@all_tweets.size} total=#{total_size}"
194
+
195
+ if !@stop_requested && total_size >= limit
196
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{total_size}"
197
+ @stop_requested = true
179
198
  end
180
- logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
181
199
 
182
200
  break unless new_pos
183
- break if daily_limit && daily_tweets.size >= daily_limit
201
+ break if @stop_requested
202
+ break if daily_limit && tmp_tweets.size >= daily_limit
184
203
  break if @all_tweets.size >= limit
185
204
 
186
205
  pos = new_pos
187
206
  end
188
207
 
189
- if !@stop_requested && @all_tweets.size >= limit
190
- logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
191
- @stop_requested = true
192
- end
208
+ tmp_tweets
193
209
  end
194
210
 
195
211
  def stop_requested?
196
212
  @stop_requested
197
213
  end
198
214
 
199
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
200
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
201
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
202
- queries = build_queries(query, start_date, end_date)
215
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
216
+ type = Type.new(type)
217
+ if type.search?
218
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
219
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
220
+ elsif type.user?
221
+ start_date = nil
222
+ end_date = nil
223
+ end
224
+
225
+ queries = build_queries(query, start_date, end_date, threads_granularity)
203
226
  if threads > queries.size
204
- logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
227
  threads = queries.size
206
228
  end
207
- if proxy_enabled?
208
- proxies = Proxy::Pool.new
209
- logger.debug "Fetch #{proxies.size} proxies"
210
- else
211
- proxies = []
212
- logger.debug 'Proxy disabled'
213
- end
214
229
  logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
215
230
 
216
-
217
231
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
232
 
233
+ logger.info "The number of queries #{queries.size}"
219
234
  logger.info "The number of threads #{threads}"
220
235
 
221
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
222
- logger.info "Headers #{headers}"
223
-
224
236
  @all_tweets = []
225
- @mutex = Mutex.new
226
237
  @stop_requested = false
238
+ @results_counter = {}
227
239
 
228
240
  if threads > 1
241
+ @mutex = Mutex.new
229
242
  Thread.abort_on_exception = true
230
243
  logger.debug "Set 'Thread.abort_on_exception' to true"
231
244
 
232
245
  Parallel.each(queries, in_threads: threads) do |query|
233
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
246
+ @results_counter[Parallel.worker_number] = 0
247
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
248
+ @mutex.synchronize {
249
+ @all_tweets.concat(tmp_tweets)
250
+ @all_tweets.uniq! { |t| t.tweet_id }
251
+ }
252
+ @results_counter[Parallel.worker_number] = 0
253
+
234
254
  raise Parallel::Break if stop_requested?
235
255
  end
236
256
  else
237
257
  queries.each do |query|
238
- main_loop(query, lang, type, limit, daily_limit, headers, proxies)
258
+ tmp_tweets = main_loop(query, lang, type, limit, daily_limit)
259
+ @all_tweets.concat(tmp_tweets)
260
+ @all_tweets.uniq! { |t| t.tweet_id }
261
+
239
262
  break if stop_requested?
240
263
  end
241
264
  end
242
265
 
266
+ logger.info "Return #{@all_tweets.size} tweets"
267
+
243
268
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
269
  end
245
270
 
246
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
271
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
272
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
248
273
  end
249
274
 
250
275
  def user_timeline(screen_name, limit: 100, order: 'desc')
251
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
276
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
252
277
  end
253
278
  end
254
279
  end
@@ -1,48 +1,57 @@
1
1
  module Twitterscraper
2
- module Template
3
- module_function
2
+ class Template
3
+ def tweets_embedded_html(name, tweets, options)
4
+ path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
5
+ template = ERB.new(File.read(path))
4
6
 
5
- def tweets_embedded_html(tweets)
6
- tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
- EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
7
+ tweets = tweets.sort_by { |t| t.created_at.to_i }
8
+
9
+ template.result_with_hash(
10
+ chart_name: name,
11
+ chart_data: chart_data(tweets).to_json,
12
+ first_tweet: tweets[0],
13
+ last_tweet: tweets[-1],
14
+ tweets: tweets,
15
+ convert_limit: 30,
16
+ )
8
17
  end
9
18
 
10
- EMBED_TWEET_HTML = <<~'HTML'
11
- <blockquote class="twitter-tweet">
12
- <a href="__TWEET_URL__"></a>
13
- </blockquote>
14
- HTML
15
-
16
- EMBED_TWEETS_HTML = <<~'HTML'
17
- <html>
18
- <head>
19
- <style type=text/css>
20
- .twitter-tweet {
21
- margin: 30px auto 0 auto !important;
22
- }
23
- </style>
24
- <script>
25
- window.twttr = (function(d, s, id) {
26
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
- if (d.getElementById(id)) return t;
28
- js = d.createElement(s);
29
- js.id = id;
30
- js.src = "https://platform.twitter.com/widgets.js";
31
- fjs.parentNode.insertBefore(js, fjs);
32
-
33
- t._e = [];
34
- t.ready = function(f) {
35
- t._e.push(f);
36
- };
37
-
38
- return t;
39
- }(document, "script", "twitter-wjs"));
40
- </script>
41
- </head>
42
- <body>
43
- __TWEETS__
44
- </body>
45
- </html>
46
- HTML
19
+ def chart_data(tweets, grouping: 'auto')
20
+ if grouping && tweets.size > 100
21
+ if grouping == 'auto'
22
+ month = 28 * 24 * 60 * 60 # 28 days
23
+ duration = tweets[-1].created_at - tweets[0].created_at
24
+
25
+ if duration > 3 * month
26
+ grouping = 'day'
27
+ elsif duration > month || tweets.size > 10000
28
+ grouping = 'hour'
29
+ else
30
+ grouping = 'minute'
31
+ end
32
+ end
33
+ end
34
+
35
+ Twitterscraper.logger.info "Chart grouping #{grouping}"
36
+
37
+ data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
38
+ t = tweet.created_at
39
+
40
+ if grouping == 'day'
41
+ time = Time.new(t.year, t.month, t.day, 0, 0, 0, '+00:00')
42
+ elsif grouping == 'hour'
43
+ time = Time.new(t.year, t.month, t.day, t.hour, 0, 0, '+00:00')
44
+ elsif grouping == 'minute'
45
+ time = Time.new(t.year, t.month, t.day, t.hour, t.min, 0, '+00:00')
46
+ else
47
+ time = t
48
+ end
49
+ memo[time.to_i] += 1
50
+ end
51
+
52
+ data.sort_by { |k, _| k }.map do |timestamp, count|
53
+ [timestamp * 1000, count]
54
+ end
55
+ end
47
56
  end
48
57
  end
@@ -0,0 +1,112 @@
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+
6
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
7
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
8
+ <script src="https://code.highcharts.com/stock/highstock.js"></script>
9
+ <script>
10
+ function updateTweets() {
11
+ window.twttr = (function (d, s, id) {
12
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
13
+ if (d.getElementById(id)) return t;
14
+ js = d.createElement(s);
15
+ js.id = id;
16
+ js.src = "https://platform.twitter.com/widgets.js";
17
+ fjs.parentNode.insertBefore(js, fjs);
18
+
19
+ t._e = [];
20
+ t.ready = function (f) {
21
+ t._e.push(f);
22
+ };
23
+
24
+ return t;
25
+ }(document, "script", "twitter-wjs"));
26
+ }
27
+
28
+ function drawChart() {
29
+ Highcharts.setOptions({
30
+ time: {
31
+ timezone: moment.tz.guess()
32
+ }
33
+ });
34
+
35
+ var data = <%= chart_data %>;
36
+ var config = {
37
+ title: {
38
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
39
+ },
40
+ subtitle: {
41
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
42
+ },
43
+ series: [{
44
+ data: data
45
+ }],
46
+ rangeSelector: {enabled: false},
47
+ scrollbar: {enabled: false},
48
+ navigator: {enabled: false},
49
+ exporting: {enabled: false},
50
+ credits: {enabled: false}
51
+ };
52
+
53
+ Highcharts.stockChart('chart-container', config);
54
+ }
55
+
56
+ document.addEventListener("DOMContentLoaded", function () {
57
+ drawChart();
58
+ updateTweets();
59
+ });
60
+ </script>
61
+
62
+ <style type=text/css>
63
+ #chart-container {
64
+ max-width: 1200px;
65
+ height: 675px;
66
+ margin: 0 auto;
67
+ border: 1px solid rgb(204, 214, 221);
68
+ display: flex;
69
+ justify-content: center;
70
+ align-items: center;
71
+ }
72
+ .tweets-container {
73
+ max-width: 550px;
74
+ margin: 0 auto 0 auto;
75
+ }
76
+
77
+ .twitter-tweet {
78
+ margin: 15px 0 15px 0 !important;
79
+ }
80
+ </style>
81
+ </head>
82
+ <body>
83
+ <div id="chart-container"><div style="color: gray;">Loading...</div></div>
84
+
85
+ <div class="tweets-container">
86
+ <% tweets.sort_by { |t| -t.created_at.to_i }.take(1000).each.with_index do |tweet, i| %>
87
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
88
+ <% if i < convert_limit %>
89
+ <blockquote class="twitter-tweet">
90
+ <% else %>
91
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
92
+ <% end %>
93
+
94
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
95
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
96
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
97
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
98
+ </div>
99
+
100
+ <div><%= tweet.text %></div>
101
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
102
+
103
+ <% if i < convert_limit %>
104
+ </blockquote>
105
+ <% else %>
106
+ </div>
107
+ <% end %>
108
+ <% end %>
109
+ </div>
110
+
111
+ </body>
112
+ </html>
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -0,0 +1,19 @@
1
+ module Twitterscraper
2
+ class Type
3
+ def initialize(value)
4
+ @value = value
5
+ end
6
+
7
+ def search?
8
+ @value == 'search'
9
+ end
10
+
11
+ def user?
12
+ @value == 'user'
13
+ end
14
+
15
+ def to_s
16
+ @value
17
+ end
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.15.1'
2
+ VERSION = '0.19.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-17 00:00:00.000000000 Z
11
+ date: 2020-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -72,7 +72,9 @@ files:
72
72
  - lib/twitterscraper/proxy.rb
73
73
  - lib/twitterscraper/query.rb
74
74
  - lib/twitterscraper/template.rb
75
+ - lib/twitterscraper/template/tweets.html.erb
75
76
  - lib/twitterscraper/tweet.rb
77
+ - lib/twitterscraper/type.rb
76
78
  - lib/version.rb
77
79
  - twitterscraper-ruby.gemspec
78
80
  homepage: https://github.com/ts-3156/twitterscraper-ruby