twitterscraper-ruby 0.16.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66dda5275a9067d328f6637f127895ded954534d304e5e4b349f286a271a08d8
4
- data.tar.gz: 6c3ffb3fba82376fc2de49514245ea96c7cb4fa16c32dcd2fff1ab1ae327bd14
3
+ metadata.gz: ac0c10b18d836983cc6b73e25b9ed333af2f620106a07c6bc6a40058fb127895
4
+ data.tar.gz: e6fc18219d9127fb30ba57e39dc4656c0f0a3c108428d959de5bac9e7d317088
5
5
  SHA512:
6
- metadata.gz: 24267284f4f29adc86d5bbe70a30bbe31d6d898546576065f1a9accafc3944a352117bbf6eb0de273743a00fb2d26c5cf37ed016cc0324187a25ca279230d812
7
- data.tar.gz: 0bc9f01659560c83b0289bf63119849135b7ec27520dd03c7abd645da99ef660ca4b5fd12301b359cd5cc45a82914d7ceae88ad93ad756fde166718b3d0fe6c2
6
+ metadata.gz: 90cbf06b606878dc36b4bba44669139c273bf03b08a777ad87036834841bcb4b052e0559813dc56e4be124442abfc5a7fc44c5c9524c74929ca02b1d287d346b
7
+ data.tar.gz: ada0b74ee42ff62964b73ad9b49358227cdaf4fc87420cf12cf65af95168ad9775615a504345ebc83d3b791e9c0d892691c55bc477eddd647b3e8934f752fb9c
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.16.0)
4
+ twitterscraper-ruby (0.17.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | | auto |
176
179
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
180
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
181
  | `--format` | string | The format of the output. | json(default) or html |
@@ -24,6 +24,7 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
@@ -69,6 +70,7 @@ module Twitterscraper
69
70
  'daily_limit:',
70
71
  'order:',
71
72
  'threads:',
73
+ 'threads_granularity:',
72
74
  'output:',
73
75
  'format:',
74
76
  'cache:',
@@ -82,10 +84,11 @@ module Twitterscraper
82
84
  options['lang'] ||= ''
83
85
  options['limit'] = (options['limit'] || 100).to_i
84
86
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
87
+ options['threads'] = (options['threads'] || 10).to_i
88
+ options['threads_granularity'] ||= 'auto'
86
89
  options['format'] ||= 'json'
87
90
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
91
+ options['output'] ||= build_output_name(options)
89
92
 
90
93
  options['cache'] = options['cache'] != 'false'
91
94
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +96,12 @@ module Twitterscraper
93
96
  options
94
97
  end
95
98
 
99
+ def build_output_name(options)
100
+ query = ERB::Util.url_encode(options['query'])
101
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
102
+ [options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format']
103
+ end
104
+
96
105
  def initialize_logger
97
106
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
107
  end
@@ -80,14 +80,14 @@ module Twitterscraper
80
80
 
81
81
  url = build_query_url(query, lang, type, pos)
82
82
  http_request = lambda do
83
- logger.debug "Scraping tweets from #{url}"
83
+ logger.debug "Scraping tweets from url=#{url}"
84
84
  get_single_page(url, headers, proxies)
85
85
  end
86
86
 
87
87
  if cache_enabled?
88
88
  client = Cache.new
89
89
  if (response = client.read(url))
90
- logger.debug 'Fetching tweets from cache'
90
+ logger.debug "Fetching tweets from cache url=#{url}"
91
91
  else
92
92
  response = http_request.call
93
93
  client.write(url, response) unless stop_requested?
@@ -147,21 +147,27 @@ module Twitterscraper
147
147
  end
148
148
  end
149
149
 
150
- def build_queries(query, start_date, end_date)
150
+ def build_queries(query, start_date, end_date, threads_granularity)
151
151
  if start_date && end_date
152
- # date_range = start_date.upto(end_date - 1)
153
- # date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
154
-
155
- queries = []
156
- time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
157
- end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
152
+ if threads_granularity == 'auto'
153
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
154
+ end
158
155
 
159
- while true
160
- if time < Time.now.utc
161
- queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
156
+ if threads_granularity == 'day'
157
+ date_range = start_date.upto(end_date - 1)
158
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
159
+ else
160
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
161
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
162
+ queries = []
163
+
164
+ while true
165
+ if time < Time.now.utc
166
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
167
+ end
168
+ time += 3600
169
+ break if time >= end_time
162
170
  end
163
- time += 3600
164
- break if time >= end_time
165
171
  end
166
172
 
167
173
  queries
@@ -209,11 +215,17 @@ module Twitterscraper
209
215
  @stop_requested
210
216
  end
211
217
 
212
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
213
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
214
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
215
- queries = build_queries(query, start_date, end_date)
218
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
216
219
  type = Type.new(type)
220
+ if type.search?
221
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
222
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
223
+ elsif type.user?
224
+ start_date = nil
225
+ end_date = nil
226
+ end
227
+
228
+ queries = build_queries(query, start_date, end_date, threads_granularity)
217
229
  if threads > queries.size
218
230
  threads = queries.size
219
231
  end
@@ -228,6 +240,7 @@ module Twitterscraper
228
240
 
229
241
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
230
242
 
243
+ logger.info "The number of queries #{queries.size}"
231
244
  logger.info "The number of threads #{threads}"
232
245
 
233
246
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
@@ -252,15 +265,17 @@ module Twitterscraper
252
265
  end
253
266
  end
254
267
 
268
+ logger.info "Return #{@all_tweets.size} tweets"
269
+
255
270
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
256
271
  end
257
272
 
258
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
259
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
273
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
274
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
260
275
  end
261
276
 
262
277
  def user_timeline(screen_name, limit: 100, order: 'desc')
263
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
278
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
264
279
  end
265
280
  end
266
281
  end
@@ -9,8 +9,8 @@ module Twitterscraper
9
9
  chart_data: chart_data(tweets).to_json,
10
10
  first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
11
11
  last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
12
- tweets_size: tweets.size,
13
- tweets: tweets.take(50)
12
+ tweets: tweets,
13
+ convert_limit: 30,
14
14
  )
15
15
  end
16
16
 
@@ -1,28 +1,29 @@
1
1
  <html>
2
2
  <head>
3
- <script>
4
- window.twttr = (function (d, s, id) {
5
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
6
- if (d.getElementById(id)) return t;
7
- js = d.createElement(s);
8
- js.id = id;
9
- js.src = "https://platform.twitter.com/widgets.js";
10
- fjs.parentNode.insertBefore(js, fjs);
11
-
12
- t._e = [];
13
- t.ready = function (f) {
14
- t._e.push(f);
15
- };
16
-
17
- return t;
18
- }(document, "script", "twitter-wjs"));
19
- </script>
20
-
21
3
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
22
4
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
23
5
  <script src="https://code.highcharts.com/stock/highstock.js"></script>
24
6
  <script>
7
+ function updateTweets() {
8
+ window.twttr = (function (d, s, id) {
9
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
10
+ if (d.getElementById(id)) return t;
11
+ js = d.createElement(s);
12
+ js.id = id;
13
+ js.src = "https://platform.twitter.com/widgets.js";
14
+ fjs.parentNode.insertBefore(js, fjs);
15
+
16
+ t._e = [];
17
+ t.ready = function (f) {
18
+ t._e.push(f);
19
+ };
20
+
21
+ return t;
22
+ }(document, "script", "twitter-wjs"));
23
+ }
24
+
25
25
  function drawChart() {
26
+ var data = <%= chart_data %>;
26
27
  Highcharts.setOptions({
27
28
  time: {
28
29
  timezone: moment.tz.guess()
@@ -31,13 +32,13 @@
31
32
 
32
33
  Highcharts.stockChart('chart', {
33
34
  title: {
34
- text: '<%= tweets_size %> tweets of <%= chart_name %>'
35
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
35
36
  },
36
37
  subtitle: {
37
- text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
38
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
38
39
  },
39
40
  series: [{
40
- data: <%= chart_data %>
41
+ data: data
41
42
  }],
42
43
  rangeSelector: {enabled: false},
43
44
  scrollbar: {enabled: false},
@@ -49,6 +50,7 @@
49
50
 
50
51
  document.addEventListener("DOMContentLoaded", function () {
51
52
  drawChart();
53
+ updateTweets();
52
54
  });
53
55
  </script>
54
56
 
@@ -64,17 +66,31 @@
64
66
  </style>
65
67
  </head>
66
68
  <body>
67
- <div id="chart"></div>
69
+ <div id="chart" style="width: 100vw; height: 400px;"></div>
68
70
 
69
71
  <div class="tweets-container">
70
- <% tweets.each do |tweet| %>
71
- <blockquote class="twitter-tweet">
72
- <a href="<%= tweet.tweet_url %>"></a>
73
- </blockquote>
74
- <% end %>
72
+ <% tweets.each.with_index do |tweet, i| %>
73
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
74
+ <% if i < convert_limit %>
75
+ <blockquote class="twitter-tweet">
76
+ <% else %>
77
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
78
+ <% end %>
79
+
80
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
81
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
82
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
83
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
84
+ </div>
85
+
86
+ <div><%= tweet.text %></div>
87
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
75
88
 
76
- <% if tweets_size > tweets.size %>
77
- <div>and more!</div>
89
+ <% if i < convert_limit %>
90
+ </blockquote>
91
+ <% else %>
92
+ </div>
93
+ <% end %>
78
94
  <% end %>
79
95
  </div>
80
96
 
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.16.0'
2
+ VERSION = '0.17.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156