twitterscraper-ruby 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66dda5275a9067d328f6637f127895ded954534d304e5e4b349f286a271a08d8
4
- data.tar.gz: 6c3ffb3fba82376fc2de49514245ea96c7cb4fa16c32dcd2fff1ab1ae327bd14
3
+ metadata.gz: ac0c10b18d836983cc6b73e25b9ed333af2f620106a07c6bc6a40058fb127895
4
+ data.tar.gz: e6fc18219d9127fb30ba57e39dc4656c0f0a3c108428d959de5bac9e7d317088
5
5
  SHA512:
6
- metadata.gz: 24267284f4f29adc86d5bbe70a30bbe31d6d898546576065f1a9accafc3944a352117bbf6eb0de273743a00fb2d26c5cf37ed016cc0324187a25ca279230d812
7
- data.tar.gz: 0bc9f01659560c83b0289bf63119849135b7ec27520dd03c7abd645da99ef660ca4b5fd12301b359cd5cc45a82914d7ceae88ad93ad756fde166718b3d0fe6c2
6
+ metadata.gz: 90cbf06b606878dc36b4bba44669139c273bf03b08a777ad87036834841bcb4b052e0559813dc56e4be124442abfc5a7fc44c5c9524c74929ca02b1d287d346b
7
+ data.tar.gz: ada0b74ee42ff62964b73ad9b49358227cdaf4fc87420cf12cf65af95168ad9775615a504345ebc83d3b791e9c0d892691c55bc477eddd647b3e8934f752fb9c
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.16.0)
4
+ twitterscraper-ruby (0.17.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -98,6 +98,7 @@ end
98
98
  "screen_name": "@name",
99
99
  "name": "Name",
100
100
  "user_id": 12340000,
101
+ "profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
101
102
  "tweet_id": 1234000000000000,
102
103
  "text": "Thanks Twitter!",
103
104
  "links": [],
@@ -122,6 +123,7 @@ end
122
123
  - screen_name
123
124
  - name
124
125
  - user_id
126
+ - profile_image_url
125
127
  - tweet_id
126
128
  - text
127
129
  - links
@@ -173,6 +175,7 @@ Search operators documentation is in [Standard search operators](https://develop
173
175
  | `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
174
176
  | `--order` | string | Sort a order of the results. | desc(default) or asc |
175
177
  | `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
178
+ | `--threads_granularity` | string | | auto |
176
179
  | `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
177
180
  | `--cache` | boolean | Enable caching. | true(default) or false |
178
181
  | `--format` | string | The format of the output. | json(default) or html |
@@ -24,6 +24,7 @@ module Twitterscraper
24
24
  daily_limit: options['daily_limit'],
25
25
  order: options['order'],
26
26
  threads: options['threads'],
27
+ threads_granularity: options['threads_granularity'],
27
28
  }
28
29
  client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
30
  tweets = client.query_tweets(options['query'], query_options)
@@ -69,6 +70,7 @@ module Twitterscraper
69
70
  'daily_limit:',
70
71
  'order:',
71
72
  'threads:',
73
+ 'threads_granularity:',
72
74
  'output:',
73
75
  'format:',
74
76
  'cache:',
@@ -82,10 +84,11 @@ module Twitterscraper
82
84
  options['lang'] ||= ''
83
85
  options['limit'] = (options['limit'] || 100).to_i
84
86
  options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
- options['threads'] = (options['threads'] || 2).to_i
87
+ options['threads'] = (options['threads'] || 10).to_i
88
+ options['threads_granularity'] ||= 'auto'
86
89
  options['format'] ||= 'json'
87
90
  options['order'] ||= 'desc'
88
- options['output'] ||= "tweets.#{options['format']}"
91
+ options['output'] ||= build_output_name(options)
89
92
 
90
93
  options['cache'] = options['cache'] != 'false'
91
94
  options['proxy'] = options['proxy'] != 'false'
@@ -93,6 +96,12 @@ module Twitterscraper
93
96
  options
94
97
  end
95
98
 
99
+ def build_output_name(options)
100
+ query = ERB::Util.url_encode(options['query'])
101
+ date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
102
+ [options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format']
103
+ end
104
+
96
105
  def initialize_logger
97
106
  Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
107
  end
@@ -80,14 +80,14 @@ module Twitterscraper
80
80
 
81
81
  url = build_query_url(query, lang, type, pos)
82
82
  http_request = lambda do
83
- logger.debug "Scraping tweets from #{url}"
83
+ logger.debug "Scraping tweets from url=#{url}"
84
84
  get_single_page(url, headers, proxies)
85
85
  end
86
86
 
87
87
  if cache_enabled?
88
88
  client = Cache.new
89
89
  if (response = client.read(url))
90
- logger.debug 'Fetching tweets from cache'
90
+ logger.debug "Fetching tweets from cache url=#{url}"
91
91
  else
92
92
  response = http_request.call
93
93
  client.write(url, response) unless stop_requested?
@@ -147,21 +147,27 @@ module Twitterscraper
147
147
  end
148
148
  end
149
149
 
150
- def build_queries(query, start_date, end_date)
150
+ def build_queries(query, start_date, end_date, threads_granularity)
151
151
  if start_date && end_date
152
- # date_range = start_date.upto(end_date - 1)
153
- # date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
154
-
155
- queries = []
156
- time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
157
- end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
152
+ if threads_granularity == 'auto'
153
+ threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
154
+ end
158
155
 
159
- while true
160
- if time < Time.now.utc
161
- queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
156
+ if threads_granularity == 'day'
157
+ date_range = start_date.upto(end_date - 1)
158
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
159
+ else
160
+ time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
161
+ end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
162
+ queries = []
163
+
164
+ while true
165
+ if time < Time.now.utc
166
+ queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
167
+ end
168
+ time += 3600
169
+ break if time >= end_time
162
170
  end
163
- time += 3600
164
- break if time >= end_time
165
171
  end
166
172
 
167
173
  queries
@@ -209,11 +215,17 @@ module Twitterscraper
209
215
  @stop_requested
210
216
  end
211
217
 
212
- def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
213
- start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
214
- end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
215
- queries = build_queries(query, start_date, end_date)
218
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
216
219
  type = Type.new(type)
220
+ if type.search?
221
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
222
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
223
+ elsif type.user?
224
+ start_date = nil
225
+ end_date = nil
226
+ end
227
+
228
+ queries = build_queries(query, start_date, end_date, threads_granularity)
217
229
  if threads > queries.size
218
230
  threads = queries.size
219
231
  end
@@ -228,6 +240,7 @@ module Twitterscraper
228
240
 
229
241
  validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
230
242
 
243
+ logger.info "The number of queries #{queries.size}"
231
244
  logger.info "The number of threads #{threads}"
232
245
 
233
246
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
@@ -252,15 +265,17 @@ module Twitterscraper
252
265
  end
253
266
  end
254
267
 
268
+ logger.info "Return #{@all_tweets.size} tweets"
269
+
255
270
  @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
256
271
  end
257
272
 
258
- def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
259
- query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
273
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
274
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
260
275
  end
261
276
 
262
277
  def user_timeline(screen_name, limit: 100, order: 'desc')
263
- query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
278
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
264
279
  end
265
280
  end
266
281
  end
@@ -9,8 +9,8 @@ module Twitterscraper
9
9
  chart_data: chart_data(tweets).to_json,
10
10
  first_tweet: tweets.sort_by { |t| t.created_at.to_i }[0],
11
11
  last_tweet: tweets.sort_by { |t| t.created_at.to_i }[-1],
12
- tweets_size: tweets.size,
13
- tweets: tweets.take(50)
12
+ tweets: tweets,
13
+ convert_limit: 30,
14
14
  )
15
15
  end
16
16
 
@@ -1,28 +1,29 @@
1
1
  <html>
2
2
  <head>
3
- <script>
4
- window.twttr = (function (d, s, id) {
5
- var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
6
- if (d.getElementById(id)) return t;
7
- js = d.createElement(s);
8
- js.id = id;
9
- js.src = "https://platform.twitter.com/widgets.js";
10
- fjs.parentNode.insertBefore(js, fjs);
11
-
12
- t._e = [];
13
- t.ready = function (f) {
14
- t._e.push(f);
15
- };
16
-
17
- return t;
18
- }(document, "script", "twitter-wjs"));
19
- </script>
20
-
21
3
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
22
4
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
23
5
  <script src="https://code.highcharts.com/stock/highstock.js"></script>
24
6
  <script>
7
+ function updateTweets() {
8
+ window.twttr = (function (d, s, id) {
9
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
10
+ if (d.getElementById(id)) return t;
11
+ js = d.createElement(s);
12
+ js.id = id;
13
+ js.src = "https://platform.twitter.com/widgets.js";
14
+ fjs.parentNode.insertBefore(js, fjs);
15
+
16
+ t._e = [];
17
+ t.ready = function (f) {
18
+ t._e.push(f);
19
+ };
20
+
21
+ return t;
22
+ }(document, "script", "twitter-wjs"));
23
+ }
24
+
25
25
  function drawChart() {
26
+ var data = <%= chart_data %>;
26
27
  Highcharts.setOptions({
27
28
  time: {
28
29
  timezone: moment.tz.guess()
@@ -31,13 +32,13 @@
31
32
 
32
33
  Highcharts.stockChart('chart', {
33
34
  title: {
34
- text: '<%= tweets_size %> tweets of <%= chart_name %>'
35
+ text: '<%= tweets.size %> tweets of <%= chart_name %>'
35
36
  },
36
37
  subtitle: {
37
- text: 'since:<%= first_tweet.created_at.localtime %> until:<%= last_tweet.created_at.localtime %>'
38
+ text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
38
39
  },
39
40
  series: [{
40
- data: <%= chart_data %>
41
+ data: data
41
42
  }],
42
43
  rangeSelector: {enabled: false},
43
44
  scrollbar: {enabled: false},
@@ -49,6 +50,7 @@
49
50
 
50
51
  document.addEventListener("DOMContentLoaded", function () {
51
52
  drawChart();
53
+ updateTweets();
52
54
  });
53
55
  </script>
54
56
 
@@ -64,17 +66,31 @@
64
66
  </style>
65
67
  </head>
66
68
  <body>
67
- <div id="chart"></div>
69
+ <div id="chart" style="width: 100vw; height: 400px;"></div>
68
70
 
69
71
  <div class="tweets-container">
70
- <% tweets.each do |tweet| %>
71
- <blockquote class="twitter-tweet">
72
- <a href="<%= tweet.tweet_url %>"></a>
73
- </blockquote>
74
- <% end %>
72
+ <% tweets.each.with_index do |tweet, i| %>
73
+ <% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
74
+ <% if i < convert_limit %>
75
+ <blockquote class="twitter-tweet">
76
+ <% else %>
77
+ <div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
78
+ <% end %>
79
+
80
+ <div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
81
+ <div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
82
+ <div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
83
+ <div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
84
+ </div>
85
+
86
+ <div><%= tweet.text %></div>
87
+ <div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
75
88
 
76
- <% if tweets_size > tweets.size %>
77
- <div>and more!</div>
89
+ <% if i < convert_limit %>
90
+ </blockquote>
91
+ <% else %>
92
+ </div>
93
+ <% end %>
78
94
  <% end %>
79
95
  </div>
80
96
 
@@ -6,6 +6,7 @@ module Twitterscraper
6
6
  :screen_name,
7
7
  :name,
8
8
  :user_id,
9
+ :profile_image_url,
9
10
  :tweet_id,
10
11
  :text,
11
12
  :links,
@@ -51,6 +52,11 @@ module Twitterscraper
51
52
  end
52
53
  end
53
54
 
55
+ # .js-stream-item
56
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
57
+ # .stream-item-header
58
+ # .js-tweet-text-container
59
+ # .stream-item-footer
54
60
  def from_html(text)
55
61
  html = Nokogiri::HTML(text)
56
62
  from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
@@ -72,6 +78,8 @@ module Twitterscraper
72
78
  end
73
79
 
74
80
  inner_html = Nokogiri::HTML(html.inner_html)
81
+
82
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
75
83
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
84
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
85
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -99,6 +107,7 @@ module Twitterscraper
99
107
  screen_name: screen_name,
100
108
  name: html.attr('data-name'),
101
109
  user_id: html.attr('data-user-id').to_i,
110
+ profile_image_url: profile_image_url,
102
111
  tweet_id: tweet_id,
103
112
  text: text,
104
113
  links: links,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.16.0'
2
+ VERSION = '0.17.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156