twitterscraper-ruby 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
4
- data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
3
+ metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
4
+ data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
5
5
  SHA512:
6
- metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
7
- data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
6
+ metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
7
+ data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.11.0)
4
+ twitterscraper-ruby (0.12.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -137,8 +137,8 @@ $ cat tweets.json | jq . | less
137
137
  | ------------- | ------------- | ------------- |
138
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
139
139
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
142
  | `--lang` | Retrieve tweets written in a specific language. | |
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
@@ -20,6 +20,7 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
23
24
  threads: options['threads'],
24
25
  proxy: options['proxy']
25
26
  }
@@ -63,6 +64,7 @@ module Twitterscraper
63
64
  'end_date:',
64
65
  'lang:',
65
66
  'limit:',
67
+ 'daily_limit:',
66
68
  'threads:',
67
69
  'output:',
68
70
  'format:',
@@ -72,8 +74,10 @@ module Twitterscraper
72
74
  'verbose',
73
75
  )
74
76
 
77
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
75
78
  options['lang'] ||= ''
76
79
  options['limit'] = (options['limit'] || 100).to_i
80
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
77
81
  options['threads'] = (options['threads'] || 2).to_i
78
82
  options['format'] ||= 'json'
79
83
  options['output'] ||= "tweets.#{options['format']}"
@@ -101,7 +105,7 @@ module Twitterscraper
101
105
  end
102
106
 
103
107
  def print_version
104
- puts "twitterscraper-#{Twitterscraper::VERSION}"
108
+ puts "twitterscraper-#{VERSION}"
105
109
  end
106
110
  end
107
111
  end
@@ -2,7 +2,7 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
- def initialize(cache:)
5
+ def initialize(cache: false)
6
6
  @cache = cache
7
7
  end
8
8
 
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,27 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
82
  http_request = lambda do
79
- logger.debug("Scraping tweets from #{url}")
83
+ logger.debug "Scraping tweets from #{url}"
80
84
  get_single_page(url, headers, proxies)
81
85
  end
82
86
 
83
87
  if cache_enabled?
84
88
  client = Cache.new
85
89
  if (response = client.read(url))
86
- logger.debug('Fetching tweets from cache')
90
+ logger.debug 'Fetching tweets from cache'
87
91
  else
88
92
  response = http_request.call
89
- client.write(url, response)
93
+ client.write(url, response) unless stop_requested?
90
94
  end
91
95
  else
92
96
  response = http_request.call
93
97
  end
94
- return [], nil if response.nil?
98
+ return [], nil if response.nil? || response.empty?
95
99
 
96
100
  html, json_resp = parse_single_page(response, pos.nil?)
97
101
 
@@ -114,31 +118,31 @@ module Twitterscraper
114
118
 
115
119
  def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
116
120
  if query.nil? || query == ''
117
- raise 'Please specify a search query.'
121
+ raise Error.new('Please specify a search query.')
118
122
  end
119
123
 
120
124
  if ERB::Util.url_encode(query).length >= 500
121
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
125
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
122
126
  end
123
127
 
124
128
  if start_date && end_date
125
129
  if start_date == end_date
126
- raise 'Please specify different values for :start_date and :end_date.'
130
+ raise Error.new('Please specify different values for :start_date and :end_date.')
127
131
  elsif start_date > end_date
128
- raise ':start_date must occur before :end_date.'
132
+ raise Error.new(':start_date must occur before :end_date.')
129
133
  end
130
134
  end
131
135
 
132
136
  if start_date
133
137
  if start_date < OLDEST_DATE
134
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
138
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
135
139
  end
136
140
  end
137
141
 
138
142
  if end_date
139
143
  today = Date.today
140
144
  if end_date > Date.today
141
- raise ":end_date must be less than or equal to today(#{today})"
145
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
142
146
  end
143
147
  end
144
148
  end
@@ -156,27 +160,32 @@ module Twitterscraper
156
160
  end
157
161
  end
158
162
 
159
- def main_loop(query, lang, limit, headers, proxies)
163
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
160
164
  pos = nil
165
+ daily_tweets = []
161
166
 
162
167
  while true
163
168
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
164
169
  unless new_tweets.empty?
170
+ daily_tweets.concat(new_tweets)
171
+ daily_tweets.uniq! { |t| t.tweet_id }
172
+
165
173
  @mutex.synchronize {
166
174
  @all_tweets.concat(new_tweets)
167
175
  @all_tweets.uniq! { |t| t.tweet_id }
168
176
  }
169
177
  end
170
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
178
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
171
179
 
172
180
  break unless new_pos
181
+ break if daily_limit && daily_tweets.size >= daily_limit
173
182
  break if @all_tweets.size >= limit
174
183
 
175
184
  pos = new_pos
176
185
  end
177
186
 
178
- if @all_tweets.size >= limit
179
- logger.info("Limit reached #{@all_tweets.size}")
187
+ if !@stop_requested && @all_tweets.size >= limit
188
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
180
189
  @stop_requested = true
181
190
  end
182
191
  end
@@ -185,32 +194,36 @@ module Twitterscraper
185
194
  @stop_requested
186
195
  end
187
196
 
188
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
197
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
189
198
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
190
199
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
191
200
  queries = build_queries(query, start_date, end_date)
192
201
  threads = queries.size if threads > queries.size
193
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ proxies = proxy ? Proxy::Pool.new : []
194
203
 
195
204
  validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
196
205
 
197
- logger.info("The number of threads #{threads}")
206
+ logger.debug "Fetch #{proxies.size} proxies" if proxy
207
+ logger.info "The number of threads #{threads}"
198
208
 
199
209
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
200
- logger.info("Headers #{headers}")
210
+ logger.info "Headers #{headers}"
201
211
 
202
212
  @all_tweets = []
203
213
  @mutex = Mutex.new
204
214
  @stop_requested = false
205
215
 
206
216
  if threads > 1
217
+ Thread.abort_on_exception = true
218
+ logger.debug "Set 'Thread.abort_on_exception' to true"
219
+
207
220
  Parallel.each(queries, in_threads: threads) do |query|
208
- main_loop(query, lang, limit, headers, proxies)
221
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
209
222
  raise Parallel::Break if stop_requested?
210
223
  end
211
224
  else
212
225
  queries.each do |query|
213
- main_loop(query, lang, limit, headers, proxies)
226
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
214
227
  break if stop_requested?
215
228
  end
216
229
  end
@@ -59,12 +59,19 @@ module Twitterscraper
59
59
  def from_tweets_html(html)
60
60
  html.map do |tweet|
61
61
  from_tweet_html(tweet)
62
- end
62
+ end.compact
63
63
  end
64
64
 
65
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
66
74
  inner_html = Nokogiri::HTML(html.inner_html)
67
- tweet_id = html.attr('data-tweet-id').to_i
68
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
69
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
70
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -89,7 +96,7 @@ module Twitterscraper
89
96
 
90
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
91
98
  new(
92
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
93
100
  name: html.attr('data-name'),
94
101
  user_id: html.attr('data-user-id').to_i,
95
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.11.0'
2
+ VERSION = '0.12.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156