twitterscraper-ruby 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4382801b03a5384095aad6a955caea438787fa2eed96e3e001237df368925a2
4
- data.tar.gz: 6722b4edce7242b3006e5c097dd78847f36e2da7edea009e2d7b89b09f5b25ff
3
+ metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
4
+ data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
5
5
  SHA512:
6
- metadata.gz: 4ca72a0bbce553c38061e0362f755a5e82b47a5288108508410c19a7eef9a2514b58682e88ed1bf89654d5b89c84c41edd8a5fa34fd7d1e5fbf92b267402884a
7
- data.tar.gz: 8853b015cb37180d6814710d971a757d08aa4ddd4579af4131e204e34bb10c80ef3139c082f17be92303d9efc2e3f8eb4ba0d15bdf4f264fb4fba0cf87ed42d7
6
+ metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
7
+ data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.11.0)
4
+ twitterscraper-ruby (0.12.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
data/README.md CHANGED
@@ -137,8 +137,8 @@ $ cat tweets.json | jq . | less
137
137
  | ------------- | ------------- | ------------- |
138
138
  | `-h`, `--help` | This option displays a summary of twitterscraper. | |
139
139
  | `--query` | Specify a keyword used during the search. | |
140
- | `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
141
- | `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
140
+ | `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
141
+ | `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
142
142
  | `--lang` | Retrieve tweets written in a specific language. | |
143
143
  | `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
144
144
  | `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
@@ -20,6 +20,7 @@ module Twitterscraper
20
20
  end_date: options['end_date'],
21
21
  lang: options['lang'],
22
22
  limit: options['limit'],
23
+ daily_limit: options['daily_limit'],
23
24
  threads: options['threads'],
24
25
  proxy: options['proxy']
25
26
  }
@@ -63,6 +64,7 @@ module Twitterscraper
63
64
  'end_date:',
64
65
  'lang:',
65
66
  'limit:',
67
+ 'daily_limit:',
66
68
  'threads:',
67
69
  'output:',
68
70
  'format:',
@@ -72,8 +74,10 @@ module Twitterscraper
72
74
  'verbose',
73
75
  )
74
76
 
77
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
75
78
  options['lang'] ||= ''
76
79
  options['limit'] = (options['limit'] || 100).to_i
80
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
77
81
  options['threads'] = (options['threads'] || 2).to_i
78
82
  options['format'] ||= 'json'
79
83
  options['output'] ||= "tweets.#{options['format']}"
@@ -101,7 +105,7 @@ module Twitterscraper
101
105
  end
102
106
 
103
107
  def print_version
104
- puts "twitterscraper-#{Twitterscraper::VERSION}"
108
+ puts "twitterscraper-#{VERSION}"
105
109
  end
106
110
  end
107
111
  end
@@ -2,7 +2,7 @@ module Twitterscraper
2
2
  class Client
3
3
  include Query
4
4
 
5
- def initialize(cache:)
5
+ def initialize(cache: false)
6
6
  @cache = cache
7
7
  end
8
8
 
@@ -17,15 +17,17 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- item = @items[@cur_index - 1]
21
- Twitterscraper.logger.info("Using proxy #{item}")
22
- item
20
+ @items[@cur_index - 1]
23
21
  end
24
22
 
25
23
  def size
26
24
  @items.size
27
25
  end
28
26
 
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
29
31
  private
30
32
 
31
33
  def reload
@@ -51,7 +53,6 @@ module Twitterscraper
51
53
  proxies << ip + ':' + port
52
54
  end
53
55
 
54
- Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
56
  proxies.shuffle
56
57
  rescue => e
57
58
  if (retries -= 1) > 0
@@ -44,14 +44,18 @@ module Twitterscraper
44
44
 
45
45
  def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
46
46
  return nil if stop_requested?
47
- Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
47
+ unless proxies.empty?
48
+ proxy = proxies.sample
49
+ logger.info("Using proxy #{proxy}")
50
+ end
51
+ Http.get(url, headers, proxy, timeout)
48
52
  rescue => e
49
53
  logger.debug "query_single_page: #{e.inspect}"
50
54
  if (retries -= 1) > 0
51
- logger.info("Retrying... (Attempts left: #{retries - 1})")
55
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
52
56
  retry
53
57
  else
54
- raise
58
+ raise Error.new("#{e.inspect} url=#{url}")
55
59
  end
56
60
  end
57
61
 
@@ -71,27 +75,27 @@ module Twitterscraper
71
75
  end
72
76
 
73
77
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
74
- logger.info("Querying #{query}")
78
+ logger.info "Querying #{query}"
75
79
  query = ERB::Util.url_encode(query)
76
80
 
77
81
  url = build_query_url(query, lang, pos, from_user)
78
82
  http_request = lambda do
79
- logger.debug("Scraping tweets from #{url}")
83
+ logger.debug "Scraping tweets from #{url}"
80
84
  get_single_page(url, headers, proxies)
81
85
  end
82
86
 
83
87
  if cache_enabled?
84
88
  client = Cache.new
85
89
  if (response = client.read(url))
86
- logger.debug('Fetching tweets from cache')
90
+ logger.debug 'Fetching tweets from cache'
87
91
  else
88
92
  response = http_request.call
89
- client.write(url, response)
93
+ client.write(url, response) unless stop_requested?
90
94
  end
91
95
  else
92
96
  response = http_request.call
93
97
  end
94
- return [], nil if response.nil?
98
+ return [], nil if response.nil? || response.empty?
95
99
 
96
100
  html, json_resp = parse_single_page(response, pos.nil?)
97
101
 
@@ -114,31 +118,31 @@ module Twitterscraper
114
118
 
115
119
  def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
116
120
  if query.nil? || query == ''
117
- raise 'Please specify a search query.'
121
+ raise Error.new('Please specify a search query.')
118
122
  end
119
123
 
120
124
  if ERB::Util.url_encode(query).length >= 500
121
- raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
125
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
122
126
  end
123
127
 
124
128
  if start_date && end_date
125
129
  if start_date == end_date
126
- raise 'Please specify different values for :start_date and :end_date.'
130
+ raise Error.new('Please specify different values for :start_date and :end_date.')
127
131
  elsif start_date > end_date
128
- raise ':start_date must occur before :end_date.'
132
+ raise Error.new(':start_date must occur before :end_date.')
129
133
  end
130
134
  end
131
135
 
132
136
  if start_date
133
137
  if start_date < OLDEST_DATE
134
- raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
138
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
135
139
  end
136
140
  end
137
141
 
138
142
  if end_date
139
143
  today = Date.today
140
144
  if end_date > Date.today
141
- raise ":end_date must be less than or equal to today(#{today})"
145
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
142
146
  end
143
147
  end
144
148
  end
@@ -156,27 +160,32 @@ module Twitterscraper
156
160
  end
157
161
  end
158
162
 
159
- def main_loop(query, lang, limit, headers, proxies)
163
+ def main_loop(query, lang, limit, daily_limit, headers, proxies)
160
164
  pos = nil
165
+ daily_tweets = []
161
166
 
162
167
  while true
163
168
  new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
164
169
  unless new_tweets.empty?
170
+ daily_tweets.concat(new_tweets)
171
+ daily_tweets.uniq! { |t| t.tweet_id }
172
+
165
173
  @mutex.synchronize {
166
174
  @all_tweets.concat(new_tweets)
167
175
  @all_tweets.uniq! { |t| t.tweet_id }
168
176
  }
169
177
  end
170
- logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
178
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
171
179
 
172
180
  break unless new_pos
181
+ break if daily_limit && daily_tweets.size >= daily_limit
173
182
  break if @all_tweets.size >= limit
174
183
 
175
184
  pos = new_pos
176
185
  end
177
186
 
178
- if @all_tweets.size >= limit
179
- logger.info("Limit reached #{@all_tweets.size}")
187
+ if !@stop_requested && @all_tweets.size >= limit
188
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
180
189
  @stop_requested = true
181
190
  end
182
191
  end
@@ -185,32 +194,36 @@ module Twitterscraper
185
194
  @stop_requested
186
195
  end
187
196
 
188
- def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
197
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
189
198
  start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
190
199
  end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
191
200
  queries = build_queries(query, start_date, end_date)
192
201
  threads = queries.size if threads > queries.size
193
- proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
202
+ proxies = proxy ? Proxy::Pool.new : []
194
203
 
195
204
  validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
196
205
 
197
- logger.info("The number of threads #{threads}")
206
+ logger.debug "Fetch #{proxies.size} proxies" if proxy
207
+ logger.info "The number of threads #{threads}"
198
208
 
199
209
  headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
200
- logger.info("Headers #{headers}")
210
+ logger.info "Headers #{headers}"
201
211
 
202
212
  @all_tweets = []
203
213
  @mutex = Mutex.new
204
214
  @stop_requested = false
205
215
 
206
216
  if threads > 1
217
+ Thread.abort_on_exception = true
218
+ logger.debug "Set 'Thread.abort_on_exception' to true"
219
+
207
220
  Parallel.each(queries, in_threads: threads) do |query|
208
- main_loop(query, lang, limit, headers, proxies)
221
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
209
222
  raise Parallel::Break if stop_requested?
210
223
  end
211
224
  else
212
225
  queries.each do |query|
213
- main_loop(query, lang, limit, headers, proxies)
226
+ main_loop(query, lang, limit, daily_limit, headers, proxies)
214
227
  break if stop_requested?
215
228
  end
216
229
  end
@@ -59,12 +59,19 @@ module Twitterscraper
59
59
  def from_tweets_html(html)
60
60
  html.map do |tweet|
61
61
  from_tweet_html(tweet)
62
- end
62
+ end.compact
63
63
  end
64
64
 
65
65
  def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
66
74
  inner_html = Nokogiri::HTML(html.inner_html)
67
- tweet_id = html.attr('data-tweet-id').to_i
68
75
  text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
69
76
  links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
70
77
  image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
@@ -89,7 +96,7 @@ module Twitterscraper
89
96
 
90
97
  timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
91
98
  new(
92
- screen_name: html.attr('data-screen-name'),
99
+ screen_name: screen_name,
93
100
  name: html.attr('data-name'),
94
101
  user_id: html.attr('data-user-id').to_i,
95
102
  tweet_id: tweet_id,
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.11.0'
2
+ VERSION = '0.12.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156