twitterscraper-ruby 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ require_relative "./twitterscraper"
@@ -0,0 +1,27 @@
1
+ require 'twitterscraper/logger'
2
+ require 'twitterscraper/proxy'
3
+ require 'twitterscraper/http'
4
+ require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
6
+ require 'twitterscraper/query'
7
+ require 'twitterscraper/client'
8
+ require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
10
+ require 'version'
11
+
12
+ module Twitterscraper
13
+ class Error < StandardError; end
14
+
15
+ def self.logger
16
+ @logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
17
+ end
18
+
19
+ def self.logger=(logger)
20
+ if logger.nil?
21
+ self.logger.level = ::Logger::FATAL
22
+ return self.logger
23
+ end
24
+
25
+ @logger = logger
26
+ end
27
+ end
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,119 @@
1
+ $stdout.sync = true
2
+
3
+ require 'json'
4
+ require 'optparse'
5
+ require 'twitterscraper'
6
+
7
+ module Twitterscraper
8
+ class Cli
9
+ def parse
10
+ @options = parse_options(ARGV)
11
+ initialize_logger
12
+ end
13
+
14
+ def run
15
+ print_help || return if print_help?
16
+ print_version || return if print_version?
17
+
18
+ query_options = {
19
+ type: options['type'],
20
+ start_date: options['start_date'],
21
+ end_date: options['end_date'],
22
+ lang: options['lang'],
23
+ limit: options['limit'],
24
+ daily_limit: options['daily_limit'],
25
+ order: options['order'],
26
+ threads: options['threads'],
27
+ }
28
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
+ tweets = client.query_tweets(options['query'], query_options)
30
+ export(tweets) unless tweets.empty?
31
+ end
32
+
33
+ def export(tweets)
34
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
+
36
+ if options['format'] == 'json'
37
+ write_json.call
38
+ elsif options['format'] == 'html'
39
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
40
+ else
41
+ write_json.call
42
+ end
43
+ end
44
+
45
+ def generate_json(tweets)
46
+ if options['pretty']
47
+ ::JSON.pretty_generate(tweets)
48
+ else
49
+ ::JSON.generate(tweets)
50
+ end
51
+ end
52
+
53
+ def options
54
+ @options
55
+ end
56
+
57
+ def parse_options(argv)
58
+ options = argv.getopts(
59
+ 'h',
60
+ 'help',
61
+ 'v',
62
+ 'version',
63
+ 'type:',
64
+ 'query:',
65
+ 'start_date:',
66
+ 'end_date:',
67
+ 'lang:',
68
+ 'limit:',
69
+ 'daily_limit:',
70
+ 'order:',
71
+ 'threads:',
72
+ 'output:',
73
+ 'format:',
74
+ 'cache:',
75
+ 'proxy:',
76
+ 'pretty',
77
+ 'verbose',
78
+ )
79
+
80
+ options['type'] ||= 'search'
81
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
82
+ options['lang'] ||= ''
83
+ options['limit'] = (options['limit'] || 100).to_i
84
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
+ options['threads'] = (options['threads'] || 2).to_i
86
+ options['format'] ||= 'json'
87
+ options['order'] ||= 'desc'
88
+ options['output'] ||= "tweets.#{options['format']}"
89
+
90
+ options['cache'] = options['cache'] != 'false'
91
+ options['proxy'] = options['proxy'] != 'false'
92
+
93
+ options
94
+ end
95
+
96
+ def initialize_logger
97
+ Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
+ end
99
+
100
+ def print_help?
101
+ options['h'] || options['help']
102
+ end
103
+
104
+ def print_help
105
+ puts <<~'SHELL'
106
+ Usage:
107
+ twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
108
+ SHELL
109
+ end
110
+
111
+ def print_version?
112
+ options['v'] || options['version']
113
+ end
114
+
115
+ def print_version
116
+ puts "twitterscraper-#{VERSION}"
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,18 @@
1
+ module Twitterscraper
2
+ class Client
3
+ include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,31 @@
1
+ module Twitterscraper
2
+ module Http
3
+
4
+ module_function
5
+
6
+ def get(url, headers = {}, proxy = nil, timeout = nil)
7
+ timeout ||= 3
8
+
9
+ if proxy
10
+ ip, port = proxy.split(':')
11
+ http_class = Net::HTTP::Proxy(ip, port.to_i)
12
+ else
13
+ http_class = Net::HTTP
14
+ end
15
+
16
+ uri = URI.parse(url)
17
+ http = http_class.new(uri.host, uri.port)
18
+ http.use_ssl = true if url.match?(/^https/)
19
+ http.open_timeout = timeout
20
+ http.read_timeout = timeout
21
+ req = Net::HTTP::Get.new(uri)
22
+
23
+ headers.each do |key, value|
24
+ req[key] = value
25
+ end
26
+
27
+ res = http.start { http.request(req) }
28
+ res.body
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,40 @@
1
+ module Twitterscraper
2
+ class Lang
3
+ LIST = [
4
+ 'en', # English
5
+ 'ar', # Arabic
6
+ 'bn', # Bengali
7
+ 'cs', # Czech
8
+ 'da', # Danish
9
+ 'de', # German
10
+ 'el', # Greek
11
+ 'es', # Spanish
12
+ 'fa', # Persian
13
+ 'fi', # Finnish
14
+ 'fil', # Filipino
15
+ 'fr', # French
16
+ 'he', # Hebrew
17
+ 'hi', # Hindi
18
+ 'hu', # Hungarian
19
+ 'id', # Indonesian
20
+ 'it', # Italian
21
+ 'ja', # Japanese
22
+ 'ko', # Korean
23
+ 'msa', # Malay
24
+ 'nl', # Dutch
25
+ 'no', # Norwegian
26
+ 'pl', # Polish
27
+ 'pt', # Portuguese
28
+ 'ro', # Romanian
29
+ 'ru', # Russian
30
+ 'sv', # Swedish
31
+ 'th', # Thai
32
+ 'tr', # Turkish
33
+ 'uk', # Ukranian
34
+ 'ur', # Urdu
35
+ 'vi', # Vietnamese
36
+ 'zh-cn', # Chinese Simplified
37
+ 'zh-tw', # Chinese Traditional
38
+ ]
39
+ end
40
+ end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ module Twitterscraper
4
+ module Logger
5
+ def logger
6
+ Twitterscraper.logger
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,65 @@
1
+ module Twitterscraper
2
+ module Proxy
3
+
4
+ PROXY_URL = 'https://free-proxy-list.net/'
5
+
6
+ class RetryExhausted < StandardError
7
+ end
8
+
9
+ class Pool
10
+ def initialize
11
+ @items = Proxy.get_proxies
12
+ @cur_index = 0
13
+ end
14
+
15
+ def sample
16
+ if @cur_index >= @items.size
17
+ reload
18
+ end
19
+ @cur_index += 1
20
+ @items[@cur_index - 1]
21
+ end
22
+
23
+ def size
24
+ @items.size
25
+ end
26
+
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
31
+ private
32
+
33
+ def reload
34
+ @items = Proxy.get_proxies
35
+ @cur_index = 0
36
+ end
37
+ end
38
+
39
+ module_function
40
+
41
+ def get_proxies(retries = 3)
42
+ response = Twitterscraper::Http.get(PROXY_URL)
43
+ html = Nokogiri::HTML(response)
44
+ table = html.xpath('//table[@id="proxylisttable"]').first
45
+
46
+ proxies = []
47
+
48
+ table.xpath('tbody/tr').each do |tr|
49
+ cells = tr.xpath('td')
50
+ ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
51
+ next unless ['elite proxy', 'anonymous'].include?(anonymity)
52
+ next if https == 'no'
53
+ proxies << ip + ':' + port
54
+ end
55
+
56
+ proxies.shuffle
57
+ rescue => e
58
+ if (retries -= 1) > 0
59
+ retry
60
+ else
61
+ raise RetryExhausted.new(e.inspect)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,254 @@
1
+ require 'resolv-replace'
2
+ require 'net/http'
3
+ require 'nokogiri'
4
+ require 'date'
5
+ require 'json'
6
+ require 'erb'
7
+ require 'parallel'
8
+
9
+ module Twitterscraper
10
+ module Query
11
+ include Logger
12
+
13
+ USER_AGENT_LIST = [
14
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
+ ]
20
+
21
+ INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
+ RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
+ 'default&include_available_features=1&include_entities=1&' +
24
+ 'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
25
+ INIT_URL_USER = 'https://twitter.com/__USER__'
26
+ RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
27
+ 'include_available_features=1&include_entities=1&' +
28
+ 'max_position=__POS__&reset_error_state=false'
29
+
30
+ def build_query_url(query, lang, from_user, pos)
31
+ if from_user
32
+ if pos
33
+ RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
+ else
35
+ INIT_URL_USER.sub('__USER__', query)
36
+ end
37
+ else
38
+ if pos
39
+ RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
40
+ else
41
+ INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
42
+ end
43
+ end
44
+ end
45
+
46
+ def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
47
+ return nil if stop_requested?
48
+ unless proxies.empty?
49
+ proxy = proxies.sample
50
+ logger.info("Using proxy #{proxy}")
51
+ end
52
+ Http.get(url, headers, proxy, timeout)
53
+ rescue => e
54
+ logger.debug "query_single_page: #{e.inspect}"
55
+ if (retries -= 1) > 0
56
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
57
+ retry
58
+ else
59
+ raise Error.new("#{e.inspect} url=#{url}")
60
+ end
61
+ end
62
+
63
+ def parse_single_page(text, html = true)
64
+ return [nil, nil] if text.nil? || text == ''
65
+
66
+ if html
67
+ json_resp = nil
68
+ items_html = text
69
+ else
70
+ json_resp = JSON.parse(text)
71
+ items_html = json_resp['items_html'] || ''
72
+ logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
73
+ end
74
+
75
+ [items_html, json_resp]
76
+ end
77
+
78
+ def query_single_page(query, lang, type, pos, headers: [], proxies: [])
79
+ logger.info "Querying #{query}"
80
+ query = ERB::Util.url_encode(query)
81
+
82
+ url = build_query_url(query, lang, type == 'user', pos)
83
+ http_request = lambda do
84
+ logger.debug "Scraping tweets from #{url}"
85
+ get_single_page(url, headers, proxies)
86
+ end
87
+
88
+ if cache_enabled?
89
+ client = Cache.new
90
+ if (response = client.read(url))
91
+ logger.debug 'Fetching tweets from cache'
92
+ else
93
+ response = http_request.call
94
+ client.write(url, response) unless stop_requested?
95
+ end
96
+ else
97
+ response = http_request.call
98
+ end
99
+ return [], nil if response.nil? || response.empty?
100
+
101
+ html, json_resp = parse_single_page(response, pos.nil?)
102
+
103
+ tweets = Tweet.from_html(html)
104
+
105
+ if tweets.empty?
106
+ return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
107
+ end
108
+
109
+ if json_resp
110
+ [tweets, json_resp['min_position']]
111
+ elsif type
112
+ [tweets, tweets[-1].tweet_id]
113
+ else
114
+ [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
115
+ end
116
+ end
117
+
118
+ OLDEST_DATE = Date.parse('2006-03-21')
119
+
120
+ def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
121
+ query = queries[0]
122
+ if query.nil? || query == ''
123
+ raise Error.new('Please specify a search query.')
124
+ end
125
+
126
+ if ERB::Util.url_encode(query).length >= 500
127
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
128
+ end
129
+
130
+ if start_date && end_date
131
+ if start_date == end_date
132
+ raise Error.new('Please specify different values for :start_date and :end_date.')
133
+ elsif start_date > end_date
134
+ raise Error.new(':start_date must occur before :end_date.')
135
+ end
136
+ end
137
+
138
+ if start_date
139
+ if start_date < OLDEST_DATE
140
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
141
+ end
142
+ end
143
+
144
+ if end_date
145
+ today = Date.today
146
+ if end_date > Date.today
147
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
148
+ end
149
+ end
150
+ end
151
+
152
+ def build_queries(query, start_date, end_date)
153
+ if start_date && end_date
154
+ date_range = start_date.upto(end_date - 1)
155
+ date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
156
+ elsif start_date
157
+ [query + " since:#{start_date}"]
158
+ elsif end_date
159
+ [query + " until:#{end_date}"]
160
+ else
161
+ [query]
162
+ end
163
+ end
164
+
165
+ def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
166
+ pos = nil
167
+ daily_tweets = []
168
+
169
+ while true
170
+ new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
171
+ unless new_tweets.empty?
172
+ daily_tweets.concat(new_tweets)
173
+ daily_tweets.uniq! { |t| t.tweet_id }
174
+
175
+ @mutex.synchronize {
176
+ @all_tweets.concat(new_tweets)
177
+ @all_tweets.uniq! { |t| t.tweet_id }
178
+ }
179
+ end
180
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
181
+
182
+ break unless new_pos
183
+ break if daily_limit && daily_tweets.size >= daily_limit
184
+ break if @all_tweets.size >= limit
185
+
186
+ pos = new_pos
187
+ end
188
+
189
+ if !@stop_requested && @all_tweets.size >= limit
190
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
191
+ @stop_requested = true
192
+ end
193
+ end
194
+
195
+ def stop_requested?
196
+ @stop_requested
197
+ end
198
+
199
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
200
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
201
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
202
+ queries = build_queries(query, start_date, end_date)
203
+ if threads > queries.size
204
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
+ threads = queries.size
206
+ end
207
+ if proxy_enabled?
208
+ proxies = Proxy::Pool.new
209
+ logger.debug "Fetch #{proxies.size} proxies"
210
+ else
211
+ proxies = []
212
+ logger.debug 'Proxy disabled'
213
+ end
214
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
215
+
216
+
217
+ validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
+
219
+ logger.info "The number of threads #{threads}"
220
+
221
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
222
+ logger.info "Headers #{headers}"
223
+
224
+ @all_tweets = []
225
+ @mutex = Mutex.new
226
+ @stop_requested = false
227
+
228
+ if threads > 1
229
+ Thread.abort_on_exception = true
230
+ logger.debug "Set 'Thread.abort_on_exception' to true"
231
+
232
+ Parallel.each(queries, in_threads: threads) do |query|
233
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
234
+ raise Parallel::Break if stop_requested?
235
+ end
236
+ else
237
+ queries.each do |query|
238
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
239
+ break if stop_requested?
240
+ end
241
+ end
242
+
243
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
+ end
245
+
246
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
248
+ end
249
+
250
+ def user_timeline(screen_name, limit: 100, order: 'desc')
251
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
252
+ end
253
+ end
254
+ end