twitterscraper-ruby 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ require_relative "./twitterscraper"
@@ -0,0 +1,27 @@
1
+ require 'twitterscraper/logger'
2
+ require 'twitterscraper/proxy'
3
+ require 'twitterscraper/http'
4
+ require 'twitterscraper/lang'
5
+ require 'twitterscraper/cache'
6
+ require 'twitterscraper/query'
7
+ require 'twitterscraper/client'
8
+ require 'twitterscraper/tweet'
9
+ require 'twitterscraper/template'
10
+ require 'version'
11
+
12
+ module Twitterscraper
13
+ class Error < StandardError; end
14
+
15
+ def self.logger
16
+ @logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
17
+ end
18
+
19
+ def self.logger=(logger)
20
+ if logger.nil?
21
+ self.logger.level = ::Logger::FATAL
22
+ return self.logger
23
+ end
24
+
25
+ @logger = logger
26
+ end
27
+ end
@@ -0,0 +1,69 @@
1
+ require 'base64'
2
+ require 'digest/md5'
3
+
4
+ module Twitterscraper
5
+ class Cache
6
+ def initialize()
7
+ @ttl = 3600 # 1 hour
8
+ @dir = 'cache'
9
+ Dir.mkdir(@dir) unless File.exist?(@dir)
10
+ end
11
+
12
+ def read(key)
13
+ key = cache_key(key)
14
+ file = File.join(@dir, key)
15
+ entry = Entry.from_json(File.read(file))
16
+ entry.value if entry.time > Time.now - @ttl
17
+ rescue Errno::ENOENT => e
18
+ nil
19
+ end
20
+
21
+ def write(key, value)
22
+ key = cache_key(key)
23
+ entry = Entry.new(key, value, Time.now)
24
+ file = File.join(@dir, key)
25
+ File.write(file, entry.to_json)
26
+ end
27
+
28
+ def fetch(key, &block)
29
+ if (value = read(key))
30
+ value
31
+ else
32
+ yield.tap { |v| write(key, v) }
33
+ end
34
+ end
35
+
36
+ def cache_key(key)
37
+ value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
38
+ value = Digest::MD5.hexdigest(value) if value.length >= 100
39
+ value
40
+ end
41
+
42
+ class Entry < Hash
43
+ attr_reader :key, :value, :time
44
+
45
+ def initialize(key, value, time)
46
+ @key = key
47
+ @value = value
48
+ @time = time
49
+ end
50
+
51
+ def attrs
52
+ {key: @key, value: @value, time: @time}
53
+ end
54
+
55
+ def to_json
56
+ hash = attrs
57
+ hash[:value] = Base64.encode64(hash[:value])
58
+ hash.to_json
59
+ end
60
+
61
+ class << self
62
+ def from_json(text)
63
+ json = JSON.parse(text)
64
+ new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,119 @@
1
+ $stdout.sync = true
2
+
3
+ require 'json'
4
+ require 'optparse'
5
+ require 'twitterscraper'
6
+
7
+ module Twitterscraper
8
+ class Cli
9
+ def parse
10
+ @options = parse_options(ARGV)
11
+ initialize_logger
12
+ end
13
+
14
+ def run
15
+ print_help || return if print_help?
16
+ print_version || return if print_version?
17
+
18
+ query_options = {
19
+ type: options['type'],
20
+ start_date: options['start_date'],
21
+ end_date: options['end_date'],
22
+ lang: options['lang'],
23
+ limit: options['limit'],
24
+ daily_limit: options['daily_limit'],
25
+ order: options['order'],
26
+ threads: options['threads'],
27
+ }
28
+ client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
29
+ tweets = client.query_tweets(options['query'], query_options)
30
+ export(tweets) unless tweets.empty?
31
+ end
32
+
33
+ def export(tweets)
34
+ write_json = lambda { File.write(options['output'], generate_json(tweets)) }
35
+
36
+ if options['format'] == 'json'
37
+ write_json.call
38
+ elsif options['format'] == 'html'
39
+ File.write('tweets.html', Template.tweets_embedded_html(tweets))
40
+ else
41
+ write_json.call
42
+ end
43
+ end
44
+
45
+ def generate_json(tweets)
46
+ if options['pretty']
47
+ ::JSON.pretty_generate(tweets)
48
+ else
49
+ ::JSON.generate(tweets)
50
+ end
51
+ end
52
+
53
+ def options
54
+ @options
55
+ end
56
+
57
+ def parse_options(argv)
58
+ options = argv.getopts(
59
+ 'h',
60
+ 'help',
61
+ 'v',
62
+ 'version',
63
+ 'type:',
64
+ 'query:',
65
+ 'start_date:',
66
+ 'end_date:',
67
+ 'lang:',
68
+ 'limit:',
69
+ 'daily_limit:',
70
+ 'order:',
71
+ 'threads:',
72
+ 'output:',
73
+ 'format:',
74
+ 'cache:',
75
+ 'proxy:',
76
+ 'pretty',
77
+ 'verbose',
78
+ )
79
+
80
+ options['type'] ||= 'search'
81
+ options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
82
+ options['lang'] ||= ''
83
+ options['limit'] = (options['limit'] || 100).to_i
84
+ options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
85
+ options['threads'] = (options['threads'] || 2).to_i
86
+ options['format'] ||= 'json'
87
+ options['order'] ||= 'desc'
88
+ options['output'] ||= "tweets.#{options['format']}"
89
+
90
+ options['cache'] = options['cache'] != 'false'
91
+ options['proxy'] = options['proxy'] != 'false'
92
+
93
+ options
94
+ end
95
+
96
+ def initialize_logger
97
+ Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
98
+ end
99
+
100
+ def print_help?
101
+ options['h'] || options['help']
102
+ end
103
+
104
+ def print_help
105
+ puts <<~'SHELL'
106
+ Usage:
107
+ twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
108
+ SHELL
109
+ end
110
+
111
+ def print_version?
112
+ options['v'] || options['version']
113
+ end
114
+
115
+ def print_version
116
+ puts "twitterscraper-#{VERSION}"
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,18 @@
1
+ module Twitterscraper
2
+ class Client
3
+ include Query
4
+
5
+ def initialize(cache: true, proxy: true)
6
+ @cache = cache
7
+ @proxy = proxy
8
+ end
9
+
10
+ def cache_enabled?
11
+ @cache
12
+ end
13
+
14
+ def proxy_enabled?
15
+ @proxy
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,31 @@
1
+ module Twitterscraper
2
+ module Http
3
+
4
+ module_function
5
+
6
+ def get(url, headers = {}, proxy = nil, timeout = nil)
7
+ timeout ||= 3
8
+
9
+ if proxy
10
+ ip, port = proxy.split(':')
11
+ http_class = Net::HTTP::Proxy(ip, port.to_i)
12
+ else
13
+ http_class = Net::HTTP
14
+ end
15
+
16
+ uri = URI.parse(url)
17
+ http = http_class.new(uri.host, uri.port)
18
+ http.use_ssl = true if url.match?(/^https/)
19
+ http.open_timeout = timeout
20
+ http.read_timeout = timeout
21
+ req = Net::HTTP::Get.new(uri)
22
+
23
+ headers.each do |key, value|
24
+ req[key] = value
25
+ end
26
+
27
+ res = http.start { http.request(req) }
28
+ res.body
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,40 @@
1
+ module Twitterscraper
2
+ class Lang
3
+ LIST = [
4
+ 'en', # English
5
+ 'ar', # Arabic
6
+ 'bn', # Bengali
7
+ 'cs', # Czech
8
+ 'da', # Danish
9
+ 'de', # German
10
+ 'el', # Greek
11
+ 'es', # Spanish
12
+ 'fa', # Persian
13
+ 'fi', # Finnish
14
+ 'fil', # Filipino
15
+ 'fr', # French
16
+ 'he', # Hebrew
17
+ 'hi', # Hindi
18
+ 'hu', # Hungarian
19
+ 'id', # Indonesian
20
+ 'it', # Italian
21
+ 'ja', # Japanese
22
+ 'ko', # Korean
23
+ 'msa', # Malay
24
+ 'nl', # Dutch
25
+ 'no', # Norwegian
26
+ 'pl', # Polish
27
+ 'pt', # Portuguese
28
+ 'ro', # Romanian
29
+ 'ru', # Russian
30
+ 'sv', # Swedish
31
+ 'th', # Thai
32
+ 'tr', # Turkish
33
+ 'uk', # Ukranian
34
+ 'ur', # Urdu
35
+ 'vi', # Vietnamese
36
+ 'zh-cn', # Chinese Simplified
37
+ 'zh-tw', # Chinese Traditional
38
+ ]
39
+ end
40
+ end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ module Twitterscraper
4
+ module Logger
5
+ def logger
6
+ Twitterscraper.logger
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,65 @@
1
+ module Twitterscraper
2
+ module Proxy
3
+
4
+ PROXY_URL = 'https://free-proxy-list.net/'
5
+
6
+ class RetryExhausted < StandardError
7
+ end
8
+
9
+ class Pool
10
+ def initialize
11
+ @items = Proxy.get_proxies
12
+ @cur_index = 0
13
+ end
14
+
15
+ def sample
16
+ if @cur_index >= @items.size
17
+ reload
18
+ end
19
+ @cur_index += 1
20
+ @items[@cur_index - 1]
21
+ end
22
+
23
+ def size
24
+ @items.size
25
+ end
26
+
27
+ def empty?
28
+ @items.empty?
29
+ end
30
+
31
+ private
32
+
33
+ def reload
34
+ @items = Proxy.get_proxies
35
+ @cur_index = 0
36
+ end
37
+ end
38
+
39
+ module_function
40
+
41
+ def get_proxies(retries = 3)
42
+ response = Twitterscraper::Http.get(PROXY_URL)
43
+ html = Nokogiri::HTML(response)
44
+ table = html.xpath('//table[@id="proxylisttable"]').first
45
+
46
+ proxies = []
47
+
48
+ table.xpath('tbody/tr').each do |tr|
49
+ cells = tr.xpath('td')
50
+ ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
51
+ next unless ['elite proxy', 'anonymous'].include?(anonymity)
52
+ next if https == 'no'
53
+ proxies << ip + ':' + port
54
+ end
55
+
56
+ proxies.shuffle
57
+ rescue => e
58
+ if (retries -= 1) > 0
59
+ retry
60
+ else
61
+ raise RetryExhausted.new(e.inspect)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,254 @@
1
+ require 'resolv-replace'
2
+ require 'net/http'
3
+ require 'nokogiri'
4
+ require 'date'
5
+ require 'json'
6
+ require 'erb'
7
+ require 'parallel'
8
+
9
+ module Twitterscraper
10
+ module Query
11
+ include Logger
12
+
13
+ USER_AGENT_LIST = [
14
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
15
+ 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
16
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
17
+ 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
18
+ 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
19
+ ]
20
+
21
+ INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
22
+ RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
23
+ 'default&include_available_features=1&include_entities=1&' +
24
+ 'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
25
+ INIT_URL_USER = 'https://twitter.com/__USER__'
26
+ RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
27
+ 'include_available_features=1&include_entities=1&' +
28
+ 'max_position=__POS__&reset_error_state=false'
29
+
30
+ def build_query_url(query, lang, from_user, pos)
31
+ if from_user
32
+ if pos
33
+ RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
34
+ else
35
+ INIT_URL_USER.sub('__USER__', query)
36
+ end
37
+ else
38
+ if pos
39
+ RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
40
+ else
41
+ INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
42
+ end
43
+ end
44
+ end
45
+
46
+ def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
47
+ return nil if stop_requested?
48
+ unless proxies.empty?
49
+ proxy = proxies.sample
50
+ logger.info("Using proxy #{proxy}")
51
+ end
52
+ Http.get(url, headers, proxy, timeout)
53
+ rescue => e
54
+ logger.debug "query_single_page: #{e.inspect}"
55
+ if (retries -= 1) > 0
56
+ logger.info "Retrying... (Attempts left: #{retries - 1})"
57
+ retry
58
+ else
59
+ raise Error.new("#{e.inspect} url=#{url}")
60
+ end
61
+ end
62
+
63
+ def parse_single_page(text, html = true)
64
+ return [nil, nil] if text.nil? || text == ''
65
+
66
+ if html
67
+ json_resp = nil
68
+ items_html = text
69
+ else
70
+ json_resp = JSON.parse(text)
71
+ items_html = json_resp['items_html'] || ''
72
+ logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
73
+ end
74
+
75
+ [items_html, json_resp]
76
+ end
77
+
78
+ def query_single_page(query, lang, type, pos, headers: [], proxies: [])
79
+ logger.info "Querying #{query}"
80
+ query = ERB::Util.url_encode(query)
81
+
82
+ url = build_query_url(query, lang, type == 'user', pos)
83
+ http_request = lambda do
84
+ logger.debug "Scraping tweets from #{url}"
85
+ get_single_page(url, headers, proxies)
86
+ end
87
+
88
+ if cache_enabled?
89
+ client = Cache.new
90
+ if (response = client.read(url))
91
+ logger.debug 'Fetching tweets from cache'
92
+ else
93
+ response = http_request.call
94
+ client.write(url, response) unless stop_requested?
95
+ end
96
+ else
97
+ response = http_request.call
98
+ end
99
+ return [], nil if response.nil? || response.empty?
100
+
101
+ html, json_resp = parse_single_page(response, pos.nil?)
102
+
103
+ tweets = Tweet.from_html(html)
104
+
105
+ if tweets.empty?
106
+ return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
107
+ end
108
+
109
+ if json_resp
110
+ [tweets, json_resp['min_position']]
111
+ elsif type
112
+ [tweets, tweets[-1].tweet_id]
113
+ else
114
+ [tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
115
+ end
116
+ end
117
+
118
+ OLDEST_DATE = Date.parse('2006-03-21')
119
+
120
+ def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
121
+ query = queries[0]
122
+ if query.nil? || query == ''
123
+ raise Error.new('Please specify a search query.')
124
+ end
125
+
126
+ if ERB::Util.url_encode(query).length >= 500
127
+ raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
128
+ end
129
+
130
+ if start_date && end_date
131
+ if start_date == end_date
132
+ raise Error.new('Please specify different values for :start_date and :end_date.')
133
+ elsif start_date > end_date
134
+ raise Error.new(':start_date must occur before :end_date.')
135
+ end
136
+ end
137
+
138
+ if start_date
139
+ if start_date < OLDEST_DATE
140
+ raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
141
+ end
142
+ end
143
+
144
+ if end_date
145
+ today = Date.today
146
+ if end_date > Date.today
147
+ raise Error.new(":end_date must be less than or equal to today(#{today})")
148
+ end
149
+ end
150
+ end
151
+
152
+ def build_queries(query, start_date, end_date)
153
+ if start_date && end_date
154
+ date_range = start_date.upto(end_date - 1)
155
+ date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
156
+ elsif start_date
157
+ [query + " since:#{start_date}"]
158
+ elsif end_date
159
+ [query + " until:#{end_date}"]
160
+ else
161
+ [query]
162
+ end
163
+ end
164
+
165
+ def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
166
+ pos = nil
167
+ daily_tweets = []
168
+
169
+ while true
170
+ new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
171
+ unless new_tweets.empty?
172
+ daily_tweets.concat(new_tweets)
173
+ daily_tweets.uniq! { |t| t.tweet_id }
174
+
175
+ @mutex.synchronize {
176
+ @all_tweets.concat(new_tweets)
177
+ @all_tweets.uniq! { |t| t.tweet_id }
178
+ }
179
+ end
180
+ logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
181
+
182
+ break unless new_pos
183
+ break if daily_limit && daily_tweets.size >= daily_limit
184
+ break if @all_tweets.size >= limit
185
+
186
+ pos = new_pos
187
+ end
188
+
189
+ if !@stop_requested && @all_tweets.size >= limit
190
+ logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
191
+ @stop_requested = true
192
+ end
193
+ end
194
+
195
+ def stop_requested?
196
+ @stop_requested
197
+ end
198
+
199
+ def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
200
+ start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
201
+ end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
202
+ queries = build_queries(query, start_date, end_date)
203
+ if threads > queries.size
204
+ logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
205
+ threads = queries.size
206
+ end
207
+ if proxy_enabled?
208
+ proxies = Proxy::Pool.new
209
+ logger.debug "Fetch #{proxies.size} proxies"
210
+ else
211
+ proxies = []
212
+ logger.debug 'Proxy disabled'
213
+ end
214
+ logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
215
+
216
+
217
+ validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
218
+
219
+ logger.info "The number of threads #{threads}"
220
+
221
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
222
+ logger.info "Headers #{headers}"
223
+
224
+ @all_tweets = []
225
+ @mutex = Mutex.new
226
+ @stop_requested = false
227
+
228
+ if threads > 1
229
+ Thread.abort_on_exception = true
230
+ logger.debug "Set 'Thread.abort_on_exception' to true"
231
+
232
+ Parallel.each(queries, in_threads: threads) do |query|
233
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
234
+ raise Parallel::Break if stop_requested?
235
+ end
236
+ else
237
+ queries.each do |query|
238
+ main_loop(query, lang, type, limit, daily_limit, headers, proxies)
239
+ break if stop_requested?
240
+ end
241
+ end
242
+
243
+ @all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
244
+ end
245
+
246
+ def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
247
+ query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
248
+ end
249
+
250
+ def user_timeline(screen_name, limit: 100, order: 'desc')
251
+ query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
252
+ end
253
+ end
254
+ end