twitterscraper-ruby 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.circleci/config.yml +31 -0
- data/.gitignore +10 -0
- data/.irbrc +7 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +42 -0
- data/LICENSE.txt +21 -0
- data/README.md +174 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bin/twitterscraper +13 -0
- data/lib/twitterscraper-ruby.rb +1 -0
- data/lib/twitterscraper.rb +27 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +119 -0
- data/lib/twitterscraper/client.rb +18 -0
- data/lib/twitterscraper/http.rb +31 -0
- data/lib/twitterscraper/lang.rb +40 -0
- data/lib/twitterscraper/logger.rb +9 -0
- data/lib/twitterscraper/proxy.rb +65 -0
- data/lib/twitterscraper/query.rb +254 -0
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +123 -0
- data/lib/version.rb +3 -0
- data/twitterscraper-ruby.gemspec +31 -0
- metadata +104 -0
@@ -0,0 +1 @@
|
|
1
|
+
require_relative "./twitterscraper"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'twitterscraper/logger'
|
2
|
+
require 'twitterscraper/proxy'
|
3
|
+
require 'twitterscraper/http'
|
4
|
+
require 'twitterscraper/lang'
|
5
|
+
require 'twitterscraper/cache'
|
6
|
+
require 'twitterscraper/query'
|
7
|
+
require 'twitterscraper/client'
|
8
|
+
require 'twitterscraper/tweet'
|
9
|
+
require 'twitterscraper/template'
|
10
|
+
require 'version'
|
11
|
+
|
12
|
+
module Twitterscraper
|
13
|
+
class Error < StandardError; end
|
14
|
+
|
15
|
+
def self.logger
|
16
|
+
@logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.logger=(logger)
|
20
|
+
if logger.nil?
|
21
|
+
self.logger.level = ::Logger::FATAL
|
22
|
+
return self.logger
|
23
|
+
end
|
24
|
+
|
25
|
+
@logger = logger
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
$stdout.sync = true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'optparse'
|
5
|
+
require 'twitterscraper'
|
6
|
+
|
7
|
+
module Twitterscraper
|
8
|
+
class Cli
|
9
|
+
def parse
|
10
|
+
@options = parse_options(ARGV)
|
11
|
+
initialize_logger
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
print_help || return if print_help?
|
16
|
+
print_version || return if print_version?
|
17
|
+
|
18
|
+
query_options = {
|
19
|
+
type: options['type'],
|
20
|
+
start_date: options['start_date'],
|
21
|
+
end_date: options['end_date'],
|
22
|
+
lang: options['lang'],
|
23
|
+
limit: options['limit'],
|
24
|
+
daily_limit: options['daily_limit'],
|
25
|
+
order: options['order'],
|
26
|
+
threads: options['threads'],
|
27
|
+
}
|
28
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
|
+
tweets = client.query_tweets(options['query'], query_options)
|
30
|
+
export(tweets) unless tweets.empty?
|
31
|
+
end
|
32
|
+
|
33
|
+
def export(tweets)
|
34
|
+
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
35
|
+
|
36
|
+
if options['format'] == 'json'
|
37
|
+
write_json.call
|
38
|
+
elsif options['format'] == 'html'
|
39
|
+
File.write('tweets.html', Template.tweets_embedded_html(tweets))
|
40
|
+
else
|
41
|
+
write_json.call
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def generate_json(tweets)
|
46
|
+
if options['pretty']
|
47
|
+
::JSON.pretty_generate(tweets)
|
48
|
+
else
|
49
|
+
::JSON.generate(tweets)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def options
|
54
|
+
@options
|
55
|
+
end
|
56
|
+
|
57
|
+
def parse_options(argv)
|
58
|
+
options = argv.getopts(
|
59
|
+
'h',
|
60
|
+
'help',
|
61
|
+
'v',
|
62
|
+
'version',
|
63
|
+
'type:',
|
64
|
+
'query:',
|
65
|
+
'start_date:',
|
66
|
+
'end_date:',
|
67
|
+
'lang:',
|
68
|
+
'limit:',
|
69
|
+
'daily_limit:',
|
70
|
+
'order:',
|
71
|
+
'threads:',
|
72
|
+
'output:',
|
73
|
+
'format:',
|
74
|
+
'cache:',
|
75
|
+
'proxy:',
|
76
|
+
'pretty',
|
77
|
+
'verbose',
|
78
|
+
)
|
79
|
+
|
80
|
+
options['type'] ||= 'search'
|
81
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
82
|
+
options['lang'] ||= ''
|
83
|
+
options['limit'] = (options['limit'] || 100).to_i
|
84
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
+
options['threads'] = (options['threads'] || 2).to_i
|
86
|
+
options['format'] ||= 'json'
|
87
|
+
options['order'] ||= 'desc'
|
88
|
+
options['output'] ||= "tweets.#{options['format']}"
|
89
|
+
|
90
|
+
options['cache'] = options['cache'] != 'false'
|
91
|
+
options['proxy'] = options['proxy'] != 'false'
|
92
|
+
|
93
|
+
options
|
94
|
+
end
|
95
|
+
|
96
|
+
def initialize_logger
|
97
|
+
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
|
+
end
|
99
|
+
|
100
|
+
def print_help?
|
101
|
+
options['h'] || options['help']
|
102
|
+
end
|
103
|
+
|
104
|
+
def print_help
|
105
|
+
puts <<~'SHELL'
|
106
|
+
Usage:
|
107
|
+
twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
|
108
|
+
SHELL
|
109
|
+
end
|
110
|
+
|
111
|
+
def print_version?
|
112
|
+
options['v'] || options['version']
|
113
|
+
end
|
114
|
+
|
115
|
+
def print_version
|
116
|
+
puts "twitterscraper-#{VERSION}"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Http
|
3
|
+
|
4
|
+
module_function
|
5
|
+
|
6
|
+
def get(url, headers = {}, proxy = nil, timeout = nil)
|
7
|
+
timeout ||= 3
|
8
|
+
|
9
|
+
if proxy
|
10
|
+
ip, port = proxy.split(':')
|
11
|
+
http_class = Net::HTTP::Proxy(ip, port.to_i)
|
12
|
+
else
|
13
|
+
http_class = Net::HTTP
|
14
|
+
end
|
15
|
+
|
16
|
+
uri = URI.parse(url)
|
17
|
+
http = http_class.new(uri.host, uri.port)
|
18
|
+
http.use_ssl = true if url.match?(/^https/)
|
19
|
+
http.open_timeout = timeout
|
20
|
+
http.read_timeout = timeout
|
21
|
+
req = Net::HTTP::Get.new(uri)
|
22
|
+
|
23
|
+
headers.each do |key, value|
|
24
|
+
req[key] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
res = http.start { http.request(req) }
|
28
|
+
res.body
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
class Lang
|
3
|
+
LIST = [
|
4
|
+
'en', # English
|
5
|
+
'ar', # Arabic
|
6
|
+
'bn', # Bengali
|
7
|
+
'cs', # Czech
|
8
|
+
'da', # Danish
|
9
|
+
'de', # German
|
10
|
+
'el', # Greek
|
11
|
+
'es', # Spanish
|
12
|
+
'fa', # Persian
|
13
|
+
'fi', # Finnish
|
14
|
+
'fil', # Filipino
|
15
|
+
'fr', # French
|
16
|
+
'he', # Hebrew
|
17
|
+
'hi', # Hindi
|
18
|
+
'hu', # Hungarian
|
19
|
+
'id', # Indonesian
|
20
|
+
'it', # Italian
|
21
|
+
'ja', # Japanese
|
22
|
+
'ko', # Korean
|
23
|
+
'msa', # Malay
|
24
|
+
'nl', # Dutch
|
25
|
+
'no', # Norwegian
|
26
|
+
'pl', # Polish
|
27
|
+
'pt', # Portuguese
|
28
|
+
'ro', # Romanian
|
29
|
+
'ru', # Russian
|
30
|
+
'sv', # Swedish
|
31
|
+
'th', # Thai
|
32
|
+
'tr', # Turkish
|
33
|
+
'uk', # Ukranian
|
34
|
+
'ur', # Urdu
|
35
|
+
'vi', # Vietnamese
|
36
|
+
'zh-cn', # Chinese Simplified
|
37
|
+
'zh-tw', # Chinese Traditional
|
38
|
+
]
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Proxy
|
3
|
+
|
4
|
+
PROXY_URL = 'https://free-proxy-list.net/'
|
5
|
+
|
6
|
+
class RetryExhausted < StandardError
|
7
|
+
end
|
8
|
+
|
9
|
+
class Pool
|
10
|
+
def initialize
|
11
|
+
@items = Proxy.get_proxies
|
12
|
+
@cur_index = 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def sample
|
16
|
+
if @cur_index >= @items.size
|
17
|
+
reload
|
18
|
+
end
|
19
|
+
@cur_index += 1
|
20
|
+
@items[@cur_index - 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def size
|
24
|
+
@items.size
|
25
|
+
end
|
26
|
+
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def reload
|
34
|
+
@items = Proxy.get_proxies
|
35
|
+
@cur_index = 0
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
module_function
|
40
|
+
|
41
|
+
def get_proxies(retries = 3)
|
42
|
+
response = Twitterscraper::Http.get(PROXY_URL)
|
43
|
+
html = Nokogiri::HTML(response)
|
44
|
+
table = html.xpath('//table[@id="proxylisttable"]').first
|
45
|
+
|
46
|
+
proxies = []
|
47
|
+
|
48
|
+
table.xpath('tbody/tr').each do |tr|
|
49
|
+
cells = tr.xpath('td')
|
50
|
+
ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
|
51
|
+
next unless ['elite proxy', 'anonymous'].include?(anonymity)
|
52
|
+
next if https == 'no'
|
53
|
+
proxies << ip + ':' + port
|
54
|
+
end
|
55
|
+
|
56
|
+
proxies.shuffle
|
57
|
+
rescue => e
|
58
|
+
if (retries -= 1) > 0
|
59
|
+
retry
|
60
|
+
else
|
61
|
+
raise RetryExhausted.new(e.inspect)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,254 @@
|
|
1
|
+
require 'resolv-replace'
|
2
|
+
require 'net/http'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'date'
|
5
|
+
require 'json'
|
6
|
+
require 'erb'
|
7
|
+
require 'parallel'
|
8
|
+
|
9
|
+
module Twitterscraper
|
10
|
+
module Query
|
11
|
+
include Logger
|
12
|
+
|
13
|
+
USER_AGENT_LIST = [
|
14
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
|
15
|
+
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
16
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
17
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
18
|
+
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
19
|
+
]
|
20
|
+
|
21
|
+
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
22
|
+
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
|
+
'default&include_available_features=1&include_entities=1&' +
|
24
|
+
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
|
+
'include_available_features=1&include_entities=1&' +
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, from_user, pos)
|
31
|
+
if from_user
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
37
|
+
else
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
47
|
+
return nil if stop_requested?
|
48
|
+
unless proxies.empty?
|
49
|
+
proxy = proxies.sample
|
50
|
+
logger.info("Using proxy #{proxy}")
|
51
|
+
end
|
52
|
+
Http.get(url, headers, proxy, timeout)
|
53
|
+
rescue => e
|
54
|
+
logger.debug "query_single_page: #{e.inspect}"
|
55
|
+
if (retries -= 1) > 0
|
56
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
57
|
+
retry
|
58
|
+
else
|
59
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_single_page(text, html = true)
|
64
|
+
return [nil, nil] if text.nil? || text == ''
|
65
|
+
|
66
|
+
if html
|
67
|
+
json_resp = nil
|
68
|
+
items_html = text
|
69
|
+
else
|
70
|
+
json_resp = JSON.parse(text)
|
71
|
+
items_html = json_resp['items_html'] || ''
|
72
|
+
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
73
|
+
end
|
74
|
+
|
75
|
+
[items_html, json_resp]
|
76
|
+
end
|
77
|
+
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
79
|
+
logger.info "Querying #{query}"
|
80
|
+
query = ERB::Util.url_encode(query)
|
81
|
+
|
82
|
+
url = build_query_url(query, lang, type == 'user', pos)
|
83
|
+
http_request = lambda do
|
84
|
+
logger.debug "Scraping tweets from #{url}"
|
85
|
+
get_single_page(url, headers, proxies)
|
86
|
+
end
|
87
|
+
|
88
|
+
if cache_enabled?
|
89
|
+
client = Cache.new
|
90
|
+
if (response = client.read(url))
|
91
|
+
logger.debug 'Fetching tweets from cache'
|
92
|
+
else
|
93
|
+
response = http_request.call
|
94
|
+
client.write(url, response) unless stop_requested?
|
95
|
+
end
|
96
|
+
else
|
97
|
+
response = http_request.call
|
98
|
+
end
|
99
|
+
return [], nil if response.nil? || response.empty?
|
100
|
+
|
101
|
+
html, json_resp = parse_single_page(response, pos.nil?)
|
102
|
+
|
103
|
+
tweets = Tweet.from_html(html)
|
104
|
+
|
105
|
+
if tweets.empty?
|
106
|
+
return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
|
107
|
+
end
|
108
|
+
|
109
|
+
if json_resp
|
110
|
+
[tweets, json_resp['min_position']]
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
113
|
+
else
|
114
|
+
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
OLDEST_DATE = Date.parse('2006-03-21')
|
119
|
+
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
121
|
+
query = queries[0]
|
122
|
+
if query.nil? || query == ''
|
123
|
+
raise Error.new('Please specify a search query.')
|
124
|
+
end
|
125
|
+
|
126
|
+
if ERB::Util.url_encode(query).length >= 500
|
127
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
128
|
+
end
|
129
|
+
|
130
|
+
if start_date && end_date
|
131
|
+
if start_date == end_date
|
132
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
133
|
+
elsif start_date > end_date
|
134
|
+
raise Error.new(':start_date must occur before :end_date.')
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
if start_date
|
139
|
+
if start_date < OLDEST_DATE
|
140
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
if end_date
|
145
|
+
today = Date.today
|
146
|
+
if end_date > Date.today
|
147
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def build_queries(query, start_date, end_date)
|
153
|
+
if start_date && end_date
|
154
|
+
date_range = start_date.upto(end_date - 1)
|
155
|
+
date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
156
|
+
elsif start_date
|
157
|
+
[query + " since:#{start_date}"]
|
158
|
+
elsif end_date
|
159
|
+
[query + " until:#{end_date}"]
|
160
|
+
else
|
161
|
+
[query]
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
166
|
+
pos = nil
|
167
|
+
daily_tweets = []
|
168
|
+
|
169
|
+
while true
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
171
|
+
unless new_tweets.empty?
|
172
|
+
daily_tweets.concat(new_tweets)
|
173
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
174
|
+
|
175
|
+
@mutex.synchronize {
|
176
|
+
@all_tweets.concat(new_tweets)
|
177
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
178
|
+
}
|
179
|
+
end
|
180
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
181
|
+
|
182
|
+
break unless new_pos
|
183
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
184
|
+
break if @all_tweets.size >= limit
|
185
|
+
|
186
|
+
pos = new_pos
|
187
|
+
end
|
188
|
+
|
189
|
+
if !@stop_requested && @all_tweets.size >= limit
|
190
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
191
|
+
@stop_requested = true
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def stop_requested?
|
196
|
+
@stop_requested
|
197
|
+
end
|
198
|
+
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
200
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
201
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
202
|
+
queries = build_queries(query, start_date, end_date)
|
203
|
+
if threads > queries.size
|
204
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
|
+
threads = queries.size
|
206
|
+
end
|
207
|
+
if proxy_enabled?
|
208
|
+
proxies = Proxy::Pool.new
|
209
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
210
|
+
else
|
211
|
+
proxies = []
|
212
|
+
logger.debug 'Proxy disabled'
|
213
|
+
end
|
214
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
215
|
+
|
216
|
+
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
|
+
|
219
|
+
logger.info "The number of threads #{threads}"
|
220
|
+
|
221
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
222
|
+
logger.info "Headers #{headers}"
|
223
|
+
|
224
|
+
@all_tweets = []
|
225
|
+
@mutex = Mutex.new
|
226
|
+
@stop_requested = false
|
227
|
+
|
228
|
+
if threads > 1
|
229
|
+
Thread.abort_on_exception = true
|
230
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
231
|
+
|
232
|
+
Parallel.each(queries, in_threads: threads) do |query|
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
234
|
+
raise Parallel::Break if stop_requested?
|
235
|
+
end
|
236
|
+
else
|
237
|
+
queries.each do |query|
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
239
|
+
break if stop_requested?
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
|
+
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|