scraper_clients 9.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +26 -0
- data/bin/pry +17 -0
- data/bin/rspec +17 -0
- data/data/user_agents.txt +204 -0
- data/lib/clients.rb +24 -0
- data/lib/clients/errors.rb +16 -0
- data/lib/clients/ftp_client.rb +17 -0
- data/lib/clients/http_client.rb +152 -0
- data/lib/clients/http_client/response.rb +57 -0
- data/lib/clients/proxy6_client.rb +70 -0
- data/lib/clients/proxy_client.rb +14 -0
- data/lib/clients/proxy_list_client.rb +38 -0
- data/lib/clients/recaptcha/client.rb +48 -0
- data/lib/clients/recaptcha/response.rb +15 -0
- data/lib/clients/recaptcha/solver.rb +115 -0
- data/lib/clients/tor_client.rb +146 -0
- data/lib/clients/url_decoder.rb +8 -0
- data/lib/clients/version.rb +3 -0
- data/spec/lib/clients/http_client/response_spec.rb +197 -0
- data/spec/lib/clients/http_client_spec.rb +221 -0
- data/spec/lib/clients/tor_client_spec.rb +34 -0
- data/spec/spec_helper.rb +66 -0
- metadata +168 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
require "http"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
class HttpClient
|
6
|
+
class Response < SimpleDelegator
|
7
|
+
alias_method :object, :__getobj__
|
8
|
+
|
9
|
+
DEFAULT_ENCODING = Encoding::UTF_8
|
10
|
+
|
11
|
+
def success?
|
12
|
+
object.status.success?
|
13
|
+
end
|
14
|
+
|
15
|
+
def fail?
|
16
|
+
!success?
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s(force_utf8: false)
|
20
|
+
response = object.to_s
|
21
|
+
return response unless force_utf8
|
22
|
+
|
23
|
+
if object.charset
|
24
|
+
response
|
25
|
+
.encode(DEFAULT_ENCODING)
|
26
|
+
.scrub("_")
|
27
|
+
else
|
28
|
+
response
|
29
|
+
.force_encoding(DEFAULT_ENCODING)
|
30
|
+
.scrub("_")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_html(**kargs)
|
35
|
+
Nokogiri::HTML.parse to_s(**kargs)
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_xml(**kargs)
|
39
|
+
Nokogiri::XML.parse to_s(**kargs)
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_json(**kargs)
|
43
|
+
JSON.parse to_s(**kargs), symbolize_names: true
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_io
|
47
|
+
StringIO.new(to_s)
|
48
|
+
end
|
49
|
+
|
50
|
+
def stream(size = HTTP::Connection::BUFFER_SIZE)
|
51
|
+
while (chunk = object.body.readpartial(size))
|
52
|
+
yield chunk
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require "http"
|
2
|
+
require "clients/errors"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
class Proxy6Client
|
6
|
+
API_URL = "https://proxy6.net/api".freeze
|
7
|
+
|
8
|
+
attr_reader :ip_version
|
9
|
+
|
10
|
+
def initialize(
|
11
|
+
api_url: API_URL,
|
12
|
+
api_key: ENV["PROXY6_KEY"],
|
13
|
+
ip_version: "4"
|
14
|
+
)
|
15
|
+
@api_url = api_url
|
16
|
+
@api_key = api_key
|
17
|
+
@ip_version = ip_version.to_s
|
18
|
+
@proxy = fetch_proxy
|
19
|
+
end
|
20
|
+
|
21
|
+
def host
|
22
|
+
@proxy["host"]
|
23
|
+
end
|
24
|
+
|
25
|
+
def port
|
26
|
+
@proxy["port"].to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
def user
|
30
|
+
@proxy["user"]
|
31
|
+
end
|
32
|
+
|
33
|
+
def password
|
34
|
+
@proxy["pass"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
[host, port, user, password].compact.join(":")
|
39
|
+
end
|
40
|
+
|
41
|
+
def reset!
|
42
|
+
@proxy = fetch_proxy
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def fetch_proxy
|
48
|
+
response = HTTP.accept(:json).get(api_url, params: { state: "active" })
|
49
|
+
json = JSON.parse(response.to_s) if response.status.success?
|
50
|
+
|
51
|
+
fail_on_invalid_list(response) if !response.status.success? || !json.key?("list")
|
52
|
+
|
53
|
+
proxies = json.fetch("list")
|
54
|
+
fail_on_invalid_list(response) if proxies.is_a?(Array)
|
55
|
+
|
56
|
+
proxies = proxies.values.select { |h| h["version"] == @ip_version }
|
57
|
+
fail_on_invalid_list(response) if proxies.empty?
|
58
|
+
|
59
|
+
proxies.sample
|
60
|
+
end
|
61
|
+
|
62
|
+
def api_url
|
63
|
+
[@api_url, @api_key, "getproxy/"].join("/")
|
64
|
+
end
|
65
|
+
|
66
|
+
def fail_on_invalid_list(response)
|
67
|
+
fail ProxyClientError, "Invalid proxy list: #{response.status} #{response.to_s}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "clients/errors"
|
2
|
+
require "clients/proxy6_client"
|
3
|
+
require "clients/proxy_list_client"
|
4
|
+
|
5
|
+
module Clients
|
6
|
+
class ProxyClient
|
7
|
+
def self.from_env
|
8
|
+
case ENV["CLIENTS_PROXY_CLIENT"]
|
9
|
+
when "proxy6" then Proxy6Client.new
|
10
|
+
when "list" then ProxyListClient.new
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Clients
|
2
|
+
class ProxyListClient
|
3
|
+
DEFAULT_LIST_PATH = "/tmp/clients_proxy_list.txt".freeze
|
4
|
+
|
5
|
+
attr_reader :host, :user, :password
|
6
|
+
|
7
|
+
def self.cache_list(list_url, list_path = DEFAULT_LIST_PATH)
|
8
|
+
response = HTTP.get(list_url)
|
9
|
+
fail "Invalid list response: #{response.status}" unless response.status.success?
|
10
|
+
File.open(list_path, "w") { |f| f << response.to_s }
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(path = DEFAULT_LIST_PATH)
|
14
|
+
@path = path
|
15
|
+
select_proxy_from_list
|
16
|
+
end
|
17
|
+
|
18
|
+
def port
|
19
|
+
@port.to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
[host, port, user, password].compact.join(":")
|
24
|
+
end
|
25
|
+
|
26
|
+
def reset!
|
27
|
+
select_proxy_from_list
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def select_proxy_from_list
|
33
|
+
proxies = File.readlines(@path)
|
34
|
+
address = proxies.sample
|
35
|
+
@host, @port, @user, @password = address.split(":").map(&:strip)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require "clients/http_client"
|
2
|
+
require "clients/recaptcha/solver"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
module Recaptcha
|
6
|
+
class Client < Clients::HttpClient
|
7
|
+
attr_writer :solver
|
8
|
+
|
9
|
+
def get(url, **options, &block)
|
10
|
+
response = super
|
11
|
+
response = bypass_captcha(url, response) if captcha_protected?(response)
|
12
|
+
response
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_without_bypass(url, **options, &block)
|
16
|
+
request :get, url, **options, &block
|
17
|
+
end
|
18
|
+
|
19
|
+
def solver
|
20
|
+
@solver ||= Solver.new(self)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def captcha_protected?(response)
|
26
|
+
response.status == 403 &&
|
27
|
+
response.to_s.include?("g-recaptcha-response")
|
28
|
+
end
|
29
|
+
|
30
|
+
def bypass_captcha(url, response)
|
31
|
+
if has_cookies?
|
32
|
+
reset_cookies
|
33
|
+
response = get_without_bypass(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
fail "captcha with empty cookie" if response.cookies.empty?
|
37
|
+
|
38
|
+
solved_cookies = solver.solve(url, response)
|
39
|
+
store_cookies solved_cookies
|
40
|
+
|
41
|
+
solved_response = get_without_bypass(url, follow_redirects: false)
|
42
|
+
reset_cookies
|
43
|
+
|
44
|
+
solved_response
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require "clients/errors"
|
2
|
+
require "clients/recaptcha/response"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
module Recaptcha
|
6
|
+
class Solver
|
7
|
+
INIT_URL = Addressable::URI.parse("http://2captcha.com/in.php").freeze
|
8
|
+
SOLVE_STATUS_URL = Addressable::URI.parse("http://2captcha.com/res.php").freeze
|
9
|
+
|
10
|
+
attr_reader :client, :captcha_key, :sleep_duration
|
11
|
+
|
12
|
+
def initialize(client, captcha_key: ENV["CAPTCHA_SOLVER_KEY"], sleep_duration: 5)
|
13
|
+
@client = client
|
14
|
+
@captcha_key = captcha_key
|
15
|
+
@sleep_duration = sleep_duration
|
16
|
+
end
|
17
|
+
|
18
|
+
def solve(banned_url, response)
|
19
|
+
site_key = find_site_key(response)
|
20
|
+
solved_path = find_solve_url(response)
|
21
|
+
|
22
|
+
id = init_solver banned_url, site_key
|
23
|
+
token = get_solution id
|
24
|
+
|
25
|
+
solved_response = solve_captcha banned_url, solved_path, token
|
26
|
+
solved_response.cookies
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def init_url(banned_url, site_key)
|
32
|
+
url = INIT_URL.dup
|
33
|
+
url.query_values = {
|
34
|
+
key: captcha_key,
|
35
|
+
method: "userrecaptcha",
|
36
|
+
googlekey: site_key,
|
37
|
+
url: banned_url
|
38
|
+
}
|
39
|
+
url
|
40
|
+
end
|
41
|
+
|
42
|
+
def status_url(id, tries = 0)
|
43
|
+
url = SOLVE_STATUS_URL.dup
|
44
|
+
url.query_values = {
|
45
|
+
key: captcha_key,
|
46
|
+
action: "get",
|
47
|
+
id: id,
|
48
|
+
try: tries
|
49
|
+
}
|
50
|
+
url
|
51
|
+
end
|
52
|
+
|
53
|
+
def solved_url(banned_url, solved_path, token)
|
54
|
+
url = Addressable::URI.parse(banned_url)
|
55
|
+
url.path = solved_path
|
56
|
+
url.query_values = { "g-recaptcha-response": token }
|
57
|
+
url
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_site_key(response)
|
61
|
+
key = response.to_s.match(/data-sitekey=\"(.+?)\"/) { |m| m[1] }
|
62
|
+
fail RecaptchaError, "Empty sitekey in recaptcha form" unless key
|
63
|
+
key
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_solve_url(response)
|
67
|
+
url = response.to_s.match(/action=\"(.+?)\"/) { |m| m[1] }
|
68
|
+
fail RecaptchaError, "Empty action in recaptcha form" unless url
|
69
|
+
url
|
70
|
+
end
|
71
|
+
|
72
|
+
def init_solver(banned_url, site_key)
|
73
|
+
url = init_url banned_url, site_key
|
74
|
+
response = wrap_response client.get_without_bypass(url)
|
75
|
+
fail RecaptchaError, response.to_s unless response.success?
|
76
|
+
response.data
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_solution(id)
|
80
|
+
tries = 0
|
81
|
+
response = nil
|
82
|
+
|
83
|
+
until response
|
84
|
+
fail RecaptchaError, "Solve timeout after 10 tries" if tries > 10
|
85
|
+
|
86
|
+
sleep sleep_duration
|
87
|
+
response = check_status id, tries
|
88
|
+
tries += 1
|
89
|
+
end
|
90
|
+
|
91
|
+
response.data
|
92
|
+
end
|
93
|
+
|
94
|
+
def check_status(id, tries)
|
95
|
+
url = status_url id, tries
|
96
|
+
response = wrap_response client.get_without_bypass(url)
|
97
|
+
response if response.success?
|
98
|
+
end
|
99
|
+
|
100
|
+
def solve_captcha(banned_url, solved_path, token)
|
101
|
+
url = solved_url banned_url, solved_path, token
|
102
|
+
response = client.get_without_bypass(url, follow_redirects: false) do |request|
|
103
|
+
request.headers(referer: banned_url)
|
104
|
+
end
|
105
|
+
|
106
|
+
fail RecaptchaError, "Unable to solve recaptcha" if response.status.redirect?
|
107
|
+
response
|
108
|
+
end
|
109
|
+
|
110
|
+
def wrap_response(response)
|
111
|
+
Recaptcha::Response.new response
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require "net/telnet"
|
2
|
+
require "ostruct"
|
3
|
+
|
4
|
+
# Proxy to Tor
|
5
|
+
#
|
6
|
+
# From http://martincik.com/?p=402
|
7
|
+
module Clients
|
8
|
+
class TorClient
|
9
|
+
DEFAULT_PORT = 9050
|
10
|
+
DEFAULT_CONTROL_PORT = 9051
|
11
|
+
DEFAULT_HTTP_PORT = 8080
|
12
|
+
OK_STATUS = "250 OK\n".freeze
|
13
|
+
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
attr_reader :config, :threshold, :pool_num
|
17
|
+
|
18
|
+
def_delegators :@config,
|
19
|
+
:tor_host, :tor_port
|
20
|
+
|
21
|
+
# rubocop:disable Metrics/MethodLength
|
22
|
+
def initialize(options = {})
|
23
|
+
options = {
|
24
|
+
tor_host: "localhost",
|
25
|
+
tor_port: (ENV["TOR_PORT"] || DEFAULT_PORT).to_i,
|
26
|
+
control_port: (ENV["TOR_CONTROL_PORT"] || DEFAULT_CONTROL_PORT).to_i,
|
27
|
+
host: "localhost",
|
28
|
+
port: (ENV["HTTP_TOR_PORT"] || DEFAULT_HTTP_PORT).to_i,
|
29
|
+
circuit_timeout: 10,
|
30
|
+
throttle_by: 10, # .seconds implied
|
31
|
+
pool_num: nil
|
32
|
+
}.merge(options)
|
33
|
+
|
34
|
+
@pool_num = options.delete(:pool_num)
|
35
|
+
@config = OpenStruct.new options
|
36
|
+
|
37
|
+
setup_pool
|
38
|
+
end
|
39
|
+
|
40
|
+
def host
|
41
|
+
@config[:host]
|
42
|
+
end
|
43
|
+
|
44
|
+
def port
|
45
|
+
@config[:port]
|
46
|
+
end
|
47
|
+
|
48
|
+
def user
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
def password
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_s
|
57
|
+
"#{host}:#{port}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def switch_identity
|
61
|
+
throttle do
|
62
|
+
client = Net::Telnet.new(
|
63
|
+
"Host" => config.tor_host,
|
64
|
+
"Port" => config.control_port,
|
65
|
+
"Timeout" => config.circuit_timeout,
|
66
|
+
"Prompt" => Regexp.new(OK_STATUS)
|
67
|
+
)
|
68
|
+
|
69
|
+
authenticate client
|
70
|
+
new_route client
|
71
|
+
|
72
|
+
client.close
|
73
|
+
end
|
74
|
+
end
|
75
|
+
alias_method :reset!, :switch_identity
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def throttle
|
80
|
+
scheduled = if check_threshold
|
81
|
+
update_threshold
|
82
|
+
:now
|
83
|
+
else
|
84
|
+
schedule_switch
|
85
|
+
end
|
86
|
+
|
87
|
+
if scheduled == :now
|
88
|
+
yield
|
89
|
+
else
|
90
|
+
sleep until_next_switch_time
|
91
|
+
|
92
|
+
if scheduled
|
93
|
+
update_threshold
|
94
|
+
yield
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def authenticate(client)
|
100
|
+
client.cmd("AUTHENTICATE") do |c|
|
101
|
+
fail "cannot authenticate to Tor!" unless c == OK_STATUS
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def new_route(client)
|
106
|
+
client.cmd("SIGNAL NEWNYM") do |c|
|
107
|
+
fail "cannot switch Tor to new route!" unless c == OK_STATUS
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def setup_pool
|
112
|
+
return unless pool_num
|
113
|
+
|
114
|
+
config.tor_port += 2 * pool_num
|
115
|
+
config.control_port += 2 * pool_num
|
116
|
+
config.port += 2 * pool_num
|
117
|
+
end
|
118
|
+
|
119
|
+
def check_threshold
|
120
|
+
!threshold || (next_switch_time < Time.now)
|
121
|
+
end
|
122
|
+
|
123
|
+
def next_switch_time
|
124
|
+
threshold + config.throttle_by
|
125
|
+
end
|
126
|
+
|
127
|
+
def until_next_switch_time
|
128
|
+
diff = next_switch_time - Time.now
|
129
|
+
diff < 0 ? 0 : diff
|
130
|
+
end
|
131
|
+
|
132
|
+
def schedule_switch
|
133
|
+
return if @scheduled
|
134
|
+
@scheduled = true
|
135
|
+
end
|
136
|
+
|
137
|
+
def update_threshold
|
138
|
+
@threshold = Time.now
|
139
|
+
@scheduled = false
|
140
|
+
end
|
141
|
+
|
142
|
+
def reset_threshold
|
143
|
+
@threshold = nil
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|