scraper_clients 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +26 -0
- data/bin/pry +17 -0
- data/bin/rspec +17 -0
- data/data/user_agents.txt +204 -0
- data/lib/clients.rb +24 -0
- data/lib/clients/errors.rb +16 -0
- data/lib/clients/ftp_client.rb +17 -0
- data/lib/clients/http_client.rb +152 -0
- data/lib/clients/http_client/response.rb +57 -0
- data/lib/clients/proxy6_client.rb +70 -0
- data/lib/clients/proxy_client.rb +14 -0
- data/lib/clients/proxy_list_client.rb +38 -0
- data/lib/clients/recaptcha/client.rb +48 -0
- data/lib/clients/recaptcha/response.rb +15 -0
- data/lib/clients/recaptcha/solver.rb +115 -0
- data/lib/clients/tor_client.rb +146 -0
- data/lib/clients/url_decoder.rb +8 -0
- data/lib/clients/version.rb +3 -0
- data/spec/lib/clients/http_client/response_spec.rb +197 -0
- data/spec/lib/clients/http_client_spec.rb +221 -0
- data/spec/lib/clients/tor_client_spec.rb +34 -0
- data/spec/spec_helper.rb +66 -0
- metadata +168 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
require "http"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
class HttpClient
|
6
|
+
class Response < SimpleDelegator
|
7
|
+
alias_method :object, :__getobj__
|
8
|
+
|
9
|
+
DEFAULT_ENCODING = Encoding::UTF_8
|
10
|
+
|
11
|
+
def success?
|
12
|
+
object.status.success?
|
13
|
+
end
|
14
|
+
|
15
|
+
def fail?
|
16
|
+
!success?
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s(force_utf8: false)
|
20
|
+
response = object.to_s
|
21
|
+
return response unless force_utf8
|
22
|
+
|
23
|
+
if object.charset
|
24
|
+
response
|
25
|
+
.encode(DEFAULT_ENCODING)
|
26
|
+
.scrub("_")
|
27
|
+
else
|
28
|
+
response
|
29
|
+
.force_encoding(DEFAULT_ENCODING)
|
30
|
+
.scrub("_")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_html(**kargs)
|
35
|
+
Nokogiri::HTML.parse to_s(**kargs)
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_xml(**kargs)
|
39
|
+
Nokogiri::XML.parse to_s(**kargs)
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_json(**kargs)
|
43
|
+
JSON.parse to_s(**kargs), symbolize_names: true
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_io
|
47
|
+
StringIO.new(to_s)
|
48
|
+
end
|
49
|
+
|
50
|
+
def stream(size = HTTP::Connection::BUFFER_SIZE)
|
51
|
+
while (chunk = object.body.readpartial(size))
|
52
|
+
yield chunk
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require "http"
|
2
|
+
require "clients/errors"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
class Proxy6Client
|
6
|
+
API_URL = "https://proxy6.net/api".freeze
|
7
|
+
|
8
|
+
attr_reader :ip_version
|
9
|
+
|
10
|
+
def initialize(
|
11
|
+
api_url: API_URL,
|
12
|
+
api_key: ENV["PROXY6_KEY"],
|
13
|
+
ip_version: "4"
|
14
|
+
)
|
15
|
+
@api_url = api_url
|
16
|
+
@api_key = api_key
|
17
|
+
@ip_version = ip_version.to_s
|
18
|
+
@proxy = fetch_proxy
|
19
|
+
end
|
20
|
+
|
21
|
+
def host
|
22
|
+
@proxy["host"]
|
23
|
+
end
|
24
|
+
|
25
|
+
def port
|
26
|
+
@proxy["port"].to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
def user
|
30
|
+
@proxy["user"]
|
31
|
+
end
|
32
|
+
|
33
|
+
def password
|
34
|
+
@proxy["pass"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
[host, port, user, password].compact.join(":")
|
39
|
+
end
|
40
|
+
|
41
|
+
def reset!
|
42
|
+
@proxy = fetch_proxy
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def fetch_proxy
|
48
|
+
response = HTTP.accept(:json).get(api_url, params: { state: "active" })
|
49
|
+
json = JSON.parse(response.to_s) if response.status.success?
|
50
|
+
|
51
|
+
fail_on_invalid_list(response) if !response.status.success? || !json.key?("list")
|
52
|
+
|
53
|
+
proxies = json.fetch("list")
|
54
|
+
fail_on_invalid_list(response) if proxies.is_a?(Array)
|
55
|
+
|
56
|
+
proxies = proxies.values.select { |h| h["version"] == @ip_version }
|
57
|
+
fail_on_invalid_list(response) if proxies.empty?
|
58
|
+
|
59
|
+
proxies.sample
|
60
|
+
end
|
61
|
+
|
62
|
+
def api_url
|
63
|
+
[@api_url, @api_key, "getproxy/"].join("/")
|
64
|
+
end
|
65
|
+
|
66
|
+
def fail_on_invalid_list(response)
|
67
|
+
fail ProxyClientError, "Invalid proxy list: #{response.status} #{response.to_s}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "clients/errors"
|
2
|
+
require "clients/proxy6_client"
|
3
|
+
require "clients/proxy_list_client"
|
4
|
+
|
5
|
+
module Clients
|
6
|
+
class ProxyClient
|
7
|
+
def self.from_env
|
8
|
+
case ENV["CLIENTS_PROXY_CLIENT"]
|
9
|
+
when "proxy6" then Proxy6Client.new
|
10
|
+
when "list" then ProxyListClient.new
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Clients
|
2
|
+
class ProxyListClient
|
3
|
+
DEFAULT_LIST_PATH = "/tmp/clients_proxy_list.txt".freeze
|
4
|
+
|
5
|
+
attr_reader :host, :user, :password
|
6
|
+
|
7
|
+
def self.cache_list(list_url, list_path = DEFAULT_LIST_PATH)
|
8
|
+
response = HTTP.get(list_url)
|
9
|
+
fail "Invalid list response: #{response.status}" unless response.status.success?
|
10
|
+
File.open(list_path, "w") { |f| f << response.to_s }
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(path = DEFAULT_LIST_PATH)
|
14
|
+
@path = path
|
15
|
+
select_proxy_from_list
|
16
|
+
end
|
17
|
+
|
18
|
+
def port
|
19
|
+
@port.to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
[host, port, user, password].compact.join(":")
|
24
|
+
end
|
25
|
+
|
26
|
+
def reset!
|
27
|
+
select_proxy_from_list
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def select_proxy_from_list
|
33
|
+
proxies = File.readlines(@path)
|
34
|
+
address = proxies.sample
|
35
|
+
@host, @port, @user, @password = address.split(":").map(&:strip)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require "clients/http_client"
|
2
|
+
require "clients/recaptcha/solver"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
module Recaptcha
|
6
|
+
class Client < Clients::HttpClient
|
7
|
+
attr_writer :solver
|
8
|
+
|
9
|
+
def get(url, **options, &block)
|
10
|
+
response = super
|
11
|
+
response = bypass_captcha(url, response) if captcha_protected?(response)
|
12
|
+
response
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_without_bypass(url, **options, &block)
|
16
|
+
request :get, url, **options, &block
|
17
|
+
end
|
18
|
+
|
19
|
+
def solver
|
20
|
+
@solver ||= Solver.new(self)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def captcha_protected?(response)
|
26
|
+
response.status == 403 &&
|
27
|
+
response.to_s.include?("g-recaptcha-response")
|
28
|
+
end
|
29
|
+
|
30
|
+
def bypass_captcha(url, response)
|
31
|
+
if has_cookies?
|
32
|
+
reset_cookies
|
33
|
+
response = get_without_bypass(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
fail "captcha with empty cookie" if response.cookies.empty?
|
37
|
+
|
38
|
+
solved_cookies = solver.solve(url, response)
|
39
|
+
store_cookies solved_cookies
|
40
|
+
|
41
|
+
solved_response = get_without_bypass(url, follow_redirects: false)
|
42
|
+
reset_cookies
|
43
|
+
|
44
|
+
solved_response
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require "clients/errors"
|
2
|
+
require "clients/recaptcha/response"
|
3
|
+
|
4
|
+
module Clients
|
5
|
+
module Recaptcha
|
6
|
+
class Solver
|
7
|
+
INIT_URL = Addressable::URI.parse("http://2captcha.com/in.php").freeze
|
8
|
+
SOLVE_STATUS_URL = Addressable::URI.parse("http://2captcha.com/res.php").freeze
|
9
|
+
|
10
|
+
attr_reader :client, :captcha_key, :sleep_duration
|
11
|
+
|
12
|
+
def initialize(client, captcha_key: ENV["CAPTCHA_SOLVER_KEY"], sleep_duration: 5)
|
13
|
+
@client = client
|
14
|
+
@captcha_key = captcha_key
|
15
|
+
@sleep_duration = sleep_duration
|
16
|
+
end
|
17
|
+
|
18
|
+
def solve(banned_url, response)
|
19
|
+
site_key = find_site_key(response)
|
20
|
+
solved_path = find_solve_url(response)
|
21
|
+
|
22
|
+
id = init_solver banned_url, site_key
|
23
|
+
token = get_solution id
|
24
|
+
|
25
|
+
solved_response = solve_captcha banned_url, solved_path, token
|
26
|
+
solved_response.cookies
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def init_url(banned_url, site_key)
|
32
|
+
url = INIT_URL.dup
|
33
|
+
url.query_values = {
|
34
|
+
key: captcha_key,
|
35
|
+
method: "userrecaptcha",
|
36
|
+
googlekey: site_key,
|
37
|
+
url: banned_url
|
38
|
+
}
|
39
|
+
url
|
40
|
+
end
|
41
|
+
|
42
|
+
def status_url(id, tries = 0)
|
43
|
+
url = SOLVE_STATUS_URL.dup
|
44
|
+
url.query_values = {
|
45
|
+
key: captcha_key,
|
46
|
+
action: "get",
|
47
|
+
id: id,
|
48
|
+
try: tries
|
49
|
+
}
|
50
|
+
url
|
51
|
+
end
|
52
|
+
|
53
|
+
def solved_url(banned_url, solved_path, token)
|
54
|
+
url = Addressable::URI.parse(banned_url)
|
55
|
+
url.path = solved_path
|
56
|
+
url.query_values = { "g-recaptcha-response": token }
|
57
|
+
url
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_site_key(response)
|
61
|
+
key = response.to_s.match(/data-sitekey=\"(.+?)\"/) { |m| m[1] }
|
62
|
+
fail RecaptchaError, "Empty sitekey in recaptcha form" unless key
|
63
|
+
key
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_solve_url(response)
|
67
|
+
url = response.to_s.match(/action=\"(.+?)\"/) { |m| m[1] }
|
68
|
+
fail RecaptchaError, "Empty action in recaptcha form" unless url
|
69
|
+
url
|
70
|
+
end
|
71
|
+
|
72
|
+
def init_solver(banned_url, site_key)
|
73
|
+
url = init_url banned_url, site_key
|
74
|
+
response = wrap_response client.get_without_bypass(url)
|
75
|
+
fail RecaptchaError, response.to_s unless response.success?
|
76
|
+
response.data
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_solution(id)
|
80
|
+
tries = 0
|
81
|
+
response = nil
|
82
|
+
|
83
|
+
until response
|
84
|
+
fail RecaptchaError, "Solve timeout after 10 tries" if tries > 10
|
85
|
+
|
86
|
+
sleep sleep_duration
|
87
|
+
response = check_status id, tries
|
88
|
+
tries += 1
|
89
|
+
end
|
90
|
+
|
91
|
+
response.data
|
92
|
+
end
|
93
|
+
|
94
|
+
def check_status(id, tries)
|
95
|
+
url = status_url id, tries
|
96
|
+
response = wrap_response client.get_without_bypass(url)
|
97
|
+
response if response.success?
|
98
|
+
end
|
99
|
+
|
100
|
+
def solve_captcha(banned_url, solved_path, token)
|
101
|
+
url = solved_url banned_url, solved_path, token
|
102
|
+
response = client.get_without_bypass(url, follow_redirects: false) do |request|
|
103
|
+
request.headers(referer: banned_url)
|
104
|
+
end
|
105
|
+
|
106
|
+
fail RecaptchaError, "Unable to solve recaptcha" if response.status.redirect?
|
107
|
+
response
|
108
|
+
end
|
109
|
+
|
110
|
+
def wrap_response(response)
|
111
|
+
Recaptcha::Response.new response
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require "net/telnet"
|
2
|
+
require "ostruct"
|
3
|
+
|
4
|
+
# Proxy to Tor
|
5
|
+
#
|
6
|
+
# From http://martincik.com/?p=402
|
7
|
+
module Clients
|
8
|
+
class TorClient
|
9
|
+
DEFAULT_PORT = 9050
|
10
|
+
DEFAULT_CONTROL_PORT = 9051
|
11
|
+
DEFAULT_HTTP_PORT = 8080
|
12
|
+
OK_STATUS = "250 OK\n".freeze
|
13
|
+
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
attr_reader :config, :threshold, :pool_num
|
17
|
+
|
18
|
+
def_delegators :@config,
|
19
|
+
:tor_host, :tor_port
|
20
|
+
|
21
|
+
# rubocop:disable Metrics/MethodLength
|
22
|
+
def initialize(options = {})
|
23
|
+
options = {
|
24
|
+
tor_host: "localhost",
|
25
|
+
tor_port: (ENV["TOR_PORT"] || DEFAULT_PORT).to_i,
|
26
|
+
control_port: (ENV["TOR_CONTROL_PORT"] || DEFAULT_CONTROL_PORT).to_i,
|
27
|
+
host: "localhost",
|
28
|
+
port: (ENV["HTTP_TOR_PORT"] || DEFAULT_HTTP_PORT).to_i,
|
29
|
+
circuit_timeout: 10,
|
30
|
+
throttle_by: 10, # .seconds implied
|
31
|
+
pool_num: nil
|
32
|
+
}.merge(options)
|
33
|
+
|
34
|
+
@pool_num = options.delete(:pool_num)
|
35
|
+
@config = OpenStruct.new options
|
36
|
+
|
37
|
+
setup_pool
|
38
|
+
end
|
39
|
+
|
40
|
+
def host
|
41
|
+
@config[:host]
|
42
|
+
end
|
43
|
+
|
44
|
+
def port
|
45
|
+
@config[:port]
|
46
|
+
end
|
47
|
+
|
48
|
+
def user
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
def password
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_s
|
57
|
+
"#{host}:#{port}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def switch_identity
|
61
|
+
throttle do
|
62
|
+
client = Net::Telnet.new(
|
63
|
+
"Host" => config.tor_host,
|
64
|
+
"Port" => config.control_port,
|
65
|
+
"Timeout" => config.circuit_timeout,
|
66
|
+
"Prompt" => Regexp.new(OK_STATUS)
|
67
|
+
)
|
68
|
+
|
69
|
+
authenticate client
|
70
|
+
new_route client
|
71
|
+
|
72
|
+
client.close
|
73
|
+
end
|
74
|
+
end
|
75
|
+
alias_method :reset!, :switch_identity
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def throttle
|
80
|
+
scheduled = if check_threshold
|
81
|
+
update_threshold
|
82
|
+
:now
|
83
|
+
else
|
84
|
+
schedule_switch
|
85
|
+
end
|
86
|
+
|
87
|
+
if scheduled == :now
|
88
|
+
yield
|
89
|
+
else
|
90
|
+
sleep until_next_switch_time
|
91
|
+
|
92
|
+
if scheduled
|
93
|
+
update_threshold
|
94
|
+
yield
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def authenticate(client)
|
100
|
+
client.cmd("AUTHENTICATE") do |c|
|
101
|
+
fail "cannot authenticate to Tor!" unless c == OK_STATUS
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def new_route(client)
|
106
|
+
client.cmd("SIGNAL NEWNYM") do |c|
|
107
|
+
fail "cannot switch Tor to new route!" unless c == OK_STATUS
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def setup_pool
|
112
|
+
return unless pool_num
|
113
|
+
|
114
|
+
config.tor_port += 2 * pool_num
|
115
|
+
config.control_port += 2 * pool_num
|
116
|
+
config.port += 2 * pool_num
|
117
|
+
end
|
118
|
+
|
119
|
+
def check_threshold
|
120
|
+
!threshold || (next_switch_time < Time.now)
|
121
|
+
end
|
122
|
+
|
123
|
+
def next_switch_time
|
124
|
+
threshold + config.throttle_by
|
125
|
+
end
|
126
|
+
|
127
|
+
def until_next_switch_time
|
128
|
+
diff = next_switch_time - Time.now
|
129
|
+
diff < 0 ? 0 : diff
|
130
|
+
end
|
131
|
+
|
132
|
+
def schedule_switch
|
133
|
+
return if @scheduled
|
134
|
+
@scheduled = true
|
135
|
+
end
|
136
|
+
|
137
|
+
def update_threshold
|
138
|
+
@threshold = Time.now
|
139
|
+
@scheduled = false
|
140
|
+
end
|
141
|
+
|
142
|
+
def reset_threshold
|
143
|
+
@threshold = nil
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|