scraper_clients 9.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ require "http"
2
+ require "nokogiri"
3
+
4
+ module Clients
5
+ class HttpClient
6
+ class Response < SimpleDelegator
7
+ alias_method :object, :__getobj__
8
+
9
+ DEFAULT_ENCODING = Encoding::UTF_8
10
+
11
+ def success?
12
+ object.status.success?
13
+ end
14
+
15
+ def fail?
16
+ !success?
17
+ end
18
+
19
+ def to_s(force_utf8: false)
20
+ response = object.to_s
21
+ return response unless force_utf8
22
+
23
+ if object.charset
24
+ response
25
+ .encode(DEFAULT_ENCODING)
26
+ .scrub("_")
27
+ else
28
+ response
29
+ .force_encoding(DEFAULT_ENCODING)
30
+ .scrub("_")
31
+ end
32
+ end
33
+
34
+ def to_html(**kargs)
35
+ Nokogiri::HTML.parse to_s(**kargs)
36
+ end
37
+
38
+ def to_xml(**kargs)
39
+ Nokogiri::XML.parse to_s(**kargs)
40
+ end
41
+
42
+ def to_json(**kargs)
43
+ JSON.parse to_s(**kargs), symbolize_names: true
44
+ end
45
+
46
+ def to_io
47
+ StringIO.new(to_s)
48
+ end
49
+
50
+ def stream(size = HTTP::Connection::BUFFER_SIZE)
51
+ while (chunk = object.body.readpartial(size))
52
+ yield chunk
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,70 @@
1
+ require "http"
2
+ require "clients/errors"
3
+
4
+ module Clients
5
+ class Proxy6Client
6
+ API_URL = "https://proxy6.net/api".freeze
7
+
8
+ attr_reader :ip_version
9
+
10
+ def initialize(
11
+ api_url: API_URL,
12
+ api_key: ENV["PROXY6_KEY"],
13
+ ip_version: "4"
14
+ )
15
+ @api_url = api_url
16
+ @api_key = api_key
17
+ @ip_version = ip_version.to_s
18
+ @proxy = fetch_proxy
19
+ end
20
+
21
+ def host
22
+ @proxy["host"]
23
+ end
24
+
25
+ def port
26
+ @proxy["port"].to_i
27
+ end
28
+
29
+ def user
30
+ @proxy["user"]
31
+ end
32
+
33
+ def password
34
+ @proxy["pass"]
35
+ end
36
+
37
+ def to_s
38
+ [host, port, user, password].compact.join(":")
39
+ end
40
+
41
+ def reset!
42
+ @proxy = fetch_proxy
43
+ end
44
+
45
+ private
46
+
47
+ def fetch_proxy
48
+ response = HTTP.accept(:json).get(api_url, params: { state: "active" })
49
+ json = JSON.parse(response.to_s) if response.status.success?
50
+
51
+ fail_on_invalid_list(response) if !response.status.success? || !json.key?("list")
52
+
53
+ proxies = json.fetch("list")
54
+ fail_on_invalid_list(response) if proxies.is_a?(Array)
55
+
56
+ proxies = proxies.values.select { |h| h["version"] == @ip_version }
57
+ fail_on_invalid_list(response) if proxies.empty?
58
+
59
+ proxies.sample
60
+ end
61
+
62
+ def api_url
63
+ [@api_url, @api_key, "getproxy/"].join("/")
64
+ end
65
+
66
+ def fail_on_invalid_list(response)
67
+ fail ProxyClientError, "Invalid proxy list: #{response.status} #{response.to_s}"
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,14 @@
1
+ require "clients/errors"
2
+ require "clients/proxy6_client"
3
+ require "clients/proxy_list_client"
4
+
5
+ module Clients
6
+ class ProxyClient
7
+ def self.from_env
8
+ case ENV["CLIENTS_PROXY_CLIENT"]
9
+ when "proxy6" then Proxy6Client.new
10
+ when "list" then ProxyListClient.new
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,38 @@
1
+ module Clients
2
+ class ProxyListClient
3
+ DEFAULT_LIST_PATH = "/tmp/clients_proxy_list.txt".freeze
4
+
5
+ attr_reader :host, :user, :password
6
+
7
+ def self.cache_list(list_url, list_path = DEFAULT_LIST_PATH)
8
+ response = HTTP.get(list_url)
9
+ fail "Invalid list response: #{response.status}" unless response.status.success?
10
+ File.open(list_path, "w") { |f| f << response.to_s }
11
+ end
12
+
13
+ def initialize(path = DEFAULT_LIST_PATH)
14
+ @path = path
15
+ select_proxy_from_list
16
+ end
17
+
18
+ def port
19
+ @port.to_i
20
+ end
21
+
22
+ def to_s
23
+ [host, port, user, password].compact.join(":")
24
+ end
25
+
26
+ def reset!
27
+ select_proxy_from_list
28
+ end
29
+
30
+ private
31
+
32
+ def select_proxy_from_list
33
+ proxies = File.readlines(@path)
34
+ address = proxies.sample
35
+ @host, @port, @user, @password = address.split(":").map(&:strip)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,48 @@
1
+ require "clients/http_client"
2
+ require "clients/recaptcha/solver"
3
+
4
+ module Clients
5
+ module Recaptcha
6
+ class Client < Clients::HttpClient
7
+ attr_writer :solver
8
+
9
+ def get(url, **options, &block)
10
+ response = super
11
+ response = bypass_captcha(url, response) if captcha_protected?(response)
12
+ response
13
+ end
14
+
15
+ def get_without_bypass(url, **options, &block)
16
+ request :get, url, **options, &block
17
+ end
18
+
19
+ def solver
20
+ @solver ||= Solver.new(self)
21
+ end
22
+
23
+ private
24
+
25
+ def captcha_protected?(response)
26
+ response.status == 403 &&
27
+ response.to_s.include?("g-recaptcha-response")
28
+ end
29
+
30
+ def bypass_captcha(url, response)
31
+ if has_cookies?
32
+ reset_cookies
33
+ response = get_without_bypass(url)
34
+ end
35
+
36
+ fail "captcha with empty cookie" if response.cookies.empty?
37
+
38
+ solved_cookies = solver.solve(url, response)
39
+ store_cookies solved_cookies
40
+
41
+ solved_response = get_without_bypass(url, follow_redirects: false)
42
+ reset_cookies
43
+
44
+ solved_response
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ module Clients
2
+ module Recaptcha
3
+ class Response < SimpleDelegator
4
+ alias_method :object, :__getobj__
5
+
6
+ def success?
7
+ to_s[0..1] == "OK"
8
+ end
9
+
10
+ def data
11
+ to_s[3..-1]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,115 @@
1
+ require "clients/errors"
2
+ require "clients/recaptcha/response"
3
+
4
+ module Clients
5
+ module Recaptcha
6
+ class Solver
7
+ INIT_URL = Addressable::URI.parse("http://2captcha.com/in.php").freeze
8
+ SOLVE_STATUS_URL = Addressable::URI.parse("http://2captcha.com/res.php").freeze
9
+
10
+ attr_reader :client, :captcha_key, :sleep_duration
11
+
12
+ def initialize(client, captcha_key: ENV["CAPTCHA_SOLVER_KEY"], sleep_duration: 5)
13
+ @client = client
14
+ @captcha_key = captcha_key
15
+ @sleep_duration = sleep_duration
16
+ end
17
+
18
+ def solve(banned_url, response)
19
+ site_key = find_site_key(response)
20
+ solved_path = find_solve_url(response)
21
+
22
+ id = init_solver banned_url, site_key
23
+ token = get_solution id
24
+
25
+ solved_response = solve_captcha banned_url, solved_path, token
26
+ solved_response.cookies
27
+ end
28
+
29
+ private
30
+
31
+ def init_url(banned_url, site_key)
32
+ url = INIT_URL.dup
33
+ url.query_values = {
34
+ key: captcha_key,
35
+ method: "userrecaptcha",
36
+ googlekey: site_key,
37
+ url: banned_url
38
+ }
39
+ url
40
+ end
41
+
42
+ def status_url(id, tries = 0)
43
+ url = SOLVE_STATUS_URL.dup
44
+ url.query_values = {
45
+ key: captcha_key,
46
+ action: "get",
47
+ id: id,
48
+ try: tries
49
+ }
50
+ url
51
+ end
52
+
53
+ def solved_url(banned_url, solved_path, token)
54
+ url = Addressable::URI.parse(banned_url)
55
+ url.path = solved_path
56
+ url.query_values = { "g-recaptcha-response": token }
57
+ url
58
+ end
59
+
60
+ def find_site_key(response)
61
+ key = response.to_s.match(/data-sitekey=\"(.+?)\"/) { |m| m[1] }
62
+ fail RecaptchaError, "Empty sitekey in recaptcha form" unless key
63
+ key
64
+ end
65
+
66
+ def find_solve_url(response)
67
+ url = response.to_s.match(/action=\"(.+?)\"/) { |m| m[1] }
68
+ fail RecaptchaError, "Empty action in recaptcha form" unless url
69
+ url
70
+ end
71
+
72
+ def init_solver(banned_url, site_key)
73
+ url = init_url banned_url, site_key
74
+ response = wrap_response client.get_without_bypass(url)
75
+ fail RecaptchaError, response.to_s unless response.success?
76
+ response.data
77
+ end
78
+
79
+ def get_solution(id)
80
+ tries = 0
81
+ response = nil
82
+
83
+ until response
84
+ fail RecaptchaError, "Solve timeout after 10 tries" if tries > 10
85
+
86
+ sleep sleep_duration
87
+ response = check_status id, tries
88
+ tries += 1
89
+ end
90
+
91
+ response.data
92
+ end
93
+
94
+ def check_status(id, tries)
95
+ url = status_url id, tries
96
+ response = wrap_response client.get_without_bypass(url)
97
+ response if response.success?
98
+ end
99
+
100
+ def solve_captcha(banned_url, solved_path, token)
101
+ url = solved_url banned_url, solved_path, token
102
+ response = client.get_without_bypass(url, follow_redirects: false) do |request|
103
+ request.headers(referer: banned_url)
104
+ end
105
+
106
+ fail RecaptchaError, "Unable to solve recaptcha" if response.status.redirect?
107
+ response
108
+ end
109
+
110
+ def wrap_response(response)
111
+ Recaptcha::Response.new response
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,146 @@
1
+ require "net/telnet"
2
+ require "ostruct"
3
+
4
+ # Proxy to Tor
5
+ #
6
+ # From http://martincik.com/?p=402
7
+ module Clients
8
+ class TorClient
9
+ DEFAULT_PORT = 9050
10
+ DEFAULT_CONTROL_PORT = 9051
11
+ DEFAULT_HTTP_PORT = 8080
12
+ OK_STATUS = "250 OK\n".freeze
13
+
14
+ extend Forwardable
15
+
16
+ attr_reader :config, :threshold, :pool_num
17
+
18
+ def_delegators :@config,
19
+ :tor_host, :tor_port
20
+
21
+ # rubocop:disable Metrics/MethodLength
22
+ def initialize(options = {})
23
+ options = {
24
+ tor_host: "localhost",
25
+ tor_port: (ENV["TOR_PORT"] || DEFAULT_PORT).to_i,
26
+ control_port: (ENV["TOR_CONTROL_PORT"] || DEFAULT_CONTROL_PORT).to_i,
27
+ host: "localhost",
28
+ port: (ENV["HTTP_TOR_PORT"] || DEFAULT_HTTP_PORT).to_i,
29
+ circuit_timeout: 10,
30
+ throttle_by: 10, # .seconds implied
31
+ pool_num: nil
32
+ }.merge(options)
33
+
34
+ @pool_num = options.delete(:pool_num)
35
+ @config = OpenStruct.new options
36
+
37
+ setup_pool
38
+ end
39
+
40
+ def host
41
+ @config[:host]
42
+ end
43
+
44
+ def port
45
+ @config[:port]
46
+ end
47
+
48
+ def user
49
+ nil
50
+ end
51
+
52
+ def password
53
+ nil
54
+ end
55
+
56
+ def to_s
57
+ "#{host}:#{port}"
58
+ end
59
+
60
+ def switch_identity
61
+ throttle do
62
+ client = Net::Telnet.new(
63
+ "Host" => config.tor_host,
64
+ "Port" => config.control_port,
65
+ "Timeout" => config.circuit_timeout,
66
+ "Prompt" => Regexp.new(OK_STATUS)
67
+ )
68
+
69
+ authenticate client
70
+ new_route client
71
+
72
+ client.close
73
+ end
74
+ end
75
+ alias_method :reset!, :switch_identity
76
+
77
+ private
78
+
79
+ def throttle
80
+ scheduled = if check_threshold
81
+ update_threshold
82
+ :now
83
+ else
84
+ schedule_switch
85
+ end
86
+
87
+ if scheduled == :now
88
+ yield
89
+ else
90
+ sleep until_next_switch_time
91
+
92
+ if scheduled
93
+ update_threshold
94
+ yield
95
+ end
96
+ end
97
+ end
98
+
99
+ def authenticate(client)
100
+ client.cmd("AUTHENTICATE") do |c|
101
+ fail "cannot authenticate to Tor!" unless c == OK_STATUS
102
+ end
103
+ end
104
+
105
+ def new_route(client)
106
+ client.cmd("SIGNAL NEWNYM") do |c|
107
+ fail "cannot switch Tor to new route!" unless c == OK_STATUS
108
+ end
109
+ end
110
+
111
+ def setup_pool
112
+ return unless pool_num
113
+
114
+ config.tor_port += 2 * pool_num
115
+ config.control_port += 2 * pool_num
116
+ config.port += 2 * pool_num
117
+ end
118
+
119
+ def check_threshold
120
+ !threshold || (next_switch_time < Time.now)
121
+ end
122
+
123
+ def next_switch_time
124
+ threshold + config.throttle_by
125
+ end
126
+
127
+ def until_next_switch_time
128
+ diff = next_switch_time - Time.now
129
+ diff < 0 ? 0 : diff
130
+ end
131
+
132
+ def schedule_switch
133
+ return if @scheduled
134
+ @scheduled = true
135
+ end
136
+
137
+ def update_threshold
138
+ @threshold = Time.now
139
+ @scheduled = false
140
+ end
141
+
142
+ def reset_threshold
143
+ @threshold = nil
144
+ end
145
+ end
146
+ end