scraper_clients 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ require "http"
2
+ require "nokogiri"
3
+
4
+ module Clients
5
+ class HttpClient
6
+ class Response < SimpleDelegator
7
+ alias_method :object, :__getobj__
8
+
9
+ DEFAULT_ENCODING = Encoding::UTF_8
10
+
11
+ def success?
12
+ object.status.success?
13
+ end
14
+
15
+ def fail?
16
+ !success?
17
+ end
18
+
19
+ def to_s(force_utf8: false)
20
+ response = object.to_s
21
+ return response unless force_utf8
22
+
23
+ if object.charset
24
+ response
25
+ .encode(DEFAULT_ENCODING)
26
+ .scrub("_")
27
+ else
28
+ response
29
+ .force_encoding(DEFAULT_ENCODING)
30
+ .scrub("_")
31
+ end
32
+ end
33
+
34
+ def to_html(**kargs)
35
+ Nokogiri::HTML.parse to_s(**kargs)
36
+ end
37
+
38
+ def to_xml(**kargs)
39
+ Nokogiri::XML.parse to_s(**kargs)
40
+ end
41
+
42
+ def to_json(**kargs)
43
+ JSON.parse to_s(**kargs), symbolize_names: true
44
+ end
45
+
46
+ def to_io
47
+ StringIO.new(to_s)
48
+ end
49
+
50
+ def stream(size = HTTP::Connection::BUFFER_SIZE)
51
+ while (chunk = object.body.readpartial(size))
52
+ yield chunk
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,70 @@
1
+ require "http"
2
+ require "clients/errors"
3
+
4
+ module Clients
5
+ class Proxy6Client
6
+ API_URL = "https://proxy6.net/api".freeze
7
+
8
+ attr_reader :ip_version
9
+
10
+ def initialize(
11
+ api_url: API_URL,
12
+ api_key: ENV["PROXY6_KEY"],
13
+ ip_version: "4"
14
+ )
15
+ @api_url = api_url
16
+ @api_key = api_key
17
+ @ip_version = ip_version.to_s
18
+ @proxy = fetch_proxy
19
+ end
20
+
21
+ def host
22
+ @proxy["host"]
23
+ end
24
+
25
+ def port
26
+ @proxy["port"].to_i
27
+ end
28
+
29
+ def user
30
+ @proxy["user"]
31
+ end
32
+
33
+ def password
34
+ @proxy["pass"]
35
+ end
36
+
37
+ def to_s
38
+ [host, port, user, password].compact.join(":")
39
+ end
40
+
41
+ def reset!
42
+ @proxy = fetch_proxy
43
+ end
44
+
45
+ private
46
+
47
+ def fetch_proxy
48
+ response = HTTP.accept(:json).get(api_url, params: { state: "active" })
49
+ json = JSON.parse(response.to_s) if response.status.success?
50
+
51
+ fail_on_invalid_list(response) if !response.status.success? || !json.key?("list")
52
+
53
+ proxies = json.fetch("list")
54
+ fail_on_invalid_list(response) if proxies.is_a?(Array)
55
+
56
+ proxies = proxies.values.select { |h| h["version"] == @ip_version }
57
+ fail_on_invalid_list(response) if proxies.empty?
58
+
59
+ proxies.sample
60
+ end
61
+
62
+ def api_url
63
+ [@api_url, @api_key, "getproxy/"].join("/")
64
+ end
65
+
66
+ def fail_on_invalid_list(response)
67
+ fail ProxyClientError, "Invalid proxy list: #{response.status} #{response.to_s}"
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,14 @@
1
+ require "clients/errors"
2
+ require "clients/proxy6_client"
3
+ require "clients/proxy_list_client"
4
+
5
+ module Clients
6
+ class ProxyClient
7
+ def self.from_env
8
+ case ENV["CLIENTS_PROXY_CLIENT"]
9
+ when "proxy6" then Proxy6Client.new
10
+ when "list" then ProxyListClient.new
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,38 @@
1
+ module Clients
2
+ class ProxyListClient
3
+ DEFAULT_LIST_PATH = "/tmp/clients_proxy_list.txt".freeze
4
+
5
+ attr_reader :host, :user, :password
6
+
7
+ def self.cache_list(list_url, list_path = DEFAULT_LIST_PATH)
8
+ response = HTTP.get(list_url)
9
+ fail "Invalid list response: #{response.status}" unless response.status.success?
10
+ File.open(list_path, "w") { |f| f << response.to_s }
11
+ end
12
+
13
+ def initialize(path = DEFAULT_LIST_PATH)
14
+ @path = path
15
+ select_proxy_from_list
16
+ end
17
+
18
+ def port
19
+ @port.to_i
20
+ end
21
+
22
+ def to_s
23
+ [host, port, user, password].compact.join(":")
24
+ end
25
+
26
+ def reset!
27
+ select_proxy_from_list
28
+ end
29
+
30
+ private
31
+
32
+ def select_proxy_from_list
33
+ proxies = File.readlines(@path)
34
+ address = proxies.sample
35
+ @host, @port, @user, @password = address.split(":").map(&:strip)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,48 @@
1
+ require "clients/http_client"
2
+ require "clients/recaptcha/solver"
3
+
4
+ module Clients
5
+ module Recaptcha
6
+ class Client < Clients::HttpClient
7
+ attr_writer :solver
8
+
9
+ def get(url, **options, &block)
10
+ response = super
11
+ response = bypass_captcha(url, response) if captcha_protected?(response)
12
+ response
13
+ end
14
+
15
+ def get_without_bypass(url, **options, &block)
16
+ request :get, url, **options, &block
17
+ end
18
+
19
+ def solver
20
+ @solver ||= Solver.new(self)
21
+ end
22
+
23
+ private
24
+
25
+ def captcha_protected?(response)
26
+ response.status == 403 &&
27
+ response.to_s.include?("g-recaptcha-response")
28
+ end
29
+
30
+ def bypass_captcha(url, response)
31
+ if has_cookies?
32
+ reset_cookies
33
+ response = get_without_bypass(url)
34
+ end
35
+
36
+ fail "captcha with empty cookie" if response.cookies.empty?
37
+
38
+ solved_cookies = solver.solve(url, response)
39
+ store_cookies solved_cookies
40
+
41
+ solved_response = get_without_bypass(url, follow_redirects: false)
42
+ reset_cookies
43
+
44
+ solved_response
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ module Clients
2
+ module Recaptcha
3
+ class Response < SimpleDelegator
4
+ alias_method :object, :__getobj__
5
+
6
+ def success?
7
+ to_s[0..1] == "OK"
8
+ end
9
+
10
+ def data
11
+ to_s[3..-1]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,115 @@
1
+ require "clients/errors"
2
+ require "clients/recaptcha/response"
3
+
4
+ module Clients
5
+ module Recaptcha
6
+ class Solver
7
+ INIT_URL = Addressable::URI.parse("http://2captcha.com/in.php").freeze
8
+ SOLVE_STATUS_URL = Addressable::URI.parse("http://2captcha.com/res.php").freeze
9
+
10
+ attr_reader :client, :captcha_key, :sleep_duration
11
+
12
+ def initialize(client, captcha_key: ENV["CAPTCHA_SOLVER_KEY"], sleep_duration: 5)
13
+ @client = client
14
+ @captcha_key = captcha_key
15
+ @sleep_duration = sleep_duration
16
+ end
17
+
18
+ def solve(banned_url, response)
19
+ site_key = find_site_key(response)
20
+ solved_path = find_solve_url(response)
21
+
22
+ id = init_solver banned_url, site_key
23
+ token = get_solution id
24
+
25
+ solved_response = solve_captcha banned_url, solved_path, token
26
+ solved_response.cookies
27
+ end
28
+
29
+ private
30
+
31
+ def init_url(banned_url, site_key)
32
+ url = INIT_URL.dup
33
+ url.query_values = {
34
+ key: captcha_key,
35
+ method: "userrecaptcha",
36
+ googlekey: site_key,
37
+ url: banned_url
38
+ }
39
+ url
40
+ end
41
+
42
+ def status_url(id, tries = 0)
43
+ url = SOLVE_STATUS_URL.dup
44
+ url.query_values = {
45
+ key: captcha_key,
46
+ action: "get",
47
+ id: id,
48
+ try: tries
49
+ }
50
+ url
51
+ end
52
+
53
+ def solved_url(banned_url, solved_path, token)
54
+ url = Addressable::URI.parse(banned_url)
55
+ url.path = solved_path
56
+ url.query_values = { "g-recaptcha-response": token }
57
+ url
58
+ end
59
+
60
+ def find_site_key(response)
61
+ key = response.to_s.match(/data-sitekey=\"(.+?)\"/) { |m| m[1] }
62
+ fail RecaptchaError, "Empty sitekey in recaptcha form" unless key
63
+ key
64
+ end
65
+
66
+ def find_solve_url(response)
67
+ url = response.to_s.match(/action=\"(.+?)\"/) { |m| m[1] }
68
+ fail RecaptchaError, "Empty action in recaptcha form" unless url
69
+ url
70
+ end
71
+
72
+ def init_solver(banned_url, site_key)
73
+ url = init_url banned_url, site_key
74
+ response = wrap_response client.get_without_bypass(url)
75
+ fail RecaptchaError, response.to_s unless response.success?
76
+ response.data
77
+ end
78
+
79
+ def get_solution(id)
80
+ tries = 0
81
+ response = nil
82
+
83
+ until response
84
+ fail RecaptchaError, "Solve timeout after 10 tries" if tries > 10
85
+
86
+ sleep sleep_duration
87
+ response = check_status id, tries
88
+ tries += 1
89
+ end
90
+
91
+ response.data
92
+ end
93
+
94
+ def check_status(id, tries)
95
+ url = status_url id, tries
96
+ response = wrap_response client.get_without_bypass(url)
97
+ response if response.success?
98
+ end
99
+
100
+ def solve_captcha(banned_url, solved_path, token)
101
+ url = solved_url banned_url, solved_path, token
102
+ response = client.get_without_bypass(url, follow_redirects: false) do |request|
103
+ request.headers(referer: banned_url)
104
+ end
105
+
106
+ fail RecaptchaError, "Unable to solve recaptcha" if response.status.redirect?
107
+ response
108
+ end
109
+
110
+ def wrap_response(response)
111
+ Recaptcha::Response.new response
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,146 @@
1
+ require "net/telnet"
2
+ require "ostruct"
3
+
4
+ # Proxy to Tor
5
+ #
6
+ # From http://martincik.com/?p=402
7
+ module Clients
8
+ class TorClient
9
+ DEFAULT_PORT = 9050
10
+ DEFAULT_CONTROL_PORT = 9051
11
+ DEFAULT_HTTP_PORT = 8080
12
+ OK_STATUS = "250 OK\n".freeze
13
+
14
+ extend Forwardable
15
+
16
+ attr_reader :config, :threshold, :pool_num
17
+
18
+ def_delegators :@config,
19
+ :tor_host, :tor_port
20
+
21
+ # rubocop:disable Metrics/MethodLength
22
+ def initialize(options = {})
23
+ options = {
24
+ tor_host: "localhost",
25
+ tor_port: (ENV["TOR_PORT"] || DEFAULT_PORT).to_i,
26
+ control_port: (ENV["TOR_CONTROL_PORT"] || DEFAULT_CONTROL_PORT).to_i,
27
+ host: "localhost",
28
+ port: (ENV["HTTP_TOR_PORT"] || DEFAULT_HTTP_PORT).to_i,
29
+ circuit_timeout: 10,
30
+ throttle_by: 10, # .seconds implied
31
+ pool_num: nil
32
+ }.merge(options)
33
+
34
+ @pool_num = options.delete(:pool_num)
35
+ @config = OpenStruct.new options
36
+
37
+ setup_pool
38
+ end
39
+
40
+ def host
41
+ @config[:host]
42
+ end
43
+
44
+ def port
45
+ @config[:port]
46
+ end
47
+
48
+ def user
49
+ nil
50
+ end
51
+
52
+ def password
53
+ nil
54
+ end
55
+
56
+ def to_s
57
+ "#{host}:#{port}"
58
+ end
59
+
60
+ def switch_identity
61
+ throttle do
62
+ client = Net::Telnet.new(
63
+ "Host" => config.tor_host,
64
+ "Port" => config.control_port,
65
+ "Timeout" => config.circuit_timeout,
66
+ "Prompt" => Regexp.new(OK_STATUS)
67
+ )
68
+
69
+ authenticate client
70
+ new_route client
71
+
72
+ client.close
73
+ end
74
+ end
75
+ alias_method :reset!, :switch_identity
76
+
77
+ private
78
+
79
+ def throttle
80
+ scheduled = if check_threshold
81
+ update_threshold
82
+ :now
83
+ else
84
+ schedule_switch
85
+ end
86
+
87
+ if scheduled == :now
88
+ yield
89
+ else
90
+ sleep until_next_switch_time
91
+
92
+ if scheduled
93
+ update_threshold
94
+ yield
95
+ end
96
+ end
97
+ end
98
+
99
+ def authenticate(client)
100
+ client.cmd("AUTHENTICATE") do |c|
101
+ fail "cannot authenticate to Tor!" unless c == OK_STATUS
102
+ end
103
+ end
104
+
105
+ def new_route(client)
106
+ client.cmd("SIGNAL NEWNYM") do |c|
107
+ fail "cannot switch Tor to new route!" unless c == OK_STATUS
108
+ end
109
+ end
110
+
111
+ def setup_pool
112
+ return unless pool_num
113
+
114
+ config.tor_port += 2 * pool_num
115
+ config.control_port += 2 * pool_num
116
+ config.port += 2 * pool_num
117
+ end
118
+
119
+ def check_threshold
120
+ !threshold || (next_switch_time < Time.now)
121
+ end
122
+
123
+ def next_switch_time
124
+ threshold + config.throttle_by
125
+ end
126
+
127
+ def until_next_switch_time
128
+ diff = next_switch_time - Time.now
129
+ diff < 0 ? 0 : diff
130
+ end
131
+
132
+ def schedule_switch
133
+ return if @scheduled
134
+ @scheduled = true
135
+ end
136
+
137
+ def update_threshold
138
+ @threshold = Time.now
139
+ @scheduled = false
140
+ end
141
+
142
+ def reset_threshold
143
+ @threshold = nil
144
+ end
145
+ end
146
+ end