pincers 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pincers/chenso/backend.rb +162 -0
- data/lib/pincers/chenso/browsing_context.rb +76 -0
- data/lib/pincers/chenso/browsing_manager.rb +67 -0
- data/lib/pincers/chenso/factory.rb +27 -0
- data/lib/pincers/chenso/html_form_request.rb +7 -0
- data/lib/pincers/chenso/html_page_request.rb +27 -0
- data/lib/pincers/{backend/base.rb → core/base_backend.rb} +18 -8
- data/lib/pincers/{factories/base.rb → core/base_factory.rb} +2 -2
- data/lib/pincers/core/helpers/form.rb +106 -0
- data/lib/pincers/core/{query.rb → helpers/query.rb} +3 -4
- data/lib/pincers/core/replicas/form.rb +43 -0
- data/lib/pincers/core/replicas/link.rb +18 -0
- data/lib/pincers/core/root_context.rb +25 -6
- data/lib/pincers/core/search_context.rb +24 -3
- data/lib/pincers/errors.rb +4 -1
- data/lib/pincers/extension/labs.rb +3 -0
- data/lib/pincers/factory.rb +9 -4
- data/lib/pincers/http/base_document.rb +20 -0
- data/lib/pincers/http/client.rb +134 -0
- data/lib/pincers/{support → http}/cookie.rb +1 -1
- data/lib/pincers/{support → http}/cookie_jar.rb +4 -3
- data/lib/pincers/http/errors.rb +26 -0
- data/lib/pincers/http/request.rb +62 -0
- data/lib/pincers/http/response_document.rb +24 -0
- data/lib/pincers/http/session.rb +99 -0
- data/lib/pincers/http/utils.rb +43 -0
- data/lib/pincers/nokogiri/backend.rb +151 -0
- data/lib/pincers/{factories/nokogiri.rb → nokogiri/factory.rb} +5 -5
- data/lib/pincers/version.rb +1 -1
- data/lib/pincers/{backend/webdriver.rb → webdriver/backend.rb} +22 -31
- data/lib/pincers/{factories/webdriver.rb → webdriver/factory.rb} +5 -5
- data/lib/pincers/webdriver/http_document.rb +23 -0
- metadata +42 -13
- data/lib/pincers/backend/nokogiri.rb +0 -66
- data/lib/pincers/core/download.rb +0 -14
- data/lib/pincers/support/http_client.rb +0 -123
@@ -1,6 +1,7 @@
|
|
1
|
-
require
|
1
|
+
require 'pincers/http/cookie'
|
2
|
+
require 'pincers/http/utils'
|
2
3
|
|
3
|
-
module Pincers::
|
4
|
+
module Pincers::Http
|
4
5
|
class CookieJar
|
5
6
|
|
6
7
|
BAD_VALUE_CHARS = /([\x00-\x20\x7F",;\\])/ # RFC 6265 - 4.1.1
|
@@ -16,7 +17,7 @@ module Pincers::Support
|
|
16
17
|
end
|
17
18
|
|
18
19
|
def get(_url, _name)
|
19
|
-
for_origin(
|
20
|
+
for_origin(Utils.parse_uri(_url)).find { |c| c.name == _name }
|
20
21
|
end
|
21
22
|
|
22
23
|
def set(_parts)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
class RequestError < StandardError
|
3
|
+
extend Forwardable
|
4
|
+
|
5
|
+
def_delegators :@response, :code, :body
|
6
|
+
|
7
|
+
attr_reader :response
|
8
|
+
|
9
|
+
def initialize(_response)
|
10
|
+
@response = _response
|
11
|
+
super _response.message
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class EncodingNotSupported < StandardError
|
16
|
+
def initialize(_encoding)
|
17
|
+
super "#{_encoding} is not supported by this operation"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class MaximumRedirectsError < StandardError
|
22
|
+
def initialize
|
23
|
+
super 'Redirection loop detected!'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
class Request
|
3
|
+
|
4
|
+
attr_reader :method, :uri, :headers
|
5
|
+
attr_accessor :data
|
6
|
+
|
7
|
+
def initialize(_method, _uri)
|
8
|
+
@method = _method
|
9
|
+
@uri = _uri
|
10
|
+
@headers = {}
|
11
|
+
@data = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def url
|
15
|
+
@uri.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def native_type
|
19
|
+
case @method
|
20
|
+
when :get then Net::HTTP::Get
|
21
|
+
when :post then Net::HTTP::Post
|
22
|
+
when :put then Net::HTTP::Put
|
23
|
+
when :delete then Net::HTTP::Delete
|
24
|
+
else nil end
|
25
|
+
end
|
26
|
+
|
27
|
+
def set_query(_pairs)
|
28
|
+
_pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
|
29
|
+
@uri.query = Utils.encode_urlencoded(_pairs)
|
30
|
+
end
|
31
|
+
|
32
|
+
def set_form_data(_pairs, _encoding = nil)
|
33
|
+
_pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
|
34
|
+
encoding = default_encoding_for(_pairs)
|
35
|
+
encoding = _encoding if !_encoding.nil? && encoding == Utils::FORM_URLENCODED
|
36
|
+
|
37
|
+
if method == :get
|
38
|
+
raise EncodingNotSupported, encoding if encoding != Utils::FORM_URLENCODED
|
39
|
+
set_query _pairs
|
40
|
+
else
|
41
|
+
headers['Content-Type'] = encoding
|
42
|
+
|
43
|
+
self.data = case encoding
|
44
|
+
when Utils::FORM_URLENCODED
|
45
|
+
Utils.encode_urlencoded _pairs
|
46
|
+
when Utils::FORM_MULTIPART
|
47
|
+
Utils.encode_multipart _pairs
|
48
|
+
else
|
49
|
+
raise Pincers::MissingFeatureError.new "form encoding: #{_encoding}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def default_encoding_for(_pairs)
|
57
|
+
has_files = _pairs.any? { |p| p[1].is_a? IO }
|
58
|
+
has_files ? Utils::FORM_MULTIPART : Utils::FORM_URLENCODED
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'pincers/http/base_document'
|
2
|
+
|
3
|
+
module Pincers::Http
|
4
|
+
class ResponseDocument < BaseDocument
|
5
|
+
|
6
|
+
attr_reader :response
|
7
|
+
|
8
|
+
def initialize(_response)
|
9
|
+
@response = _response
|
10
|
+
end
|
11
|
+
|
12
|
+
def uri
|
13
|
+
@response.uri
|
14
|
+
end
|
15
|
+
|
16
|
+
def content_type
|
17
|
+
@response['Content-Type'] || 'text/plain'
|
18
|
+
end
|
19
|
+
|
20
|
+
def content
|
21
|
+
@response.body
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'pincers/http/errors'
|
2
|
+
require 'pincers/http/cookie_jar'
|
3
|
+
|
4
|
+
module Pincers::Http
|
5
|
+
class Session
|
6
|
+
|
7
|
+
DEFAULT_HEADERS = {
|
8
|
+
'Accept' => '*/*',
|
9
|
+
'Cache-Control' => 'no-cache'
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :cookie_jar, :headers
|
13
|
+
attr_accessor :proxy_addr, :proxy_port, :redirect_limit
|
14
|
+
|
15
|
+
def initialize(_other = nil)
|
16
|
+
if _other
|
17
|
+
@headers = _other.headers.clone
|
18
|
+
@cookie_jar = _other.cookie_jar.copy
|
19
|
+
@proxy_addr = _other.proxy_addr
|
20
|
+
@proxy_port = _other.proxy_port
|
21
|
+
@redirect_limit = _other.redirect_limit
|
22
|
+
else
|
23
|
+
@headers = DEFAULT_HEADERS
|
24
|
+
@cookie_jar = CookieJar.new
|
25
|
+
@redirect_limit = 10
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def proxy=(_value)
|
30
|
+
if _value
|
31
|
+
@proxy_addr, @proxy_port = _value.split ':'
|
32
|
+
else
|
33
|
+
@proxy_addr, @proxy_port = [nil, nil]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def clone
|
38
|
+
self.class.new self
|
39
|
+
end
|
40
|
+
|
41
|
+
def perform(_request)
|
42
|
+
perform_recursive _request, @redirect_limit, nil
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def perform_recursive(_request, _limit, _redirect)
|
48
|
+
raise MaximumRedirectsError.new if _limit == 0
|
49
|
+
|
50
|
+
uri = _redirect || _request.uri
|
51
|
+
path = uri.request_uri.empty? ? '/' : uri.request_uri
|
52
|
+
|
53
|
+
http_request = _request.native_type.new path
|
54
|
+
http_request.body = _request.data
|
55
|
+
|
56
|
+
copy_headers http_request, @headers
|
57
|
+
copy_headers http_request, _request.headers
|
58
|
+
set_cookies http_request, uri
|
59
|
+
|
60
|
+
http_response = connect(uri).request http_request
|
61
|
+
|
62
|
+
case http_response
|
63
|
+
when Net::HTTPSuccess then
|
64
|
+
update_cookies(uri, http_response)
|
65
|
+
http_response.uri = uri # uri is not always set by net/http
|
66
|
+
http_response
|
67
|
+
when Net::HTTPRedirection then
|
68
|
+
location = Utils.parse_uri(http_response['location'])
|
69
|
+
perform(_request, _limit - 1, location)
|
70
|
+
else
|
71
|
+
handle_error_response http_response
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def connect(_uri)
|
76
|
+
conn = Net::HTTP.new _uri.host, _uri.port || 80, @proxy_addr, @proxy_port
|
77
|
+
conn.use_ssl = true if _uri.scheme == 'https'
|
78
|
+
conn.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
79
|
+
conn
|
80
|
+
end
|
81
|
+
|
82
|
+
def handle_error_response(_http_response)
|
83
|
+
raise RequestError.new _http_response
|
84
|
+
end
|
85
|
+
|
86
|
+
def copy_headers(_request, _headers)
|
87
|
+
_headers.keys.each { |k| _request[k] = _headers[k] }
|
88
|
+
end
|
89
|
+
|
90
|
+
def set_cookies(_request, _uri)
|
91
|
+
_request['Cookie'] = @cookie_jar.for_origin_as_header _uri
|
92
|
+
end
|
93
|
+
|
94
|
+
def update_cookies(_uri, _response)
|
95
|
+
cookies = _response.get_fields('set-cookie')
|
96
|
+
cookies.each { |raw| @cookie_jar.set_raw _uri, raw } if cookies
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
module Utils
|
3
|
+
extend self
|
4
|
+
|
5
|
+
FORM_URLENCODED = 'application/x-www-form-urlencoded'
|
6
|
+
FORM_MULTIPART = 'multipart/form-data'
|
7
|
+
|
8
|
+
def encode_urlencoded(_pairs)
|
9
|
+
_pairs = hash_to_pairs _pairs if _pairs.is_a? Hash
|
10
|
+
_pairs.map { |p| "#{p[0]}=#{CGI.escape(p[1])}" }.join '&'
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode_multipart(_pairs)
|
14
|
+
raise Pincers::MissingFeatureError.new :encode_multipart
|
15
|
+
end
|
16
|
+
|
17
|
+
def hash_to_pairs(_hash)
|
18
|
+
pair_recursive [], _hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_uri(_url)
|
22
|
+
URI.parse _url
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def pair_recursive(_pairs, _data, _prefix = nil)
|
28
|
+
_data.each do |key, value|
|
29
|
+
key = "#{_prefix}.#{key}" if _prefix
|
30
|
+
case value
|
31
|
+
when Hash
|
32
|
+
pair_recursive _pairs, value, key
|
33
|
+
when Array
|
34
|
+
key = "#{key}[]"
|
35
|
+
value.each { |item| _pairs << [key, item.to_s] }
|
36
|
+
else
|
37
|
+
_pairs << [key.to_s, value.to_s]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
_pairs
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'pincers/core/base_backend'
|
2
|
+
|
3
|
+
module Pincers::Nokogiri
|
4
|
+
class Backend < Pincers::Core::BaseBackend
|
5
|
+
|
6
|
+
# This is a small bool properties subset, I believe its enough for scrapping.
|
7
|
+
# For information of where to find the full list: http://stackoverflow.com/questions/706384/boolean-html-attributes
|
8
|
+
|
9
|
+
BOOL_PROPERTIES = {
|
10
|
+
checked: [:input_checkbox, :input_radio],
|
11
|
+
selected: [:option],
|
12
|
+
disabled: :all, # no restrictions
|
13
|
+
readonly: [:input_text, :input_password, :textarea],
|
14
|
+
multiple: [:select]
|
15
|
+
}
|
16
|
+
|
17
|
+
def initialize(_document)
|
18
|
+
@document = _document
|
19
|
+
end
|
20
|
+
|
21
|
+
def document
|
22
|
+
@document
|
23
|
+
end
|
24
|
+
|
25
|
+
def document_root
|
26
|
+
[document]
|
27
|
+
end
|
28
|
+
|
29
|
+
def document_title
|
30
|
+
document.title
|
31
|
+
end
|
32
|
+
|
33
|
+
def close_document
|
34
|
+
# no closing needed
|
35
|
+
end
|
36
|
+
|
37
|
+
def search_by_css(_element, _selector, _limit)
|
38
|
+
# nokogiri does not do any query level optimization when searching just one node
|
39
|
+
_element.css _selector
|
40
|
+
end
|
41
|
+
|
42
|
+
def search_by_xpath(_element, _selector, _limit)
|
43
|
+
# nokogiri does not do any query level optimization when searching just one node
|
44
|
+
_element.xpath _selector
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_element_tag(_element)
|
48
|
+
_element.name
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_element_text(_element)
|
52
|
+
_element.content
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_element_html(_element)
|
56
|
+
_element.to_html
|
57
|
+
end
|
58
|
+
|
59
|
+
def extract_element_attribute(_element, _name)
|
60
|
+
_name = _name.to_sym
|
61
|
+
if _name == :value
|
62
|
+
case classify _element
|
63
|
+
when :input_checkbox, :input_radio
|
64
|
+
extract_checkable_value _element
|
65
|
+
when :select
|
66
|
+
extract_select_value _element
|
67
|
+
when :option
|
68
|
+
extract_option_value _element
|
69
|
+
when :textarea
|
70
|
+
_element.content
|
71
|
+
else
|
72
|
+
_element[:value]
|
73
|
+
end
|
74
|
+
elsif is_boolean? _element, _name
|
75
|
+
!_element[_name].nil?
|
76
|
+
else
|
77
|
+
_element[_name]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def set_element_attribute(_element, _name, _value)
|
82
|
+
_name = _name.to_sym
|
83
|
+
|
84
|
+
if _name == :value
|
85
|
+
case classify _element
|
86
|
+
when :select
|
87
|
+
set_select_value _element, _value
|
88
|
+
when :textarea
|
89
|
+
_element.content = _value
|
90
|
+
else
|
91
|
+
_element.set_attribute(_name, _value)
|
92
|
+
end
|
93
|
+
elsif is_boolean? _element, _name
|
94
|
+
set_boolean _element, _name, _value
|
95
|
+
else
|
96
|
+
_element.set_attribute(_name, _value)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def classify(_element)
|
103
|
+
name = _element.name
|
104
|
+
name = "input_#{(_element[:type] || 'text')}" if name == 'input'
|
105
|
+
name = "button_#{(_element[:type] || 'submit')}" if name == 'button'
|
106
|
+
name.to_sym
|
107
|
+
end
|
108
|
+
|
109
|
+
def is_boolean?(_element, _name)
|
110
|
+
permitted = BOOL_PROPERTIES[_name]
|
111
|
+
return false if permitted.nil?
|
112
|
+
return true if permitted == :all
|
113
|
+
return permitted.include? classify(_element)
|
114
|
+
end
|
115
|
+
|
116
|
+
def extract_checkable_value(_element)
|
117
|
+
value = _element[:value]
|
118
|
+
value || 'on'
|
119
|
+
end
|
120
|
+
|
121
|
+
def extract_select_value(_element)
|
122
|
+
multiple = !_element[:multiple].nil?
|
123
|
+
selected = _element.css('option[selected]')
|
124
|
+
if multiple
|
125
|
+
selected.map { |o| extract_option_value(o) }
|
126
|
+
else
|
127
|
+
extract_option_value(selected.first)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def extract_option_value(_element)
|
132
|
+
return nil if _element.nil?
|
133
|
+
_element[:value] || _element.content
|
134
|
+
end
|
135
|
+
|
136
|
+
def set_select_value(_element, _value)
|
137
|
+
_element.xpath(".//option[@selected]").each { |o| set_boolean(o, :selected, false) }
|
138
|
+
to_select = _element.at_xpath(".//option[@value='#{_value}']")
|
139
|
+
to_select = _element.at_xpath(".//option[text()='#{_value}']") if to_select.nil?
|
140
|
+
set_boolean(to_select, :selected, true) unless to_select.nil?
|
141
|
+
end
|
142
|
+
|
143
|
+
def set_boolean(_element, _name, _value)
|
144
|
+
if _value
|
145
|
+
_element.set_attribute(_name, _name)
|
146
|
+
else
|
147
|
+
_element.remove_attribute(_name.to_s)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "pincers/core/base_factory"
|
2
|
+
require "pincers/nokogiri/backend"
|
3
3
|
|
4
|
-
module Pincers::
|
5
|
-
class
|
4
|
+
module Pincers::Nokogiri
|
5
|
+
class Factory < Pincers::Core::BaseFactory
|
6
6
|
|
7
7
|
def load_backend(_options)
|
8
8
|
document = _options.delete(:document)
|
@@ -11,7 +11,7 @@ module Pincers::Factories
|
|
11
11
|
document = ::Nokogiri::HTML document, _options[:url], _options[:encoding], _options[:flags]
|
12
12
|
end
|
13
13
|
|
14
|
-
|
14
|
+
Pincers::Nokogiri::Backend.new document
|
15
15
|
end
|
16
16
|
|
17
17
|
end
|