pincers 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pincers/chenso/backend.rb +162 -0
- data/lib/pincers/chenso/browsing_context.rb +76 -0
- data/lib/pincers/chenso/browsing_manager.rb +67 -0
- data/lib/pincers/chenso/factory.rb +27 -0
- data/lib/pincers/chenso/html_form_request.rb +7 -0
- data/lib/pincers/chenso/html_page_request.rb +27 -0
- data/lib/pincers/{backend/base.rb → core/base_backend.rb} +18 -8
- data/lib/pincers/{factories/base.rb → core/base_factory.rb} +2 -2
- data/lib/pincers/core/helpers/form.rb +106 -0
- data/lib/pincers/core/{query.rb → helpers/query.rb} +3 -4
- data/lib/pincers/core/replicas/form.rb +43 -0
- data/lib/pincers/core/replicas/link.rb +18 -0
- data/lib/pincers/core/root_context.rb +25 -6
- data/lib/pincers/core/search_context.rb +24 -3
- data/lib/pincers/errors.rb +4 -1
- data/lib/pincers/extension/labs.rb +3 -0
- data/lib/pincers/factory.rb +9 -4
- data/lib/pincers/http/base_document.rb +20 -0
- data/lib/pincers/http/client.rb +134 -0
- data/lib/pincers/{support → http}/cookie.rb +1 -1
- data/lib/pincers/{support → http}/cookie_jar.rb +4 -3
- data/lib/pincers/http/errors.rb +26 -0
- data/lib/pincers/http/request.rb +62 -0
- data/lib/pincers/http/response_document.rb +24 -0
- data/lib/pincers/http/session.rb +99 -0
- data/lib/pincers/http/utils.rb +43 -0
- data/lib/pincers/nokogiri/backend.rb +151 -0
- data/lib/pincers/{factories/nokogiri.rb → nokogiri/factory.rb} +5 -5
- data/lib/pincers/version.rb +1 -1
- data/lib/pincers/{backend/webdriver.rb → webdriver/backend.rb} +22 -31
- data/lib/pincers/{factories/webdriver.rb → webdriver/factory.rb} +5 -5
- data/lib/pincers/webdriver/http_document.rb +23 -0
- metadata +42 -13
- data/lib/pincers/backend/nokogiri.rb +0 -66
- data/lib/pincers/core/download.rb +0 -14
- data/lib/pincers/support/http_client.rb +0 -123
@@ -1,6 +1,7 @@
|
|
1
|
-
require
|
1
|
+
require 'pincers/http/cookie'
|
2
|
+
require 'pincers/http/utils'
|
2
3
|
|
3
|
-
module Pincers::
|
4
|
+
module Pincers::Http
|
4
5
|
class CookieJar
|
5
6
|
|
6
7
|
BAD_VALUE_CHARS = /([\x00-\x20\x7F",;\\])/ # RFC 6265 - 4.1.1
|
@@ -16,7 +17,7 @@ module Pincers::Support
|
|
16
17
|
end
|
17
18
|
|
18
19
|
def get(_url, _name)
|
19
|
-
for_origin(
|
20
|
+
for_origin(Utils.parse_uri(_url)).find { |c| c.name == _name }
|
20
21
|
end
|
21
22
|
|
22
23
|
def set(_parts)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
class RequestError < StandardError
|
3
|
+
extend Forwardable
|
4
|
+
|
5
|
+
def_delegators :@response, :code, :body
|
6
|
+
|
7
|
+
attr_reader :response
|
8
|
+
|
9
|
+
def initialize(_response)
|
10
|
+
@response = _response
|
11
|
+
super _response.message
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class EncodingNotSupported < StandardError
|
16
|
+
def initialize(_encoding)
|
17
|
+
super "#{_encoding} is not supported by this operation"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class MaximumRedirectsError < StandardError
|
22
|
+
def initialize
|
23
|
+
super 'Redirection loop detected!'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
class Request
|
3
|
+
|
4
|
+
attr_reader :method, :uri, :headers
|
5
|
+
attr_accessor :data
|
6
|
+
|
7
|
+
def initialize(_method, _uri)
|
8
|
+
@method = _method
|
9
|
+
@uri = _uri
|
10
|
+
@headers = {}
|
11
|
+
@data = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def url
|
15
|
+
@uri.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def native_type
|
19
|
+
case @method
|
20
|
+
when :get then Net::HTTP::Get
|
21
|
+
when :post then Net::HTTP::Post
|
22
|
+
when :put then Net::HTTP::Put
|
23
|
+
when :delete then Net::HTTP::Delete
|
24
|
+
else nil end
|
25
|
+
end
|
26
|
+
|
27
|
+
def set_query(_pairs)
|
28
|
+
_pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
|
29
|
+
@uri.query = Utils.encode_urlencoded(_pairs)
|
30
|
+
end
|
31
|
+
|
32
|
+
def set_form_data(_pairs, _encoding = nil)
|
33
|
+
_pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
|
34
|
+
encoding = default_encoding_for(_pairs)
|
35
|
+
encoding = _encoding if !_encoding.nil? && encoding == Utils::FORM_URLENCODED
|
36
|
+
|
37
|
+
if method == :get
|
38
|
+
raise EncodingNotSupported, encoding if encoding != Utils::FORM_URLENCODED
|
39
|
+
set_query _pairs
|
40
|
+
else
|
41
|
+
headers['Content-Type'] = encoding
|
42
|
+
|
43
|
+
self.data = case encoding
|
44
|
+
when Utils::FORM_URLENCODED
|
45
|
+
Utils.encode_urlencoded _pairs
|
46
|
+
when Utils::FORM_MULTIPART
|
47
|
+
Utils.encode_multipart _pairs
|
48
|
+
else
|
49
|
+
raise Pincers::MissingFeatureError.new "form encoding: #{_encoding}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def default_encoding_for(_pairs)
|
57
|
+
has_files = _pairs.any? { |p| p[1].is_a? IO }
|
58
|
+
has_files ? Utils::FORM_MULTIPART : Utils::FORM_URLENCODED
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'pincers/http/base_document'
|
2
|
+
|
3
|
+
module Pincers::Http
|
4
|
+
class ResponseDocument < BaseDocument
|
5
|
+
|
6
|
+
attr_reader :response
|
7
|
+
|
8
|
+
def initialize(_response)
|
9
|
+
@response = _response
|
10
|
+
end
|
11
|
+
|
12
|
+
def uri
|
13
|
+
@response.uri
|
14
|
+
end
|
15
|
+
|
16
|
+
def content_type
|
17
|
+
@response['Content-Type'] || 'text/plain'
|
18
|
+
end
|
19
|
+
|
20
|
+
def content
|
21
|
+
@response.body
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'pincers/http/errors'
|
2
|
+
require 'pincers/http/cookie_jar'
|
3
|
+
|
4
|
+
module Pincers::Http
|
5
|
+
class Session
|
6
|
+
|
7
|
+
DEFAULT_HEADERS = {
|
8
|
+
'Accept' => '*/*',
|
9
|
+
'Cache-Control' => 'no-cache'
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :cookie_jar, :headers
|
13
|
+
attr_accessor :proxy_addr, :proxy_port, :redirect_limit
|
14
|
+
|
15
|
+
def initialize(_other = nil)
|
16
|
+
if _other
|
17
|
+
@headers = _other.headers.clone
|
18
|
+
@cookie_jar = _other.cookie_jar.copy
|
19
|
+
@proxy_addr = _other.proxy_addr
|
20
|
+
@proxy_port = _other.proxy_port
|
21
|
+
@redirect_limit = _other.redirect_limit
|
22
|
+
else
|
23
|
+
@headers = DEFAULT_HEADERS
|
24
|
+
@cookie_jar = CookieJar.new
|
25
|
+
@redirect_limit = 10
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def proxy=(_value)
|
30
|
+
if _value
|
31
|
+
@proxy_addr, @proxy_port = _value.split ':'
|
32
|
+
else
|
33
|
+
@proxy_addr, @proxy_port = [nil, nil]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def clone
|
38
|
+
self.class.new self
|
39
|
+
end
|
40
|
+
|
41
|
+
def perform(_request)
|
42
|
+
perform_recursive _request, @redirect_limit, nil
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def perform_recursive(_request, _limit, _redirect)
|
48
|
+
raise MaximumRedirectsError.new if _limit == 0
|
49
|
+
|
50
|
+
uri = _redirect || _request.uri
|
51
|
+
path = uri.request_uri.empty? ? '/' : uri.request_uri
|
52
|
+
|
53
|
+
http_request = _request.native_type.new path
|
54
|
+
http_request.body = _request.data
|
55
|
+
|
56
|
+
copy_headers http_request, @headers
|
57
|
+
copy_headers http_request, _request.headers
|
58
|
+
set_cookies http_request, uri
|
59
|
+
|
60
|
+
http_response = connect(uri).request http_request
|
61
|
+
|
62
|
+
case http_response
|
63
|
+
when Net::HTTPSuccess then
|
64
|
+
update_cookies(uri, http_response)
|
65
|
+
http_response.uri = uri # uri is not always set by net/http
|
66
|
+
http_response
|
67
|
+
when Net::HTTPRedirection then
|
68
|
+
location = Utils.parse_uri(http_response['location'])
|
69
|
+
perform(_request, _limit - 1, location)
|
70
|
+
else
|
71
|
+
handle_error_response http_response
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def connect(_uri)
|
76
|
+
conn = Net::HTTP.new _uri.host, _uri.port || 80, @proxy_addr, @proxy_port
|
77
|
+
conn.use_ssl = true if _uri.scheme == 'https'
|
78
|
+
conn.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
79
|
+
conn
|
80
|
+
end
|
81
|
+
|
82
|
+
def handle_error_response(_http_response)
|
83
|
+
raise RequestError.new _http_response
|
84
|
+
end
|
85
|
+
|
86
|
+
def copy_headers(_request, _headers)
|
87
|
+
_headers.keys.each { |k| _request[k] = _headers[k] }
|
88
|
+
end
|
89
|
+
|
90
|
+
def set_cookies(_request, _uri)
|
91
|
+
_request['Cookie'] = @cookie_jar.for_origin_as_header _uri
|
92
|
+
end
|
93
|
+
|
94
|
+
def update_cookies(_uri, _response)
|
95
|
+
cookies = _response.get_fields('set-cookie')
|
96
|
+
cookies.each { |raw| @cookie_jar.set_raw _uri, raw } if cookies
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Pincers::Http
|
2
|
+
module Utils
|
3
|
+
extend self
|
4
|
+
|
5
|
+
FORM_URLENCODED = 'application/x-www-form-urlencoded'
|
6
|
+
FORM_MULTIPART = 'multipart/form-data'
|
7
|
+
|
8
|
+
def encode_urlencoded(_pairs)
|
9
|
+
_pairs = hash_to_pairs _pairs if _pairs.is_a? Hash
|
10
|
+
_pairs.map { |p| "#{p[0]}=#{CGI.escape(p[1])}" }.join '&'
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode_multipart(_pairs)
|
14
|
+
raise Pincers::MissingFeatureError.new :encode_multipart
|
15
|
+
end
|
16
|
+
|
17
|
+
def hash_to_pairs(_hash)
|
18
|
+
pair_recursive [], _hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_uri(_url)
|
22
|
+
URI.parse _url
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def pair_recursive(_pairs, _data, _prefix = nil)
|
28
|
+
_data.each do |key, value|
|
29
|
+
key = "#{_prefix}.#{key}" if _prefix
|
30
|
+
case value
|
31
|
+
when Hash
|
32
|
+
pair_recursive _pairs, value, key
|
33
|
+
when Array
|
34
|
+
key = "#{key}[]"
|
35
|
+
value.each { |item| _pairs << [key, item.to_s] }
|
36
|
+
else
|
37
|
+
_pairs << [key.to_s, value.to_s]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
_pairs
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'pincers/core/base_backend'
|
2
|
+
|
3
|
+
module Pincers::Nokogiri
|
4
|
+
class Backend < Pincers::Core::BaseBackend
|
5
|
+
|
6
|
+
# This is a small bool properties subset, I believe its enough for scrapping.
|
7
|
+
# For information of where to find the full list: http://stackoverflow.com/questions/706384/boolean-html-attributes
|
8
|
+
|
9
|
+
BOOL_PROPERTIES = {
|
10
|
+
checked: [:input_checkbox, :input_radio],
|
11
|
+
selected: [:option],
|
12
|
+
disabled: :all, # no restrictions
|
13
|
+
readonly: [:input_text, :input_password, :textarea],
|
14
|
+
multiple: [:select]
|
15
|
+
}
|
16
|
+
|
17
|
+
def initialize(_document)
|
18
|
+
@document = _document
|
19
|
+
end
|
20
|
+
|
21
|
+
def document
|
22
|
+
@document
|
23
|
+
end
|
24
|
+
|
25
|
+
def document_root
|
26
|
+
[document]
|
27
|
+
end
|
28
|
+
|
29
|
+
def document_title
|
30
|
+
document.title
|
31
|
+
end
|
32
|
+
|
33
|
+
def close_document
|
34
|
+
# no closing needed
|
35
|
+
end
|
36
|
+
|
37
|
+
def search_by_css(_element, _selector, _limit)
|
38
|
+
# nokogiri does not do any query level optimization when searching just one node
|
39
|
+
_element.css _selector
|
40
|
+
end
|
41
|
+
|
42
|
+
def search_by_xpath(_element, _selector, _limit)
|
43
|
+
# nokogiri does not do any query level optimization when searching just one node
|
44
|
+
_element.xpath _selector
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_element_tag(_element)
|
48
|
+
_element.name
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_element_text(_element)
|
52
|
+
_element.content
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_element_html(_element)
|
56
|
+
_element.to_html
|
57
|
+
end
|
58
|
+
|
59
|
+
def extract_element_attribute(_element, _name)
|
60
|
+
_name = _name.to_sym
|
61
|
+
if _name == :value
|
62
|
+
case classify _element
|
63
|
+
when :input_checkbox, :input_radio
|
64
|
+
extract_checkable_value _element
|
65
|
+
when :select
|
66
|
+
extract_select_value _element
|
67
|
+
when :option
|
68
|
+
extract_option_value _element
|
69
|
+
when :textarea
|
70
|
+
_element.content
|
71
|
+
else
|
72
|
+
_element[:value]
|
73
|
+
end
|
74
|
+
elsif is_boolean? _element, _name
|
75
|
+
!_element[_name].nil?
|
76
|
+
else
|
77
|
+
_element[_name]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def set_element_attribute(_element, _name, _value)
|
82
|
+
_name = _name.to_sym
|
83
|
+
|
84
|
+
if _name == :value
|
85
|
+
case classify _element
|
86
|
+
when :select
|
87
|
+
set_select_value _element, _value
|
88
|
+
when :textarea
|
89
|
+
_element.content = _value
|
90
|
+
else
|
91
|
+
_element.set_attribute(_name, _value)
|
92
|
+
end
|
93
|
+
elsif is_boolean? _element, _name
|
94
|
+
set_boolean _element, _name, _value
|
95
|
+
else
|
96
|
+
_element.set_attribute(_name, _value)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def classify(_element)
|
103
|
+
name = _element.name
|
104
|
+
name = "input_#{(_element[:type] || 'text')}" if name == 'input'
|
105
|
+
name = "button_#{(_element[:type] || 'submit')}" if name == 'button'
|
106
|
+
name.to_sym
|
107
|
+
end
|
108
|
+
|
109
|
+
def is_boolean?(_element, _name)
|
110
|
+
permitted = BOOL_PROPERTIES[_name]
|
111
|
+
return false if permitted.nil?
|
112
|
+
return true if permitted == :all
|
113
|
+
return permitted.include? classify(_element)
|
114
|
+
end
|
115
|
+
|
116
|
+
def extract_checkable_value(_element)
|
117
|
+
value = _element[:value]
|
118
|
+
value || 'on'
|
119
|
+
end
|
120
|
+
|
121
|
+
def extract_select_value(_element)
|
122
|
+
multiple = !_element[:multiple].nil?
|
123
|
+
selected = _element.css('option[selected]')
|
124
|
+
if multiple
|
125
|
+
selected.map { |o| extract_option_value(o) }
|
126
|
+
else
|
127
|
+
extract_option_value(selected.first)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def extract_option_value(_element)
|
132
|
+
return nil if _element.nil?
|
133
|
+
_element[:value] || _element.content
|
134
|
+
end
|
135
|
+
|
136
|
+
def set_select_value(_element, _value)
|
137
|
+
_element.xpath(".//option[@selected]").each { |o| set_boolean(o, :selected, false) }
|
138
|
+
to_select = _element.at_xpath(".//option[@value='#{_value}']")
|
139
|
+
to_select = _element.at_xpath(".//option[text()='#{_value}']") if to_select.nil?
|
140
|
+
set_boolean(to_select, :selected, true) unless to_select.nil?
|
141
|
+
end
|
142
|
+
|
143
|
+
def set_boolean(_element, _name, _value)
|
144
|
+
if _value
|
145
|
+
_element.set_attribute(_name, _name)
|
146
|
+
else
|
147
|
+
_element.remove_attribute(_name.to_s)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "pincers/core/base_factory"
|
2
|
+
require "pincers/nokogiri/backend"
|
3
3
|
|
4
|
-
module Pincers::
|
5
|
-
class
|
4
|
+
module Pincers::Nokogiri
|
5
|
+
class Factory < Pincers::Core::BaseFactory
|
6
6
|
|
7
7
|
def load_backend(_options)
|
8
8
|
document = _options.delete(:document)
|
@@ -11,7 +11,7 @@ module Pincers::Factories
|
|
11
11
|
document = ::Nokogiri::HTML document, _options[:url], _options[:encoding], _options[:flags]
|
12
12
|
end
|
13
13
|
|
14
|
-
|
14
|
+
Pincers::Nokogiri::Backend.new document
|
15
15
|
end
|
16
16
|
|
17
17
|
end
|