pincers 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/lib/pincers/chenso/backend.rb +162 -0
  3. data/lib/pincers/chenso/browsing_context.rb +76 -0
  4. data/lib/pincers/chenso/browsing_manager.rb +67 -0
  5. data/lib/pincers/chenso/factory.rb +27 -0
  6. data/lib/pincers/chenso/html_form_request.rb +7 -0
  7. data/lib/pincers/chenso/html_page_request.rb +27 -0
  8. data/lib/pincers/{backend/base.rb → core/base_backend.rb} +18 -8
  9. data/lib/pincers/{factories/base.rb → core/base_factory.rb} +2 -2
  10. data/lib/pincers/core/helpers/form.rb +106 -0
  11. data/lib/pincers/core/{query.rb → helpers/query.rb} +3 -4
  12. data/lib/pincers/core/replicas/form.rb +43 -0
  13. data/lib/pincers/core/replicas/link.rb +18 -0
  14. data/lib/pincers/core/root_context.rb +25 -6
  15. data/lib/pincers/core/search_context.rb +24 -3
  16. data/lib/pincers/errors.rb +4 -1
  17. data/lib/pincers/extension/labs.rb +3 -0
  18. data/lib/pincers/factory.rb +9 -4
  19. data/lib/pincers/http/base_document.rb +20 -0
  20. data/lib/pincers/http/client.rb +134 -0
  21. data/lib/pincers/{support → http}/cookie.rb +1 -1
  22. data/lib/pincers/{support → http}/cookie_jar.rb +4 -3
  23. data/lib/pincers/http/errors.rb +26 -0
  24. data/lib/pincers/http/request.rb +62 -0
  25. data/lib/pincers/http/response_document.rb +24 -0
  26. data/lib/pincers/http/session.rb +99 -0
  27. data/lib/pincers/http/utils.rb +43 -0
  28. data/lib/pincers/nokogiri/backend.rb +151 -0
  29. data/lib/pincers/{factories/nokogiri.rb → nokogiri/factory.rb} +5 -5
  30. data/lib/pincers/version.rb +1 -1
  31. data/lib/pincers/{backend/webdriver.rb → webdriver/backend.rb} +22 -31
  32. data/lib/pincers/{factories/webdriver.rb → webdriver/factory.rb} +5 -5
  33. data/lib/pincers/webdriver/http_document.rb +23 -0
  34. metadata +42 -13
  35. data/lib/pincers/backend/nokogiri.rb +0 -66
  36. data/lib/pincers/core/download.rb +0 -14
  37. data/lib/pincers/support/http_client.rb +0 -123
@@ -1,3 +1,3 @@
1
- module Pincers::Support
1
+ module Pincers::Http
2
2
  class Cookie < Struct.new(:name, :value, :domain, :path, :expires, :secure); end
3
3
  end
@@ -1,6 +1,7 @@
1
- require "pincers/support/cookie"
1
+ require 'pincers/http/cookie'
2
+ require 'pincers/http/utils'
2
3
 
3
- module Pincers::Support
4
+ module Pincers::Http
4
5
  class CookieJar
5
6
 
6
7
  BAD_VALUE_CHARS = /([\x00-\x20\x7F",;\\])/ # RFC 6265 - 4.1.1
@@ -16,7 +17,7 @@ module Pincers::Support
16
17
  end
17
18
 
18
19
  def get(_url, _name)
19
- for_origin(URI.parse(_url)).find { |c| c.name == _name }
20
+ for_origin(Utils.parse_uri(_url)).find { |c| c.name == _name }
20
21
  end
21
22
 
22
23
  def set(_parts)
@@ -0,0 +1,26 @@
1
+ module Pincers::Http
2
+ class RequestError < StandardError
3
+ extend Forwardable
4
+
5
+ def_delegators :@response, :code, :body
6
+
7
+ attr_reader :response
8
+
9
+ def initialize(_response)
10
+ @response = _response
11
+ super _response.message
12
+ end
13
+ end
14
+
15
+ class EncodingNotSupported < StandardError
16
+ def initialize(_encoding)
17
+ super "#{_encoding} is not supported by this operation"
18
+ end
19
+ end
20
+
21
+ class MaximumRedirectsError < StandardError
22
+ def initialize
23
+ super 'Redirection loop detected!'
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,62 @@
1
+ module Pincers::Http
2
+ class Request
3
+
4
+ attr_reader :method, :uri, :headers
5
+ attr_accessor :data
6
+
7
+ def initialize(_method, _uri)
8
+ @method = _method
9
+ @uri = _uri
10
+ @headers = {}
11
+ @data = nil
12
+ end
13
+
14
+ def url
15
+ @uri.to_s
16
+ end
17
+
18
+ def native_type
19
+ case @method
20
+ when :get then Net::HTTP::Get
21
+ when :post then Net::HTTP::Post
22
+ when :put then Net::HTTP::Put
23
+ when :delete then Net::HTTP::Delete
24
+ else nil end
25
+ end
26
+
27
+ def set_query(_pairs)
28
+ _pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
29
+ @uri.query = Utils.encode_urlencoded(_pairs)
30
+ end
31
+
32
+ def set_form_data(_pairs, _encoding = nil)
33
+ _pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
34
+ encoding = default_encoding_for(_pairs)
35
+ encoding = _encoding if !_encoding.nil? && encoding == Utils::FORM_URLENCODED
36
+
37
+ if method == :get
38
+ raise EncodingNotSupported, encoding if encoding != Utils::FORM_URLENCODED
39
+ set_query _pairs
40
+ else
41
+ headers['Content-Type'] = encoding
42
+
43
+ self.data = case encoding
44
+ when Utils::FORM_URLENCODED
45
+ Utils.encode_urlencoded _pairs
46
+ when Utils::FORM_MULTIPART
47
+ Utils.encode_multipart _pairs
48
+ else
49
+ raise Pincers::MissingFeatureError.new "form encoding: #{_encoding}"
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def default_encoding_for(_pairs)
57
+ has_files = _pairs.any? { |p| p[1].is_a? IO }
58
+ has_files ? Utils::FORM_MULTIPART : Utils::FORM_URLENCODED
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,24 @@
1
+ require 'pincers/http/base_document'
2
+
3
+ module Pincers::Http
4
+ class ResponseDocument < BaseDocument
5
+
6
+ attr_reader :response
7
+
8
+ def initialize(_response)
9
+ @response = _response
10
+ end
11
+
12
+ def uri
13
+ @response.uri
14
+ end
15
+
16
+ def content_type
17
+ @response['Content-Type'] || 'text/plain'
18
+ end
19
+
20
+ def content
21
+ @response.body
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,99 @@
1
+ require 'pincers/http/errors'
2
+ require 'pincers/http/cookie_jar'
3
+
4
+ module Pincers::Http
5
+ class Session
6
+
7
+ DEFAULT_HEADERS = {
8
+ 'Accept' => '*/*',
9
+ 'Cache-Control' => 'no-cache'
10
+ }
11
+
12
+ attr_reader :cookie_jar, :headers
13
+ attr_accessor :proxy_addr, :proxy_port, :redirect_limit
14
+
15
+ def initialize(_other = nil)
16
+ if _other
17
+ @headers = _other.headers.clone
18
+ @cookie_jar = _other.cookie_jar.copy
19
+ @proxy_addr = _other.proxy_addr
20
+ @proxy_port = _other.proxy_port
21
+ @redirect_limit = _other.redirect_limit
22
+ else
23
+ @headers = DEFAULT_HEADERS
24
+ @cookie_jar = CookieJar.new
25
+ @redirect_limit = 10
26
+ end
27
+ end
28
+
29
+ def proxy=(_value)
30
+ if _value
31
+ @proxy_addr, @proxy_port = _value.split ':'
32
+ else
33
+ @proxy_addr, @proxy_port = [nil, nil]
34
+ end
35
+ end
36
+
37
+ def clone
38
+ self.class.new self
39
+ end
40
+
41
+ def perform(_request)
42
+ perform_recursive _request, @redirect_limit, nil
43
+ end
44
+
45
+ private
46
+
47
+ def perform_recursive(_request, _limit, _redirect)
48
+ raise MaximumRedirectsError.new if _limit == 0
49
+
50
+ uri = _redirect || _request.uri
51
+ path = uri.request_uri.empty? ? '/' : uri.request_uri
52
+
53
+ http_request = _request.native_type.new path
54
+ http_request.body = _request.data
55
+
56
+ copy_headers http_request, @headers
57
+ copy_headers http_request, _request.headers
58
+ set_cookies http_request, uri
59
+
60
+ http_response = connect(uri).request http_request
61
+
62
+ case http_response
63
+ when Net::HTTPSuccess then
64
+ update_cookies(uri, http_response)
65
+ http_response.uri = uri # uri is not always set by net/http
66
+ http_response
67
+ when Net::HTTPRedirection then
68
+ location = Utils.parse_uri(http_response['location'])
69
+ perform(_request, _limit - 1, location)
70
+ else
71
+ handle_error_response http_response
72
+ end
73
+ end
74
+
75
+ def connect(_uri)
76
+ conn = Net::HTTP.new _uri.host, _uri.port || 80, @proxy_addr, @proxy_port
77
+ conn.use_ssl = true if _uri.scheme == 'https'
78
+ conn.verify_mode = OpenSSL::SSL::VERIFY_NONE
79
+ conn
80
+ end
81
+
82
+ def handle_error_response(_http_response)
83
+ raise RequestError.new _http_response
84
+ end
85
+
86
+ def copy_headers(_request, _headers)
87
+ _headers.keys.each { |k| _request[k] = _headers[k] }
88
+ end
89
+
90
+ def set_cookies(_request, _uri)
91
+ _request['Cookie'] = @cookie_jar.for_origin_as_header _uri
92
+ end
93
+
94
+ def update_cookies(_uri, _response)
95
+ cookies = _response.get_fields('set-cookie')
96
+ cookies.each { |raw| @cookie_jar.set_raw _uri, raw } if cookies
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,43 @@
1
+ module Pincers::Http
2
+ module Utils
3
+ extend self
4
+
5
+ FORM_URLENCODED = 'application/x-www-form-urlencoded'
6
+ FORM_MULTIPART = 'multipart/form-data'
7
+
8
+ def encode_urlencoded(_pairs)
9
+ _pairs = hash_to_pairs _pairs if _pairs.is_a? Hash
10
+ _pairs.map { |p| "#{p[0]}=#{CGI.escape(p[1])}" }.join '&'
11
+ end
12
+
13
+ def encode_multipart(_pairs)
14
+ raise Pincers::MissingFeatureError.new :encode_multipart
15
+ end
16
+
17
+ def hash_to_pairs(_hash)
18
+ pair_recursive [], _hash
19
+ end
20
+
21
+ def parse_uri(_url)
22
+ URI.parse _url
23
+ end
24
+
25
+ private
26
+
27
+ def pair_recursive(_pairs, _data, _prefix = nil)
28
+ _data.each do |key, value|
29
+ key = "#{_prefix}.#{key}" if _prefix
30
+ case value
31
+ when Hash
32
+ pair_recursive _pairs, value, key
33
+ when Array
34
+ key = "#{key}[]"
35
+ value.each { |item| _pairs << [key, item.to_s] }
36
+ else
37
+ _pairs << [key.to_s, value.to_s]
38
+ end
39
+ end
40
+ _pairs
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,151 @@
1
+ require 'pincers/core/base_backend'
2
+
3
+ module Pincers::Nokogiri
4
+ class Backend < Pincers::Core::BaseBackend
5
+
6
+ # This is a small bool properties subset, I believe its enough for scrapping.
7
+ # For information of where to find the full list: http://stackoverflow.com/questions/706384/boolean-html-attributes
8
+
9
+ BOOL_PROPERTIES = {
10
+ checked: [:input_checkbox, :input_radio],
11
+ selected: [:option],
12
+ disabled: :all, # no restrictions
13
+ readonly: [:input_text, :input_password, :textarea],
14
+ multiple: [:select]
15
+ }
16
+
17
+ def initialize(_document)
18
+ @document = _document
19
+ end
20
+
21
+ def document
22
+ @document
23
+ end
24
+
25
+ def document_root
26
+ [document]
27
+ end
28
+
29
+ def document_title
30
+ document.title
31
+ end
32
+
33
+ def close_document
34
+ # no closing needed
35
+ end
36
+
37
+ def search_by_css(_element, _selector, _limit)
38
+ # nokogiri does not do any query level optimization when searching just one node
39
+ _element.css _selector
40
+ end
41
+
42
+ def search_by_xpath(_element, _selector, _limit)
43
+ # nokogiri does not do any query level optimization when searching just one node
44
+ _element.xpath _selector
45
+ end
46
+
47
+ def extract_element_tag(_element)
48
+ _element.name
49
+ end
50
+
51
+ def extract_element_text(_element)
52
+ _element.content
53
+ end
54
+
55
+ def extract_element_html(_element)
56
+ _element.to_html
57
+ end
58
+
59
+ def extract_element_attribute(_element, _name)
60
+ _name = _name.to_sym
61
+ if _name == :value
62
+ case classify _element
63
+ when :input_checkbox, :input_radio
64
+ extract_checkable_value _element
65
+ when :select
66
+ extract_select_value _element
67
+ when :option
68
+ extract_option_value _element
69
+ when :textarea
70
+ _element.content
71
+ else
72
+ _element[:value]
73
+ end
74
+ elsif is_boolean? _element, _name
75
+ !_element[_name].nil?
76
+ else
77
+ _element[_name]
78
+ end
79
+ end
80
+
81
+ def set_element_attribute(_element, _name, _value)
82
+ _name = _name.to_sym
83
+
84
+ if _name == :value
85
+ case classify _element
86
+ when :select
87
+ set_select_value _element, _value
88
+ when :textarea
89
+ _element.content = _value
90
+ else
91
+ _element.set_attribute(_name, _value)
92
+ end
93
+ elsif is_boolean? _element, _name
94
+ set_boolean _element, _name, _value
95
+ else
96
+ _element.set_attribute(_name, _value)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ def classify(_element)
103
+ name = _element.name
104
+ name = "input_#{(_element[:type] || 'text')}" if name == 'input'
105
+ name = "button_#{(_element[:type] || 'submit')}" if name == 'button'
106
+ name.to_sym
107
+ end
108
+
109
+ def is_boolean?(_element, _name)
110
+ permitted = BOOL_PROPERTIES[_name]
111
+ return false if permitted.nil?
112
+ return true if permitted == :all
113
+ return permitted.include? classify(_element)
114
+ end
115
+
116
+ def extract_checkable_value(_element)
117
+ value = _element[:value]
118
+ value || 'on'
119
+ end
120
+
121
+ def extract_select_value(_element)
122
+ multiple = !_element[:multiple].nil?
123
+ selected = _element.css('option[selected]')
124
+ if multiple
125
+ selected.map { |o| extract_option_value(o) }
126
+ else
127
+ extract_option_value(selected.first)
128
+ end
129
+ end
130
+
131
+ def extract_option_value(_element)
132
+ return nil if _element.nil?
133
+ _element[:value] || _element.content
134
+ end
135
+
136
+ def set_select_value(_element, _value)
137
+ _element.xpath(".//option[@selected]").each { |o| set_boolean(o, :selected, false) }
138
+ to_select = _element.at_xpath(".//option[@value='#{_value}']")
139
+ to_select = _element.at_xpath(".//option[text()='#{_value}']") if to_select.nil?
140
+ set_boolean(to_select, :selected, true) unless to_select.nil?
141
+ end
142
+
143
+ def set_boolean(_element, _name, _value)
144
+ if _value
145
+ _element.set_attribute(_name, _name)
146
+ else
147
+ _element.remove_attribute(_name.to_s)
148
+ end
149
+ end
150
+ end
151
+ end
@@ -1,8 +1,8 @@
1
- require 'pincers/factories/base'
2
- require 'pincers/backend/nokogiri'
1
+ require "pincers/core/base_factory"
2
+ require "pincers/nokogiri/backend"
3
3
 
4
- module Pincers::Factories
5
- class Nokogiri < Base
4
+ module Pincers::Nokogiri
5
+ class Factory < Pincers::Core::BaseFactory
6
6
 
7
7
  def load_backend(_options)
8
8
  document = _options.delete(:document)
@@ -11,7 +11,7 @@ module Pincers::Factories
11
11
  document = ::Nokogiri::HTML document, _options[:url], _options[:encoding], _options[:flags]
12
12
  end
13
13
 
14
- ::Pincers::Backend::Nokogiri.new document
14
+ Pincers::Nokogiri::Backend.new document
15
15
  end
16
16
 
17
17
  end