pincers 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/lib/pincers/chenso/backend.rb +162 -0
  3. data/lib/pincers/chenso/browsing_context.rb +76 -0
  4. data/lib/pincers/chenso/browsing_manager.rb +67 -0
  5. data/lib/pincers/chenso/factory.rb +27 -0
  6. data/lib/pincers/chenso/html_form_request.rb +7 -0
  7. data/lib/pincers/chenso/html_page_request.rb +27 -0
  8. data/lib/pincers/{backend/base.rb → core/base_backend.rb} +18 -8
  9. data/lib/pincers/{factories/base.rb → core/base_factory.rb} +2 -2
  10. data/lib/pincers/core/helpers/form.rb +106 -0
  11. data/lib/pincers/core/{query.rb → helpers/query.rb} +3 -4
  12. data/lib/pincers/core/replicas/form.rb +43 -0
  13. data/lib/pincers/core/replicas/link.rb +18 -0
  14. data/lib/pincers/core/root_context.rb +25 -6
  15. data/lib/pincers/core/search_context.rb +24 -3
  16. data/lib/pincers/errors.rb +4 -1
  17. data/lib/pincers/extension/labs.rb +3 -0
  18. data/lib/pincers/factory.rb +9 -4
  19. data/lib/pincers/http/base_document.rb +20 -0
  20. data/lib/pincers/http/client.rb +134 -0
  21. data/lib/pincers/{support → http}/cookie.rb +1 -1
  22. data/lib/pincers/{support → http}/cookie_jar.rb +4 -3
  23. data/lib/pincers/http/errors.rb +26 -0
  24. data/lib/pincers/http/request.rb +62 -0
  25. data/lib/pincers/http/response_document.rb +24 -0
  26. data/lib/pincers/http/session.rb +99 -0
  27. data/lib/pincers/http/utils.rb +43 -0
  28. data/lib/pincers/nokogiri/backend.rb +151 -0
  29. data/lib/pincers/{factories/nokogiri.rb → nokogiri/factory.rb} +5 -5
  30. data/lib/pincers/version.rb +1 -1
  31. data/lib/pincers/{backend/webdriver.rb → webdriver/backend.rb} +22 -31
  32. data/lib/pincers/{factories/webdriver.rb → webdriver/factory.rb} +5 -5
  33. data/lib/pincers/webdriver/http_document.rb +23 -0
  34. metadata +42 -13
  35. data/lib/pincers/backend/nokogiri.rb +0 -66
  36. data/lib/pincers/core/download.rb +0 -14
  37. data/lib/pincers/support/http_client.rb +0 -123
@@ -1,3 +1,3 @@
1
- module Pincers::Support
1
+ module Pincers::Http
2
2
  class Cookie < Struct.new(:name, :value, :domain, :path, :expires, :secure); end
3
3
  end
@@ -1,6 +1,7 @@
1
- require "pincers/support/cookie"
1
+ require 'pincers/http/cookie'
2
+ require 'pincers/http/utils'
2
3
 
3
- module Pincers::Support
4
+ module Pincers::Http
4
5
  class CookieJar
5
6
 
6
7
  BAD_VALUE_CHARS = /([\x00-\x20\x7F",;\\])/ # RFC 6265 - 4.1.1
@@ -16,7 +17,7 @@ module Pincers::Support
16
17
  end
17
18
 
18
19
  def get(_url, _name)
19
- for_origin(URI.parse(_url)).find { |c| c.name == _name }
20
+ for_origin(Utils.parse_uri(_url)).find { |c| c.name == _name }
20
21
  end
21
22
 
22
23
  def set(_parts)
@@ -0,0 +1,26 @@
1
+ module Pincers::Http
2
+ class RequestError < StandardError
3
+ extend Forwardable
4
+
5
+ def_delegators :@response, :code, :body
6
+
7
+ attr_reader :response
8
+
9
+ def initialize(_response)
10
+ @response = _response
11
+ super _response.message
12
+ end
13
+ end
14
+
15
+ class EncodingNotSupported < StandardError
16
+ def initialize(_encoding)
17
+ super "#{_encoding} is not supported by this operation"
18
+ end
19
+ end
20
+
21
+ class MaximumRedirectsError < StandardError
22
+ def initialize
23
+ super 'Redirection loop detected!'
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,62 @@
1
+ module Pincers::Http
2
+ class Request
3
+
4
+ attr_reader :method, :uri, :headers
5
+ attr_accessor :data
6
+
7
+ def initialize(_method, _uri)
8
+ @method = _method
9
+ @uri = _uri
10
+ @headers = {}
11
+ @data = nil
12
+ end
13
+
14
+ def url
15
+ @uri.to_s
16
+ end
17
+
18
+ def native_type
19
+ case @method
20
+ when :get then Net::HTTP::Get
21
+ when :post then Net::HTTP::Post
22
+ when :put then Net::HTTP::Put
23
+ when :delete then Net::HTTP::Delete
24
+ else nil end
25
+ end
26
+
27
+ def set_query(_pairs)
28
+ _pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
29
+ @uri.query = Utils.encode_urlencoded(_pairs)
30
+ end
31
+
32
+ def set_form_data(_pairs, _encoding = nil)
33
+ _pairs = Utils.hash_to_pairs(_pairs) if _pairs.is_a? Hash
34
+ encoding = default_encoding_for(_pairs)
35
+ encoding = _encoding if !_encoding.nil? && encoding == Utils::FORM_URLENCODED
36
+
37
+ if method == :get
38
+ raise EncodingNotSupported, encoding if encoding != Utils::FORM_URLENCODED
39
+ set_query _pairs
40
+ else
41
+ headers['Content-Type'] = encoding
42
+
43
+ self.data = case encoding
44
+ when Utils::FORM_URLENCODED
45
+ Utils.encode_urlencoded _pairs
46
+ when Utils::FORM_MULTIPART
47
+ Utils.encode_multipart _pairs
48
+ else
49
+ raise Pincers::MissingFeatureError.new "form encoding: #{_encoding}"
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def default_encoding_for(_pairs)
57
+ has_files = _pairs.any? { |p| p[1].is_a? IO }
58
+ has_files ? Utils::FORM_MULTIPART : Utils::FORM_URLENCODED
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,24 @@
1
+ require 'pincers/http/base_document'
2
+
3
+ module Pincers::Http
4
+ class ResponseDocument < BaseDocument
5
+
6
+ attr_reader :response
7
+
8
+ def initialize(_response)
9
+ @response = _response
10
+ end
11
+
12
+ def uri
13
+ @response.uri
14
+ end
15
+
16
+ def content_type
17
+ @response['Content-Type'] || 'text/plain'
18
+ end
19
+
20
+ def content
21
+ @response.body
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,99 @@
1
+ require 'pincers/http/errors'
2
+ require 'pincers/http/cookie_jar'
3
+
4
+ module Pincers::Http
5
+ class Session
6
+
7
+ DEFAULT_HEADERS = {
8
+ 'Accept' => '*/*',
9
+ 'Cache-Control' => 'no-cache'
10
+ }
11
+
12
+ attr_reader :cookie_jar, :headers
13
+ attr_accessor :proxy_addr, :proxy_port, :redirect_limit
14
+
15
+ def initialize(_other = nil)
16
+ if _other
17
+ @headers = _other.headers.clone
18
+ @cookie_jar = _other.cookie_jar.copy
19
+ @proxy_addr = _other.proxy_addr
20
+ @proxy_port = _other.proxy_port
21
+ @redirect_limit = _other.redirect_limit
22
+ else
23
+ @headers = DEFAULT_HEADERS
24
+ @cookie_jar = CookieJar.new
25
+ @redirect_limit = 10
26
+ end
27
+ end
28
+
29
+ def proxy=(_value)
30
+ if _value
31
+ @proxy_addr, @proxy_port = _value.split ':'
32
+ else
33
+ @proxy_addr, @proxy_port = [nil, nil]
34
+ end
35
+ end
36
+
37
+ def clone
38
+ self.class.new self
39
+ end
40
+
41
+ def perform(_request)
42
+ perform_recursive _request, @redirect_limit, nil
43
+ end
44
+
45
+ private
46
+
47
+ def perform_recursive(_request, _limit, _redirect)
48
+ raise MaximumRedirectsError.new if _limit == 0
49
+
50
+ uri = _redirect || _request.uri
51
+ path = uri.request_uri.empty? ? '/' : uri.request_uri
52
+
53
+ http_request = _request.native_type.new path
54
+ http_request.body = _request.data
55
+
56
+ copy_headers http_request, @headers
57
+ copy_headers http_request, _request.headers
58
+ set_cookies http_request, uri
59
+
60
+ http_response = connect(uri).request http_request
61
+
62
+ case http_response
63
+ when Net::HTTPSuccess then
64
+ update_cookies(uri, http_response)
65
+ http_response.uri = uri # uri is not always set by net/http
66
+ http_response
67
+ when Net::HTTPRedirection then
68
+ location = Utils.parse_uri(http_response['location'])
69
+ perform(_request, _limit - 1, location)
70
+ else
71
+ handle_error_response http_response
72
+ end
73
+ end
74
+
75
+ def connect(_uri)
76
+ conn = Net::HTTP.new _uri.host, _uri.port || 80, @proxy_addr, @proxy_port
77
+ conn.use_ssl = true if _uri.scheme == 'https'
78
+ conn.verify_mode = OpenSSL::SSL::VERIFY_NONE
79
+ conn
80
+ end
81
+
82
+ def handle_error_response(_http_response)
83
+ raise RequestError.new _http_response
84
+ end
85
+
86
+ def copy_headers(_request, _headers)
87
+ _headers.keys.each { |k| _request[k] = _headers[k] }
88
+ end
89
+
90
+ def set_cookies(_request, _uri)
91
+ _request['Cookie'] = @cookie_jar.for_origin_as_header _uri
92
+ end
93
+
94
+ def update_cookies(_uri, _response)
95
+ cookies = _response.get_fields('set-cookie')
96
+ cookies.each { |raw| @cookie_jar.set_raw _uri, raw } if cookies
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,43 @@
1
+ module Pincers::Http
2
+ module Utils
3
+ extend self
4
+
5
+ FORM_URLENCODED = 'application/x-www-form-urlencoded'
6
+ FORM_MULTIPART = 'multipart/form-data'
7
+
8
+ def encode_urlencoded(_pairs)
9
+ _pairs = hash_to_pairs _pairs if _pairs.is_a? Hash
10
+ _pairs.map { |p| "#{p[0]}=#{CGI.escape(p[1])}" }.join '&'
11
+ end
12
+
13
+ def encode_multipart(_pairs)
14
+ raise Pincers::MissingFeatureError.new :encode_multipart
15
+ end
16
+
17
+ def hash_to_pairs(_hash)
18
+ pair_recursive [], _hash
19
+ end
20
+
21
+ def parse_uri(_url)
22
+ URI.parse _url
23
+ end
24
+
25
+ private
26
+
27
+ def pair_recursive(_pairs, _data, _prefix = nil)
28
+ _data.each do |key, value|
29
+ key = "#{_prefix}.#{key}" if _prefix
30
+ case value
31
+ when Hash
32
+ pair_recursive _pairs, value, key
33
+ when Array
34
+ key = "#{key}[]"
35
+ value.each { |item| _pairs << [key, item.to_s] }
36
+ else
37
+ _pairs << [key.to_s, value.to_s]
38
+ end
39
+ end
40
+ _pairs
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,151 @@
1
+ require 'pincers/core/base_backend'
2
+
3
+ module Pincers::Nokogiri
4
+ class Backend < Pincers::Core::BaseBackend
5
+
6
+ # This is a small bool properties subset, I believe its enough for scrapping.
7
+ # For information of where to find the full list: http://stackoverflow.com/questions/706384/boolean-html-attributes
8
+
9
+ BOOL_PROPERTIES = {
10
+ checked: [:input_checkbox, :input_radio],
11
+ selected: [:option],
12
+ disabled: :all, # no restrictions
13
+ readonly: [:input_text, :input_password, :textarea],
14
+ multiple: [:select]
15
+ }
16
+
17
+ def initialize(_document)
18
+ @document = _document
19
+ end
20
+
21
+ def document
22
+ @document
23
+ end
24
+
25
+ def document_root
26
+ [document]
27
+ end
28
+
29
+ def document_title
30
+ document.title
31
+ end
32
+
33
+ def close_document
34
+ # no closing needed
35
+ end
36
+
37
+ def search_by_css(_element, _selector, _limit)
38
+ # nokogiri does not do any query level optimization when searching just one node
39
+ _element.css _selector
40
+ end
41
+
42
+ def search_by_xpath(_element, _selector, _limit)
43
+ # nokogiri does not do any query level optimization when searching just one node
44
+ _element.xpath _selector
45
+ end
46
+
47
+ def extract_element_tag(_element)
48
+ _element.name
49
+ end
50
+
51
+ def extract_element_text(_element)
52
+ _element.content
53
+ end
54
+
55
+ def extract_element_html(_element)
56
+ _element.to_html
57
+ end
58
+
59
+ def extract_element_attribute(_element, _name)
60
+ _name = _name.to_sym
61
+ if _name == :value
62
+ case classify _element
63
+ when :input_checkbox, :input_radio
64
+ extract_checkable_value _element
65
+ when :select
66
+ extract_select_value _element
67
+ when :option
68
+ extract_option_value _element
69
+ when :textarea
70
+ _element.content
71
+ else
72
+ _element[:value]
73
+ end
74
+ elsif is_boolean? _element, _name
75
+ !_element[_name].nil?
76
+ else
77
+ _element[_name]
78
+ end
79
+ end
80
+
81
+ def set_element_attribute(_element, _name, _value)
82
+ _name = _name.to_sym
83
+
84
+ if _name == :value
85
+ case classify _element
86
+ when :select
87
+ set_select_value _element, _value
88
+ when :textarea
89
+ _element.content = _value
90
+ else
91
+ _element.set_attribute(_name, _value)
92
+ end
93
+ elsif is_boolean? _element, _name
94
+ set_boolean _element, _name, _value
95
+ else
96
+ _element.set_attribute(_name, _value)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ def classify(_element)
103
+ name = _element.name
104
+ name = "input_#{(_element[:type] || 'text')}" if name == 'input'
105
+ name = "button_#{(_element[:type] || 'submit')}" if name == 'button'
106
+ name.to_sym
107
+ end
108
+
109
+ def is_boolean?(_element, _name)
110
+ permitted = BOOL_PROPERTIES[_name]
111
+ return false if permitted.nil?
112
+ return true if permitted == :all
113
+ return permitted.include? classify(_element)
114
+ end
115
+
116
+ def extract_checkable_value(_element)
117
+ value = _element[:value]
118
+ value || 'on'
119
+ end
120
+
121
+ def extract_select_value(_element)
122
+ multiple = !_element[:multiple].nil?
123
+ selected = _element.css('option[selected]')
124
+ if multiple
125
+ selected.map { |o| extract_option_value(o) }
126
+ else
127
+ extract_option_value(selected.first)
128
+ end
129
+ end
130
+
131
+ def extract_option_value(_element)
132
+ return nil if _element.nil?
133
+ _element[:value] || _element.content
134
+ end
135
+
136
+ def set_select_value(_element, _value)
137
+ _element.xpath(".//option[@selected]").each { |o| set_boolean(o, :selected, false) }
138
+ to_select = _element.at_xpath(".//option[@value='#{_value}']")
139
+ to_select = _element.at_xpath(".//option[text()='#{_value}']") if to_select.nil?
140
+ set_boolean(to_select, :selected, true) unless to_select.nil?
141
+ end
142
+
143
+ def set_boolean(_element, _name, _value)
144
+ if _value
145
+ _element.set_attribute(_name, _name)
146
+ else
147
+ _element.remove_attribute(_name.to_s)
148
+ end
149
+ end
150
+ end
151
+ end
@@ -1,8 +1,8 @@
1
- require 'pincers/factories/base'
2
- require 'pincers/backend/nokogiri'
1
+ require "pincers/core/base_factory"
2
+ require "pincers/nokogiri/backend"
3
3
 
4
- module Pincers::Factories
5
- class Nokogiri < Base
4
+ module Pincers::Nokogiri
5
+ class Factory < Pincers::Core::BaseFactory
6
6
 
7
7
  def load_backend(_options)
8
8
  document = _options.delete(:document)
@@ -11,7 +11,7 @@ module Pincers::Factories
11
11
  document = ::Nokogiri::HTML document, _options[:url], _options[:encoding], _options[:flags]
12
12
  end
13
13
 
14
- ::Pincers::Backend::Nokogiri.new document
14
+ Pincers::Nokogiri::Backend.new document
15
15
  end
16
16
 
17
17
  end