http_reader 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +4 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/Guardfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +114 -0
- data/Rakefile +6 -0
- data/http_reader.gemspec +35 -0
- data/lib/http_reader/base_page_matcher.rb +23 -0
- data/lib/http_reader/base_page_parser.rb +24 -0
- data/lib/http_reader/engine.rb +78 -0
- data/lib/http_reader/hash_page_parser.rb +35 -0
- data/lib/http_reader/version.rb +3 -0
- data/lib/http_reader.rb +7 -0
- data/spec/spec_helper.rb +29 -0
- data/spec/unit/lib/http_reader/base_page_parser_spec.rb +31 -0
- data/spec/unit/lib/http_reader/engine_spec.rb +216 -0
- data/spec/unit/lib/http_reader/hash_page_parser_spec.rb +33 -0
- data/spec/unit/lib/http_reader_spec.rb +5 -0
- metadata +210 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 342e596f1ce05595886d576288111a8d873e5d06
|
4
|
+
data.tar.gz: d1bb9056c0e5ef44fa9185fbb8b97943fffa6639
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9983010f18f2bee05fadc9a7583c4c1d1740a173191a5bab70129e2d55fdc6da6da2d061a70e39c4fa6daed91337e366a058b6ccebc74a2666fb8d0ef2690d6a
|
7
|
+
data.tar.gz: af912964ef1658a387989e177c4be500a4938aca5df004fc6d203dfcdf469c4f8fe8e7614ccf89cae4d7f67dd89d13835ee7ef0d9c01258110d36275d53135a3
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Paweł Niemczyk
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
# HttpReader
|
2
|
+
|
3
|
+
Read any document on internet and parse to your own format :D
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'http_reader'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install http_reader
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
engine = HttpReader::Engine.new(opts)
|
22
|
+
engine.read('http://www.google.com')
|
23
|
+
|
24
|
+
### Available opts [Hash]
|
25
|
+
- **parsers:** list of document parser Classes [ default: [] ]
|
26
|
+
- **default_parser:** parser used when none parser was match for url [default: HashPageParser]
|
27
|
+
- **http_client:** http_client for downloading pages sources, [default: HTTParty]
|
28
|
+
- **browser:** browser_client to processing and download source, [default: Watir::Browser]
|
29
|
+
- **logger:** default: Logger
|
30
|
+
|
31
|
+
## Examples
|
32
|
+
|
33
|
+
### Usage default_parser as HashPageParser
|
34
|
+
|
35
|
+
engine = HttpReader::Engine.new
|
36
|
+
read_opts = {title: 'h1', items: '.content li;array'}
|
37
|
+
engine.read('http://example.org', read_opts)
|
38
|
+
|
39
|
+
**Where page body is:**
|
40
|
+
|
41
|
+
<h1>Information</h1>
|
42
|
+
<p>not importante</p>
|
43
|
+
<div class="content">
|
44
|
+
Items: <ul><li>A</li><li>B</li><li>C</li></ul>
|
45
|
+
</div>
|
46
|
+
|
47
|
+
**Result should be:**
|
48
|
+
|
49
|
+
{:title=>"Information", :items=>%w{A B C}}
|
50
|
+
|
51
|
+
|
52
|
+
### Usage own Parser class
|
53
|
+
|
54
|
+
**Class body:**
|
55
|
+
|
56
|
+
Class TestParser < BasePageParser
|
57
|
+
@pattern = /^((http|https):\/\/www.google.com)$/
|
58
|
+
class << self
|
59
|
+
def browse_actions_for_html(browser, opts = {})
|
60
|
+
div = browser.div(id: 'als')
|
61
|
+
raise 'Cannot find div' unless div.exists?
|
62
|
+
div.html
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse(response, opts = {})
|
66
|
+
n_body = Nokogiri::HTML(response.body)
|
67
|
+
{ text: n_body.css('p').text }
|
68
|
+
end
|
69
|
+
|
70
|
+
def use_browser
|
71
|
+
true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
**initializtion:**
|
77
|
+
|
78
|
+
engine = HttpReader::Engine.new(default_parser: TestParser)
|
79
|
+
engine.read('http://www.google.com')
|
80
|
+
|
81
|
+
**Or**
|
82
|
+
|
83
|
+
engine = HttpReader::Engine.new(parsers: [TestParser])
|
84
|
+
engine.read('http://www.google.com')
|
85
|
+
|
86
|
+
**Or**
|
87
|
+
|
88
|
+
engine = HttpReader::Engine.new
|
89
|
+
engine.read('http://www.google.com', parser: TestParser)
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
## More info about syntax
|
94
|
+
- [watir-webdriver](https://github.com/watir/watir-webdriver)
|
95
|
+
- [nokogiri](http://ruby.bastardsbook.com/chapters/html-parsing/)
|
96
|
+
|
97
|
+
## Dependecies
|
98
|
+
### Gems
|
99
|
+
- nokogiri
|
100
|
+
- httparty
|
101
|
+
- headless
|
102
|
+
- watir-webdriver
|
103
|
+
### System components
|
104
|
+
- xvfb
|
105
|
+
instalation on ubuntu: sudo apt-get install xvfb
|
106
|
+
|
107
|
+
|
108
|
+
## Contributing
|
109
|
+
|
110
|
+
1. Fork it
|
111
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
112
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
113
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
114
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/http_reader.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'http_reader/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'http_reader'
|
8
|
+
spec.version = HttpReader::VERSION
|
9
|
+
spec.authors = ['Paweł Niemczyk']
|
10
|
+
spec.email = ['pniemczyk@o2.pl']
|
11
|
+
spec.description = %q{Read page body and parse to specific data}
|
12
|
+
spec.summary = %q{Page parser}
|
13
|
+
spec.homepage = 'https://github.com/pniemczyk/http_reader'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_dependency 'httparty', '~> 0.13'
|
22
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
23
|
+
spec.add_dependency 'headless', '~> 1.0'
|
24
|
+
spec.add_dependency 'watir-webdriver', '~> 0.6'
|
25
|
+
|
26
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
27
|
+
spec.add_development_dependency 'rake' , '~> 0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3.1'
|
29
|
+
spec.add_development_dependency 'guard-rspec', '~> 0'
|
30
|
+
spec.add_development_dependency 'coveralls', '~> 0'
|
31
|
+
spec.add_development_dependency 'awesome_print', '~> 0'
|
32
|
+
|
33
|
+
spec.post_install_message = 'Do not forget install xvfb. Have fun !'
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module HttpReader
|
2
|
+
class BasePageMatcher
|
3
|
+
@pattern = /^.*$/
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_reader :pattern
|
7
|
+
|
8
|
+
def match(url)
|
9
|
+
!(url =~ pattern).nil?
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def read(body)
|
14
|
+
body
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def pattern
|
20
|
+
self.class.pattern
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module HttpReader
|
2
|
+
class BasePageParser
|
3
|
+
@pattern = /^((http|https):\/\/).*$/
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_reader :pattern
|
7
|
+
|
8
|
+
def match(url)
|
9
|
+
!(url =~ pattern).nil?
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse(response, opts = {})
|
13
|
+
response.body
|
14
|
+
end
|
15
|
+
|
16
|
+
def browse_actions_for_html(browser, opts = {})
|
17
|
+
end
|
18
|
+
|
19
|
+
def use_browser
|
20
|
+
false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'logger'
|
3
|
+
require 'watir-webdriver'
|
4
|
+
require 'headless'
|
5
|
+
|
6
|
+
|
7
|
+
module HttpReader
|
8
|
+
class Engine
|
9
|
+
ReadError = Class.new(StandardError)
|
10
|
+
DefaultResponse = Struct.new(:body, :code, :message, :headers)
|
11
|
+
attr_reader :parsers, :default_parser, :http_client, :browser, :logger
|
12
|
+
|
13
|
+
def initialize(config = {})
|
14
|
+
@parsers = config.fetch(:parsers, [])
|
15
|
+
@default_parser = config.fetch(:default_parser, HashPageParser)
|
16
|
+
@http_client = config.fetch(:http_client, HTTParty)
|
17
|
+
@browser = config.fetch(:browser, Watir::Browser)
|
18
|
+
@logger = config.fetch(:logger, Logger.new(STDOUT))
|
19
|
+
end
|
20
|
+
|
21
|
+
def read(url, opts = {})
|
22
|
+
parser = opts[:parser] || find_parser(url)
|
23
|
+
response = if parser.use_browser
|
24
|
+
browse_opts = opts.fetch(:browse_opts, {})
|
25
|
+
browse(url, parser, browse_opts)
|
26
|
+
else
|
27
|
+
request_opts = opts.fetch(:request_opts, {})
|
28
|
+
request(url, request_opts)
|
29
|
+
end
|
30
|
+
|
31
|
+
parse_opts = opts.fetch(:parse_opts, {})
|
32
|
+
parser.parse(response, parse_opts)
|
33
|
+
rescue => e
|
34
|
+
log_error('read', e, "url: #{url}, opts: #{opts.to_json}")
|
35
|
+
raise ReadError.new(e.message)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def find_parser(url)
|
41
|
+
parsers.each do |parser|
|
42
|
+
return parser if parser.match(url)
|
43
|
+
end
|
44
|
+
|
45
|
+
default_parser
|
46
|
+
end
|
47
|
+
|
48
|
+
def browse(url, parser, opts = {})
|
49
|
+
html = nil
|
50
|
+
headless.start
|
51
|
+
b = browser.start(url)
|
52
|
+
html = parser.browse_actions_for_html(b, opts)
|
53
|
+
b.close
|
54
|
+
headless.destroy
|
55
|
+
DefaultResponse.new(html, 200, opts[:message] || "success")
|
56
|
+
rescue => e
|
57
|
+
log_error('browse', e)
|
58
|
+
DefaultResponse.new(html, 500, e.message)
|
59
|
+
end
|
60
|
+
|
61
|
+
def request(url, opts = {})
|
62
|
+
method = opts.fetch(:method, :get)
|
63
|
+
options = opts.fetch(:options, {})
|
64
|
+
http_client.public_send(method, url, options)
|
65
|
+
rescue => e
|
66
|
+
log_error('request', e)
|
67
|
+
DefaultResponse.new(nil, 500, e.message)
|
68
|
+
end
|
69
|
+
|
70
|
+
def headless
|
71
|
+
@headless ||= Headless.new
|
72
|
+
end
|
73
|
+
|
74
|
+
def log_error(method, ex, info = nil)
|
75
|
+
logger.error("HttpReader::Engine##{method} - #{ex.message} #{info}")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module HttpReader
|
4
|
+
class HashPageParser < BasePageParser
|
5
|
+
KEY_IDX = 0
|
6
|
+
SELECTOR_IDX = 1
|
7
|
+
TYPE_SEPARATOR = ';'
|
8
|
+
|
9
|
+
@pattern = /^((http|https):\/\/).*$/
|
10
|
+
|
11
|
+
def self.parse(response, opts = {})
|
12
|
+
page = Nokogiri::HTML(response.body)
|
13
|
+
hash = opts.inject({}) do |h, item|
|
14
|
+
key, value = prepare_key_value(page, item)
|
15
|
+
h[key] = value
|
16
|
+
h
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def self.prepare_key_value(page, item)
|
23
|
+
key = item[KEY_IDX]
|
24
|
+
selector, is_array = prepare_selector(item[SELECTOR_IDX])
|
25
|
+
result = page.css(selector)
|
26
|
+
value = result.map(&:text)
|
27
|
+
[key, is_array ? value : value.first]
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.prepare_selector(value)
|
31
|
+
selector, is_array = value.split(TYPE_SEPARATOR)
|
32
|
+
[selector, is_array.to_s.downcase == 'array']
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/http_reader.rb
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'awesome_print'
|
3
|
+
Bundler.setup
|
4
|
+
|
5
|
+
require 'coveralls'
|
6
|
+
Coveralls.wear!
|
7
|
+
|
8
|
+
require 'http_reader'
|
9
|
+
|
10
|
+
RSpec.configure do |config|
|
11
|
+
config.filter_run :focus
|
12
|
+
config.run_all_when_everything_filtered = true
|
13
|
+
|
14
|
+
config.default_formatter = 'doc' if config.files_to_run.one?
|
15
|
+
|
16
|
+
config.profile_examples = 10
|
17
|
+
config.order = :random
|
18
|
+
|
19
|
+
Kernel.srand config.seed
|
20
|
+
|
21
|
+
config.expect_with :rspec do |expectations|
|
22
|
+
expectations.syntax = :expect
|
23
|
+
end
|
24
|
+
|
25
|
+
config.mock_with :rspec do |mocks|
|
26
|
+
mocks.syntax = :expect
|
27
|
+
mocks.verify_partial_doubles = true
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HttpReader::BasePageParser do
|
4
|
+
subject { described_class }
|
5
|
+
|
6
|
+
let(:pattern) { /^((http|https):\/\/).*$/ }
|
7
|
+
|
8
|
+
context 'self' do
|
9
|
+
it '#pattern cover every url' do
|
10
|
+
expect(subject.pattern).to eq pattern
|
11
|
+
end
|
12
|
+
|
13
|
+
context '#match' do
|
14
|
+
it 'returns true for url string' do
|
15
|
+
expect(subject.match('http://some_url')).to eq true
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'returns false for non url string' do
|
19
|
+
expect(subject.match('some_fake_url')).to eq false
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
context '#parse' do
|
24
|
+
let(:body) { 'test_body' }
|
25
|
+
let(:response) { double('response', body: body) }
|
26
|
+
it 'returns body' do
|
27
|
+
expect(subject.parse(response)).to eq body
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,216 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
describe HttpReader::Engine do
|
5
|
+
let(:default_parser) { double('default_parser') }
|
6
|
+
let(:parsers) { [] }
|
7
|
+
let(:http_client) { double('HTTParty') }
|
8
|
+
let(:browser) { double('Watir::Browser') }
|
9
|
+
let(:logger) { double('Logger') }
|
10
|
+
let(:headless) { double('Headless') }
|
11
|
+
|
12
|
+
let(:init_opts) do
|
13
|
+
{
|
14
|
+
parsers: parsers,
|
15
|
+
default_parser: default_parser,
|
16
|
+
http_client: http_client,
|
17
|
+
browser: browser,
|
18
|
+
logger: logger
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
let(:test_url) { 'http://localhost/test' }
|
23
|
+
let(:active_browser) { double('active_browser') }
|
24
|
+
|
25
|
+
subject { described_class.new(init_opts) }
|
26
|
+
context '#initialize' do
|
27
|
+
context 'init #parsers' do
|
28
|
+
context 'default' do
|
29
|
+
let(:init_opts) { {} }
|
30
|
+
it 'should eq []' do
|
31
|
+
expect(subject.parsers).to eq []
|
32
|
+
end
|
33
|
+
end
|
34
|
+
context 'optional' do
|
35
|
+
it 'can be set as array of parsers' do
|
36
|
+
expect(subject.parsers).to eq parsers
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
context 'init #default_parser' do
|
41
|
+
context 'default' do
|
42
|
+
let(:init_opts) { {} }
|
43
|
+
it 'should eq HashPageParser' do
|
44
|
+
expect(subject.default_parser).to eq HttpReader::HashPageParser
|
45
|
+
end
|
46
|
+
end
|
47
|
+
context 'optional' do
|
48
|
+
it 'can be set new default_parser' do
|
49
|
+
expect(subject.default_parser).to eq default_parser
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
context 'init #http_client' do
|
54
|
+
context 'default' do
|
55
|
+
let(:init_opts) { {} }
|
56
|
+
it 'should eq HTTParty' do
|
57
|
+
expect(subject.http_client).to eq HTTParty
|
58
|
+
end
|
59
|
+
end
|
60
|
+
context 'optional' do
|
61
|
+
it 'can be set new http_client' do
|
62
|
+
expect(subject.http_client).to eq http_client
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
context 'init #browser' do
|
67
|
+
context 'default' do
|
68
|
+
let(:init_opts) { {} }
|
69
|
+
it 'should eq Watir::Browser' do
|
70
|
+
expect(subject.browser).to eq Watir::Browser
|
71
|
+
end
|
72
|
+
end
|
73
|
+
context 'optional' do
|
74
|
+
it 'can be set new browser' do
|
75
|
+
expect(subject.browser).to eq browser
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
context 'init #logger' do
|
80
|
+
context 'default' do
|
81
|
+
let(:init_opts) { {} }
|
82
|
+
it 'should eq Logger' do
|
83
|
+
expect(subject.logger).to be_a Logger
|
84
|
+
end
|
85
|
+
end
|
86
|
+
context 'optional' do
|
87
|
+
it 'can be set new browser' do
|
88
|
+
expect(subject.logger).to eq logger
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
context '#read' do
|
95
|
+
context 'should use parser' do
|
96
|
+
let(:parser_in_opts) { double('parser_in_opts') }
|
97
|
+
let(:response) { double('response', body: 'body')}
|
98
|
+
it 'from provided opts' do
|
99
|
+
expect(parser_in_opts).to receive(:use_browser).and_return(false)
|
100
|
+
expect(http_client).to receive(:get)
|
101
|
+
.with(test_url, {})
|
102
|
+
.and_return(response)
|
103
|
+
expect(parser_in_opts).to receive(:parse).with(response, {})
|
104
|
+
subject.read(test_url, parser: parser_in_opts)
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'default when no parsers are available' do
|
108
|
+
expect(default_parser).to receive(:use_browser).and_return(false)
|
109
|
+
expect(http_client).to receive(:get)
|
110
|
+
.with(test_url, {})
|
111
|
+
.and_return(response)
|
112
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
113
|
+
subject.read(test_url)
|
114
|
+
end
|
115
|
+
|
116
|
+
context 'which' do
|
117
|
+
let(:parser_one) { double('parser_one') }
|
118
|
+
let(:parser_two) { double('parser_two') }
|
119
|
+
let(:parsers) { [parser_one, parser_two] }
|
120
|
+
it 'match as first with url' do
|
121
|
+
expect(parser_one).to receive(:match).with(test_url).and_return(false)
|
122
|
+
expect(parser_two).to receive(:match).with(test_url).and_return(true)
|
123
|
+
expect(parser_two).to receive(:use_browser).and_return(false)
|
124
|
+
expect(http_client).to receive(:get)
|
125
|
+
.with(test_url, {})
|
126
|
+
.and_return(response)
|
127
|
+
expect(parser_two).to receive(:parse).with(response, {})
|
128
|
+
subject.read(test_url)
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'is default_parser when no parser match' do
|
132
|
+
expect(parser_one).to receive(:match).with(test_url).and_return(false)
|
133
|
+
expect(parser_two).to receive(:match).with(test_url).and_return(false)
|
134
|
+
expect(default_parser).to receive(:use_browser).and_return(false)
|
135
|
+
expect(http_client).to receive(:get)
|
136
|
+
.with(test_url, {})
|
137
|
+
.and_return(response)
|
138
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
139
|
+
subject.read(test_url)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should provide parse_opts to #parser#parse method' do
|
145
|
+
parse_opts = { title: 'h1' }
|
146
|
+
response = double('response', body: 'body')
|
147
|
+
expect(default_parser).to receive(:use_browser).and_return(false)
|
148
|
+
expect(http_client).to receive(:get)
|
149
|
+
.with(test_url, {})
|
150
|
+
.and_return(response)
|
151
|
+
expect(default_parser).to receive(:parse).with(response, parse_opts)
|
152
|
+
subject.read(test_url, parse_opts: parse_opts)
|
153
|
+
end
|
154
|
+
|
155
|
+
it 'should provide request_opts to request method' do
|
156
|
+
http_client_method = :post
|
157
|
+
request_opts = { method: http_client_method, options: { body: {token: '123'}}}
|
158
|
+
response = double('response', body: 'body')
|
159
|
+
expect(default_parser).to receive(:use_browser).and_return(false)
|
160
|
+
expect(http_client).to receive(http_client_method)
|
161
|
+
.with(test_url, request_opts[:options])
|
162
|
+
.and_return(response)
|
163
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
164
|
+
subject.read(test_url, request_opts: request_opts)
|
165
|
+
end
|
166
|
+
|
167
|
+
it 'should provide request_opts to request method' do
|
168
|
+
message = 'done'
|
169
|
+
browse_opts = { process: :continue, message: message}
|
170
|
+
browser_body = "body"
|
171
|
+
response = described_class::DefaultResponse.new(browser_body, 200, message)
|
172
|
+
expect(Headless).to receive(:new).and_return(headless)
|
173
|
+
expect(headless).to receive(:start)
|
174
|
+
expect(default_parser).to receive(:use_browser).and_return(true)
|
175
|
+
expect(browser).to receive(:start).with(test_url).and_return(active_browser)
|
176
|
+
expect(default_parser).to receive(:browse_actions_for_html)
|
177
|
+
.with(active_browser, browse_opts)
|
178
|
+
.and_return(browser_body)
|
179
|
+
expect(active_browser).to receive(:close)
|
180
|
+
expect(headless).to receive(:destroy)
|
181
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
182
|
+
|
183
|
+
subject.read(test_url, browse_opts: browse_opts)
|
184
|
+
end
|
185
|
+
|
186
|
+
context 'on raise errors' do
|
187
|
+
|
188
|
+
it 'raise ReadError' do
|
189
|
+
error_msg = 'HttpReader::Engine#read - Bad url: http://localhost/test, opts: {}'
|
190
|
+
expect(default_parser).to receive(:use_browser).and_raise('Bad')
|
191
|
+
expect(logger).to receive(:error).with(error_msg)
|
192
|
+
expect { subject.read(test_url) }.to raise_error(described_class::ReadError, 'Bad')
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'in #request' do
|
196
|
+
error_msg = 'HttpReader::Engine#request - Bad '
|
197
|
+
response = described_class::DefaultResponse.new(nil, 500, 'Bad')
|
198
|
+
expect(default_parser).to receive(:use_browser).and_return(false)
|
199
|
+
expect(http_client).to receive(:get).with(test_url, {}).and_raise('Bad')
|
200
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
201
|
+
expect(logger).to receive(:error).with(error_msg)
|
202
|
+
subject.read(test_url)
|
203
|
+
end
|
204
|
+
|
205
|
+
it 'in #browse' do
|
206
|
+
error_msg = 'HttpReader::Engine#browse - Bad '
|
207
|
+
response = described_class::DefaultResponse.new(nil, 500, 'Bad')
|
208
|
+
expect(default_parser).to receive(:use_browser).and_return(true)
|
209
|
+
expect(browser).to receive(:start).with(test_url).and_raise('Bad')
|
210
|
+
expect(default_parser).to receive(:parse).with(response, {})
|
211
|
+
expect(logger).to receive(:error).with(error_msg)
|
212
|
+
subject.read(test_url)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HttpReader::HashPageParser do
|
4
|
+
subject { described_class }
|
5
|
+
|
6
|
+
let(:pattern) { /^((http|https):\/\/).*$/ }
|
7
|
+
|
8
|
+
context 'self' do
|
9
|
+
it '#pattern cover every url' do
|
10
|
+
expect(subject.pattern).to eq pattern
|
11
|
+
end
|
12
|
+
|
13
|
+
context '#match' do
|
14
|
+
it 'returns true for url string' do
|
15
|
+
expect(subject.match('http://some_url')).to eq true
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'returns false for non url string' do
|
19
|
+
expect(subject.match('some_fake_url')).to eq false
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
context '#parse' do
|
24
|
+
let(:body) { '<h1>Information</h1><p>not importante</p><div class="content">Items: <ul><li>A</li><li>B</li><li>C</li></ul></div>' }
|
25
|
+
let(:opts) { {title: 'h1', items: '.content li;array'} }
|
26
|
+
let(:response) { double('response', body: body) }
|
27
|
+
let(:result) { {:title=>"Information", :items=>%w{A B C}} }
|
28
|
+
it 'returns body' do
|
29
|
+
expect(subject.parse(response, opts)).to eq result
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: http_reader
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paweł Niemczyk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.13'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.13'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.6'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.6'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: headless
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: watir-webdriver
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.3'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.3'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.1'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.1'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: guard-rspec
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: coveralls
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: awesome_print
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
description: Read page body and parse to specific data
|
154
|
+
email:
|
155
|
+
- pniemczyk@o2.pl
|
156
|
+
executables: []
|
157
|
+
extensions: []
|
158
|
+
extra_rdoc_files: []
|
159
|
+
files:
|
160
|
+
- ".gitignore"
|
161
|
+
- ".rspec"
|
162
|
+
- ".travis.yml"
|
163
|
+
- Gemfile
|
164
|
+
- Guardfile
|
165
|
+
- LICENSE.txt
|
166
|
+
- README.md
|
167
|
+
- Rakefile
|
168
|
+
- http_reader.gemspec
|
169
|
+
- lib/http_reader.rb
|
170
|
+
- lib/http_reader/base_page_matcher.rb
|
171
|
+
- lib/http_reader/base_page_parser.rb
|
172
|
+
- lib/http_reader/engine.rb
|
173
|
+
- lib/http_reader/hash_page_parser.rb
|
174
|
+
- lib/http_reader/version.rb
|
175
|
+
- spec/spec_helper.rb
|
176
|
+
- spec/unit/lib/http_reader/base_page_parser_spec.rb
|
177
|
+
- spec/unit/lib/http_reader/engine_spec.rb
|
178
|
+
- spec/unit/lib/http_reader/hash_page_parser_spec.rb
|
179
|
+
- spec/unit/lib/http_reader_spec.rb
|
180
|
+
homepage: https://github.com/pniemczyk/http_reader
|
181
|
+
licenses:
|
182
|
+
- MIT
|
183
|
+
metadata: {}
|
184
|
+
post_install_message: Do not forget install xvfb. Have fun !
|
185
|
+
rdoc_options: []
|
186
|
+
require_paths:
|
187
|
+
- lib
|
188
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: '0'
|
193
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
version: '0'
|
198
|
+
requirements: []
|
199
|
+
rubyforge_project:
|
200
|
+
rubygems_version: 2.2.2
|
201
|
+
signing_key:
|
202
|
+
specification_version: 4
|
203
|
+
summary: Page parser
|
204
|
+
test_files:
|
205
|
+
- spec/spec_helper.rb
|
206
|
+
- spec/unit/lib/http_reader/base_page_parser_spec.rb
|
207
|
+
- spec/unit/lib/http_reader/engine_spec.rb
|
208
|
+
- spec/unit/lib/http_reader/hash_page_parser_spec.rb
|
209
|
+
- spec/unit/lib/http_reader_spec.rb
|
210
|
+
has_rdoc:
|