wombat 2.4.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile.lock +52 -50
- data/README.md +4 -4
- data/VERSION +1 -1
- data/examples/iterator.rb +25 -25
- data/examples/list.rb +22 -22
- data/examples/xml.rb +15 -15
- data/fixtures/vcr_cassettes/make_post_request.yml +69 -0
- data/lib/wombat/crawler.rb +13 -7
- data/lib/wombat/dsl/metadata.rb +8 -0
- data/lib/wombat/processing/parser.rb +22 -7
- data/lib/wombat/property/locators/factory.rb +28 -28
- data/lib/wombat/property/locators/follow.rb +15 -38
- data/lib/wombat/property/locators/list.rb +13 -13
- data/lib/wombat.rb +8 -8
- data/spec/crawler_spec.rb +64 -42
- data/spec/integration/crawler_inheritance_spec.rb +63 -0
- data/spec/integration/integration_spec.rb +30 -0
- data/spec/processing/parser_spec.rb +35 -0
- data/spec/property/locators/factory_spec.rb +14 -14
- data/spec/property/locators/html_spec.rb +18 -18
- data/spec/property/locators/list_spec.rb +8 -8
- data/spec/property/locators/text_spec.rb +44 -44
- data/spec/wombat_spec.rb +36 -36
- data/wombat.gemspec +7 -3
- metadata +6 -2
@@ -1,16 +1,16 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat::Property::Locators::Text do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
4
|
+
it 'should locate text property with xpath selector and namespaces' do
|
5
|
+
fake_elem = double :element
|
6
|
+
context = double :context
|
7
|
+
fake_elem.stub inner_text: "Something cool "
|
8
|
+
context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
|
9
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
|
10
|
+
|
11
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
12
|
+
|
13
|
+
locator.locate(context).should == { "data1" => "Something cool" }
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should locate text property with xpath selector using xpath functions' do
|
@@ -23,37 +23,37 @@ describe Wombat::Property::Locators::Text do
|
|
23
23
|
locator.locate(context).should == { "data1" => "Something" }
|
24
24
|
end
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
end
|
26
|
+
it 'should locate text property with css selector' do
|
27
|
+
fake_elem = double :element
|
28
|
+
context = double :context
|
29
|
+
fake_elem.stub inner_text: "My name"
|
30
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
31
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
|
32
|
+
|
33
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
34
|
+
|
35
|
+
locator.locate(context).should == { "data1" => "My name" }
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should return plain symbols as strings' do
|
39
|
+
fake_elem = double :element
|
40
|
+
context = double :context
|
41
|
+
property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
|
42
|
+
|
43
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
44
|
+
|
45
|
+
locator.locate(context).should == { "data_2" => "hardcoded_value" }
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should invoke property callback' do
|
49
|
+
fake_elem = double :element
|
50
|
+
context = double :context
|
51
|
+
fake_elem.stub inner_text: "My name"
|
52
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
53
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
|
54
|
+
|
55
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
56
|
+
|
57
|
+
locator.locate(context).should == { "data1" => "My ass" }
|
58
|
+
end
|
59
|
+
end
|
data/spec/wombat_spec.rb
CHANGED
@@ -1,45 +1,45 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat do
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
it 'should provide syntactic sugar method Wombat.crawl' do
|
5
|
+
Wombat.should respond_to(:crawl)
|
6
|
+
end
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
it 'should provide syntactic sugar method Wombat.scrape' do
|
9
|
+
Wombat.should respond_to(:scrape)
|
10
|
+
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
it 'should redirect .scrape to .crawl' do
|
13
|
+
fake_class = double :fake
|
14
|
+
fake_class.stub :include
|
15
|
+
fake_class.should_receive(:new).and_return(double(crawl: nil))
|
16
|
+
Class.stub :new => fake_class
|
17
|
+
Wombat.scrape
|
18
|
+
end
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
it 'should provide configuration method with block' do
|
21
|
+
Wombat.configure do |config|
|
22
|
+
config.set_proxy "10.0.0.1", 8080
|
23
|
+
config.set_user_agent "Wombat"
|
24
|
+
config.set_user_agent_alias 'Mac Safari'
|
25
|
+
end
|
26
|
+
Wombat.proxy_args.should == ["10.0.0.1", 8080]
|
27
|
+
Wombat.user_agent.should == 'Wombat'
|
28
|
+
Wombat.user_agent_alias.should == 'Mac Safari'
|
29
|
+
end
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
it 'should accept regular properties (non-selectors)' do
|
32
|
+
VCR.use_cassette('broken_selector') do
|
33
|
+
lambda {
|
34
|
+
Wombat.crawl do
|
35
|
+
base_url "http://www.github.com"
|
36
|
+
path "/"
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
38
|
+
source :obaoba
|
39
|
+
description 'Oba Oba'
|
40
|
+
website 'http://obaoba.com.br'
|
41
|
+
end
|
42
|
+
}.should_not raise_error
|
43
|
+
end
|
44
|
+
end
|
45
45
|
end
|
data/wombat.gemspec
CHANGED
@@ -2,19 +2,20 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: wombat 2.
|
5
|
+
# stub: wombat 2.5.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "wombat"
|
9
|
-
s.version = "2.
|
9
|
+
s.version = "2.5.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.authors = ["Felipe Lima"]
|
14
|
-
s.date = "
|
14
|
+
s.date = "2016-01-27"
|
15
15
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
16
16
|
s.email = "felipe.lima@gmail.com"
|
17
17
|
s.extra_rdoc_files = [
|
18
|
+
"CHANGELOG.md",
|
18
19
|
"LICENSE.txt",
|
19
20
|
"README.md"
|
20
21
|
]
|
@@ -22,6 +23,7 @@ Gem::Specification.new do |s|
|
|
22
23
|
".document",
|
23
24
|
".rspec",
|
24
25
|
".travis.yml",
|
26
|
+
"CHANGELOG.md",
|
25
27
|
"Gemfile",
|
26
28
|
"Gemfile.lock",
|
27
29
|
"Guardfile",
|
@@ -41,6 +43,7 @@ Gem::Specification.new do |s|
|
|
41
43
|
"fixtures/vcr_cassettes/follow_relative_links.yml",
|
42
44
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
43
45
|
"fixtures/vcr_cassettes/headers_selector.yml",
|
46
|
+
"fixtures/vcr_cassettes/make_post_request.yml",
|
44
47
|
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
45
48
|
"lib/wombat.rb",
|
46
49
|
"lib/wombat/crawler.rb",
|
@@ -64,6 +67,7 @@ Gem::Specification.new do |s|
|
|
64
67
|
"spec/crawler_spec.rb",
|
65
68
|
"spec/dsl/property_spec.rb",
|
66
69
|
"spec/helpers/sample_crawler.rb",
|
70
|
+
"spec/integration/crawler_inheritance_spec.rb",
|
67
71
|
"spec/integration/follow_relative_links_spec.rb",
|
68
72
|
"spec/integration/integration_spec.rb",
|
69
73
|
"spec/processing/parser_spec.rb",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Felipe Lima
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -169,12 +169,14 @@ email: felipe.lima@gmail.com
|
|
169
169
|
executables: []
|
170
170
|
extensions: []
|
171
171
|
extra_rdoc_files:
|
172
|
+
- CHANGELOG.md
|
172
173
|
- LICENSE.txt
|
173
174
|
- README.md
|
174
175
|
files:
|
175
176
|
- ".document"
|
176
177
|
- ".rspec"
|
177
178
|
- ".travis.yml"
|
179
|
+
- CHANGELOG.md
|
178
180
|
- Gemfile
|
179
181
|
- Gemfile.lock
|
180
182
|
- Guardfile
|
@@ -194,6 +196,7 @@ files:
|
|
194
196
|
- fixtures/vcr_cassettes/follow_relative_links.yml
|
195
197
|
- fixtures/vcr_cassettes/for_each_page.yml
|
196
198
|
- fixtures/vcr_cassettes/headers_selector.yml
|
199
|
+
- fixtures/vcr_cassettes/make_post_request.yml
|
197
200
|
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
198
201
|
- lib/wombat.rb
|
199
202
|
- lib/wombat/crawler.rb
|
@@ -217,6 +220,7 @@ files:
|
|
217
220
|
- spec/crawler_spec.rb
|
218
221
|
- spec/dsl/property_spec.rb
|
219
222
|
- spec/helpers/sample_crawler.rb
|
223
|
+
- spec/integration/crawler_inheritance_spec.rb
|
220
224
|
- spec/integration/follow_relative_links_spec.rb
|
221
225
|
- spec/integration/integration_spec.rb
|
222
226
|
- spec/processing/parser_spec.rb
|