wombat 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +52 -2
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +517 -508
- data/lib/wombat/crawler.rb +2 -6
- data/lib/wombat/iterator.rb +9 -0
- data/lib/wombat/metadata.rb +13 -0
- data/lib/wombat/parser.rb +5 -3
- data/lib/wombat/property_container.rb +1 -1
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +15 -0
- data/spec/integration/integration_spec.rb +7 -1
- data/spec/metadata_spec.rb +6 -0
- data/spec/parser_spec.rb +7 -5
- data/spec/property_container_spec.rb +3 -2
- data/spec/property_locator_spec.rb +3 -3
- data/wombat.gemspec +3 -2
- metadata +27 -26
data/lib/wombat/crawler.rb
CHANGED
@@ -29,12 +29,8 @@ module Wombat
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
def for_each selector
|
37
|
-
|
32
|
+
def for_each selector, &block
|
33
|
+
metadata.for_each(selector).instance_eval(&block) if block
|
38
34
|
end
|
39
35
|
|
40
36
|
def follow_links selector
|
data/lib/wombat/metadata.rb
CHANGED
@@ -1,8 +1,15 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
require 'wombat/property_container'
|
3
|
+
require 'wombat/iterator'
|
3
4
|
|
4
5
|
module Wombat
|
5
6
|
class Metadata < PropertyContainer
|
7
|
+
attr_accessor :iterators
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@iterators = []
|
11
|
+
end
|
12
|
+
|
6
13
|
def base_url url
|
7
14
|
self[:base_url] = url
|
8
15
|
end
|
@@ -10,5 +17,11 @@ module Wombat
|
|
10
17
|
def list_page url
|
11
18
|
self[:list_page] = url
|
12
19
|
end
|
20
|
+
|
21
|
+
def for_each selector
|
22
|
+
Iterator.new(selector).tap do |i|
|
23
|
+
iterators << i
|
24
|
+
end
|
25
|
+
end
|
13
26
|
end
|
14
27
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -14,10 +14,12 @@ module Wombat
|
|
14
14
|
def parse metadata
|
15
15
|
@context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
|
16
16
|
|
17
|
-
|
17
|
+
props = metadata.all_properties
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
locate props
|
20
|
+
|
21
|
+
props.each do |p|
|
22
|
+
p.result = p.callback.call(p.result) if p.callback
|
21
23
|
end
|
22
24
|
|
23
25
|
metadata.flatten
|
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
module Wombat
|
4
4
|
module PropertyLocator
|
5
|
-
def locate
|
6
|
-
|
5
|
+
def locate properties
|
6
|
+
properties.each do |p|
|
7
7
|
p.result = locate_property(p).first
|
8
8
|
end
|
9
9
|
end
|
10
10
|
|
11
11
|
private
|
12
12
|
def locate_property property
|
13
|
-
result = locate_selector(property.selector, property.namespaces)
|
13
|
+
result = locate_selector(property.selector, property.namespaces).to_a
|
14
14
|
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
15
|
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
16
16
|
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -91,4 +91,19 @@ describe Wombat::Crawler do
|
|
91
91
|
it 'should not explode if no block given' do
|
92
92
|
@crawler.event
|
93
93
|
end
|
94
|
+
|
95
|
+
it 'should iterate on elements inside for_each block' do
|
96
|
+
@crawler.for_each "css=.element" do
|
97
|
+
title "css=.title"
|
98
|
+
body "css=.body"
|
99
|
+
end
|
100
|
+
|
101
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
102
|
+
arg.iterators.first.selector.should == "css=.element"
|
103
|
+
arg.iterators.first["title"].selector.should == "css=.title"
|
104
|
+
arg.iterators.first["body"].selector.should == "css=.body"
|
105
|
+
end
|
106
|
+
|
107
|
+
@crawler_instance.crawl
|
108
|
+
end
|
94
109
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
require 'spec_helper'
|
2
3
|
|
3
4
|
describe 'basic crawler setup' do
|
@@ -14,12 +15,17 @@ describe 'basic crawler setup' do
|
|
14
15
|
s.twitter "css=.ctn-bar li.last"
|
15
16
|
end
|
16
17
|
|
18
|
+
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
19
|
+
h.gsub("London", "Londres")
|
20
|
+
end
|
21
|
+
|
17
22
|
crawler_instance = crawler.new
|
18
23
|
|
19
24
|
results = crawler_instance.crawl
|
20
25
|
|
21
26
|
results["search"].should == "Buscar"
|
22
|
-
results["
|
27
|
+
results["subheader"].should == "Londres 2012"
|
28
|
+
results["social"]["twitter"].should == "Verão"
|
23
29
|
end
|
24
30
|
end
|
25
31
|
end
|
data/spec/metadata_spec.rb
CHANGED
@@ -11,4 +11,10 @@ describe Wombat::Metadata do
|
|
11
11
|
@metadata.list_page "/yeah"
|
12
12
|
@metadata.all_properties.should == [@metadata['another_property']]
|
13
13
|
end
|
14
|
+
|
15
|
+
it 'should store iterators' do
|
16
|
+
@metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
|
17
|
+
@metadata.iterators.size.should == 1
|
18
|
+
@metadata.iterators.first.selector.should == "some_selector"
|
19
|
+
end
|
14
20
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -24,7 +24,7 @@ describe Wombat::Parser do
|
|
24
24
|
fake_parser = double :parser
|
25
25
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
26
26
|
@parser.mechanize.stub(:get).and_return fake_document
|
27
|
-
@parser.should_receive(:locate).with(@metadata)
|
27
|
+
@parser.should_receive(:locate).with(@metadata.all_properties)
|
28
28
|
@parser.parse @metadata
|
29
29
|
end
|
30
30
|
|
@@ -38,10 +38,11 @@ describe Wombat::Parser do
|
|
38
38
|
property.stub(:result)
|
39
39
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
40
40
|
property.should_receive(:callback).twice.and_return(block)
|
41
|
+
property.should_receive(:result=).with(true)
|
41
42
|
|
42
43
|
@parser.mechanize.stub(:get).and_return fake_document
|
43
|
-
@
|
44
|
-
@
|
44
|
+
@metadata.stub(:all_properties).and_return [property]
|
45
|
+
@parser.should_receive(:locate).with(@metadata.all_properties)
|
45
46
|
|
46
47
|
@parser.parse @metadata
|
47
48
|
|
@@ -61,10 +62,11 @@ describe Wombat::Parser do
|
|
61
62
|
property.should_receive(:result).and_return("blah")
|
62
63
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
63
64
|
property.should_receive(:callback).twice.and_return(block)
|
65
|
+
property.should_receive(:result=).with(true)
|
64
66
|
|
65
67
|
@parser.mechanize.stub(:get).and_return fake_document
|
66
|
-
@
|
67
|
-
@
|
68
|
+
@metadata.stub(:all_properties).and_return [property]
|
69
|
+
@parser.should_receive(:locate).with(@metadata.all_properties)
|
68
70
|
|
69
71
|
@parser.parse @metadata
|
70
72
|
|
@@ -9,12 +9,13 @@ describe Wombat::PropertyContainer do
|
|
9
9
|
@metadata["event"] = Wombat::PropertyContainer.new
|
10
10
|
@metadata["venue"] = Wombat::PropertyContainer.new
|
11
11
|
@metadata.another_property "/some/selector", :text
|
12
|
-
@metadata["event"]
|
12
|
+
@metadata["event"]["something"] = Wombat::PropertyContainer.new
|
13
|
+
@metadata["event"]["something"].else "Wohooo"
|
13
14
|
@metadata["venue"].awesome "whooea"
|
14
15
|
|
15
16
|
all_propes = @metadata.all_properties
|
16
17
|
|
17
|
-
all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"], @metadata["venue"]["awesome"]]
|
18
|
+
all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"]["else"], @metadata["venue"]["awesome"]]
|
18
19
|
end
|
19
20
|
|
20
21
|
it 'should be able to change properties via all_properties' do
|
@@ -28,7 +28,7 @@ describe Wombat::PropertyLocator do
|
|
28
28
|
|
29
29
|
@locator_instance.stub(:context).and_return context
|
30
30
|
|
31
|
-
@locator_instance.locate @metadata
|
31
|
+
@locator_instance.locate @metadata.all_properties
|
32
32
|
|
33
33
|
@metadata.get_property("blah").result.should == "abc"
|
34
34
|
@metadata["event"].get_property("data1").result.should == "Something cool"
|
@@ -47,7 +47,7 @@ describe Wombat::PropertyLocator do
|
|
47
47
|
|
48
48
|
@metadata["event"].another_info "xpath=/anotherData", :html
|
49
49
|
|
50
|
-
@locator_instance.locate @metadata
|
50
|
+
@locator_instance.locate @metadata.all_properties
|
51
51
|
|
52
52
|
@metadata["event"].get_property("another_info").result.should == "some another info"
|
53
53
|
end
|
@@ -59,7 +59,7 @@ describe Wombat::PropertyLocator do
|
|
59
59
|
@locator_instance.stub(:context).and_return context
|
60
60
|
@metadata["event"].description "xpath=/event/some/description", :text, "blah"
|
61
61
|
|
62
|
-
@locator_instance.locate @metadata
|
62
|
+
@locator_instance.locate @metadata.all_properties
|
63
63
|
|
64
64
|
@metadata["event"].get_property("description").result.should == "awesome event"
|
65
65
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-08"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -30,6 +30,7 @@ Gem::Specification.new do |s|
|
|
30
30
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
31
|
"lib/wombat.rb",
|
32
32
|
"lib/wombat/crawler.rb",
|
33
|
+
"lib/wombat/iterator.rb",
|
33
34
|
"lib/wombat/metadata.rb",
|
34
35
|
"lib/wombat/parser.rb",
|
35
36
|
"lib/wombat/property.rb",
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70163534813940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70163534813940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: activesupport
|
27
|
-
requirement: &
|
27
|
+
requirement: &70163534813160 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70163534813160
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70163534812680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70163534812680
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70163534812200 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70163534812200
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: yard
|
60
|
-
requirement: &
|
60
|
+
requirement: &70163534811700 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70163534811700
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70163534811200 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70163534811200
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rspec
|
82
|
-
requirement: &
|
82
|
+
requirement: &70163534810720 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70163534810720
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard
|
93
|
-
requirement: &
|
93
|
+
requirement: &70163534810240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70163534810240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: guard-rspec
|
104
|
-
requirement: &
|
104
|
+
requirement: &70163535048080 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70163535048080
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: guard-bundler
|
115
|
-
requirement: &
|
115
|
+
requirement: &70163535047560 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70163535047560
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: vcr
|
126
|
-
requirement: &
|
126
|
+
requirement: &70163535047060 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - =
|
@@ -131,10 +131,10 @@ dependencies:
|
|
131
131
|
version: 2.0.0.rc1
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70163535047060
|
135
135
|
- !ruby/object:Gem::Dependency
|
136
136
|
name: fakeweb
|
137
|
-
requirement: &
|
137
|
+
requirement: &70163535046440 !ruby/object:Gem::Requirement
|
138
138
|
none: false
|
139
139
|
requirements:
|
140
140
|
- - ! '>='
|
@@ -142,7 +142,7 @@ dependencies:
|
|
142
142
|
version: '0'
|
143
143
|
type: :development
|
144
144
|
prerelease: false
|
145
|
-
version_requirements: *
|
145
|
+
version_requirements: *70163535046440
|
146
146
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
147
147
|
email: felipe.lima@gmail.com
|
148
148
|
executables: []
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
165
165
|
- lib/wombat.rb
|
166
166
|
- lib/wombat/crawler.rb
|
167
|
+
- lib/wombat/iterator.rb
|
167
168
|
- lib/wombat/metadata.rb
|
168
169
|
- lib/wombat/parser.rb
|
169
170
|
- lib/wombat/property.rb
|