wombat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +1 -4
- data/Gemfile +4 -0
- data/Gemfile.lock +11 -0
- data/README.md +22 -0
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +600 -0
- data/lib/wombat/crawler.rb +20 -17
- data/lib/wombat/metadata.rb +7 -17
- data/lib/wombat/parser.rb +6 -4
- data/lib/wombat/property_container.rb +44 -0
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +35 -13
- data/spec/helpers/sample_crawler.rb +27 -8
- data/spec/integration/integration_spec.rb +25 -0
- data/spec/metadata_spec.rb +7 -14
- data/spec/parser_spec.rb +20 -54
- data/spec/property_container_spec.rb +49 -0
- data/spec/property_locator_spec.rb +21 -11
- data/spec/sample_crawler_spec.rb +10 -8
- data/spec/spec_helper.rb +7 -1
- data/wombat.gemspec +20 -6
- metadata +68 -22
- data/README.rdoc +0 -20
- data/lib/wombat/properties.rb +0 -31
- data/spec/properties_spec.rb +0 -31
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::PropertyContainer do
|
4
|
+
before(:each) do
|
5
|
+
@metadata = Wombat::PropertyContainer.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should return an array with all the metadata properties' do
|
9
|
+
@metadata["event"] = Wombat::PropertyContainer.new
|
10
|
+
@metadata["venue"] = Wombat::PropertyContainer.new
|
11
|
+
@metadata.another_property "/some/selector", :text
|
12
|
+
@metadata["event"].something "else"
|
13
|
+
@metadata["venue"].awesome "whooea"
|
14
|
+
|
15
|
+
all_propes = @metadata.all_properties
|
16
|
+
|
17
|
+
all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"], @metadata["venue"]["awesome"]]
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should be able to change properties via all_properties' do
|
21
|
+
@metadata.another_property "/some/selector", :text
|
22
|
+
@metadata.all_properties.first.selector = "abc"
|
23
|
+
@metadata["another_property"].selector.should == "abc"
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should return metadata in plain hash format' do
|
27
|
+
@metadata.title "/some/selector"
|
28
|
+
@metadata["title"].result = "Gogobot Inc."
|
29
|
+
@metadata["holder"] = Wombat::PropertyContainer.new
|
30
|
+
@metadata["holder"].heading "css=.heading"
|
31
|
+
@metadata["holder"]["heading"].result = 123456
|
32
|
+
@metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
|
33
|
+
@metadata["holder"]["subheader"].section "/blah"
|
34
|
+
@metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
|
35
|
+
@metadata.footer("another thing", :html) { |a| true }
|
36
|
+
@metadata["footer"].result = "bla bla bla"
|
37
|
+
|
38
|
+
@metadata.flatten.should == {
|
39
|
+
"title" => "Gogobot Inc.",
|
40
|
+
"holder" => {
|
41
|
+
"heading" => 123456,
|
42
|
+
"subheader" => {
|
43
|
+
"section" => "Lorem Ipsum"
|
44
|
+
}
|
45
|
+
},
|
46
|
+
"footer" => "bla bla bla"
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
@@ -6,24 +6,34 @@ describe Wombat::PropertyLocator do
|
|
6
6
|
@locator.send(:include, Wombat::PropertyLocator)
|
7
7
|
@locator_instance = @locator.new
|
8
8
|
@metadata = Wombat::Metadata.new
|
9
|
+
@metadata["event"] = Wombat::PropertyContainer.new
|
10
|
+
@metadata["venue"] = Wombat::PropertyContainer.new
|
11
|
+
@metadata["location"] = Wombat::PropertyContainer.new
|
9
12
|
end
|
10
13
|
|
11
14
|
it 'should locate metadata properties' do
|
12
15
|
context = double :context
|
13
|
-
|
16
|
+
abc = double :abc
|
17
|
+
|
18
|
+
abc.stub(:inner_text).and_return("Something cool")
|
19
|
+
|
20
|
+
context.stub(:xpath).with("/abc", nil).and_return([abc])
|
21
|
+
context.stub(:xpath).with("/bah", nil).and_return(["abc"])
|
14
22
|
context.stub(:css).with("/ghi").and_return(["Another stuff"])
|
15
23
|
|
16
|
-
@metadata.
|
17
|
-
@metadata.
|
18
|
-
@metadata.
|
24
|
+
@metadata["event"].data1 "xpath=/abc"
|
25
|
+
@metadata["venue"].data2 :farms
|
26
|
+
@metadata["location"].data3 "css=/ghi"
|
27
|
+
@metadata.blah "xpath=/bah"
|
19
28
|
|
20
29
|
@locator_instance.stub(:context).and_return context
|
21
30
|
|
22
31
|
@locator_instance.locate @metadata
|
23
32
|
|
24
|
-
@metadata.
|
25
|
-
@metadata.
|
26
|
-
@metadata.
|
33
|
+
@metadata.get_property("blah").result.should == "abc"
|
34
|
+
@metadata["event"].get_property("data1").result.should == "Something cool"
|
35
|
+
@metadata["venue"].get_property("data2").result.should == "farms"
|
36
|
+
@metadata["location"].get_property("data3").result.should == "Another stuff"
|
27
37
|
end
|
28
38
|
|
29
39
|
it 'should support properties with html format' do
|
@@ -35,11 +45,11 @@ describe Wombat::PropertyLocator do
|
|
35
45
|
|
36
46
|
@locator_instance.stub(:context).and_return context
|
37
47
|
|
38
|
-
@metadata.
|
48
|
+
@metadata["event"].another_info "xpath=/anotherData", :html
|
39
49
|
|
40
50
|
@locator_instance.locate @metadata
|
41
51
|
|
42
|
-
@metadata.
|
52
|
+
@metadata["event"].get_property("another_info").result.should == "some another info"
|
43
53
|
end
|
44
54
|
|
45
55
|
it 'should trim property contents and use namespaces if present' do
|
@@ -47,10 +57,10 @@ describe Wombat::PropertyLocator do
|
|
47
57
|
context.should_receive(:xpath).with("/event/some/description", "blah").and_return([" awesome event "])
|
48
58
|
|
49
59
|
@locator_instance.stub(:context).and_return context
|
50
|
-
@metadata.
|
60
|
+
@metadata["event"].description "xpath=/event/some/description", :text, "blah"
|
51
61
|
|
52
62
|
@locator_instance.locate @metadata
|
53
63
|
|
54
|
-
@metadata.
|
64
|
+
@metadata["event"].get_property("description").result.should == "awesome event"
|
55
65
|
end
|
56
66
|
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -4,17 +4,19 @@ require 'helpers/sample_crawler'
|
|
4
4
|
describe SampleCrawler do
|
5
5
|
before(:each) do
|
6
6
|
@sample_crawler = SampleCrawler.new
|
7
|
-
@sample_crawler.parser = Wombat::Parser.new
|
8
7
|
end
|
9
8
|
|
10
|
-
|
11
|
-
@sample_crawler.
|
12
|
-
args.
|
13
|
-
args.
|
14
|
-
args.
|
9
|
+
xit 'should correctly assign event metadata' do
|
10
|
+
@sample_crawler.should_receive(:parse) do |args|
|
11
|
+
args.event["title"].selector.should == "xpath=."
|
12
|
+
args.event["description"].selector.should == "css=#main-node-content"
|
13
|
+
args.event["date"].selector.should == DateTime.now.to_date
|
15
14
|
|
16
|
-
args.
|
17
|
-
args.
|
15
|
+
args.venue["name"].selector.should == "Cafe de La Musique"
|
16
|
+
args.venue["address"].selector.should == "324 Dom Pedro II Street"
|
17
|
+
|
18
|
+
args[:base_url].should == 'http://www.google.com/'
|
19
|
+
args[:list_page].should == 'shows.php'
|
18
20
|
end
|
19
21
|
|
20
22
|
@sample_crawler.crawl
|
data/spec/spec_helper.rb
CHANGED
data/wombat.gemspec
CHANGED
@@ -5,16 +5,16 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2012-02-06"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses event-related data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE.txt",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
@@ -24,21 +24,23 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Gemfile.lock",
|
25
25
|
"Guardfile",
|
26
26
|
"LICENSE.txt",
|
27
|
-
"README.
|
27
|
+
"README.md",
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
|
+
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
30
31
|
"lib/wombat.rb",
|
31
32
|
"lib/wombat/crawler.rb",
|
32
33
|
"lib/wombat/metadata.rb",
|
33
34
|
"lib/wombat/parser.rb",
|
34
|
-
"lib/wombat/properties.rb",
|
35
35
|
"lib/wombat/property.rb",
|
36
|
+
"lib/wombat/property_container.rb",
|
36
37
|
"lib/wombat/property_locator.rb",
|
37
38
|
"spec/crawler_spec.rb",
|
38
39
|
"spec/helpers/sample_crawler.rb",
|
40
|
+
"spec/integration/integration_spec.rb",
|
39
41
|
"spec/metadata_spec.rb",
|
40
42
|
"spec/parser_spec.rb",
|
41
|
-
"spec/
|
43
|
+
"spec/property_container_spec.rb",
|
42
44
|
"spec/property_locator_spec.rb",
|
43
45
|
"spec/property_spec.rb",
|
44
46
|
"spec/sample_crawler_spec.rb",
|
@@ -63,6 +65,10 @@ Gem::Specification.new do |s|
|
|
63
65
|
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
64
66
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
65
67
|
s.add_development_dependency(%q<guard>, [">= 0"])
|
68
|
+
s.add_development_dependency(%q<guard-rspec>, [">= 0"])
|
69
|
+
s.add_development_dependency(%q<guard-bundler>, [">= 0"])
|
70
|
+
s.add_development_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
71
|
+
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
66
72
|
else
|
67
73
|
s.add_dependency(%q<bundler>, [">= 0"])
|
68
74
|
s.add_dependency(%q<rake>, [">= 0"])
|
@@ -72,6 +78,10 @@ Gem::Specification.new do |s|
|
|
72
78
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
73
79
|
s.add_dependency(%q<rspec>, [">= 0"])
|
74
80
|
s.add_dependency(%q<guard>, [">= 0"])
|
81
|
+
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
82
|
+
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
83
|
+
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
84
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
75
85
|
end
|
76
86
|
else
|
77
87
|
s.add_dependency(%q<bundler>, [">= 0"])
|
@@ -82,6 +92,10 @@ Gem::Specification.new do |s|
|
|
82
92
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
83
93
|
s.add_dependency(%q<rspec>, [">= 0"])
|
84
94
|
s.add_dependency(%q<guard>, [">= 0"])
|
95
|
+
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
96
|
+
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
97
|
+
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
98
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
85
99
|
end
|
86
100
|
end
|
87
101
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-02-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
16
|
-
requirement: &
|
16
|
+
requirement: &70251197006880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70251197006880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake
|
27
|
-
requirement: &
|
27
|
+
requirement: &70251197006400 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70251197006400
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: mechanize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70251197022260 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70251197022260
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: activesupport
|
49
|
-
requirement: &
|
49
|
+
requirement: &70251197021660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70251197021660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: yard
|
60
|
-
requirement: &
|
60
|
+
requirement: &70251197021180 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70251197021180
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70251197020700 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70251197020700
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rspec
|
82
|
-
requirement: &
|
82
|
+
requirement: &70251197020200 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70251197020200
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard
|
93
|
-
requirement: &
|
93
|
+
requirement: &70251197019700 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,7 +98,51 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70251197019700
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: guard-rspec
|
104
|
+
requirement: &70251197019220 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *70251197019220
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: guard-bundler
|
115
|
+
requirement: &70251197018740 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
type: :development
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: *70251197018740
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: vcr
|
126
|
+
requirement: &70251197018260 !ruby/object:Gem::Requirement
|
127
|
+
none: false
|
128
|
+
requirements:
|
129
|
+
- - =
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 2.0.0.rc1
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: *70251197018260
|
135
|
+
- !ruby/object:Gem::Dependency
|
136
|
+
name: fakeweb
|
137
|
+
requirement: &70251197017720 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
type: :development
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: *70251197017720
|
102
146
|
description: Generic Web crawler with a DSL that parses event-related data from web
|
103
147
|
pages
|
104
148
|
email: felipe.lima@gmail.com
|
@@ -106,7 +150,7 @@ executables: []
|
|
106
150
|
extensions: []
|
107
151
|
extra_rdoc_files:
|
108
152
|
- LICENSE.txt
|
109
|
-
- README.
|
153
|
+
- README.md
|
110
154
|
files:
|
111
155
|
- .document
|
112
156
|
- .rspec
|
@@ -115,21 +159,23 @@ files:
|
|
115
159
|
- Gemfile.lock
|
116
160
|
- Guardfile
|
117
161
|
- LICENSE.txt
|
118
|
-
- README.
|
162
|
+
- README.md
|
119
163
|
- Rakefile
|
120
164
|
- VERSION
|
165
|
+
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
121
166
|
- lib/wombat.rb
|
122
167
|
- lib/wombat/crawler.rb
|
123
168
|
- lib/wombat/metadata.rb
|
124
169
|
- lib/wombat/parser.rb
|
125
|
-
- lib/wombat/properties.rb
|
126
170
|
- lib/wombat/property.rb
|
171
|
+
- lib/wombat/property_container.rb
|
127
172
|
- lib/wombat/property_locator.rb
|
128
173
|
- spec/crawler_spec.rb
|
129
174
|
- spec/helpers/sample_crawler.rb
|
175
|
+
- spec/integration/integration_spec.rb
|
130
176
|
- spec/metadata_spec.rb
|
131
177
|
- spec/parser_spec.rb
|
132
|
-
- spec/
|
178
|
+
- spec/property_container_spec.rb
|
133
179
|
- spec/property_locator_spec.rb
|
134
180
|
- spec/property_spec.rb
|
135
181
|
- spec/sample_crawler_spec.rb
|
data/README.rdoc
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
= Wombat
|
2
|
-
|
3
|
-
Generic Web crawler with a DSL that parses event-related data from web pages.
|
4
|
-
Still under development, it is being rewritten from scratch as a gem from an already existing project.
|
5
|
-
|
6
|
-
== Contributing to Wombat
|
7
|
-
|
8
|
-
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
9
|
-
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
10
|
-
* Fork the project
|
11
|
-
* Start a feature/bugfix branch
|
12
|
-
* Commit and push until you are happy with your contribution
|
13
|
-
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
14
|
-
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
15
|
-
|
16
|
-
== Copyright
|
17
|
-
|
18
|
-
Copyright (c) 2011 Felipe Lima. See LICENSE.txt for
|
19
|
-
further details.
|
20
|
-
|