wombat 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +8 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +13 -2
- data/lib/wombat/node_selector.rb +1 -1
- data/spec/crawler_spec.rb +32 -9
- data/wombat.gemspec +2 -2
- metadata +2 -2
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[]
|
3
|
+
[][travis] [][gemnasium]
|
4
4
|
|
5
5
|
[travis]: http://travis-ci.org/felipecsl/wombat
|
6
6
|
[gemnasium]: https://gemnasium.com/felipecsl/wombat
|
@@ -85,6 +85,13 @@ my_crawler.crawl
|
|
85
85
|
|
86
86
|
* Felipe Lima ([@felipecsl](https://github.com/felipecsl))
|
87
87
|
* [@sigi](https://github.com/sigi)
|
88
|
+
* Daniel Naves de Carvalho ([@danielnc](https://github.com/danielnc))
|
89
|
+
|
90
|
+
## Changelog
|
91
|
+
|
92
|
+
### version 0.3.1
|
93
|
+
|
94
|
+
* Added the ability to provide a block to Crawler#crawl and override the default crawler properties for a one off run (thanks to @danielnc)
|
88
95
|
|
89
96
|
## Copyright
|
90
97
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
data/lib/wombat/crawler.rb
CHANGED
@@ -10,8 +10,19 @@ module Wombat
|
|
10
10
|
include Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
|
-
def crawl
|
14
|
-
|
13
|
+
def crawl(&block)
|
14
|
+
if block
|
15
|
+
@metadata_dup = self.class.send(:metadata).clone
|
16
|
+
instance_eval do
|
17
|
+
def method_missing method, *args, &block
|
18
|
+
@metadata_dup.send method, *args, &block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
self.instance_eval &block
|
22
|
+
parse @metadata_dup
|
23
|
+
else
|
24
|
+
parse self.class.send(:metadata)
|
25
|
+
end
|
15
26
|
end
|
16
27
|
|
17
28
|
module ClassMethods
|
data/lib/wombat/node_selector.rb
CHANGED
data/spec/crawler_spec.rb
CHANGED
@@ -9,16 +9,16 @@ describe Wombat::Crawler do
|
|
9
9
|
|
10
10
|
it 'should call the provided block' do
|
11
11
|
event_called = false
|
12
|
-
|
12
|
+
|
13
13
|
@crawler.event { event_called = true }
|
14
|
-
|
14
|
+
|
15
15
|
event_called.should be_true
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'should provide metadata to yielded block' do
|
19
19
|
@crawler.event do |e|
|
20
20
|
e.should_not be_nil
|
21
|
-
end
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
24
|
it 'should store assigned metadata information' do
|
@@ -38,7 +38,7 @@ describe Wombat::Crawler do
|
|
38
38
|
arg["venue"]["name"].selector.should == "Scooba"
|
39
39
|
arg["location"]["latitude"].selector.should == -50.2323
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
@crawler_instance.crawl
|
43
43
|
end
|
44
44
|
|
@@ -57,8 +57,8 @@ describe Wombat::Crawler do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
it 'should be able to assign arbitrary plain text metadata' do
|
60
|
-
@crawler.some_data("/event/list", :html, "geo") {|p| true }
|
61
|
-
|
60
|
+
@crawler.some_data("/event/list", :html, "geo") { |p| true }
|
61
|
+
|
62
62
|
@crawler_instance.should_receive(:parse) do |arg|
|
63
63
|
prop = arg['some_data']
|
64
64
|
prop.name.should == "some_data"
|
@@ -67,7 +67,7 @@ describe Wombat::Crawler do
|
|
67
67
|
prop.namespaces.should == "geo"
|
68
68
|
prop.callback.should_not be_nil
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
@crawler_instance.crawl
|
72
72
|
end
|
73
73
|
|
@@ -107,16 +107,39 @@ describe Wombat::Crawler do
|
|
107
107
|
it["title"].selector.should == "css=.title"
|
108
108
|
it["body"].selector.should == "css=.body"
|
109
109
|
it["event"]["all"].selector.should == "yeah"
|
110
|
-
end
|
110
|
+
end
|
111
111
|
|
112
112
|
@crawler_instance.crawl
|
113
113
|
end
|
114
114
|
|
115
|
-
it 'should assign metadata
|
115
|
+
it 'should assign metadata format' do
|
116
116
|
@crawler_instance.should_receive(:parse) do |arg|
|
117
117
|
arg[:format].should == :xml
|
118
118
|
end
|
119
119
|
@crawler.format :xml
|
120
120
|
@crawler_instance.crawl
|
121
121
|
end
|
122
|
+
|
123
|
+
it 'should crawl with block' do
|
124
|
+
@crawler.base_url "danielnc.com"
|
125
|
+
@crawler.list_page "/itens"
|
126
|
+
|
127
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
128
|
+
arg[:base_url].should == "danielnc.com"
|
129
|
+
arg[:list_page].should == "/itens/1"
|
130
|
+
end
|
131
|
+
|
132
|
+
@crawler_instance.crawl do
|
133
|
+
list_page "/itens/1"
|
134
|
+
end
|
135
|
+
|
136
|
+
another_instance = @crawler.new
|
137
|
+
|
138
|
+
another_instance.should_receive(:parse) do |arg|
|
139
|
+
arg[:base_url].should == "danielnc.com"
|
140
|
+
arg[:list_page].should == "/itens"
|
141
|
+
end
|
142
|
+
|
143
|
+
another_instance.crawl
|
144
|
+
end
|
122
145
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-04-12"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|