wombat 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +8 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +13 -2
- data/lib/wombat/node_selector.rb +1 -1
- data/spec/crawler_spec.rb +32 -9
- data/wombat.gemspec +2 -2
- metadata +2 -2
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)]
|
3
|
+
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium]
|
4
4
|
|
5
5
|
[travis]: http://travis-ci.org/felipecsl/wombat
|
6
6
|
[gemnasium]: https://gemnasium.com/felipecsl/wombat
|
@@ -85,6 +85,13 @@ my_crawler.crawl
|
|
85
85
|
|
86
86
|
* Felipe Lima ([@felipecsl](https://github.com/felipecsl))
|
87
87
|
* [@sigi](https://github.com/sigi)
|
88
|
+
* Daniel Naves de Carvalho ([@danielnc](https://github.com/danielnc))
|
89
|
+
|
90
|
+
## Changelog
|
91
|
+
|
92
|
+
### version 0.3.1
|
93
|
+
|
94
|
+
* Added the ability to provide a block to Crawler#crawl and override the default crawler properties for a one off run (thanks to @danielnc)
|
88
95
|
|
89
96
|
## Copyright
|
90
97
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.1
|
data/lib/wombat/crawler.rb
CHANGED
@@ -10,8 +10,19 @@ module Wombat
|
|
10
10
|
include Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
|
-
def crawl
|
14
|
-
|
13
|
+
def crawl(&block)
|
14
|
+
if block
|
15
|
+
@metadata_dup = self.class.send(:metadata).clone
|
16
|
+
instance_eval do
|
17
|
+
def method_missing method, *args, &block
|
18
|
+
@metadata_dup.send method, *args, &block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
self.instance_eval &block
|
22
|
+
parse @metadata_dup
|
23
|
+
else
|
24
|
+
parse self.class.send(:metadata)
|
25
|
+
end
|
15
26
|
end
|
16
27
|
|
17
28
|
module ClassMethods
|
data/lib/wombat/node_selector.rb
CHANGED
data/spec/crawler_spec.rb
CHANGED
@@ -9,16 +9,16 @@ describe Wombat::Crawler do
|
|
9
9
|
|
10
10
|
it 'should call the provided block' do
|
11
11
|
event_called = false
|
12
|
-
|
12
|
+
|
13
13
|
@crawler.event { event_called = true }
|
14
|
-
|
14
|
+
|
15
15
|
event_called.should be_true
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'should provide metadata to yielded block' do
|
19
19
|
@crawler.event do |e|
|
20
20
|
e.should_not be_nil
|
21
|
-
end
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
24
|
it 'should store assigned metadata information' do
|
@@ -38,7 +38,7 @@ describe Wombat::Crawler do
|
|
38
38
|
arg["venue"]["name"].selector.should == "Scooba"
|
39
39
|
arg["location"]["latitude"].selector.should == -50.2323
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
@crawler_instance.crawl
|
43
43
|
end
|
44
44
|
|
@@ -57,8 +57,8 @@ describe Wombat::Crawler do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
it 'should be able to assign arbitrary plain text metadata' do
|
60
|
-
@crawler.some_data("/event/list", :html, "geo") {|p| true }
|
61
|
-
|
60
|
+
@crawler.some_data("/event/list", :html, "geo") { |p| true }
|
61
|
+
|
62
62
|
@crawler_instance.should_receive(:parse) do |arg|
|
63
63
|
prop = arg['some_data']
|
64
64
|
prop.name.should == "some_data"
|
@@ -67,7 +67,7 @@ describe Wombat::Crawler do
|
|
67
67
|
prop.namespaces.should == "geo"
|
68
68
|
prop.callback.should_not be_nil
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
@crawler_instance.crawl
|
72
72
|
end
|
73
73
|
|
@@ -107,16 +107,39 @@ describe Wombat::Crawler do
|
|
107
107
|
it["title"].selector.should == "css=.title"
|
108
108
|
it["body"].selector.should == "css=.body"
|
109
109
|
it["event"]["all"].selector.should == "yeah"
|
110
|
-
end
|
110
|
+
end
|
111
111
|
|
112
112
|
@crawler_instance.crawl
|
113
113
|
end
|
114
114
|
|
115
|
-
it 'should assign metadata
|
115
|
+
it 'should assign metadata format' do
|
116
116
|
@crawler_instance.should_receive(:parse) do |arg|
|
117
117
|
arg[:format].should == :xml
|
118
118
|
end
|
119
119
|
@crawler.format :xml
|
120
120
|
@crawler_instance.crawl
|
121
121
|
end
|
122
|
+
|
123
|
+
it 'should crawl with block' do
|
124
|
+
@crawler.base_url "danielnc.com"
|
125
|
+
@crawler.list_page "/itens"
|
126
|
+
|
127
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
128
|
+
arg[:base_url].should == "danielnc.com"
|
129
|
+
arg[:list_page].should == "/itens/1"
|
130
|
+
end
|
131
|
+
|
132
|
+
@crawler_instance.crawl do
|
133
|
+
list_page "/itens/1"
|
134
|
+
end
|
135
|
+
|
136
|
+
another_instance = @crawler.new
|
137
|
+
|
138
|
+
another_instance.should_receive(:parse) do |arg|
|
139
|
+
arg[:base_url].should == "danielnc.com"
|
140
|
+
arg[:list_page].should == "/itens"
|
141
|
+
end
|
142
|
+
|
143
|
+
another_instance.crawl
|
144
|
+
end
|
122
145
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-04-12"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|