wombat 0.2.5 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -1
- data/Gemfile.lock +2 -2
- data/README.md +12 -2
- data/VERSION +1 -1
- data/lib/wombat/iterator.rb +26 -2
- data/lib/wombat/parser.rb +6 -16
- data/lib/wombat/property.rb +3 -3
- data/lib/wombat/property_container.rb +19 -5
- data/spec/integration/integration_spec.rb +26 -13
- data/spec/iterator_spec.rb +43 -0
- data/spec/parser_spec.rb +0 -31
- data/spec/property_container_spec.rb +1 -1
- data/wombat.gemspec +6 -5
- metadata +7 -6
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)](
|
3
|
+
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)](travis) [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)](gemnasium)
|
4
4
|
|
5
|
-
|
5
|
+
[travis]: http://travis-ci.org/felipecsl/wombat
|
6
|
+
[gemnasium]: https://gemnasium.com/felipecsl/wombat
|
7
|
+
|
8
|
+
Generic Web crawler with an elegant DSL that parses structured data from web pages.
|
6
9
|
|
7
10
|
## Usage:
|
8
11
|
|
9
12
|
``gem install wombat``
|
10
13
|
|
14
|
+
OBS: Requires ruby 1.9
|
15
|
+
|
11
16
|
Creating a crawler:
|
12
17
|
|
13
18
|
###### Create a class that includes ``Wombat::Crawler``:
|
@@ -76,6 +81,11 @@ my_crawler.crawl
|
|
76
81
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
77
82
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
78
83
|
|
84
|
+
## Contributors
|
85
|
+
|
86
|
+
* Felipe Lima ([@felipecsl](https://github.com/felipecsl))
|
87
|
+
* [@sigi](https://github.com/sigi)
|
88
|
+
|
79
89
|
## Copyright
|
80
90
|
|
81
91
|
Copyright (c) 2012 Felipe Lima. See LICENSE.txt for further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/wombat/iterator.rb
CHANGED
@@ -2,9 +2,33 @@ module Wombat
|
|
2
2
|
class Iterator < PropertyContainer
|
3
3
|
attr_accessor :selector
|
4
4
|
|
5
|
-
def initialize
|
5
|
+
def initialize(selector)
|
6
6
|
@selector = selector
|
7
7
|
super()
|
8
|
-
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse
|
11
|
+
raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
|
12
|
+
|
13
|
+
all_properties.each do |p|
|
14
|
+
p.result ||= []
|
15
|
+
result = yield p
|
16
|
+
if result
|
17
|
+
result = p.callback ? p.callback.call(result) : result
|
18
|
+
p.result << result
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def flatten(depth = nil)
|
24
|
+
# determine the iterator length by the biggest property array that we have
|
25
|
+
length = all_properties.map(&:result).sort { |a| a.length }.last.size
|
26
|
+
|
27
|
+
Array.new.tap do |a|
|
28
|
+
length.times do |i|
|
29
|
+
a << super(i)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
9
33
|
end
|
10
34
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -13,35 +13,25 @@ module Wombat
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def parse metadata
|
16
|
-
self.context =
|
16
|
+
self.context = parser_for metadata
|
17
17
|
original_context = self.context
|
18
18
|
|
19
19
|
metadata.iterators.each do |it|
|
20
|
-
select_nodes(it.selector).each do |
|
21
|
-
self.context =
|
22
|
-
it.
|
23
|
-
p.result ||= []
|
24
|
-
result = locate(p)
|
25
|
-
if result
|
26
|
-
result = p.callback ? p.callback.call(result) : result
|
27
|
-
p.result << result
|
28
|
-
end
|
29
|
-
end
|
20
|
+
select_nodes(it.selector).each do |node|
|
21
|
+
self.context = node
|
22
|
+
it.parse { |p| locate p }
|
30
23
|
end
|
31
24
|
end
|
32
25
|
|
33
26
|
self.context = original_context
|
34
27
|
|
35
|
-
metadata.
|
36
|
-
result = locate p
|
37
|
-
p.result = p.callback ? p.callback.call(result) : result
|
38
|
-
end
|
28
|
+
metadata.parse { |p| locate p }
|
39
29
|
|
40
30
|
metadata.flatten
|
41
31
|
end
|
42
32
|
|
43
33
|
private
|
44
|
-
def
|
34
|
+
def parser_for metadata
|
45
35
|
url = "#{metadata[:base_url]}#{metadata[:list_page]}"
|
46
36
|
|
47
37
|
if metadata[:format] == :html
|
data/lib/wombat/property.rb
CHANGED
@@ -2,7 +2,7 @@ module Wombat
|
|
2
2
|
class Property
|
3
3
|
attr_accessor :name, :selector, :format, :namespaces, :callback, :result
|
4
4
|
|
5
|
-
def initialize
|
5
|
+
def initialize(options)
|
6
6
|
@name = options[:name]
|
7
7
|
@selector = options[:selector]
|
8
8
|
@format = options[:format]
|
@@ -10,8 +10,8 @@ module Wombat
|
|
10
10
|
@callback = options[:callback]
|
11
11
|
end
|
12
12
|
|
13
|
-
def flatten
|
14
|
-
result
|
13
|
+
def flatten(depth = nil)
|
14
|
+
depth ? result[depth] : result
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -37,14 +37,28 @@ module Wombat
|
|
37
37
|
}.compact
|
38
38
|
end
|
39
39
|
|
40
|
-
def
|
41
|
-
|
40
|
+
def parse
|
41
|
+
all_properties.each do |p|
|
42
|
+
result = yield p if block_given?
|
43
|
+
p.result = p.callback ? p.callback.call(result) : result
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def flatten(depth = nil)
|
48
|
+
properties = Hash.new.tap do |h|
|
42
49
|
keys.map do |k|
|
43
|
-
|
44
|
-
|
50
|
+
val = self[k]
|
51
|
+
if val.is_a?(PropertyContainer) || val.is_a?(Property)
|
52
|
+
h[k] = val.flatten depth
|
45
53
|
end
|
46
54
|
end
|
47
|
-
end
|
55
|
+
end
|
56
|
+
|
57
|
+
iters = iterators.reduce({}) do |memo, i|
|
58
|
+
memo.merge("iterator#{iterators.index(i)}" => i.flatten)
|
59
|
+
end
|
60
|
+
|
61
|
+
properties.merge iters
|
48
62
|
end
|
49
63
|
|
50
64
|
def for_each selector
|
@@ -28,7 +28,7 @@ describe 'basic crawler setup' do
|
|
28
28
|
results = crawler_instance.crawl
|
29
29
|
|
30
30
|
results["search"].should == "Buscar"
|
31
|
-
results["
|
31
|
+
results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
32
32
|
results["subheader"].should == "Londres 2012"
|
33
33
|
results["social"]["twitter"].should == "Verão"
|
34
34
|
end
|
@@ -43,21 +43,22 @@ describe 'basic crawler setup' do
|
|
43
43
|
crawler.list_page "/explore"
|
44
44
|
|
45
45
|
crawler.for_each "css=ol.ranked-repositories li" do
|
46
|
-
|
47
|
-
|
46
|
+
project do |p|
|
47
|
+
p.repo 'css=h3'
|
48
|
+
p.description('css=p.description') { |d| d.gsub(/for/, '') }
|
49
|
+
end
|
48
50
|
end
|
49
51
|
|
50
52
|
crawler_instance = crawler.new
|
51
53
|
results = crawler_instance.crawl
|
52
54
|
|
53
|
-
results
|
54
|
-
|
55
|
-
"node.js REST framework specifically meant web service APIs",
|
56
|
-
"
|
57
|
-
"
|
58
|
-
"
|
59
|
-
|
60
|
-
]
|
55
|
+
results.should == { "iterator0" => [
|
56
|
+
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
|
57
|
+
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
|
58
|
+
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
|
59
|
+
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
|
60
|
+
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
|
61
|
+
]}
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
@@ -79,9 +80,21 @@ describe 'basic crawler setup' do
|
|
79
80
|
|
80
81
|
crawler_instance = crawler.new
|
81
82
|
results = crawler_instance.crawl
|
83
|
+
iterator = results['iterator0']
|
84
|
+
|
85
|
+
iterator.should == [
|
86
|
+
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
87
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
88
|
+
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
89
|
+
{"latitude"=>"37.870873", "longitude"=>"-122.269313"},
|
90
|
+
{"latitude"=>"37.782348", "longitude"=>"-122.408059"},
|
91
|
+
{"latitude"=>"37.775529", "longitude"=>"-122.437757"},
|
92
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
93
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
94
|
+
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
95
|
+
{"latitude"=>"37.788978", "longitude"=>"-122.40664"}
|
96
|
+
]
|
82
97
|
|
83
|
-
results["latitude"].should =~ ["37.807775", "37.807717", "37.869784", "37.870873", "37.782348", "37.775529", "37.771079", "37.771079", "37.784963", "37.788978"]
|
84
|
-
results["longitude"].should =~ ["-122.272736", "-122.270059", "-122.267701", "-122.269313", "-122.408059", "-122.437757", "-122.412604", "-122.412604", "-122.418871", "-122.40664"]
|
85
98
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
86
99
|
end
|
87
100
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Iterator do
|
4
|
+
let(:it) { Wombat::Iterator.new "it_selector" }
|
5
|
+
|
6
|
+
context 'parse' do
|
7
|
+
it 'should iterate in for_each properties' do
|
8
|
+
it.prop_1 "some_selector"
|
9
|
+
it.prop_2 "another_selector"
|
10
|
+
|
11
|
+
it['prop_1'].should_receive(:result).twice.and_return([])
|
12
|
+
it['prop_2'].should_receive(:result).twice.and_return([])
|
13
|
+
|
14
|
+
parser = double :parser
|
15
|
+
parser.should_receive(:locate).with(it['prop_1']).twice
|
16
|
+
parser.should_receive(:locate).with(it['prop_2']).twice
|
17
|
+
|
18
|
+
it.parse { |p| parser.locate p }
|
19
|
+
it.parse { |p| parser.locate p }
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should raise if no block given' do
|
23
|
+
expect{
|
24
|
+
it.parse
|
25
|
+
}.to raise_error(ArgumentError)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should flatten properties to plain hash format' do
|
30
|
+
it.prop_1 "some_selector"
|
31
|
+
it.prop_2 "another_selector"
|
32
|
+
|
33
|
+
it.parse {|p| }
|
34
|
+
it.parse {|p| }
|
35
|
+
it['prop_1'].result = ['result 1', 'result 2']
|
36
|
+
it['prop_2'].result = ['result 3', 'result 4']
|
37
|
+
|
38
|
+
it.flatten.should == [
|
39
|
+
{ "prop_1" => "result 1", "prop_2" => "result 3" },
|
40
|
+
{ "prop_1" => "result 2", "prop_2" => "result 4" }
|
41
|
+
]
|
42
|
+
end
|
43
|
+
end
|
data/spec/parser_spec.rb
CHANGED
@@ -72,10 +72,6 @@ describe Wombat::Parser do
|
|
72
72
|
block_called.should be_true
|
73
73
|
end
|
74
74
|
|
75
|
-
it 'should invoke callback inside for_each block' do
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
75
|
it 'should return hash with requested properties' do
|
80
76
|
hash = double :results
|
81
77
|
fake_parser = double :parser
|
@@ -88,33 +84,6 @@ describe Wombat::Parser do
|
|
88
84
|
@parser.parse(@metadata).should == hash
|
89
85
|
end
|
90
86
|
|
91
|
-
it 'should iterate in for_each properties' do
|
92
|
-
fake_parser = double :parser
|
93
|
-
fake_document = double :document
|
94
|
-
c1 = double :context
|
95
|
-
c2 = double :context
|
96
|
-
it = Wombat::Iterator.new "it_selector"
|
97
|
-
it.prop_1 "some_selector"
|
98
|
-
it.prop_2 "another_selector"
|
99
|
-
|
100
|
-
@parser.should_receive(:context=).ordered
|
101
|
-
@metadata.should_receive(:iterators).and_return [it]
|
102
|
-
@metadata.should_receive(:flatten)
|
103
|
-
fake_document.should_receive(:parser).and_return(fake_parser)
|
104
|
-
it['prop_1'].should_receive(:result).exactly(2).times.and_return([])
|
105
|
-
it['prop_2'].should_receive(:result).exactly(2).times.and_return([])
|
106
|
-
@parser.mechanize.stub(:get).and_return fake_document
|
107
|
-
@parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
|
108
|
-
@parser.should_receive(:context=).with(c1).ordered
|
109
|
-
@parser.should_receive(:context=).with(c2).ordered
|
110
|
-
@parser.should_receive(:context=).ordered
|
111
|
-
@parser.should_receive(:locate).with(it['prop_1']).twice
|
112
|
-
@parser.should_receive(:locate).with(it['prop_2']).twice
|
113
|
-
@parser.stub(:locate)
|
114
|
-
|
115
|
-
@parser.parse(@metadata)
|
116
|
-
end
|
117
|
-
|
118
87
|
it 'should not include null results in iterated block' do
|
119
88
|
fake_parser = double :parser
|
120
89
|
fake_document = double :document
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-03-
|
12
|
+
s.date = "2012-03-25"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
|
|
42
42
|
"spec/crawler_spec.rb",
|
43
43
|
"spec/helpers/sample_crawler.rb",
|
44
44
|
"spec/integration/integration_spec.rb",
|
45
|
+
"spec/iterator_spec.rb",
|
45
46
|
"spec/metadata_spec.rb",
|
46
47
|
"spec/parser_spec.rb",
|
47
48
|
"spec/property_container_spec.rb",
|
@@ -70,7 +71,7 @@ Gem::Specification.new do |s|
|
|
70
71
|
s.add_development_dependency(%q<yard>, [">= 0"])
|
71
72
|
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
72
73
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
73
|
-
s.add_development_dependency(%q<vcr>, ["
|
74
|
+
s.add_development_dependency(%q<vcr>, [">= 0"])
|
74
75
|
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
75
76
|
else
|
76
77
|
s.add_dependency(%q<mechanize>, [">= 0"])
|
@@ -81,7 +82,7 @@ Gem::Specification.new do |s|
|
|
81
82
|
s.add_dependency(%q<yard>, [">= 0"])
|
82
83
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
83
84
|
s.add_dependency(%q<rspec>, [">= 0"])
|
84
|
-
s.add_dependency(%q<vcr>, ["
|
85
|
+
s.add_dependency(%q<vcr>, [">= 0"])
|
85
86
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
86
87
|
end
|
87
88
|
else
|
@@ -93,7 +94,7 @@ Gem::Specification.new do |s|
|
|
93
94
|
s.add_dependency(%q<yard>, [">= 0"])
|
94
95
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
95
96
|
s.add_dependency(%q<rspec>, [">= 0"])
|
96
|
-
s.add_dependency(%q<vcr>, ["
|
97
|
+
s.add_dependency(%q<vcr>, [">= 0"])
|
97
98
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
98
99
|
end
|
99
100
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -144,17 +144,17 @@ dependencies:
|
|
144
144
|
requirement: !ruby/object:Gem::Requirement
|
145
145
|
none: false
|
146
146
|
requirements:
|
147
|
-
- - '
|
147
|
+
- - ! '>='
|
148
148
|
- !ruby/object:Gem::Version
|
149
|
-
version:
|
149
|
+
version: '0'
|
150
150
|
type: :development
|
151
151
|
prerelease: false
|
152
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
153
|
none: false
|
154
154
|
requirements:
|
155
|
-
- - '
|
155
|
+
- - ! '>='
|
156
156
|
- !ruby/object:Gem::Version
|
157
|
-
version:
|
157
|
+
version: '0'
|
158
158
|
- !ruby/object:Gem::Dependency
|
159
159
|
name: fakeweb
|
160
160
|
requirement: !ruby/object:Gem::Requirement
|
@@ -204,6 +204,7 @@ files:
|
|
204
204
|
- spec/crawler_spec.rb
|
205
205
|
- spec/helpers/sample_crawler.rb
|
206
206
|
- spec/integration/integration_spec.rb
|
207
|
+
- spec/iterator_spec.rb
|
207
208
|
- spec/metadata_spec.rb
|
208
209
|
- spec/parser_spec.rb
|
209
210
|
- spec/property_container_spec.rb
|