wombat 0.2.5 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -10,6 +10,6 @@ group :development, :test do
10
10
  gem 'yard'
11
11
  gem 'jeweler'
12
12
  gem 'rspec'
13
- gem 'vcr', '2.0.0.rc1'
13
+ gem 'vcr'
14
14
  gem 'fakeweb'
15
15
  end
data/Gemfile.lock CHANGED
@@ -37,7 +37,7 @@ GEM
37
37
  unf (0.0.4)
38
38
  unf_ext
39
39
  unf_ext (0.0.4)
40
- vcr (2.0.0.rc1)
40
+ vcr (2.0.0)
41
41
  webrobots (0.0.12)
42
42
  nokogiri (>= 1.4.4)
43
43
  yard (0.7.4)
@@ -54,5 +54,5 @@ DEPENDENCIES
54
54
  rake
55
55
  rest-client
56
56
  rspec
57
- vcr (= 2.0.0.rc1)
57
+ vcr
58
58
  yard
data/README.md CHANGED
@@ -1,13 +1,18 @@
1
1
  # Wombat
2
2
 
3
- [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)](http://travis-ci.org/felipecsl/wombat)
3
+ [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)](travis) [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)](gemnasium)
4
4
 
5
- Generic Web crawler with a DSL that parses structured data from web pages.
5
+ [travis]: http://travis-ci.org/felipecsl/wombat
6
+ [gemnasium]: https://gemnasium.com/felipecsl/wombat
7
+
8
+ Generic Web crawler with an elegant DSL that parses structured data from web pages.
6
9
 
7
10
  ## Usage:
8
11
 
9
12
  ``gem install wombat``
10
13
 
14
+ OBS: Requires ruby 1.9
15
+
11
16
  Creating a crawler:
12
17
 
13
18
  ###### Create a class that includes ``Wombat::Crawler``:
@@ -76,6 +81,11 @@ my_crawler.crawl
76
81
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
77
82
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
78
83
 
84
+ ## Contributors
85
+
86
+ * Felipe Lima ([@felipecsl](https://github.com/felipecsl))
87
+ * [@sigi](https://github.com/sigi)
88
+
79
89
  ## Copyright
80
90
 
81
91
  Copyright (c) 2012 Felipe Lima. See LICENSE.txt for further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.5
1
+ 0.3.0
@@ -2,9 +2,33 @@ module Wombat
2
2
  class Iterator < PropertyContainer
3
3
  attr_accessor :selector
4
4
 
5
- def initialize selector
5
+ def initialize(selector)
6
6
  @selector = selector
7
7
  super()
8
- end
8
+ end
9
+
10
+ def parse
11
+ raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
12
+
13
+ all_properties.each do |p|
14
+ p.result ||= []
15
+ result = yield p
16
+ if result
17
+ result = p.callback ? p.callback.call(result) : result
18
+ p.result << result
19
+ end
20
+ end
21
+ end
22
+
23
+ def flatten(depth = nil)
24
+ # determine the iterator length by the biggest property array that we have
25
+ length = all_properties.map(&:result).sort { |a| a.length }.last.size
26
+
27
+ Array.new.tap do |a|
28
+ length.times do |i|
29
+ a << super(i)
30
+ end
31
+ end
32
+ end
9
33
  end
10
34
  end
data/lib/wombat/parser.rb CHANGED
@@ -13,35 +13,25 @@ module Wombat
13
13
  end
14
14
 
15
15
  def parse metadata
16
- self.context = get_parser metadata
16
+ self.context = parser_for metadata
17
17
  original_context = self.context
18
18
 
19
19
  metadata.iterators.each do |it|
20
- select_nodes(it.selector).each do |n|
21
- self.context = n
22
- it.all_properties.each do |p|
23
- p.result ||= []
24
- result = locate(p)
25
- if result
26
- result = p.callback ? p.callback.call(result) : result
27
- p.result << result
28
- end
29
- end
20
+ select_nodes(it.selector).each do |node|
21
+ self.context = node
22
+ it.parse { |p| locate p }
30
23
  end
31
24
  end
32
25
 
33
26
  self.context = original_context
34
27
 
35
- metadata.all_properties.each do |p|
36
- result = locate p
37
- p.result = p.callback ? p.callback.call(result) : result
38
- end
28
+ metadata.parse { |p| locate p }
39
29
 
40
30
  metadata.flatten
41
31
  end
42
32
 
43
33
  private
44
- def get_parser metadata
34
+ def parser_for metadata
45
35
  url = "#{metadata[:base_url]}#{metadata[:list_page]}"
46
36
 
47
37
  if metadata[:format] == :html
@@ -2,7 +2,7 @@ module Wombat
2
2
  class Property
3
3
  attr_accessor :name, :selector, :format, :namespaces, :callback, :result
4
4
 
5
- def initialize options
5
+ def initialize(options)
6
6
  @name = options[:name]
7
7
  @selector = options[:selector]
8
8
  @format = options[:format]
@@ -10,8 +10,8 @@ module Wombat
10
10
  @callback = options[:callback]
11
11
  end
12
12
 
13
- def flatten
14
- result
13
+ def flatten(depth = nil)
14
+ depth ? result[depth] : result
15
15
  end
16
16
  end
17
17
  end
@@ -37,14 +37,28 @@ module Wombat
37
37
  }.compact
38
38
  end
39
39
 
40
- def flatten
41
- Hash.new.tap do |h|
40
+ def parse
41
+ all_properties.each do |p|
42
+ result = yield p if block_given?
43
+ p.result = p.callback ? p.callback.call(result) : result
44
+ end
45
+ end
46
+
47
+ def flatten(depth = nil)
48
+ properties = Hash.new.tap do |h|
42
49
  keys.map do |k|
43
- if self[k].kind_of?(PropertyContainer) || self[k].kind_of?(Property)
44
- h[k] = self[k].flatten
50
+ val = self[k]
51
+ if val.is_a?(PropertyContainer) || val.is_a?(Property)
52
+ h[k] = val.flatten depth
45
53
  end
46
54
  end
47
- end.merge(iterators.inject({}) {|memo, i| memo.merge(i.flatten) })
55
+ end
56
+
57
+ iters = iterators.reduce({}) do |memo, i|
58
+ memo.merge("iterator#{iterators.index(i)}" => i.flatten)
59
+ end
60
+
61
+ properties.merge iters
48
62
  end
49
63
 
50
64
  def for_each selector
@@ -28,7 +28,7 @@ describe 'basic crawler setup' do
28
28
  results = crawler_instance.crawl
29
29
 
30
30
  results["search"].should == "Buscar"
31
- results["menu"].should =~ ["Agenda", "Brasileiro", "Brasil", "Bolsas", "Cinema", "Galerias de Fotos", "Beleza", "Esportes", "Assine o RSS"]
31
+ results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
32
32
  results["subheader"].should == "Londres 2012"
33
33
  results["social"]["twitter"].should == "Verão"
34
34
  end
@@ -43,21 +43,22 @@ describe 'basic crawler setup' do
43
43
  crawler.list_page "/explore"
44
44
 
45
45
  crawler.for_each "css=ol.ranked-repositories li" do
46
- repo 'css=h3'
47
- description('css=p.description') { |d| d.gsub(/for/, '') }
46
+ project do |p|
47
+ p.repo 'css=h3'
48
+ p.description('css=p.description') { |d| d.gsub(/for/, '') }
49
+ end
48
50
  end
49
51
 
50
52
  crawler_instance = crawler.new
51
53
  results = crawler_instance.crawl
52
54
 
53
- results["repo"].should =~ ["jairajs89 / Touchy.js", "mcavage / node-restify", "notlion / streetview-stereographic", "twitter / bootstrap", "stolksdorf / Parallaxjs"]
54
- results["description"].should =~ [
55
- "node.js REST framework specifically meant web service APIs",
56
- "A simple light-weight JavaScript library dealing with touch events",
57
- "Shader Toy + Google Map + Panoramic Explorer",
58
- "HTML, CSS, and JS toolkit from Twitter",
59
- "a Library Javascript that allows easy page parallaxing"
60
- ]
55
+ results.should == { "iterator0" => [
56
+ { "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
57
+ { "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
58
+ { "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
59
+ { "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
60
+ { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
61
+ ]}
61
62
  end
62
63
  end
63
64
 
@@ -79,9 +80,21 @@ describe 'basic crawler setup' do
79
80
 
80
81
  crawler_instance = crawler.new
81
82
  results = crawler_instance.crawl
83
+ iterator = results['iterator0']
84
+
85
+ iterator.should == [
86
+ {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
87
+ {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
88
+ {"latitude"=>"37.869784", "longitude"=>"-122.267701"},
89
+ {"latitude"=>"37.870873", "longitude"=>"-122.269313"},
90
+ {"latitude"=>"37.782348", "longitude"=>"-122.408059"},
91
+ {"latitude"=>"37.775529", "longitude"=>"-122.437757"},
92
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
93
+ {"latitude"=>"37.771079", "longitude"=>"-122.412604"},
94
+ {"latitude"=>"37.784963", "longitude"=>"-122.418871"},
95
+ {"latitude"=>"37.788978", "longitude"=>"-122.40664"}
96
+ ]
82
97
 
83
- results["latitude"].should =~ ["37.807775", "37.807717", "37.869784", "37.870873", "37.782348", "37.775529", "37.771079", "37.771079", "37.784963", "37.788978"]
84
- results["longitude"].should =~ ["-122.272736", "-122.270059", "-122.267701", "-122.269313", "-122.408059", "-122.437757", "-122.412604", "-122.412604", "-122.418871", "-122.40664"]
85
98
  results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
86
99
  end
87
100
  end
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Iterator do
4
+ let(:it) { Wombat::Iterator.new "it_selector" }
5
+
6
+ context 'parse' do
7
+ it 'should iterate in for_each properties' do
8
+ it.prop_1 "some_selector"
9
+ it.prop_2 "another_selector"
10
+
11
+ it['prop_1'].should_receive(:result).twice.and_return([])
12
+ it['prop_2'].should_receive(:result).twice.and_return([])
13
+
14
+ parser = double :parser
15
+ parser.should_receive(:locate).with(it['prop_1']).twice
16
+ parser.should_receive(:locate).with(it['prop_2']).twice
17
+
18
+ it.parse { |p| parser.locate p }
19
+ it.parse { |p| parser.locate p }
20
+ end
21
+
22
+ it 'should raise if no block given' do
23
+ expect{
24
+ it.parse
25
+ }.to raise_error(ArgumentError)
26
+ end
27
+ end
28
+
29
+ it 'should flatten properties to plain hash format' do
30
+ it.prop_1 "some_selector"
31
+ it.prop_2 "another_selector"
32
+
33
+ it.parse {|p| }
34
+ it.parse {|p| }
35
+ it['prop_1'].result = ['result 1', 'result 2']
36
+ it['prop_2'].result = ['result 3', 'result 4']
37
+
38
+ it.flatten.should == [
39
+ { "prop_1" => "result 1", "prop_2" => "result 3" },
40
+ { "prop_1" => "result 2", "prop_2" => "result 4" }
41
+ ]
42
+ end
43
+ end
data/spec/parser_spec.rb CHANGED
@@ -72,10 +72,6 @@ describe Wombat::Parser do
72
72
  block_called.should be_true
73
73
  end
74
74
 
75
- it 'should invoke callback inside for_each block' do
76
-
77
- end
78
-
79
75
  it 'should return hash with requested properties' do
80
76
  hash = double :results
81
77
  fake_parser = double :parser
@@ -88,33 +84,6 @@ describe Wombat::Parser do
88
84
  @parser.parse(@metadata).should == hash
89
85
  end
90
86
 
91
- it 'should iterate in for_each properties' do
92
- fake_parser = double :parser
93
- fake_document = double :document
94
- c1 = double :context
95
- c2 = double :context
96
- it = Wombat::Iterator.new "it_selector"
97
- it.prop_1 "some_selector"
98
- it.prop_2 "another_selector"
99
-
100
- @parser.should_receive(:context=).ordered
101
- @metadata.should_receive(:iterators).and_return [it]
102
- @metadata.should_receive(:flatten)
103
- fake_document.should_receive(:parser).and_return(fake_parser)
104
- it['prop_1'].should_receive(:result).exactly(2).times.and_return([])
105
- it['prop_2'].should_receive(:result).exactly(2).times.and_return([])
106
- @parser.mechanize.stub(:get).and_return fake_document
107
- @parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
108
- @parser.should_receive(:context=).with(c1).ordered
109
- @parser.should_receive(:context=).with(c2).ordered
110
- @parser.should_receive(:context=).ordered
111
- @parser.should_receive(:locate).with(it['prop_1']).twice
112
- @parser.should_receive(:locate).with(it['prop_2']).twice
113
- @parser.stub(:locate)
114
-
115
- @parser.parse(@metadata)
116
- end
117
-
118
87
  it 'should not include null results in iterated block' do
119
88
  fake_parser = double :parser
120
89
  fake_document = double :document
@@ -55,7 +55,7 @@ describe Wombat::PropertyContainer do
55
55
  "section" => "Lorem Ipsum"
56
56
  }
57
57
  },
58
- "felipe" => ["correa", "de souza", "lima"],
58
+ "iterator0"=>[{"felipe"=>"correa"}, {"felipe"=>"de souza"}, {"felipe"=>"lima"}],
59
59
  "footer" => "bla bla bla"
60
60
  }
61
61
  end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "0.2.5"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-03-21"
12
+ s.date = "2012-03-25"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
42
42
  "spec/crawler_spec.rb",
43
43
  "spec/helpers/sample_crawler.rb",
44
44
  "spec/integration/integration_spec.rb",
45
+ "spec/iterator_spec.rb",
45
46
  "spec/metadata_spec.rb",
46
47
  "spec/parser_spec.rb",
47
48
  "spec/property_container_spec.rb",
@@ -70,7 +71,7 @@ Gem::Specification.new do |s|
70
71
  s.add_development_dependency(%q<yard>, [">= 0"])
71
72
  s.add_development_dependency(%q<jeweler>, [">= 0"])
72
73
  s.add_development_dependency(%q<rspec>, [">= 0"])
73
- s.add_development_dependency(%q<vcr>, ["= 2.0.0.rc1"])
74
+ s.add_development_dependency(%q<vcr>, [">= 0"])
74
75
  s.add_development_dependency(%q<fakeweb>, [">= 0"])
75
76
  else
76
77
  s.add_dependency(%q<mechanize>, [">= 0"])
@@ -81,7 +82,7 @@ Gem::Specification.new do |s|
81
82
  s.add_dependency(%q<yard>, [">= 0"])
82
83
  s.add_dependency(%q<jeweler>, [">= 0"])
83
84
  s.add_dependency(%q<rspec>, [">= 0"])
84
- s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
85
+ s.add_dependency(%q<vcr>, [">= 0"])
85
86
  s.add_dependency(%q<fakeweb>, [">= 0"])
86
87
  end
87
88
  else
@@ -93,7 +94,7 @@ Gem::Specification.new do |s|
93
94
  s.add_dependency(%q<yard>, [">= 0"])
94
95
  s.add_dependency(%q<jeweler>, [">= 0"])
95
96
  s.add_dependency(%q<rspec>, [">= 0"])
96
- s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
97
+ s.add_dependency(%q<vcr>, [">= 0"])
97
98
  s.add_dependency(%q<fakeweb>, [">= 0"])
98
99
  end
99
100
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-21 00:00:00.000000000 Z
12
+ date: 2012-03-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -144,17 +144,17 @@ dependencies:
144
144
  requirement: !ruby/object:Gem::Requirement
145
145
  none: false
146
146
  requirements:
147
- - - '='
147
+ - - ! '>='
148
148
  - !ruby/object:Gem::Version
149
- version: 2.0.0.rc1
149
+ version: '0'
150
150
  type: :development
151
151
  prerelease: false
152
152
  version_requirements: !ruby/object:Gem::Requirement
153
153
  none: false
154
154
  requirements:
155
- - - '='
155
+ - - ! '>='
156
156
  - !ruby/object:Gem::Version
157
- version: 2.0.0.rc1
157
+ version: '0'
158
158
  - !ruby/object:Gem::Dependency
159
159
  name: fakeweb
160
160
  requirement: !ruby/object:Gem::Requirement
@@ -204,6 +204,7 @@ files:
204
204
  - spec/crawler_spec.rb
205
205
  - spec/helpers/sample_crawler.rb
206
206
  - spec/integration/integration_spec.rb
207
+ - spec/iterator_spec.rb
207
208
  - spec/metadata_spec.rb
208
209
  - spec/parser_spec.rb
209
210
  - spec/property_container_spec.rb