wombat 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,46 +1,54 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- activesupport (3.0.11)
4
+ activesupport (3.2.3)
5
+ i18n (~> 0.6)
6
+ multi_json (~> 1.0)
5
7
  diff-lcs (1.1.3)
6
- domain_name (0.5.1)
8
+ domain_name (0.5.3)
7
9
  unf (~> 0.0.3)
8
10
  fakeweb (1.3.0)
9
11
  git (1.2.5)
10
- jeweler (1.6.4)
12
+ i18n (0.6.0)
13
+ jeweler (1.8.3)
11
14
  bundler (~> 1.0)
12
15
  git (>= 1.2.5)
13
16
  rake
14
- mechanize (2.1)
17
+ rdoc
18
+ json (1.7.3)
19
+ mechanize (2.5.1)
15
20
  domain_name (~> 0.5, >= 0.5.1)
21
+ mime-types (~> 1.17, >= 1.17.2)
16
22
  net-http-digest_auth (~> 1.1, >= 1.1.1)
17
- net-http-persistent (~> 2.3, >= 2.3.2)
23
+ net-http-persistent (~> 2.5, >= 2.5.2)
18
24
  nokogiri (~> 1.4)
19
25
  ntlm-http (~> 0.1, >= 0.1.1)
20
26
  webrobots (~> 0.0, >= 0.0.9)
21
- mime-types (1.17.2)
22
- net-http-digest_auth (1.2)
23
- net-http-persistent (2.3.3)
24
- nokogiri (1.5.0)
27
+ mime-types (1.18)
28
+ multi_json (1.3.5)
29
+ net-http-digest_auth (1.2.1)
30
+ net-http-persistent (2.6)
31
+ nokogiri (1.5.2)
25
32
  ntlm-http (0.1.1)
26
33
  rake (0.9.2.2)
34
+ rdoc (3.12)
35
+ json (~> 1.4)
27
36
  rest-client (1.6.7)
28
37
  mime-types (>= 1.16)
29
- rspec (2.7.0)
30
- rspec-core (~> 2.7.0)
31
- rspec-expectations (~> 2.7.0)
32
- rspec-mocks (~> 2.7.0)
33
- rspec-core (2.7.1)
34
- rspec-expectations (2.7.0)
35
- diff-lcs (~> 1.1.2)
36
- rspec-mocks (2.7.0)
37
- unf (0.0.4)
38
+ rspec (2.10.0)
39
+ rspec-core (~> 2.10.0)
40
+ rspec-expectations (~> 2.10.0)
41
+ rspec-mocks (~> 2.10.0)
42
+ rspec-core (2.10.1)
43
+ rspec-expectations (2.10.0)
44
+ diff-lcs (~> 1.1.3)
45
+ rspec-mocks (2.10.1)
46
+ unf (0.0.5)
38
47
  unf_ext
39
48
  unf_ext (0.0.4)
40
- vcr (2.0.0)
41
- webrobots (0.0.12)
42
- nokogiri (>= 1.4.4)
43
- yard (0.7.4)
49
+ vcr (2.1.1)
50
+ webrobots (0.0.13)
51
+ yard (0.8.1)
44
52
 
45
53
  PLATFORMS
46
54
  ruby
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
data/lib/wombat.rb CHANGED
@@ -3,4 +3,9 @@
3
3
  require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
+ def self.crawl(&block)
7
+ klass = Class.new
8
+ klass.send(:include, Wombat::Crawler)
9
+ klass.new.crawl(&block)
10
+ end
6
11
  end
@@ -14,27 +14,41 @@ module Wombat
14
14
  if block
15
15
  @metadata_dup = self.class.send(:metadata).clone
16
16
  instance_eval do
17
+ alias :old_method_missing :method_missing
17
18
  def method_missing method, *args, &block
18
19
  @metadata_dup.send method, *args, &block
19
20
  end
20
21
  end
21
22
  self.instance_eval &block
22
- parse @metadata_dup
23
+ parsed = parse @metadata_dup
24
+ instance_eval do
25
+ alias :method_missing :old_method_missing
26
+ remove_instance_variable :@metadata_dup
27
+ end
28
+ parsed
23
29
  else
24
30
  parse self.class.send(:metadata)
25
31
  end
26
32
  end
27
33
 
34
+ def method_missing(method, *args, &block)
35
+ self.class.send method, *args, &block
36
+ end
37
+
38
+ def for_each(selector, &block)
39
+ self.class.for_each selector, &block
40
+ end
41
+
28
42
  module ClassMethods
29
- def method_missing method, *args, &block
43
+ def method_missing(method, *args, &block)
30
44
  metadata.send method, *args, &block
31
45
  end
32
46
 
33
- def for_each selector, &block
47
+ def for_each(selector, &block)
34
48
  metadata.for_each(selector).instance_eval(&block) if block
35
49
  end
36
50
 
37
- def follow_links selector
51
+ def follow_links(selector)
38
52
 
39
53
  end
40
54
 
@@ -9,15 +9,15 @@ module Wombat
9
9
  super
10
10
  end
11
11
 
12
- def base_url url
12
+ def base_url(url)
13
13
  self[:base_url] = url
14
14
  end
15
15
 
16
- def list_page url
16
+ def list_page(url)
17
17
  self[:list_page] = url
18
18
  end
19
19
 
20
- def format format
20
+ def format(format)
21
21
  self[:format] = format
22
22
  end
23
23
  end
@@ -1,6 +1,6 @@
1
1
  module Wombat
2
2
  module NodeSelector
3
- def select_nodes selector, namespaces = nil
3
+ def select_nodes(selector, namespaces = nil)
4
4
  return [selector.to_s] if selector.is_a? Symbol
5
5
  return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
6
6
  return context.css selector[4..-1] if selector.start_with? "css="
data/lib/wombat/parser.rb CHANGED
@@ -12,7 +12,7 @@ module Wombat
12
12
  @mechanize = Mechanize.new
13
13
  end
14
14
 
15
- def parse metadata
15
+ def parse(metadata)
16
16
  self.context = parser_for metadata
17
17
  original_context = self.context
18
18
 
@@ -31,7 +31,7 @@ module Wombat
31
31
  end
32
32
 
33
33
  private
34
- def parser_for metadata
34
+ def parser_for(metadata)
35
35
  url = "#{metadata[:base_url]}#{metadata[:list_page]}"
36
36
 
37
37
  if metadata[:format] == :html
@@ -8,7 +8,7 @@ module Wombat
8
8
  @iterators = []
9
9
  end
10
10
 
11
- def method_missing method, *args, &block
11
+ def method_missing(method, *args, &block)
12
12
  if args.empty? && block
13
13
  self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
14
14
  block.call(self["#{method.to_s}"])
@@ -61,7 +61,7 @@ module Wombat
61
61
  properties.merge iters
62
62
  end
63
63
 
64
- def for_each selector
64
+ def for_each(selector)
65
65
  Iterator.new(selector).tap do |i|
66
66
  iterators << i
67
67
  end
@@ -5,13 +5,13 @@ module Wombat
5
5
  module PropertyLocator
6
6
  include NodeSelector
7
7
 
8
- def locate property
8
+ def locate(property)
9
9
  props = _locate property
10
10
  property.format != :list ? props.first : props
11
11
  end
12
12
 
13
13
  private
14
- def _locate property
14
+ def _locate(property)
15
15
  result = select_nodes(property.selector, property.namespaces).to_a
16
16
  result.map! {|r| r.inner_html.strip } if property.format == :html
17
17
  result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
data/spec/crawler_spec.rb CHANGED
@@ -1,145 +1,179 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Crawler do
4
- before(:each) do
5
- @crawler = Class.new
6
- @crawler.send(:include, Wombat::Crawler)
7
- @crawler_instance = @crawler.new
8
- end
4
+ describe '#crawl' do
5
+ before(:each) do
6
+ @crawler = Class.new
7
+ @crawler.send(:include, Wombat::Crawler)
8
+ @crawler_instance = @crawler.new
9
+ end
9
10
 
10
- it 'should call the provided block' do
11
- event_called = false
11
+ it 'should call the provided block' do
12
+ event_called = false
12
13
 
13
- @crawler.event { event_called = true }
14
+ @crawler.event { event_called = true }
14
15
 
15
- event_called.should be_true
16
- end
16
+ event_called.should be_true
17
+ end
17
18
 
18
- it 'should provide metadata to yielded block' do
19
- @crawler.event do |e|
20
- e.should_not be_nil
19
+ it 'should provide metadata to yielded block' do
20
+ @crawler.event do |e|
21
+ e.should_not be_nil
22
+ end
21
23
  end
22
- end
23
24
 
24
- it 'should store assigned metadata information' do
25
- time = Time.now
25
+ it 'should store assigned metadata information' do
26
+ time = Time.now
26
27
 
27
- @crawler.event do |e|
28
- e.title 'Fulltronic Dezembro'
29
- e.time Time.now
30
- end
28
+ @crawler.event do |e|
29
+ e.title 'Fulltronic Dezembro'
30
+ e.time Time.now
31
+ end
31
32
 
32
- @crawler.venue { |v| v.name "Scooba" }
33
- @crawler.location { |v| v.latitude -50.2323 }
33
+ @crawler.venue { |v| v.name "Scooba" }
34
+ @crawler.location { |v| v.latitude -50.2323 }
34
35
 
35
- @crawler_instance.should_receive(:parse) do |arg|
36
- arg["event"]["title"].selector.should == "Fulltronic Dezembro"
37
- arg["event"]["time"].selector.to_s.should == time.to_s
38
- arg["venue"]["name"].selector.should == "Scooba"
39
- arg["location"]["latitude"].selector.should == -50.2323
36
+ @crawler_instance.should_receive(:parse) do |arg|
37
+ arg["event"]["title"].selector.should == "Fulltronic Dezembro"
38
+ arg["event"]["time"].selector.to_s.should == time.to_s
39
+ arg["venue"]["name"].selector.should == "Scooba"
40
+ arg["location"]["latitude"].selector.should == -50.2323
41
+ end
42
+
43
+ @crawler_instance.crawl
40
44
  end
41
45
 
42
- @crawler_instance.crawl
43
- end
46
+ it 'should isolate metadata between different instances' do
47
+ another_crawler = Class.new
48
+ another_crawler.send(:include, Wombat::Crawler)
49
+ another_crawler_instance = another_crawler.new
44
50
 
45
- it 'should isolate metadata between different instances' do
46
- another_crawler = Class.new
47
- another_crawler.send(:include, Wombat::Crawler)
48
- another_crawler_instance = another_crawler.new
51
+ another_crawler.event { |e| e.title 'Ibiza' }
52
+ another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
53
+ another_crawler_instance.crawl
49
54
 
50
- another_crawler.event { |e| e.title 'Ibiza' }
51
- another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
52
- another_crawler_instance.crawl
55
+ @crawler.event { |e| e.title 'Fulltronic Dezembro' }
56
+ @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
57
+ @crawler_instance.crawl
58
+ end
53
59
 
54
- @crawler.event { |e| e.title 'Fulltronic Dezembro' }
55
- @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
56
- @crawler_instance.crawl
57
- end
60
+ it 'should be able to assign arbitrary plain text metadata' do
61
+ @crawler.some_data("/event/list", :html, "geo") { |p| true }
58
62
 
59
- it 'should be able to assign arbitrary plain text metadata' do
60
- @crawler.some_data("/event/list", :html, "geo") { |p| true }
63
+ @crawler_instance.should_receive(:parse) do |arg|
64
+ prop = arg['some_data']
65
+ prop.name.should == "some_data"
66
+ prop.selector.should == "/event/list"
67
+ prop.format.should == :html
68
+ prop.namespaces.should == "geo"
69
+ prop.callback.should_not be_nil
70
+ end
61
71
 
62
- @crawler_instance.should_receive(:parse) do |arg|
63
- prop = arg['some_data']
64
- prop.name.should == "some_data"
65
- prop.selector.should == "/event/list"
66
- prop.format.should == :html
67
- prop.namespaces.should == "geo"
68
- prop.callback.should_not be_nil
72
+ @crawler_instance.crawl
69
73
  end
70
74
 
71
- @crawler_instance.crawl
72
- end
75
+ it 'should be able to specify arbitrary block structure more than once' do
76
+ @crawler.structure do |s|
77
+ s.data "xpath=/xyz"
78
+ end
73
79
 
74
- it 'should be able to specify arbitrary block structure more than once' do
75
- @crawler.structure do |s|
76
- s.data "xpath=/xyz"
77
- end
80
+ @crawler.structure do |s|
81
+ s.another "css=.information"
82
+ end
78
83
 
79
- @crawler.structure do |s|
80
- s.another "css=.information"
81
- end
84
+ @crawler_instance.should_receive(:parse) do |arg|
85
+ arg["structure"]["data"].selector.should == "xpath=/xyz"
86
+ arg["structure"]["another"].selector.should == "css=.information"
87
+ end
82
88
 
83
- @crawler_instance.should_receive(:parse) do |arg|
84
- arg["structure"]["data"].selector.should == "xpath=/xyz"
85
- arg["structure"]["another"].selector.should == "css=.information"
89
+ @crawler_instance.crawl
86
90
  end
87
91
 
88
- @crawler_instance.crawl
89
- end
92
+ it 'should not explode if no block given' do
93
+ @crawler.event
94
+ end
90
95
 
91
- it 'should not explode if no block given' do
92
- @crawler.event
93
- end
96
+ it 'should iterate on elements inside for_each block' do
97
+ @crawler.for_each "css=.element" do
98
+ title "css=.title"
99
+ body "css=.body"
100
+ event do |e|
101
+ e.all "yeah"
102
+ end
103
+ end
94
104
 
95
- it 'should iterate on elements inside for_each block' do
96
- @crawler.for_each "css=.element" do
97
- title "css=.title"
98
- body "css=.body"
99
- event do |e|
100
- e.all "yeah"
105
+ @crawler_instance.should_receive(:parse) do |arg|
106
+ it = arg.iterators.first
107
+ it.selector.should == "css=.element"
108
+ it["title"].selector.should == "css=.title"
109
+ it["body"].selector.should == "css=.body"
110
+ it["event"]["all"].selector.should == "yeah"
101
111
  end
112
+
113
+ @crawler_instance.crawl
102
114
  end
103
115
 
104
- @crawler_instance.should_receive(:parse) do |arg|
105
- it = arg.iterators.first
106
- it.selector.should == "css=.element"
107
- it["title"].selector.should == "css=.title"
108
- it["body"].selector.should == "css=.body"
109
- it["event"]["all"].selector.should == "yeah"
116
+ it 'should assign metadata format' do
117
+ @crawler_instance.should_receive(:parse) do |arg|
118
+ arg[:format].should == :xml
119
+ end
120
+ @crawler.format :xml
121
+ @crawler_instance.crawl
110
122
  end
111
123
 
112
- @crawler_instance.crawl
113
- end
124
+ it 'should crawl with block' do
125
+ @crawler.base_url "danielnc.com"
126
+ @crawler.list_page "/itens"
114
127
 
115
- it 'should assign metadata format' do
116
- @crawler_instance.should_receive(:parse) do |arg|
117
- arg[:format].should == :xml
118
- end
119
- @crawler.format :xml
120
- @crawler_instance.crawl
121
- end
128
+ @crawler_instance.should_receive(:parse) do |arg|
129
+ arg[:base_url].should == "danielnc.com"
130
+ arg[:list_page].should == "/itens/1"
131
+ end
122
132
 
123
- it 'should crawl with block' do
124
- @crawler.base_url "danielnc.com"
125
- @crawler.list_page "/itens"
133
+ @crawler_instance.crawl do
134
+ list_page "/itens/1"
135
+ end
126
136
 
127
- @crawler_instance.should_receive(:parse) do |arg|
128
- arg[:base_url].should == "danielnc.com"
129
- arg[:list_page].should == "/itens/1"
130
- end
137
+ another_instance = @crawler.new
131
138
 
132
- @crawler_instance.crawl do
133
- list_page "/itens/1"
139
+ another_instance.should_receive(:parse) do |arg|
140
+ arg[:base_url].should == "danielnc.com"
141
+ arg[:list_page].should == "/itens"
142
+ end
143
+
144
+ another_instance.crawl
134
145
  end
146
+
147
+ it 'should remove created method missing' do
148
+ @crawler.base_url "danielnc.com"
149
+ @crawler.list_page "/itens"
150
+
151
+ @crawler_instance.should_receive(:parse) do |arg|
152
+ arg[:base_url].should == "danielnc.com"
153
+ arg[:list_page].should == "/itens/1"
154
+ end
135
155
 
136
- another_instance = @crawler.new
156
+ @crawler_instance.crawl do
157
+ list_page "/itens/1"
158
+ end
137
159
 
138
- another_instance.should_receive(:parse) do |arg|
139
- arg[:base_url].should == "danielnc.com"
140
- arg[:list_page].should == "/itens"
160
+ lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
141
161
  end
142
162
 
143
- another_instance.crawl
163
+ it 'should remove created instance variable' do
164
+ @crawler.base_url "danielnc.com"
165
+ @crawler.list_page "/itens"
166
+
167
+ @crawler_instance.should_receive(:parse) do |arg|
168
+ arg[:base_url].should == "danielnc.com"
169
+ arg[:list_page].should == "/itens/1"
170
+ end
171
+
172
+ @crawler_instance.crawl do
173
+ list_page "/itens/1"
174
+ end
175
+
176
+ @crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
177
+ end
144
178
  end
145
179
  end
@@ -34,6 +34,65 @@ describe 'basic crawler setup' do
34
34
  end
35
35
  end
36
36
 
37
+ it 'should crawl page through block to class instance crawl method' do
38
+ VCR.use_cassette('basic_crawler_page') do
39
+ crawler = Class.new
40
+ crawler.send(:include, Wombat::Crawler)
41
+ crawler_instance = crawler.new
42
+ results = crawler_instance.crawl do
43
+ base_url "http://www.terra.com.br"
44
+ list_page '/portal'
45
+
46
+ search "css=.btn-search"
47
+
48
+ social do |s|
49
+ s.twitter "css=.ctn-bar li.last"
50
+ end
51
+
52
+ for_each "css=.ctn-links" do
53
+ menu "css=a"
54
+ end
55
+
56
+ subheader "css=h2.ttl-dynamic" do |h|
57
+ h.gsub("London", "Londres")
58
+ end
59
+ end
60
+
61
+ results["search"].should == "Buscar"
62
+ results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
63
+ results["subheader"].should == "Londres 2012"
64
+ results["social"]["twitter"].should == "Verão"
65
+ end
66
+ end
67
+
68
+ it 'should crawl page through static crawl method' do
69
+ VCR.use_cassette('basic_crawler_page') do
70
+ results = Wombat.crawl do
71
+ base_url "http://www.terra.com.br"
72
+ list_page '/portal'
73
+
74
+ search "css=.btn-search"
75
+
76
+ social do |s|
77
+ s.twitter "css=.ctn-bar li.last"
78
+ end
79
+
80
+ for_each "css=.ctn-links" do
81
+ menu "css=a"
82
+ end
83
+
84
+ subheader "css=h2.ttl-dynamic" do |h|
85
+ h.gsub("London", "Londres")
86
+ end
87
+ end
88
+
89
+ results["search"].should == "Buscar"
90
+ results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
91
+ results["subheader"].should == "Londres 2012"
92
+ results["social"]["twitter"].should == "Verão"
93
+ end
94
+ end
95
+
37
96
  it 'should iterate elements' do
38
97
  VCR.use_cassette('for_each_page') do
39
98
  crawler = Class.new
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat do
4
+ it 'should provide syntactic sugar method Wombat.crawl' do
5
+ Wombat.should respond_to(:crawl)
6
+ end
7
+ end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "0.3.1"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-04-12"
12
+ s.date = "2012-05-25"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -50,6 +50,7 @@ Gem::Specification.new do |s|
50
50
  "spec/property_spec.rb",
51
51
  "spec/sample_crawler_spec.rb",
52
52
  "spec/spec_helper.rb",
53
+ "spec/wombat_spec.rb",
53
54
  "wombat.gemspec"
54
55
  ]
55
56
  s.homepage = "http://github.com/felipecsl/wombat"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-12 00:00:00.000000000 Z
12
+ date: 2012-05-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -212,6 +212,7 @@ files:
212
212
  - spec/property_spec.rb
213
213
  - spec/sample_crawler_spec.rb
214
214
  - spec/spec_helper.rb
215
+ - spec/wombat_spec.rb
215
216
  - wombat.gemspec
216
217
  homepage: http://github.com/felipecsl/wombat
217
218
  licenses: