wombat 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,46 +1,54 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- activesupport (3.0.11)
4
+ activesupport (3.2.3)
5
+ i18n (~> 0.6)
6
+ multi_json (~> 1.0)
5
7
  diff-lcs (1.1.3)
6
- domain_name (0.5.1)
8
+ domain_name (0.5.3)
7
9
  unf (~> 0.0.3)
8
10
  fakeweb (1.3.0)
9
11
  git (1.2.5)
10
- jeweler (1.6.4)
12
+ i18n (0.6.0)
13
+ jeweler (1.8.3)
11
14
  bundler (~> 1.0)
12
15
  git (>= 1.2.5)
13
16
  rake
14
- mechanize (2.1)
17
+ rdoc
18
+ json (1.7.3)
19
+ mechanize (2.5.1)
15
20
  domain_name (~> 0.5, >= 0.5.1)
21
+ mime-types (~> 1.17, >= 1.17.2)
16
22
  net-http-digest_auth (~> 1.1, >= 1.1.1)
17
- net-http-persistent (~> 2.3, >= 2.3.2)
23
+ net-http-persistent (~> 2.5, >= 2.5.2)
18
24
  nokogiri (~> 1.4)
19
25
  ntlm-http (~> 0.1, >= 0.1.1)
20
26
  webrobots (~> 0.0, >= 0.0.9)
21
- mime-types (1.17.2)
22
- net-http-digest_auth (1.2)
23
- net-http-persistent (2.3.3)
24
- nokogiri (1.5.0)
27
+ mime-types (1.18)
28
+ multi_json (1.3.5)
29
+ net-http-digest_auth (1.2.1)
30
+ net-http-persistent (2.6)
31
+ nokogiri (1.5.2)
25
32
  ntlm-http (0.1.1)
26
33
  rake (0.9.2.2)
34
+ rdoc (3.12)
35
+ json (~> 1.4)
27
36
  rest-client (1.6.7)
28
37
  mime-types (>= 1.16)
29
- rspec (2.7.0)
30
- rspec-core (~> 2.7.0)
31
- rspec-expectations (~> 2.7.0)
32
- rspec-mocks (~> 2.7.0)
33
- rspec-core (2.7.1)
34
- rspec-expectations (2.7.0)
35
- diff-lcs (~> 1.1.2)
36
- rspec-mocks (2.7.0)
37
- unf (0.0.4)
38
+ rspec (2.10.0)
39
+ rspec-core (~> 2.10.0)
40
+ rspec-expectations (~> 2.10.0)
41
+ rspec-mocks (~> 2.10.0)
42
+ rspec-core (2.10.1)
43
+ rspec-expectations (2.10.0)
44
+ diff-lcs (~> 1.1.3)
45
+ rspec-mocks (2.10.1)
46
+ unf (0.0.5)
38
47
  unf_ext
39
48
  unf_ext (0.0.4)
40
- vcr (2.0.0)
41
- webrobots (0.0.12)
42
- nokogiri (>= 1.4.4)
43
- yard (0.7.4)
49
+ vcr (2.1.1)
50
+ webrobots (0.0.13)
51
+ yard (0.8.1)
44
52
 
45
53
  PLATFORMS
46
54
  ruby
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
data/lib/wombat.rb CHANGED
@@ -3,4 +3,9 @@
3
3
  require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
+ def self.crawl(&block)
7
+ klass = Class.new
8
+ klass.send(:include, Wombat::Crawler)
9
+ klass.new.crawl(&block)
10
+ end
6
11
  end
@@ -14,27 +14,41 @@ module Wombat
14
14
  if block
15
15
  @metadata_dup = self.class.send(:metadata).clone
16
16
  instance_eval do
17
+ alias :old_method_missing :method_missing
17
18
  def method_missing method, *args, &block
18
19
  @metadata_dup.send method, *args, &block
19
20
  end
20
21
  end
21
22
  self.instance_eval &block
22
- parse @metadata_dup
23
+ parsed = parse @metadata_dup
24
+ instance_eval do
25
+ alias :method_missing :old_method_missing
26
+ remove_instance_variable :@metadata_dup
27
+ end
28
+ parsed
23
29
  else
24
30
  parse self.class.send(:metadata)
25
31
  end
26
32
  end
27
33
 
34
+ def method_missing(method, *args, &block)
35
+ self.class.send method, *args, &block
36
+ end
37
+
38
+ def for_each(selector, &block)
39
+ self.class.for_each selector, &block
40
+ end
41
+
28
42
  module ClassMethods
29
- def method_missing method, *args, &block
43
+ def method_missing(method, *args, &block)
30
44
  metadata.send method, *args, &block
31
45
  end
32
46
 
33
- def for_each selector, &block
47
+ def for_each(selector, &block)
34
48
  metadata.for_each(selector).instance_eval(&block) if block
35
49
  end
36
50
 
37
- def follow_links selector
51
+ def follow_links(selector)
38
52
 
39
53
  end
40
54
 
@@ -9,15 +9,15 @@ module Wombat
9
9
  super
10
10
  end
11
11
 
12
- def base_url url
12
+ def base_url(url)
13
13
  self[:base_url] = url
14
14
  end
15
15
 
16
- def list_page url
16
+ def list_page(url)
17
17
  self[:list_page] = url
18
18
  end
19
19
 
20
- def format format
20
+ def format(format)
21
21
  self[:format] = format
22
22
  end
23
23
  end
@@ -1,6 +1,6 @@
1
1
  module Wombat
2
2
  module NodeSelector
3
- def select_nodes selector, namespaces = nil
3
+ def select_nodes(selector, namespaces = nil)
4
4
  return [selector.to_s] if selector.is_a? Symbol
5
5
  return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
6
6
  return context.css selector[4..-1] if selector.start_with? "css="
data/lib/wombat/parser.rb CHANGED
@@ -12,7 +12,7 @@ module Wombat
12
12
  @mechanize = Mechanize.new
13
13
  end
14
14
 
15
- def parse metadata
15
+ def parse(metadata)
16
16
  self.context = parser_for metadata
17
17
  original_context = self.context
18
18
 
@@ -31,7 +31,7 @@ module Wombat
31
31
  end
32
32
 
33
33
  private
34
- def parser_for metadata
34
+ def parser_for(metadata)
35
35
  url = "#{metadata[:base_url]}#{metadata[:list_page]}"
36
36
 
37
37
  if metadata[:format] == :html
@@ -8,7 +8,7 @@ module Wombat
8
8
  @iterators = []
9
9
  end
10
10
 
11
- def method_missing method, *args, &block
11
+ def method_missing(method, *args, &block)
12
12
  if args.empty? && block
13
13
  self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
14
14
  block.call(self["#{method.to_s}"])
@@ -61,7 +61,7 @@ module Wombat
61
61
  properties.merge iters
62
62
  end
63
63
 
64
- def for_each selector
64
+ def for_each(selector)
65
65
  Iterator.new(selector).tap do |i|
66
66
  iterators << i
67
67
  end
@@ -5,13 +5,13 @@ module Wombat
5
5
  module PropertyLocator
6
6
  include NodeSelector
7
7
 
8
- def locate property
8
+ def locate(property)
9
9
  props = _locate property
10
10
  property.format != :list ? props.first : props
11
11
  end
12
12
 
13
13
  private
14
- def _locate property
14
+ def _locate(property)
15
15
  result = select_nodes(property.selector, property.namespaces).to_a
16
16
  result.map! {|r| r.inner_html.strip } if property.format == :html
17
17
  result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
data/spec/crawler_spec.rb CHANGED
@@ -1,145 +1,179 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Crawler do
4
- before(:each) do
5
- @crawler = Class.new
6
- @crawler.send(:include, Wombat::Crawler)
7
- @crawler_instance = @crawler.new
8
- end
4
+ describe '#crawl' do
5
+ before(:each) do
6
+ @crawler = Class.new
7
+ @crawler.send(:include, Wombat::Crawler)
8
+ @crawler_instance = @crawler.new
9
+ end
9
10
 
10
- it 'should call the provided block' do
11
- event_called = false
11
+ it 'should call the provided block' do
12
+ event_called = false
12
13
 
13
- @crawler.event { event_called = true }
14
+ @crawler.event { event_called = true }
14
15
 
15
- event_called.should be_true
16
- end
16
+ event_called.should be_true
17
+ end
17
18
 
18
- it 'should provide metadata to yielded block' do
19
- @crawler.event do |e|
20
- e.should_not be_nil
19
+ it 'should provide metadata to yielded block' do
20
+ @crawler.event do |e|
21
+ e.should_not be_nil
22
+ end
21
23
  end
22
- end
23
24
 
24
- it 'should store assigned metadata information' do
25
- time = Time.now
25
+ it 'should store assigned metadata information' do
26
+ time = Time.now
26
27
 
27
- @crawler.event do |e|
28
- e.title 'Fulltronic Dezembro'
29
- e.time Time.now
30
- end
28
+ @crawler.event do |e|
29
+ e.title 'Fulltronic Dezembro'
30
+ e.time Time.now
31
+ end
31
32
 
32
- @crawler.venue { |v| v.name "Scooba" }
33
- @crawler.location { |v| v.latitude -50.2323 }
33
+ @crawler.venue { |v| v.name "Scooba" }
34
+ @crawler.location { |v| v.latitude -50.2323 }
34
35
 
35
- @crawler_instance.should_receive(:parse) do |arg|
36
- arg["event"]["title"].selector.should == "Fulltronic Dezembro"
37
- arg["event"]["time"].selector.to_s.should == time.to_s
38
- arg["venue"]["name"].selector.should == "Scooba"
39
- arg["location"]["latitude"].selector.should == -50.2323
36
+ @crawler_instance.should_receive(:parse) do |arg|
37
+ arg["event"]["title"].selector.should == "Fulltronic Dezembro"
38
+ arg["event"]["time"].selector.to_s.should == time.to_s
39
+ arg["venue"]["name"].selector.should == "Scooba"
40
+ arg["location"]["latitude"].selector.should == -50.2323
41
+ end
42
+
43
+ @crawler_instance.crawl
40
44
  end
41
45
 
42
- @crawler_instance.crawl
43
- end
46
+ it 'should isolate metadata between different instances' do
47
+ another_crawler = Class.new
48
+ another_crawler.send(:include, Wombat::Crawler)
49
+ another_crawler_instance = another_crawler.new
44
50
 
45
- it 'should isolate metadata between different instances' do
46
- another_crawler = Class.new
47
- another_crawler.send(:include, Wombat::Crawler)
48
- another_crawler_instance = another_crawler.new
51
+ another_crawler.event { |e| e.title 'Ibiza' }
52
+ another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
53
+ another_crawler_instance.crawl
49
54
 
50
- another_crawler.event { |e| e.title 'Ibiza' }
51
- another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
52
- another_crawler_instance.crawl
55
+ @crawler.event { |e| e.title 'Fulltronic Dezembro' }
56
+ @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
57
+ @crawler_instance.crawl
58
+ end
53
59
 
54
- @crawler.event { |e| e.title 'Fulltronic Dezembro' }
55
- @crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
56
- @crawler_instance.crawl
57
- end
60
+ it 'should be able to assign arbitrary plain text metadata' do
61
+ @crawler.some_data("/event/list", :html, "geo") { |p| true }
58
62
 
59
- it 'should be able to assign arbitrary plain text metadata' do
60
- @crawler.some_data("/event/list", :html, "geo") { |p| true }
63
+ @crawler_instance.should_receive(:parse) do |arg|
64
+ prop = arg['some_data']
65
+ prop.name.should == "some_data"
66
+ prop.selector.should == "/event/list"
67
+ prop.format.should == :html
68
+ prop.namespaces.should == "geo"
69
+ prop.callback.should_not be_nil
70
+ end
61
71
 
62
- @crawler_instance.should_receive(:parse) do |arg|
63
- prop = arg['some_data']
64
- prop.name.should == "some_data"
65
- prop.selector.should == "/event/list"
66
- prop.format.should == :html
67
- prop.namespaces.should == "geo"
68
- prop.callback.should_not be_nil
72
+ @crawler_instance.crawl
69
73
  end
70
74
 
71
- @crawler_instance.crawl
72
- end
75
+ it 'should be able to specify arbitrary block structure more than once' do
76
+ @crawler.structure do |s|
77
+ s.data "xpath=/xyz"
78
+ end
73
79
 
74
- it 'should be able to specify arbitrary block structure more than once' do
75
- @crawler.structure do |s|
76
- s.data "xpath=/xyz"
77
- end
80
+ @crawler.structure do |s|
81
+ s.another "css=.information"
82
+ end
78
83
 
79
- @crawler.structure do |s|
80
- s.another "css=.information"
81
- end
84
+ @crawler_instance.should_receive(:parse) do |arg|
85
+ arg["structure"]["data"].selector.should == "xpath=/xyz"
86
+ arg["structure"]["another"].selector.should == "css=.information"
87
+ end
82
88
 
83
- @crawler_instance.should_receive(:parse) do |arg|
84
- arg["structure"]["data"].selector.should == "xpath=/xyz"
85
- arg["structure"]["another"].selector.should == "css=.information"
89
+ @crawler_instance.crawl
86
90
  end
87
91
 
88
- @crawler_instance.crawl
89
- end
92
+ it 'should not explode if no block given' do
93
+ @crawler.event
94
+ end
90
95
 
91
- it 'should not explode if no block given' do
92
- @crawler.event
93
- end
96
+ it 'should iterate on elements inside for_each block' do
97
+ @crawler.for_each "css=.element" do
98
+ title "css=.title"
99
+ body "css=.body"
100
+ event do |e|
101
+ e.all "yeah"
102
+ end
103
+ end
94
104
 
95
- it 'should iterate on elements inside for_each block' do
96
- @crawler.for_each "css=.element" do
97
- title "css=.title"
98
- body "css=.body"
99
- event do |e|
100
- e.all "yeah"
105
+ @crawler_instance.should_receive(:parse) do |arg|
106
+ it = arg.iterators.first
107
+ it.selector.should == "css=.element"
108
+ it["title"].selector.should == "css=.title"
109
+ it["body"].selector.should == "css=.body"
110
+ it["event"]["all"].selector.should == "yeah"
101
111
  end
112
+
113
+ @crawler_instance.crawl
102
114
  end
103
115
 
104
- @crawler_instance.should_receive(:parse) do |arg|
105
- it = arg.iterators.first
106
- it.selector.should == "css=.element"
107
- it["title"].selector.should == "css=.title"
108
- it["body"].selector.should == "css=.body"
109
- it["event"]["all"].selector.should == "yeah"
116
+ it 'should assign metadata format' do
117
+ @crawler_instance.should_receive(:parse) do |arg|
118
+ arg[:format].should == :xml
119
+ end
120
+ @crawler.format :xml
121
+ @crawler_instance.crawl
110
122
  end
111
123
 
112
- @crawler_instance.crawl
113
- end
124
+ it 'should crawl with block' do
125
+ @crawler.base_url "danielnc.com"
126
+ @crawler.list_page "/itens"
114
127
 
115
- it 'should assign metadata format' do
116
- @crawler_instance.should_receive(:parse) do |arg|
117
- arg[:format].should == :xml
118
- end
119
- @crawler.format :xml
120
- @crawler_instance.crawl
121
- end
128
+ @crawler_instance.should_receive(:parse) do |arg|
129
+ arg[:base_url].should == "danielnc.com"
130
+ arg[:list_page].should == "/itens/1"
131
+ end
122
132
 
123
- it 'should crawl with block' do
124
- @crawler.base_url "danielnc.com"
125
- @crawler.list_page "/itens"
133
+ @crawler_instance.crawl do
134
+ list_page "/itens/1"
135
+ end
126
136
 
127
- @crawler_instance.should_receive(:parse) do |arg|
128
- arg[:base_url].should == "danielnc.com"
129
- arg[:list_page].should == "/itens/1"
130
- end
137
+ another_instance = @crawler.new
131
138
 
132
- @crawler_instance.crawl do
133
- list_page "/itens/1"
139
+ another_instance.should_receive(:parse) do |arg|
140
+ arg[:base_url].should == "danielnc.com"
141
+ arg[:list_page].should == "/itens"
142
+ end
143
+
144
+ another_instance.crawl
134
145
  end
146
+
147
+ it 'should remove created method missing' do
148
+ @crawler.base_url "danielnc.com"
149
+ @crawler.list_page "/itens"
150
+
151
+ @crawler_instance.should_receive(:parse) do |arg|
152
+ arg[:base_url].should == "danielnc.com"
153
+ arg[:list_page].should == "/itens/1"
154
+ end
135
155
 
136
- another_instance = @crawler.new
156
+ @crawler_instance.crawl do
157
+ list_page "/itens/1"
158
+ end
137
159
 
138
- another_instance.should_receive(:parse) do |arg|
139
- arg[:base_url].should == "danielnc.com"
140
- arg[:list_page].should == "/itens"
160
+ lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
141
161
  end
142
162
 
143
- another_instance.crawl
163
+ it 'should remove created instance variable' do
164
+ @crawler.base_url "danielnc.com"
165
+ @crawler.list_page "/itens"
166
+
167
+ @crawler_instance.should_receive(:parse) do |arg|
168
+ arg[:base_url].should == "danielnc.com"
169
+ arg[:list_page].should == "/itens/1"
170
+ end
171
+
172
+ @crawler_instance.crawl do
173
+ list_page "/itens/1"
174
+ end
175
+
176
+ @crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
177
+ end
144
178
  end
145
179
  end
@@ -34,6 +34,65 @@ describe 'basic crawler setup' do
34
34
  end
35
35
  end
36
36
 
37
+ it 'should crawl page through block to class instance crawl method' do
38
+ VCR.use_cassette('basic_crawler_page') do
39
+ crawler = Class.new
40
+ crawler.send(:include, Wombat::Crawler)
41
+ crawler_instance = crawler.new
42
+ results = crawler_instance.crawl do
43
+ base_url "http://www.terra.com.br"
44
+ list_page '/portal'
45
+
46
+ search "css=.btn-search"
47
+
48
+ social do |s|
49
+ s.twitter "css=.ctn-bar li.last"
50
+ end
51
+
52
+ for_each "css=.ctn-links" do
53
+ menu "css=a"
54
+ end
55
+
56
+ subheader "css=h2.ttl-dynamic" do |h|
57
+ h.gsub("London", "Londres")
58
+ end
59
+ end
60
+
61
+ results["search"].should == "Buscar"
62
+ results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
63
+ results["subheader"].should == "Londres 2012"
64
+ results["social"]["twitter"].should == "Verão"
65
+ end
66
+ end
67
+
68
+ it 'should crawl page through static crawl method' do
69
+ VCR.use_cassette('basic_crawler_page') do
70
+ results = Wombat.crawl do
71
+ base_url "http://www.terra.com.br"
72
+ list_page '/portal'
73
+
74
+ search "css=.btn-search"
75
+
76
+ social do |s|
77
+ s.twitter "css=.ctn-bar li.last"
78
+ end
79
+
80
+ for_each "css=.ctn-links" do
81
+ menu "css=a"
82
+ end
83
+
84
+ subheader "css=h2.ttl-dynamic" do |h|
85
+ h.gsub("London", "Londres")
86
+ end
87
+ end
88
+
89
+ results["search"].should == "Buscar"
90
+ results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
91
+ results["subheader"].should == "Londres 2012"
92
+ results["social"]["twitter"].should == "Verão"
93
+ end
94
+ end
95
+
37
96
  it 'should iterate elements' do
38
97
  VCR.use_cassette('for_each_page') do
39
98
  crawler = Class.new
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat do
4
+ it 'should provide syntactic sugar method Wombat.crawl' do
5
+ Wombat.should respond_to(:crawl)
6
+ end
7
+ end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "0.3.1"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-04-12"
12
+ s.date = "2012-05-25"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -50,6 +50,7 @@ Gem::Specification.new do |s|
50
50
  "spec/property_spec.rb",
51
51
  "spec/sample_crawler_spec.rb",
52
52
  "spec/spec_helper.rb",
53
+ "spec/wombat_spec.rb",
53
54
  "wombat.gemspec"
54
55
  ]
55
56
  s.homepage = "http://github.com/felipecsl/wombat"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-12 00:00:00.000000000 Z
12
+ date: 2012-05-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -212,6 +212,7 @@ files:
212
212
  - spec/property_spec.rb
213
213
  - spec/sample_crawler_spec.rb
214
214
  - spec/spec_helper.rb
215
+ - spec/wombat_spec.rb
215
216
  - wombat.gemspec
216
217
  homepage: http://github.com/felipecsl/wombat
217
218
  licenses: