wombat 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,12 +29,8 @@ module Wombat
29
29
  end
30
30
  end
31
31
 
32
- def with_details_page
33
- yield metadata if block_given?
34
- end
35
-
36
- def for_each selector
37
-
32
+ def for_each selector, &block
33
+ metadata.for_each(selector).instance_eval(&block) if block
38
34
  end
39
35
 
40
36
  def follow_links selector
@@ -0,0 +1,9 @@
1
+ module Wombat
2
+ class Iterator < PropertyContainer
3
+ attr_accessor :selector
4
+
5
+ def initialize selector
6
+ @selector = selector
7
+ end
8
+ end
9
+ end
@@ -1,8 +1,15 @@
1
1
  #coding: utf-8
2
2
  require 'wombat/property_container'
3
+ require 'wombat/iterator'
3
4
 
4
5
  module Wombat
5
6
  class Metadata < PropertyContainer
7
+ attr_accessor :iterators
8
+
9
+ def initialize
10
+ @iterators = []
11
+ end
12
+
6
13
  def base_url url
7
14
  self[:base_url] = url
8
15
  end
@@ -10,5 +17,11 @@ module Wombat
10
17
  def list_page url
11
18
  self[:list_page] = url
12
19
  end
20
+
21
+ def for_each selector
22
+ Iterator.new(selector).tap do |i|
23
+ iterators << i
24
+ end
25
+ end
13
26
  end
14
27
  end
data/lib/wombat/parser.rb CHANGED
@@ -14,10 +14,12 @@ module Wombat
14
14
  def parse metadata
15
15
  @context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
16
16
 
17
- locate metadata
17
+ props = metadata.all_properties
18
18
 
19
- metadata.all_properties.each do |p|
20
- p.callback.call(p.result) if p.callback
19
+ locate props
20
+
21
+ props.each do |p|
22
+ p.result = p.callback.call(p.result) if p.callback
21
23
  end
22
24
 
23
25
  metadata.flatten
@@ -22,7 +22,7 @@ module Wombat
22
22
  def all_properties
23
23
  values.flat_map { |v|
24
24
  v.kind_of?(PropertyContainer) \
25
- ? v.values \
25
+ ? v.all_properties \
26
26
  : v.kind_of?(Property) \
27
27
  ? v \
28
28
  : nil
@@ -2,15 +2,15 @@
2
2
 
3
3
  module Wombat
4
4
  module PropertyLocator
5
- def locate metadata
6
- metadata.all_properties.each do |p|
5
+ def locate properties
6
+ properties.each do |p|
7
7
  p.result = locate_property(p).first
8
8
  end
9
9
  end
10
10
 
11
11
  private
12
12
  def locate_property property
13
- result = locate_selector(property.selector, property.namespaces)
13
+ result = locate_selector(property.selector, property.namespaces).to_a
14
14
  result.map! {|r| r.inner_html.strip } if property.format == :html
15
15
  result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
16
16
  end
data/spec/crawler_spec.rb CHANGED
@@ -91,4 +91,19 @@ describe Wombat::Crawler do
91
91
  it 'should not explode if no block given' do
92
92
  @crawler.event
93
93
  end
94
+
95
+ it 'should iterate on elements inside for_each block' do
96
+ @crawler.for_each "css=.element" do
97
+ title "css=.title"
98
+ body "css=.body"
99
+ end
100
+
101
+ @crawler_instance.should_receive(:parse) do |arg|
102
+ arg.iterators.first.selector.should == "css=.element"
103
+ arg.iterators.first["title"].selector.should == "css=.title"
104
+ arg.iterators.first["body"].selector.should == "css=.body"
105
+ end
106
+
107
+ @crawler_instance.crawl
108
+ end
94
109
  end
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  require 'spec_helper'
2
3
 
3
4
  describe 'basic crawler setup' do
@@ -14,12 +15,17 @@ describe 'basic crawler setup' do
14
15
  s.twitter "css=.ctn-bar li.last"
15
16
  end
16
17
 
18
+ crawler.subheader "css=h2.ttl-dynamic" do |h|
19
+ h.gsub("London", "Londres")
20
+ end
21
+
17
22
  crawler_instance = crawler.new
18
23
 
19
24
  results = crawler_instance.crawl
20
25
 
21
26
  results["search"].should == "Buscar"
22
- results["social"]["twitter"].should == "Terra Magazine"
27
+ results["subheader"].should == "Londres 2012"
28
+ results["social"]["twitter"].should == "Verão"
23
29
  end
24
30
  end
25
31
  end
@@ -11,4 +11,10 @@ describe Wombat::Metadata do
11
11
  @metadata.list_page "/yeah"
12
12
  @metadata.all_properties.should == [@metadata['another_property']]
13
13
  end
14
+
15
+ it 'should store iterators' do
16
+ @metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
17
+ @metadata.iterators.size.should == 1
18
+ @metadata.iterators.first.selector.should == "some_selector"
19
+ end
14
20
  end
data/spec/parser_spec.rb CHANGED
@@ -24,7 +24,7 @@ describe Wombat::Parser do
24
24
  fake_parser = double :parser
25
25
  fake_document.should_receive(:parser).and_return(fake_parser)
26
26
  @parser.mechanize.stub(:get).and_return fake_document
27
- @parser.should_receive(:locate).with(@metadata)
27
+ @parser.should_receive(:locate).with(@metadata.all_properties)
28
28
  @parser.parse @metadata
29
29
  end
30
30
 
@@ -38,10 +38,11 @@ describe Wombat::Parser do
38
38
  property.stub(:result)
39
39
  fake_document.should_receive(:parser).and_return(fake_parser)
40
40
  property.should_receive(:callback).twice.and_return(block)
41
+ property.should_receive(:result=).with(true)
41
42
 
42
43
  @parser.mechanize.stub(:get).and_return fake_document
43
- @parser.should_receive(:locate).with(@metadata)
44
- @metadata.should_receive(:all_properties).and_return [property]
44
+ @metadata.stub(:all_properties).and_return [property]
45
+ @parser.should_receive(:locate).with(@metadata.all_properties)
45
46
 
46
47
  @parser.parse @metadata
47
48
 
@@ -61,10 +62,11 @@ describe Wombat::Parser do
61
62
  property.should_receive(:result).and_return("blah")
62
63
  fake_document.should_receive(:parser).and_return(fake_parser)
63
64
  property.should_receive(:callback).twice.and_return(block)
65
+ property.should_receive(:result=).with(true)
64
66
 
65
67
  @parser.mechanize.stub(:get).and_return fake_document
66
- @parser.should_receive(:locate).with(@metadata)
67
- @metadata.should_receive(:all_properties).and_return [property]
68
+ @metadata.stub(:all_properties).and_return [property]
69
+ @parser.should_receive(:locate).with(@metadata.all_properties)
68
70
 
69
71
  @parser.parse @metadata
70
72
 
@@ -9,12 +9,13 @@ describe Wombat::PropertyContainer do
9
9
  @metadata["event"] = Wombat::PropertyContainer.new
10
10
  @metadata["venue"] = Wombat::PropertyContainer.new
11
11
  @metadata.another_property "/some/selector", :text
12
- @metadata["event"].something "else"
12
+ @metadata["event"]["something"] = Wombat::PropertyContainer.new
13
+ @metadata["event"]["something"].else "Wohooo"
13
14
  @metadata["venue"].awesome "whooea"
14
15
 
15
16
  all_propes = @metadata.all_properties
16
17
 
17
- all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"], @metadata["venue"]["awesome"]]
18
+ all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"]["else"], @metadata["venue"]["awesome"]]
18
19
  end
19
20
 
20
21
  it 'should be able to change properties via all_properties' do
@@ -28,7 +28,7 @@ describe Wombat::PropertyLocator do
28
28
 
29
29
  @locator_instance.stub(:context).and_return context
30
30
 
31
- @locator_instance.locate @metadata
31
+ @locator_instance.locate @metadata.all_properties
32
32
 
33
33
  @metadata.get_property("blah").result.should == "abc"
34
34
  @metadata["event"].get_property("data1").result.should == "Something cool"
@@ -47,7 +47,7 @@ describe Wombat::PropertyLocator do
47
47
 
48
48
  @metadata["event"].another_info "xpath=/anotherData", :html
49
49
 
50
- @locator_instance.locate @metadata
50
+ @locator_instance.locate @metadata.all_properties
51
51
 
52
52
  @metadata["event"].get_property("another_info").result.should == "some another info"
53
53
  end
@@ -59,7 +59,7 @@ describe Wombat::PropertyLocator do
59
59
  @locator_instance.stub(:context).and_return context
60
60
  @metadata["event"].description "xpath=/event/some/description", :text, "blah"
61
61
 
62
- @locator_instance.locate @metadata
62
+ @locator_instance.locate @metadata.all_properties
63
63
 
64
64
  @metadata["event"].get_property("description").result.should == "awesome event"
65
65
  end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-02-06"
12
+ s.date = "2012-02-08"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -30,6 +30,7 @@ Gem::Specification.new do |s|
30
30
  "fixtures/vcr_cassettes/basic_crawler_page.yml",
31
31
  "lib/wombat.rb",
32
32
  "lib/wombat/crawler.rb",
33
+ "lib/wombat/iterator.rb",
33
34
  "lib/wombat/metadata.rb",
34
35
  "lib/wombat/parser.rb",
35
36
  "lib/wombat/property.rb",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-06 00:00:00.000000000 Z
12
+ date: 2012-02-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70197219823100 !ruby/object:Gem::Requirement
16
+ requirement: &70163534813940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70197219823100
24
+ version_requirements: *70163534813940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: activesupport
27
- requirement: &70197219822620 !ruby/object:Gem::Requirement
27
+ requirement: &70163534813160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70197219822620
35
+ version_requirements: *70163534813160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &70197219822140 !ruby/object:Gem::Requirement
38
+ requirement: &70163534812680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70197219822140
46
+ version_requirements: *70163534812680
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &70197219821660 !ruby/object:Gem::Requirement
49
+ requirement: &70163534812200 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70197219821660
57
+ version_requirements: *70163534812200
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: yard
60
- requirement: &70197219821120 !ruby/object:Gem::Requirement
60
+ requirement: &70163534811700 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70197219821120
68
+ version_requirements: *70163534811700
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: jeweler
71
- requirement: &70197219820520 !ruby/object:Gem::Requirement
71
+ requirement: &70163534811200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70197219820520
79
+ version_requirements: *70163534811200
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rspec
82
- requirement: &70197219819920 !ruby/object:Gem::Requirement
82
+ requirement: &70163534810720 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70197219819920
90
+ version_requirements: *70163534810720
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: guard
93
- requirement: &70197219819200 !ruby/object:Gem::Requirement
93
+ requirement: &70163534810240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :development
100
100
  prerelease: false
101
- version_requirements: *70197219819200
101
+ version_requirements: *70163534810240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: guard-rspec
104
- requirement: &70197219818500 !ruby/object:Gem::Requirement
104
+ requirement: &70163535048080 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :development
111
111
  prerelease: false
112
- version_requirements: *70197219818500
112
+ version_requirements: *70163535048080
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: guard-bundler
115
- requirement: &70197219818020 !ruby/object:Gem::Requirement
115
+ requirement: &70163535047560 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *70197219818020
123
+ version_requirements: *70163535047560
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: vcr
126
- requirement: &70197219817400 !ruby/object:Gem::Requirement
126
+ requirement: &70163535047060 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - =
@@ -131,10 +131,10 @@ dependencies:
131
131
  version: 2.0.0.rc1
132
132
  type: :development
133
133
  prerelease: false
134
- version_requirements: *70197219817400
134
+ version_requirements: *70163535047060
135
135
  - !ruby/object:Gem::Dependency
136
136
  name: fakeweb
137
- requirement: &70197219816680 !ruby/object:Gem::Requirement
137
+ requirement: &70163535046440 !ruby/object:Gem::Requirement
138
138
  none: false
139
139
  requirements:
140
140
  - - ! '>='
@@ -142,7 +142,7 @@ dependencies:
142
142
  version: '0'
143
143
  type: :development
144
144
  prerelease: false
145
- version_requirements: *70197219816680
145
+ version_requirements: *70163535046440
146
146
  description: Generic Web crawler with a DSL that parses structured data from web pages
147
147
  email: felipe.lima@gmail.com
148
148
  executables: []
@@ -164,6 +164,7 @@ files:
164
164
  - fixtures/vcr_cassettes/basic_crawler_page.yml
165
165
  - lib/wombat.rb
166
166
  - lib/wombat/crawler.rb
167
+ - lib/wombat/iterator.rb
167
168
  - lib/wombat/metadata.rb
168
169
  - lib/wombat/parser.rb
169
170
  - lib/wombat/property.rb