wombat 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,12 +29,8 @@ module Wombat
29
29
  end
30
30
  end
31
31
 
32
- def with_details_page
33
- yield metadata if block_given?
34
- end
35
-
36
- def for_each selector
37
-
32
+ def for_each selector, &block
33
+ metadata.for_each(selector).instance_eval(&block) if block
38
34
  end
39
35
 
40
36
  def follow_links selector
@@ -0,0 +1,9 @@
1
+ module Wombat
2
+ class Iterator < PropertyContainer
3
+ attr_accessor :selector
4
+
5
+ def initialize selector
6
+ @selector = selector
7
+ end
8
+ end
9
+ end
@@ -1,8 +1,15 @@
1
1
  #coding: utf-8
2
2
  require 'wombat/property_container'
3
+ require 'wombat/iterator'
3
4
 
4
5
  module Wombat
5
6
  class Metadata < PropertyContainer
7
+ attr_accessor :iterators
8
+
9
+ def initialize
10
+ @iterators = []
11
+ end
12
+
6
13
  def base_url url
7
14
  self[:base_url] = url
8
15
  end
@@ -10,5 +17,11 @@ module Wombat
10
17
  def list_page url
11
18
  self[:list_page] = url
12
19
  end
20
+
21
+ def for_each selector
22
+ Iterator.new(selector).tap do |i|
23
+ iterators << i
24
+ end
25
+ end
13
26
  end
14
27
  end
data/lib/wombat/parser.rb CHANGED
@@ -14,10 +14,12 @@ module Wombat
14
14
  def parse metadata
15
15
  @context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
16
16
 
17
- locate metadata
17
+ props = metadata.all_properties
18
18
 
19
- metadata.all_properties.each do |p|
20
- p.callback.call(p.result) if p.callback
19
+ locate props
20
+
21
+ props.each do |p|
22
+ p.result = p.callback.call(p.result) if p.callback
21
23
  end
22
24
 
23
25
  metadata.flatten
@@ -22,7 +22,7 @@ module Wombat
22
22
  def all_properties
23
23
  values.flat_map { |v|
24
24
  v.kind_of?(PropertyContainer) \
25
- ? v.values \
25
+ ? v.all_properties \
26
26
  : v.kind_of?(Property) \
27
27
  ? v \
28
28
  : nil
@@ -2,15 +2,15 @@
2
2
 
3
3
  module Wombat
4
4
  module PropertyLocator
5
- def locate metadata
6
- metadata.all_properties.each do |p|
5
+ def locate properties
6
+ properties.each do |p|
7
7
  p.result = locate_property(p).first
8
8
  end
9
9
  end
10
10
 
11
11
  private
12
12
  def locate_property property
13
- result = locate_selector(property.selector, property.namespaces)
13
+ result = locate_selector(property.selector, property.namespaces).to_a
14
14
  result.map! {|r| r.inner_html.strip } if property.format == :html
15
15
  result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
16
16
  end
data/spec/crawler_spec.rb CHANGED
@@ -91,4 +91,19 @@ describe Wombat::Crawler do
91
91
  it 'should not explode if no block given' do
92
92
  @crawler.event
93
93
  end
94
+
95
+ it 'should iterate on elements inside for_each block' do
96
+ @crawler.for_each "css=.element" do
97
+ title "css=.title"
98
+ body "css=.body"
99
+ end
100
+
101
+ @crawler_instance.should_receive(:parse) do |arg|
102
+ arg.iterators.first.selector.should == "css=.element"
103
+ arg.iterators.first["title"].selector.should == "css=.title"
104
+ arg.iterators.first["body"].selector.should == "css=.body"
105
+ end
106
+
107
+ @crawler_instance.crawl
108
+ end
94
109
  end
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  require 'spec_helper'
2
3
 
3
4
  describe 'basic crawler setup' do
@@ -14,12 +15,17 @@ describe 'basic crawler setup' do
14
15
  s.twitter "css=.ctn-bar li.last"
15
16
  end
16
17
 
18
+ crawler.subheader "css=h2.ttl-dynamic" do |h|
19
+ h.gsub("London", "Londres")
20
+ end
21
+
17
22
  crawler_instance = crawler.new
18
23
 
19
24
  results = crawler_instance.crawl
20
25
 
21
26
  results["search"].should == "Buscar"
22
- results["social"]["twitter"].should == "Terra Magazine"
27
+ results["subheader"].should == "Londres 2012"
28
+ results["social"]["twitter"].should == "Verão"
23
29
  end
24
30
  end
25
31
  end
@@ -11,4 +11,10 @@ describe Wombat::Metadata do
11
11
  @metadata.list_page "/yeah"
12
12
  @metadata.all_properties.should == [@metadata['another_property']]
13
13
  end
14
+
15
+ it 'should store iterators' do
16
+ @metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
17
+ @metadata.iterators.size.should == 1
18
+ @metadata.iterators.first.selector.should == "some_selector"
19
+ end
14
20
  end
data/spec/parser_spec.rb CHANGED
@@ -24,7 +24,7 @@ describe Wombat::Parser do
24
24
  fake_parser = double :parser
25
25
  fake_document.should_receive(:parser).and_return(fake_parser)
26
26
  @parser.mechanize.stub(:get).and_return fake_document
27
- @parser.should_receive(:locate).with(@metadata)
27
+ @parser.should_receive(:locate).with(@metadata.all_properties)
28
28
  @parser.parse @metadata
29
29
  end
30
30
 
@@ -38,10 +38,11 @@ describe Wombat::Parser do
38
38
  property.stub(:result)
39
39
  fake_document.should_receive(:parser).and_return(fake_parser)
40
40
  property.should_receive(:callback).twice.and_return(block)
41
+ property.should_receive(:result=).with(true)
41
42
 
42
43
  @parser.mechanize.stub(:get).and_return fake_document
43
- @parser.should_receive(:locate).with(@metadata)
44
- @metadata.should_receive(:all_properties).and_return [property]
44
+ @metadata.stub(:all_properties).and_return [property]
45
+ @parser.should_receive(:locate).with(@metadata.all_properties)
45
46
 
46
47
  @parser.parse @metadata
47
48
 
@@ -61,10 +62,11 @@ describe Wombat::Parser do
61
62
  property.should_receive(:result).and_return("blah")
62
63
  fake_document.should_receive(:parser).and_return(fake_parser)
63
64
  property.should_receive(:callback).twice.and_return(block)
65
+ property.should_receive(:result=).with(true)
64
66
 
65
67
  @parser.mechanize.stub(:get).and_return fake_document
66
- @parser.should_receive(:locate).with(@metadata)
67
- @metadata.should_receive(:all_properties).and_return [property]
68
+ @metadata.stub(:all_properties).and_return [property]
69
+ @parser.should_receive(:locate).with(@metadata.all_properties)
68
70
 
69
71
  @parser.parse @metadata
70
72
 
@@ -9,12 +9,13 @@ describe Wombat::PropertyContainer do
9
9
  @metadata["event"] = Wombat::PropertyContainer.new
10
10
  @metadata["venue"] = Wombat::PropertyContainer.new
11
11
  @metadata.another_property "/some/selector", :text
12
- @metadata["event"].something "else"
12
+ @metadata["event"]["something"] = Wombat::PropertyContainer.new
13
+ @metadata["event"]["something"].else "Wohooo"
13
14
  @metadata["venue"].awesome "whooea"
14
15
 
15
16
  all_propes = @metadata.all_properties
16
17
 
17
- all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"], @metadata["venue"]["awesome"]]
18
+ all_propes.should =~ [@metadata["another_property"], @metadata["event"]["something"]["else"], @metadata["venue"]["awesome"]]
18
19
  end
19
20
 
20
21
  it 'should be able to change properties via all_properties' do
@@ -28,7 +28,7 @@ describe Wombat::PropertyLocator do
28
28
 
29
29
  @locator_instance.stub(:context).and_return context
30
30
 
31
- @locator_instance.locate @metadata
31
+ @locator_instance.locate @metadata.all_properties
32
32
 
33
33
  @metadata.get_property("blah").result.should == "abc"
34
34
  @metadata["event"].get_property("data1").result.should == "Something cool"
@@ -47,7 +47,7 @@ describe Wombat::PropertyLocator do
47
47
 
48
48
  @metadata["event"].another_info "xpath=/anotherData", :html
49
49
 
50
- @locator_instance.locate @metadata
50
+ @locator_instance.locate @metadata.all_properties
51
51
 
52
52
  @metadata["event"].get_property("another_info").result.should == "some another info"
53
53
  end
@@ -59,7 +59,7 @@ describe Wombat::PropertyLocator do
59
59
  @locator_instance.stub(:context).and_return context
60
60
  @metadata["event"].description "xpath=/event/some/description", :text, "blah"
61
61
 
62
- @locator_instance.locate @metadata
62
+ @locator_instance.locate @metadata.all_properties
63
63
 
64
64
  @metadata["event"].get_property("description").result.should == "awesome event"
65
65
  end
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-02-06"
12
+ s.date = "2012-02-08"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -30,6 +30,7 @@ Gem::Specification.new do |s|
30
30
  "fixtures/vcr_cassettes/basic_crawler_page.yml",
31
31
  "lib/wombat.rb",
32
32
  "lib/wombat/crawler.rb",
33
+ "lib/wombat/iterator.rb",
33
34
  "lib/wombat/metadata.rb",
34
35
  "lib/wombat/parser.rb",
35
36
  "lib/wombat/property.rb",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-06 00:00:00.000000000 Z
12
+ date: 2012-02-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70197219823100 !ruby/object:Gem::Requirement
16
+ requirement: &70163534813940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70197219823100
24
+ version_requirements: *70163534813940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: activesupport
27
- requirement: &70197219822620 !ruby/object:Gem::Requirement
27
+ requirement: &70163534813160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70197219822620
35
+ version_requirements: *70163534813160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &70197219822140 !ruby/object:Gem::Requirement
38
+ requirement: &70163534812680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70197219822140
46
+ version_requirements: *70163534812680
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rake
49
- requirement: &70197219821660 !ruby/object:Gem::Requirement
49
+ requirement: &70163534812200 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70197219821660
57
+ version_requirements: *70163534812200
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: yard
60
- requirement: &70197219821120 !ruby/object:Gem::Requirement
60
+ requirement: &70163534811700 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70197219821120
68
+ version_requirements: *70163534811700
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: jeweler
71
- requirement: &70197219820520 !ruby/object:Gem::Requirement
71
+ requirement: &70163534811200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70197219820520
79
+ version_requirements: *70163534811200
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rspec
82
- requirement: &70197219819920 !ruby/object:Gem::Requirement
82
+ requirement: &70163534810720 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70197219819920
90
+ version_requirements: *70163534810720
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: guard
93
- requirement: &70197219819200 !ruby/object:Gem::Requirement
93
+ requirement: &70163534810240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :development
100
100
  prerelease: false
101
- version_requirements: *70197219819200
101
+ version_requirements: *70163534810240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: guard-rspec
104
- requirement: &70197219818500 !ruby/object:Gem::Requirement
104
+ requirement: &70163535048080 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :development
111
111
  prerelease: false
112
- version_requirements: *70197219818500
112
+ version_requirements: *70163535048080
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: guard-bundler
115
- requirement: &70197219818020 !ruby/object:Gem::Requirement
115
+ requirement: &70163535047560 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *70197219818020
123
+ version_requirements: *70163535047560
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: vcr
126
- requirement: &70197219817400 !ruby/object:Gem::Requirement
126
+ requirement: &70163535047060 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - =
@@ -131,10 +131,10 @@ dependencies:
131
131
  version: 2.0.0.rc1
132
132
  type: :development
133
133
  prerelease: false
134
- version_requirements: *70197219817400
134
+ version_requirements: *70163535047060
135
135
  - !ruby/object:Gem::Dependency
136
136
  name: fakeweb
137
- requirement: &70197219816680 !ruby/object:Gem::Requirement
137
+ requirement: &70163535046440 !ruby/object:Gem::Requirement
138
138
  none: false
139
139
  requirements:
140
140
  - - ! '>='
@@ -142,7 +142,7 @@ dependencies:
142
142
  version: '0'
143
143
  type: :development
144
144
  prerelease: false
145
- version_requirements: *70197219816680
145
+ version_requirements: *70163535046440
146
146
  description: Generic Web crawler with a DSL that parses structured data from web pages
147
147
  email: felipe.lima@gmail.com
148
148
  executables: []
@@ -164,6 +164,7 @@ files:
164
164
  - fixtures/vcr_cassettes/basic_crawler_page.yml
165
165
  - lib/wombat.rb
166
166
  - lib/wombat/crawler.rb
167
+ - lib/wombat/iterator.rb
167
168
  - lib/wombat/metadata.rb
168
169
  - lib/wombat/parser.rb
169
170
  - lib/wombat/property.rb