scrapify 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/scrapify/base.rb CHANGED
@@ -25,7 +25,7 @@ module Scrapify
25
25
  define_finders
26
26
  end
27
27
 
28
- def attribute(name, options={})
28
+ def attribute(name, options={}, &block)
29
29
  add_attribute(name)
30
30
  parser = options[:xpath] ? :xpath : :css
31
31
  selector = options[parser]
@@ -34,12 +34,16 @@ module Scrapify
34
34
  meta_define "#{name}_values" do
35
35
  self.doc ||= parse_html
36
36
  self.doc.send(parser, selector).map do |element|
37
- content = element.content
38
- if matcher
39
- match_data = content.scan(matcher).map &:first
40
- options[:array] ? match_data : match_data.first
37
+ if block
38
+ yield element
41
39
  else
42
- content.strip
40
+ content = element.content
41
+ if matcher
42
+ match_data = content.scan(matcher).map &:first
43
+ options[:array] ? match_data : match_data.first
44
+ else
45
+ content.strip
46
+ end
43
47
  end
44
48
  end
45
49
  end
@@ -1,3 +1,3 @@
1
1
  module Scrapify
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/spec/pizza.rb CHANGED
@@ -6,6 +6,11 @@ class Pizza
6
6
  attribute :image_url, xpath: "//li//input//@value"
7
7
  attribute :price, css: ".price", regex: /([\d\.]+)/
8
8
  attribute :ingredients, css: ".ingredients", regex: /contains (\w+)/, array: true
9
+ attribute :ingredient_urls, css: '.references ol li' do |element|
10
+ element.children.map do |child|
11
+ child.attributes['href'].value if child.attributes['href']
12
+ end.compact
13
+ end
9
14
 
10
15
  key :name
11
- end
16
+ end
@@ -20,6 +20,7 @@ describe Scrapify do
20
20
  <li>contains tomato</li>
21
21
  <ol>
22
22
  </span>
23
+ <span class='references'><ol><li></li></ol></span
23
24
  </li>
24
25
  <li>
25
26
  <a>veg supreme</a><input value="veg.jpg">
@@ -30,18 +31,33 @@ describe Scrapify do
30
31
  <li>contains jalapeno</li>
31
32
  <ol>
32
33
  </span>
34
+ <span class='references'><ol><li></li></ol></span
33
35
  </li>
34
36
  <li>
35
37
  <a>pepperoni</a><input value="pepperoni.jpg">
36
38
  <span class='price'>(3.45)</span>
37
39
  <span class='ingredients'></span>
40
+ <span class='references'><ol><li></li></ol></span
41
+ </li>
42
+ <li>
43
+ <a>chicken golden delight</a><input value="golden.jpg">
44
+ <span class='price'>(4.56)</span>
45
+ <span class='ingredients'/>
46
+ <span class='references'>
47
+ <ol>
48
+ <li>
49
+ <div href='chicken.html'>chicken</div>
50
+ <div href='delight.html'>delight</div>
51
+ </li>
52
+ </ol>
53
+ </span>
38
54
  </li>
39
55
  </ul>
40
56
  HTML
41
57
  end
42
58
 
43
59
  it "should return attribute names" do
44
- ::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
60
+ ::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients, :ingredient_urls]
45
61
  end
46
62
 
47
63
  describe "html" do
@@ -50,19 +66,23 @@ describe Scrapify do
50
66
  end
51
67
 
52
68
  it "should parse html and fetch attributes using css" do
53
- ::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni']
69
+ ::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni', 'chicken golden delight']
54
70
  end
55
71
 
56
72
  it "should parse html and fetch attributes using xpath" do
57
- ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
73
+ ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg', 'golden.jpg']
58
74
  end
59
75
 
60
76
  it "should parse html and extract attributes using regex" do
61
- ::Pizza.price_values.should == ['1.23', '2.34', '3.45']
77
+ ::Pizza.price_values.should == ['1.23', '2.34', '3.45', '4.56']
62
78
  end
63
79
 
64
80
  it "should parse html and extract multiple attributes using regex" do
65
- ::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
81
+ ::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], [], []]
82
+ end
83
+
84
+ it 'should accept block to yield attribute values' do
85
+ ::Pizza.ingredient_urls_values.should == [[], [], [], ['chicken.html', 'delight.html']]
66
86
  end
67
87
 
68
88
  it "should strip content" do
@@ -104,23 +124,23 @@ describe Scrapify do
104
124
  describe "last" do
105
125
  it "should fetch last matching element" do
106
126
  last_pizza = ::Pizza.last
107
- last_pizza.name.should == 'pepperoni'
108
- last_pizza.image_url.should == 'pepperoni.jpg'
127
+ last_pizza.name.should == 'chicken golden delight'
128
+ last_pizza.image_url.should == 'golden.jpg'
109
129
  end
110
130
  end
111
131
 
112
132
  describe "all" do
113
133
  it "should fetch all objects" do
114
134
  pizzas = ::Pizza.all
115
- pizzas.size.should == 3
116
- pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni']
117
- pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
135
+ pizzas.size.should == 4
136
+ pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni', 'chicken golden delight']
137
+ pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg', 'golden.jpg']
118
138
  end
119
139
  end
120
140
 
121
141
  describe "count" do
122
142
  it "should return number of matching elements" do
123
- ::Pizza.count.should == 3
143
+ ::Pizza.count.should == 4
124
144
  end
125
145
  end
126
146
 
@@ -131,7 +151,8 @@ describe Scrapify do
131
151
  name: "chicken supreme",
132
152
  image_url: "chicken.jpg",
133
153
  price: '1.23',
134
- ingredients: ['corn', 'tomato']
154
+ ingredients: ['corn', 'tomato'],
155
+ ingredient_urls: []
135
156
  }
136
157
  end
137
158
  end
@@ -143,16 +164,18 @@ describe Scrapify do
143
164
  name: "chicken supreme",
144
165
  image_url: "chicken.jpg",
145
166
  price: '1.23',
146
- ingredients: ['corn', 'tomato']
167
+ ingredients: ['corn', 'tomato'],
168
+ ingredient_urls: []
147
169
  }.to_json
148
170
  end
149
171
 
150
172
  it "should convert array to json" do
151
173
  pizzas = ::Pizza.all
152
174
  pizzas.to_json.should == [
153
- {name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
154
- {name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
155
- {name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
175
+ {name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato'], :ingredient_urls => []},
176
+ {name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno'], :ingredient_urls => []},
177
+ {name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: [], :ingredient_urls => []},
178
+ {name: "chicken golden delight", image_url: "golden.jpg", price: '4.56', ingredients: [], :ingredient_urls => ['chicken.html', 'delight.html']},
156
179
  ].to_json
157
180
  end
158
181
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapify
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-14 00:00:00.000000000Z
12
+ date: 2012-06-19 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70352195064240 !ruby/object:Gem::Requirement
16
+ requirement: &70142157555980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70352195064240
24
+ version_requirements: *70142157555980
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: mocha
27
- requirement: &70352195063700 !ruby/object:Gem::Requirement
27
+ requirement: &70142157554660 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70352195063700
35
+ version_requirements: *70142157554660
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: fakeweb
38
- requirement: &70352195062820 !ruby/object:Gem::Requirement
38
+ requirement: &70142157538940 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70352195062820
46
+ version_requirements: *70142157538940
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70352195062120 !ruby/object:Gem::Requirement
49
+ requirement: &70142157538080 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70352195062120
57
+ version_requirements: *70142157538080
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: activesupport
60
- requirement: &70352195061500 !ruby/object:Gem::Requirement
60
+ requirement: &70142157537240 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70352195061500
68
+ version_requirements: *70142157537240
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: json
71
- requirement: &70352195060200 !ruby/object:Gem::Requirement
71
+ requirement: &70142157536340 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70352195060200
79
+ version_requirements: *70142157536340
80
80
  description: ScrApify scraps static html sites to RESTlike APIs
81
81
  email:
82
82
  - sathish316@gmail.com