scrapify 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scrapify/base.rb +10 -6
- data/lib/scrapify/version.rb +1 -1
- data/spec/pizza.rb +6 -1
- data/spec/scrapify_spec.rb +39 -16
- metadata +14 -14
data/lib/scrapify/base.rb
CHANGED
@@ -25,7 +25,7 @@ module Scrapify
|
|
25
25
|
define_finders
|
26
26
|
end
|
27
27
|
|
28
|
-
def attribute(name, options={})
|
28
|
+
def attribute(name, options={}, &block)
|
29
29
|
add_attribute(name)
|
30
30
|
parser = options[:xpath] ? :xpath : :css
|
31
31
|
selector = options[parser]
|
@@ -34,12 +34,16 @@ module Scrapify
|
|
34
34
|
meta_define "#{name}_values" do
|
35
35
|
self.doc ||= parse_html
|
36
36
|
self.doc.send(parser, selector).map do |element|
|
37
|
-
|
38
|
-
|
39
|
-
match_data = content.scan(matcher).map &:first
|
40
|
-
options[:array] ? match_data : match_data.first
|
37
|
+
if block
|
38
|
+
yield element
|
41
39
|
else
|
42
|
-
content.
|
40
|
+
content = element.content
|
41
|
+
if matcher
|
42
|
+
match_data = content.scan(matcher).map &:first
|
43
|
+
options[:array] ? match_data : match_data.first
|
44
|
+
else
|
45
|
+
content.strip
|
46
|
+
end
|
43
47
|
end
|
44
48
|
end
|
45
49
|
end
|
data/lib/scrapify/version.rb
CHANGED
data/spec/pizza.rb
CHANGED
@@ -6,6 +6,11 @@ class Pizza
|
|
6
6
|
attribute :image_url, xpath: "//li//input//@value"
|
7
7
|
attribute :price, css: ".price", regex: /([\d\.]+)/
|
8
8
|
attribute :ingredients, css: ".ingredients", regex: /contains (\w+)/, array: true
|
9
|
+
attribute :ingredient_urls, css: '.references ol li' do |element|
|
10
|
+
element.children.map do |child|
|
11
|
+
child.attributes['href'].value if child.attributes['href']
|
12
|
+
end.compact
|
13
|
+
end
|
9
14
|
|
10
15
|
key :name
|
11
|
-
end
|
16
|
+
end
|
data/spec/scrapify_spec.rb
CHANGED
@@ -20,6 +20,7 @@ describe Scrapify do
|
|
20
20
|
<li>contains tomato</li>
|
21
21
|
<ol>
|
22
22
|
</span>
|
23
|
+
<span class='references'><ol><li></li></ol></span
|
23
24
|
</li>
|
24
25
|
<li>
|
25
26
|
<a>veg supreme</a><input value="veg.jpg">
|
@@ -30,18 +31,33 @@ describe Scrapify do
|
|
30
31
|
<li>contains jalapeno</li>
|
31
32
|
<ol>
|
32
33
|
</span>
|
34
|
+
<span class='references'><ol><li></li></ol></span
|
33
35
|
</li>
|
34
36
|
<li>
|
35
37
|
<a>pepperoni</a><input value="pepperoni.jpg">
|
36
38
|
<span class='price'>(3.45)</span>
|
37
39
|
<span class='ingredients'></span>
|
40
|
+
<span class='references'><ol><li></li></ol></span
|
41
|
+
</li>
|
42
|
+
<li>
|
43
|
+
<a>chicken golden delight</a><input value="golden.jpg">
|
44
|
+
<span class='price'>(4.56)</span>
|
45
|
+
<span class='ingredients'/>
|
46
|
+
<span class='references'>
|
47
|
+
<ol>
|
48
|
+
<li>
|
49
|
+
<div href='chicken.html'>chicken</div>
|
50
|
+
<div href='delight.html'>delight</div>
|
51
|
+
</li>
|
52
|
+
</ol>
|
53
|
+
</span>
|
38
54
|
</li>
|
39
55
|
</ul>
|
40
56
|
HTML
|
41
57
|
end
|
42
58
|
|
43
59
|
it "should return attribute names" do
|
44
|
-
::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
|
60
|
+
::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients, :ingredient_urls]
|
45
61
|
end
|
46
62
|
|
47
63
|
describe "html" do
|
@@ -50,19 +66,23 @@ describe Scrapify do
|
|
50
66
|
end
|
51
67
|
|
52
68
|
it "should parse html and fetch attributes using css" do
|
53
|
-
::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni']
|
69
|
+
::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni', 'chicken golden delight']
|
54
70
|
end
|
55
71
|
|
56
72
|
it "should parse html and fetch attributes using xpath" do
|
57
|
-
::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
73
|
+
::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg', 'golden.jpg']
|
58
74
|
end
|
59
75
|
|
60
76
|
it "should parse html and extract attributes using regex" do
|
61
|
-
::Pizza.price_values.should == ['1.23', '2.34', '3.45']
|
77
|
+
::Pizza.price_values.should == ['1.23', '2.34', '3.45', '4.56']
|
62
78
|
end
|
63
79
|
|
64
80
|
it "should parse html and extract multiple attributes using regex" do
|
65
|
-
::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
|
81
|
+
::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], [], []]
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should accept block to yield attribute values' do
|
85
|
+
::Pizza.ingredient_urls_values.should == [[], [], [], ['chicken.html', 'delight.html']]
|
66
86
|
end
|
67
87
|
|
68
88
|
it "should strip content" do
|
@@ -104,23 +124,23 @@ describe Scrapify do
|
|
104
124
|
describe "last" do
|
105
125
|
it "should fetch last matching element" do
|
106
126
|
last_pizza = ::Pizza.last
|
107
|
-
last_pizza.name.should == '
|
108
|
-
last_pizza.image_url.should == '
|
127
|
+
last_pizza.name.should == 'chicken golden delight'
|
128
|
+
last_pizza.image_url.should == 'golden.jpg'
|
109
129
|
end
|
110
130
|
end
|
111
131
|
|
112
132
|
describe "all" do
|
113
133
|
it "should fetch all objects" do
|
114
134
|
pizzas = ::Pizza.all
|
115
|
-
pizzas.size.should ==
|
116
|
-
pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni']
|
117
|
-
pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
135
|
+
pizzas.size.should == 4
|
136
|
+
pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni', 'chicken golden delight']
|
137
|
+
pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg', 'golden.jpg']
|
118
138
|
end
|
119
139
|
end
|
120
140
|
|
121
141
|
describe "count" do
|
122
142
|
it "should return number of matching elements" do
|
123
|
-
::Pizza.count.should ==
|
143
|
+
::Pizza.count.should == 4
|
124
144
|
end
|
125
145
|
end
|
126
146
|
|
@@ -131,7 +151,8 @@ describe Scrapify do
|
|
131
151
|
name: "chicken supreme",
|
132
152
|
image_url: "chicken.jpg",
|
133
153
|
price: '1.23',
|
134
|
-
ingredients: ['corn', 'tomato']
|
154
|
+
ingredients: ['corn', 'tomato'],
|
155
|
+
ingredient_urls: []
|
135
156
|
}
|
136
157
|
end
|
137
158
|
end
|
@@ -143,16 +164,18 @@ describe Scrapify do
|
|
143
164
|
name: "chicken supreme",
|
144
165
|
image_url: "chicken.jpg",
|
145
166
|
price: '1.23',
|
146
|
-
ingredients: ['corn', 'tomato']
|
167
|
+
ingredients: ['corn', 'tomato'],
|
168
|
+
ingredient_urls: []
|
147
169
|
}.to_json
|
148
170
|
end
|
149
171
|
|
150
172
|
it "should convert array to json" do
|
151
173
|
pizzas = ::Pizza.all
|
152
174
|
pizzas.to_json.should == [
|
153
|
-
{name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
|
154
|
-
{name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
|
155
|
-
{name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
|
175
|
+
{name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato'], :ingredient_urls => []},
|
176
|
+
{name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno'], :ingredient_urls => []},
|
177
|
+
{name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: [], :ingredient_urls => []},
|
178
|
+
{name: "chicken golden delight", image_url: "golden.jpg", price: '4.56', ingredients: [], :ingredient_urls => ['chicken.html', 'delight.html']},
|
156
179
|
].to_json
|
157
180
|
end
|
158
181
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-06-
|
12
|
+
date: 2012-06-19 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70142157555980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70142157555980
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: mocha
|
27
|
-
requirement: &
|
27
|
+
requirement: &70142157554660 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70142157554660
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: fakeweb
|
38
|
-
requirement: &
|
38
|
+
requirement: &70142157538940 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70142157538940
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70142157538080 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70142157538080
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: activesupport
|
60
|
-
requirement: &
|
60
|
+
requirement: &70142157537240 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70142157537240
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: json
|
71
|
-
requirement: &
|
71
|
+
requirement: &70142157536340 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70142157536340
|
80
80
|
description: ScrApify scraps static html sites to RESTlike APIs
|
81
81
|
email:
|
82
82
|
- sathish316@gmail.com
|