scrapify 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +36 -2
- data/lib/jsonify.rb +3 -2
- data/lib/scrapify/base.rb +33 -2
- data/lib/scrapify/version.rb +1 -1
- data/lib/scrapify.rb +2 -1
- data/spec/jsonify_spec.rb +14 -1
- data/spec/pizza.rb +2 -0
- data/spec/scrapify_spec.rb +74 -8
- metadata +14 -14
data/README.md
CHANGED
@@ -8,6 +8,12 @@ ScrApify is a library to build APIs by scraping static sites and use data as mod
|
|
8
8
|
$ gem install scrapify
|
9
9
|
```
|
10
10
|
|
11
|
+
If you're using Bundler, add this to your Gemfile:
|
12
|
+
|
13
|
+
```
|
14
|
+
gem 'scrapify'
|
15
|
+
```
|
16
|
+
|
11
17
|
### Usage
|
12
18
|
|
13
19
|
Define html url and declare attributes using xpath or css selectors.
|
@@ -63,6 +69,34 @@ This will respond to two urls index and show with JSON:
|
|
63
69
|
|
64
70
|
Jsonify currently has a limitation where the URLs /pizzas.json and /pizzas/1.json cannot be matched by the same map entry in Rack routes
|
65
71
|
|
66
|
-
### JSON API (Rails
|
72
|
+
### JSON API (Rails example)
|
73
|
+
|
74
|
+
Scrapify comes with a Rack application called Jsonify which can be used in rails routes to expose scraped models as JSON.
|
75
|
+
|
76
|
+
Check out this [Rails example](https://github.com/sathish316/jsonify_rails_example) for more details:
|
77
|
+
|
78
|
+
https://github.com/sathish316/jsonify_rails_example
|
79
|
+
|
80
|
+
1 Add scrapify to Gemfile
|
81
|
+
|
82
|
+
```
|
83
|
+
gem 'scrapify'
|
84
|
+
```
|
85
|
+
|
86
|
+
2 Define model to scrap data in app/models
|
87
|
+
|
88
|
+
```
|
89
|
+
class Pizza
|
90
|
+
include Scrapify::Base
|
91
|
+
end
|
92
|
+
```
|
93
|
+
|
94
|
+
3 Add index and show API to routes
|
95
|
+
|
96
|
+
```
|
97
|
+
pizza_api = Jsonify.new('/pizzas', Pizza)
|
98
|
+
get 'pizzas' => pizza_api
|
99
|
+
get 'pizzas/:id' => pizza_api
|
100
|
+
```
|
67
101
|
|
68
|
-
|
102
|
+
Jsonify scraps url and exposes index and show urls as JSON APIs
|
data/lib/jsonify.rb
CHANGED
@@ -7,7 +7,8 @@ class Jsonify
|
|
7
7
|
def call(env)
|
8
8
|
path = env['REQUEST_PATH']
|
9
9
|
response = path == @route ? all : one(find_id(path))
|
10
|
-
|
10
|
+
header = @model.http_cache_header.merge("Content-Type" => "application/json")
|
11
|
+
[200, header, [response]]
|
11
12
|
end
|
12
13
|
|
13
14
|
private
|
@@ -23,4 +24,4 @@ class Jsonify
|
|
23
24
|
def find_id(path)
|
24
25
|
path[path.rindex('/') + 1, path.size]
|
25
26
|
end
|
26
|
-
end
|
27
|
+
end
|
data/lib/scrapify/base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module Scrapify
|
2
2
|
module Base
|
3
|
+
HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
|
3
4
|
def self.included(klass)
|
4
5
|
klass.extend ClassMethods
|
5
6
|
klass.cattr_accessor :url, :doc, :attribute_names
|
@@ -28,9 +29,19 @@ module Scrapify
|
|
28
29
|
add_attribute(name)
|
29
30
|
parser = options[:xpath] ? :xpath : :css
|
30
31
|
selector = options[parser]
|
32
|
+
matcher = /#{options[:regex]}/ if options[:regex]
|
33
|
+
to_array = options[:array]
|
31
34
|
meta_define "#{name}_values" do
|
32
35
|
self.doc ||= parse_html
|
33
|
-
self.doc.send(parser, selector).map
|
36
|
+
self.doc.send(parser, selector).map do |element|
|
37
|
+
content = element.content
|
38
|
+
if matcher
|
39
|
+
match_data = content.scan(matcher).map &:first
|
40
|
+
options[:array] ? match_data : match_data.first
|
41
|
+
else
|
42
|
+
content.strip
|
43
|
+
end
|
44
|
+
end
|
34
45
|
end
|
35
46
|
end
|
36
47
|
|
@@ -39,6 +50,12 @@ module Scrapify
|
|
39
50
|
define_count attribute
|
40
51
|
end
|
41
52
|
|
53
|
+
def http_cache_header
|
54
|
+
http_header.select do |(k, v)|
|
55
|
+
HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
42
59
|
private
|
43
60
|
|
44
61
|
def add_attribute(name)
|
@@ -47,7 +64,21 @@ module Scrapify
|
|
47
64
|
end
|
48
65
|
|
49
66
|
def parse_html
|
50
|
-
Nokogiri::HTML(
|
67
|
+
Nokogiri::HTML(html_content)
|
68
|
+
end
|
69
|
+
|
70
|
+
def html_content
|
71
|
+
http_response.body
|
72
|
+
end
|
73
|
+
|
74
|
+
def http_response
|
75
|
+
@http_response ||= Net::HTTP.get_response URI(url)
|
76
|
+
end
|
77
|
+
|
78
|
+
def http_header
|
79
|
+
http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
|
80
|
+
hash[k] = v.first
|
81
|
+
end
|
51
82
|
end
|
52
83
|
|
53
84
|
def define_finders
|
data/lib/scrapify/version.rb
CHANGED
data/lib/scrapify.rb
CHANGED
data/spec/jsonify_spec.rb
CHANGED
@@ -2,6 +2,11 @@ require 'spec_helper'
|
|
2
2
|
require 'test_models'
|
3
3
|
|
4
4
|
describe Jsonify do
|
5
|
+
before do
|
6
|
+
::Pizza.stubs(:http_cache_header).returns("cache-control" => "private")
|
7
|
+
::Pizza.stubs(:all).returns([])
|
8
|
+
end
|
9
|
+
|
5
10
|
it "should find all objects and convert to json for index url" do
|
6
11
|
pizzas = [{name: 'cheese'}, {name: 'chicken'}]
|
7
12
|
::Pizza.expects(:all).returns(pizzas)
|
@@ -25,4 +30,12 @@ describe Jsonify do
|
|
25
30
|
header['Content-Type'].should == 'application/json'
|
26
31
|
response.first.should == pizza.to_json
|
27
32
|
end
|
28
|
-
|
33
|
+
|
34
|
+
it "should forward the http cache headers" do
|
35
|
+
::Pizza.expects(:http_cache_header).returns("cache-control"=>"private")
|
36
|
+
|
37
|
+
jsonify = Jsonify.new('/pizzas', ::Pizza)
|
38
|
+
status, header, response = jsonify.call('REQUEST_PATH' => '/pizzas')
|
39
|
+
header['cache-control'].should == "private"
|
40
|
+
end
|
41
|
+
end
|
data/spec/pizza.rb
CHANGED
data/spec/scrapify_spec.rb
CHANGED
@@ -5,17 +5,43 @@ describe Scrapify do
|
|
5
5
|
|
6
6
|
before do
|
7
7
|
@pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
|
8
|
-
FakeWeb.register_uri :get, @pizza_url,
|
8
|
+
FakeWeb.register_uri :get, @pizza_url,
|
9
|
+
:cache_control => "private, s-maxage=0, max-age=0, must-revalidate",
|
10
|
+
:age => 51592,
|
11
|
+
:length => 12312,
|
12
|
+
:body => <<-HTML
|
9
13
|
<ul class="menu_lft">
|
10
|
-
<li
|
11
|
-
|
12
|
-
|
14
|
+
<li>
|
15
|
+
<a> chicken supreme </a><input value="chicken.jpg">
|
16
|
+
<span class='price'>(1.23)</span>
|
17
|
+
<span class='ingredients'>
|
18
|
+
<ol>
|
19
|
+
<li>contains corn</li>
|
20
|
+
<li>contains tomato</li>
|
21
|
+
<ol>
|
22
|
+
</span>
|
23
|
+
</li>
|
24
|
+
<li>
|
25
|
+
<a>veg supreme</a><input value="veg.jpg">
|
26
|
+
<span class='price'>(2.34)</span>
|
27
|
+
<span class='ingredients'>
|
28
|
+
<ol>
|
29
|
+
<li>contains mushroom</li>
|
30
|
+
<li>contains jalapeno</li>
|
31
|
+
<ol>
|
32
|
+
</span>
|
33
|
+
</li>
|
34
|
+
<li>
|
35
|
+
<a>pepperoni</a><input value="pepperoni.jpg">
|
36
|
+
<span class='price'>(3.45)</span>
|
37
|
+
<span class='ingredients'></span>
|
38
|
+
</li>
|
13
39
|
</ul>
|
14
40
|
HTML
|
15
41
|
end
|
16
42
|
|
17
43
|
it "should return attribute names" do
|
18
|
-
::Pizza.attribute_names.should == [:name, :image_url]
|
44
|
+
::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
|
19
45
|
end
|
20
46
|
|
21
47
|
describe "html" do
|
@@ -30,6 +56,27 @@ describe Scrapify do
|
|
30
56
|
it "should parse html and fetch attributes using xpath" do
|
31
57
|
::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
32
58
|
end
|
59
|
+
|
60
|
+
it "should parse html and extract attributes using regex" do
|
61
|
+
::Pizza.price_values.should == ['1.23', '2.34', '3.45']
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should parse html and extract multiple attributes using regex" do
|
65
|
+
::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should strip content" do
|
69
|
+
::Pizza.first.name.should == 'chicken supreme'
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "cache headers" do
|
73
|
+
it "should return the http headers" do
|
74
|
+
::Pizza.http_cache_header.should == {
|
75
|
+
"cache-control" => "private, s-maxage=0, max-age=0, must-revalidate",
|
76
|
+
"age" => 51592,
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
33
80
|
end
|
34
81
|
|
35
82
|
describe "find" do
|
@@ -80,14 +127,33 @@ describe Scrapify do
|
|
80
127
|
describe "attributes" do
|
81
128
|
it "should return attributes hash" do
|
82
129
|
first_pizza = ::Pizza.first
|
83
|
-
first_pizza.attributes.should == {
|
130
|
+
first_pizza.attributes.should == {
|
131
|
+
name: "chicken supreme",
|
132
|
+
image_url: "chicken.jpg",
|
133
|
+
price: '1.23',
|
134
|
+
ingredients: ['corn', 'tomato']
|
135
|
+
}
|
84
136
|
end
|
85
137
|
end
|
86
138
|
|
87
139
|
describe "to_json" do
|
88
140
|
it "should convert attributes to json" do
|
89
141
|
first_pizza = ::Pizza.first
|
90
|
-
first_pizza.to_json.should == {
|
142
|
+
first_pizza.to_json.should == {
|
143
|
+
name: "chicken supreme",
|
144
|
+
image_url: "chicken.jpg",
|
145
|
+
price: '1.23',
|
146
|
+
ingredients: ['corn', 'tomato']
|
147
|
+
}.to_json
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should convert array to json" do
|
151
|
+
pizzas = ::Pizza.all
|
152
|
+
pizzas.to_json.should == [
|
153
|
+
{name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
|
154
|
+
{name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
|
155
|
+
{name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
|
156
|
+
].to_json
|
91
157
|
end
|
92
158
|
end
|
93
|
-
end
|
159
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-14 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70352195064240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70352195064240
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: mocha
|
27
|
-
requirement: &
|
27
|
+
requirement: &70352195063700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70352195063700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: fakeweb
|
38
|
-
requirement: &
|
38
|
+
requirement: &70352195062820 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70352195062820
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70352195062120 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70352195062120
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: activesupport
|
60
|
-
requirement: &
|
60
|
+
requirement: &70352195061500 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70352195061500
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: json
|
71
|
-
requirement: &
|
71
|
+
requirement: &70352195060200 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70352195060200
|
80
80
|
description: ScrApify scraps static html sites to RESTlike APIs
|
81
81
|
email:
|
82
82
|
- sathish316@gmail.com
|