scrapify 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +36 -2
- data/lib/jsonify.rb +3 -2
- data/lib/scrapify/base.rb +33 -2
- data/lib/scrapify/version.rb +1 -1
- data/lib/scrapify.rb +2 -1
- data/spec/jsonify_spec.rb +14 -1
- data/spec/pizza.rb +2 -0
- data/spec/scrapify_spec.rb +74 -8
- metadata +14 -14
data/README.md
CHANGED
@@ -8,6 +8,12 @@ ScrApify is a library to build APIs by scraping static sites and use data as mod
|
|
8
8
|
$ gem install scrapify
|
9
9
|
```
|
10
10
|
|
11
|
+
If you're using Bundler, add this to your Gemfile:
|
12
|
+
|
13
|
+
```
|
14
|
+
gem 'scrapify'
|
15
|
+
```
|
16
|
+
|
11
17
|
### Usage
|
12
18
|
|
13
19
|
Define html url and declare attributes using xpath or css selectors.
|
@@ -63,6 +69,34 @@ This will respond to two urls index and show with JSON:
|
|
63
69
|
|
64
70
|
Jsonify currently has a limitation where the URLs /pizzas.json and /pizzas/1.json cannot be matched by the same map entry in Rack routes
|
65
71
|
|
66
|
-
### JSON API (Rails
|
72
|
+
### JSON API (Rails example)
|
73
|
+
|
74
|
+
Scrapify comes with a Rack application called Jsonify which can be used in rails routes to expose scraped models as JSON.
|
75
|
+
|
76
|
+
Check out this [Rails example](https://github.com/sathish316/jsonify_rails_example) for more details:
|
77
|
+
|
78
|
+
https://github.com/sathish316/jsonify_rails_example
|
79
|
+
|
80
|
+
1 Add scrapify to Gemfile
|
81
|
+
|
82
|
+
```
|
83
|
+
gem 'scrapify'
|
84
|
+
```
|
85
|
+
|
86
|
+
2 Define model to scrap data in app/models
|
87
|
+
|
88
|
+
```
|
89
|
+
class Pizza
|
90
|
+
include Scrapify::Base
|
91
|
+
end
|
92
|
+
```
|
93
|
+
|
94
|
+
3 Add index and show API to routes
|
95
|
+
|
96
|
+
```
|
97
|
+
pizza_api = Jsonify.new('/pizzas', Pizza)
|
98
|
+
get 'pizzas' => pizza_api
|
99
|
+
get 'pizzas/:id' => pizza_api
|
100
|
+
```
|
67
101
|
|
68
|
-
|
102
|
+
Jsonify scraps url and exposes index and show urls as JSON APIs
|
data/lib/jsonify.rb
CHANGED
@@ -7,7 +7,8 @@ class Jsonify
|
|
7
7
|
def call(env)
|
8
8
|
path = env['REQUEST_PATH']
|
9
9
|
response = path == @route ? all : one(find_id(path))
|
10
|
-
|
10
|
+
header = @model.http_cache_header.merge("Content-Type" => "application/json")
|
11
|
+
[200, header, [response]]
|
11
12
|
end
|
12
13
|
|
13
14
|
private
|
@@ -23,4 +24,4 @@ class Jsonify
|
|
23
24
|
def find_id(path)
|
24
25
|
path[path.rindex('/') + 1, path.size]
|
25
26
|
end
|
26
|
-
end
|
27
|
+
end
|
data/lib/scrapify/base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module Scrapify
|
2
2
|
module Base
|
3
|
+
HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
|
3
4
|
def self.included(klass)
|
4
5
|
klass.extend ClassMethods
|
5
6
|
klass.cattr_accessor :url, :doc, :attribute_names
|
@@ -28,9 +29,19 @@ module Scrapify
|
|
28
29
|
add_attribute(name)
|
29
30
|
parser = options[:xpath] ? :xpath : :css
|
30
31
|
selector = options[parser]
|
32
|
+
matcher = /#{options[:regex]}/ if options[:regex]
|
33
|
+
to_array = options[:array]
|
31
34
|
meta_define "#{name}_values" do
|
32
35
|
self.doc ||= parse_html
|
33
|
-
self.doc.send(parser, selector).map
|
36
|
+
self.doc.send(parser, selector).map do |element|
|
37
|
+
content = element.content
|
38
|
+
if matcher
|
39
|
+
match_data = content.scan(matcher).map &:first
|
40
|
+
options[:array] ? match_data : match_data.first
|
41
|
+
else
|
42
|
+
content.strip
|
43
|
+
end
|
44
|
+
end
|
34
45
|
end
|
35
46
|
end
|
36
47
|
|
@@ -39,6 +50,12 @@ module Scrapify
|
|
39
50
|
define_count attribute
|
40
51
|
end
|
41
52
|
|
53
|
+
def http_cache_header
|
54
|
+
http_header.select do |(k, v)|
|
55
|
+
HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
42
59
|
private
|
43
60
|
|
44
61
|
def add_attribute(name)
|
@@ -47,7 +64,21 @@ module Scrapify
|
|
47
64
|
end
|
48
65
|
|
49
66
|
def parse_html
|
50
|
-
Nokogiri::HTML(
|
67
|
+
Nokogiri::HTML(html_content)
|
68
|
+
end
|
69
|
+
|
70
|
+
def html_content
|
71
|
+
http_response.body
|
72
|
+
end
|
73
|
+
|
74
|
+
def http_response
|
75
|
+
@http_response ||= Net::HTTP.get_response URI(url)
|
76
|
+
end
|
77
|
+
|
78
|
+
def http_header
|
79
|
+
http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
|
80
|
+
hash[k] = v.first
|
81
|
+
end
|
51
82
|
end
|
52
83
|
|
53
84
|
def define_finders
|
data/lib/scrapify/version.rb
CHANGED
data/lib/scrapify.rb
CHANGED
data/spec/jsonify_spec.rb
CHANGED
@@ -2,6 +2,11 @@ require 'spec_helper'
|
|
2
2
|
require 'test_models'
|
3
3
|
|
4
4
|
describe Jsonify do
|
5
|
+
before do
|
6
|
+
::Pizza.stubs(:http_cache_header).returns("cache-control" => "private")
|
7
|
+
::Pizza.stubs(:all).returns([])
|
8
|
+
end
|
9
|
+
|
5
10
|
it "should find all objects and convert to json for index url" do
|
6
11
|
pizzas = [{name: 'cheese'}, {name: 'chicken'}]
|
7
12
|
::Pizza.expects(:all).returns(pizzas)
|
@@ -25,4 +30,12 @@ describe Jsonify do
|
|
25
30
|
header['Content-Type'].should == 'application/json'
|
26
31
|
response.first.should == pizza.to_json
|
27
32
|
end
|
28
|
-
|
33
|
+
|
34
|
+
it "should forward the http cache headers" do
|
35
|
+
::Pizza.expects(:http_cache_header).returns("cache-control"=>"private")
|
36
|
+
|
37
|
+
jsonify = Jsonify.new('/pizzas', ::Pizza)
|
38
|
+
status, header, response = jsonify.call('REQUEST_PATH' => '/pizzas')
|
39
|
+
header['cache-control'].should == "private"
|
40
|
+
end
|
41
|
+
end
|
data/spec/pizza.rb
CHANGED
data/spec/scrapify_spec.rb
CHANGED
@@ -5,17 +5,43 @@ describe Scrapify do
|
|
5
5
|
|
6
6
|
before do
|
7
7
|
@pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
|
8
|
-
FakeWeb.register_uri :get, @pizza_url,
|
8
|
+
FakeWeb.register_uri :get, @pizza_url,
|
9
|
+
:cache_control => "private, s-maxage=0, max-age=0, must-revalidate",
|
10
|
+
:age => 51592,
|
11
|
+
:length => 12312,
|
12
|
+
:body => <<-HTML
|
9
13
|
<ul class="menu_lft">
|
10
|
-
<li
|
11
|
-
|
12
|
-
|
14
|
+
<li>
|
15
|
+
<a> chicken supreme </a><input value="chicken.jpg">
|
16
|
+
<span class='price'>(1.23)</span>
|
17
|
+
<span class='ingredients'>
|
18
|
+
<ol>
|
19
|
+
<li>contains corn</li>
|
20
|
+
<li>contains tomato</li>
|
21
|
+
<ol>
|
22
|
+
</span>
|
23
|
+
</li>
|
24
|
+
<li>
|
25
|
+
<a>veg supreme</a><input value="veg.jpg">
|
26
|
+
<span class='price'>(2.34)</span>
|
27
|
+
<span class='ingredients'>
|
28
|
+
<ol>
|
29
|
+
<li>contains mushroom</li>
|
30
|
+
<li>contains jalapeno</li>
|
31
|
+
<ol>
|
32
|
+
</span>
|
33
|
+
</li>
|
34
|
+
<li>
|
35
|
+
<a>pepperoni</a><input value="pepperoni.jpg">
|
36
|
+
<span class='price'>(3.45)</span>
|
37
|
+
<span class='ingredients'></span>
|
38
|
+
</li>
|
13
39
|
</ul>
|
14
40
|
HTML
|
15
41
|
end
|
16
42
|
|
17
43
|
it "should return attribute names" do
|
18
|
-
::Pizza.attribute_names.should == [:name, :image_url]
|
44
|
+
::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
|
19
45
|
end
|
20
46
|
|
21
47
|
describe "html" do
|
@@ -30,6 +56,27 @@ describe Scrapify do
|
|
30
56
|
it "should parse html and fetch attributes using xpath" do
|
31
57
|
::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
32
58
|
end
|
59
|
+
|
60
|
+
it "should parse html and extract attributes using regex" do
|
61
|
+
::Pizza.price_values.should == ['1.23', '2.34', '3.45']
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should parse html and extract multiple attributes using regex" do
|
65
|
+
::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should strip content" do
|
69
|
+
::Pizza.first.name.should == 'chicken supreme'
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "cache headers" do
|
73
|
+
it "should return the http headers" do
|
74
|
+
::Pizza.http_cache_header.should == {
|
75
|
+
"cache-control" => "private, s-maxage=0, max-age=0, must-revalidate",
|
76
|
+
"age" => 51592,
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
33
80
|
end
|
34
81
|
|
35
82
|
describe "find" do
|
@@ -80,14 +127,33 @@ describe Scrapify do
|
|
80
127
|
describe "attributes" do
|
81
128
|
it "should return attributes hash" do
|
82
129
|
first_pizza = ::Pizza.first
|
83
|
-
first_pizza.attributes.should == {
|
130
|
+
first_pizza.attributes.should == {
|
131
|
+
name: "chicken supreme",
|
132
|
+
image_url: "chicken.jpg",
|
133
|
+
price: '1.23',
|
134
|
+
ingredients: ['corn', 'tomato']
|
135
|
+
}
|
84
136
|
end
|
85
137
|
end
|
86
138
|
|
87
139
|
describe "to_json" do
|
88
140
|
it "should convert attributes to json" do
|
89
141
|
first_pizza = ::Pizza.first
|
90
|
-
first_pizza.to_json.should == {
|
142
|
+
first_pizza.to_json.should == {
|
143
|
+
name: "chicken supreme",
|
144
|
+
image_url: "chicken.jpg",
|
145
|
+
price: '1.23',
|
146
|
+
ingredients: ['corn', 'tomato']
|
147
|
+
}.to_json
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should convert array to json" do
|
151
|
+
pizzas = ::Pizza.all
|
152
|
+
pizzas.to_json.should == [
|
153
|
+
{name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
|
154
|
+
{name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
|
155
|
+
{name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
|
156
|
+
].to_json
|
91
157
|
end
|
92
158
|
end
|
93
|
-
end
|
159
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-14 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70352195064240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70352195064240
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: mocha
|
27
|
-
requirement: &
|
27
|
+
requirement: &70352195063700 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70352195063700
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: fakeweb
|
38
|
-
requirement: &
|
38
|
+
requirement: &70352195062820 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70352195062820
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70352195062120 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70352195062120
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: activesupport
|
60
|
-
requirement: &
|
60
|
+
requirement: &70352195061500 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70352195061500
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: json
|
71
|
-
requirement: &
|
71
|
+
requirement: &70352195060200 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70352195060200
|
80
80
|
description: ScrApify scraps static html sites to RESTlike APIs
|
81
81
|
email:
|
82
82
|
- sathish316@gmail.com
|