scrapify 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -8,6 +8,12 @@ ScrApify is a library to build APIs by scraping static sites and use data as mod
8
8
  $ gem install scrapify
9
9
  ```
10
10
 
11
+ If you're using Bundler, add this to your Gemfile:
12
+
13
+ ```
14
+ gem 'scrapify'
15
+ ```
16
+
11
17
  ### Usage
12
18
 
13
19
  Define html url and declare attributes using xpath or css selectors.
@@ -63,6 +69,34 @@ This will respond to two urls index and show with JSON:
63
69
 
64
70
  Jsonify currently has a limitation where the URLs /pizzas.json and /pizzas/1.json cannot be matched by the same map entry in Rack routes
65
71
 
66
- ### JSON API (Rails application example)
72
+ ### JSON API (Rails example)
73
+
74
+ Scrapify comes with a Rack application called Jsonify which can be used in rails routes to expose scraped models as JSON.
75
+
76
+ Check out this [Rails example](https://github.com/sathish316/jsonify_rails_example) for more details:
77
+
78
+ https://github.com/sathish316/jsonify_rails_example
79
+
80
+ 1 Add scrapify to Gemfile
81
+
82
+ ```
83
+ gem 'scrapify'
84
+ ```
85
+
86
+ 2 Define model to scrap data in app/models
87
+
88
+ ```
89
+ class Pizza
90
+ include Scrapify::Base
91
+ end
92
+ ```
93
+
94
+ 3 Add index and show API to routes
95
+
96
+ ```
97
+ pizza_api = Jsonify.new('/pizzas', Pizza)
98
+ get 'pizzas' => pizza_api
99
+ get 'pizzas/:id' => pizza_api
100
+ ```
67
101
 
68
- TODO
102
+ Jsonify scraps url and exposes index and show urls as JSON APIs
data/lib/jsonify.rb CHANGED
@@ -7,7 +7,8 @@ class Jsonify
7
7
  def call(env)
8
8
  path = env['REQUEST_PATH']
9
9
  response = path == @route ? all : one(find_id(path))
10
- [200, {"Content-Type" => "application/json"}, [response]]
10
+ header = @model.http_cache_header.merge("Content-Type" => "application/json")
11
+ [200, header, [response]]
11
12
  end
12
13
 
13
14
  private
@@ -23,4 +24,4 @@ class Jsonify
23
24
  def find_id(path)
24
25
  path[path.rindex('/') + 1, path.size]
25
26
  end
26
- end
27
+ end
data/lib/scrapify/base.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module Scrapify
2
2
  module Base
3
+ HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
3
4
  def self.included(klass)
4
5
  klass.extend ClassMethods
5
6
  klass.cattr_accessor :url, :doc, :attribute_names
@@ -28,9 +29,19 @@ module Scrapify
28
29
  add_attribute(name)
29
30
  parser = options[:xpath] ? :xpath : :css
30
31
  selector = options[parser]
32
+ matcher = /#{options[:regex]}/ if options[:regex]
33
+ to_array = options[:array]
31
34
  meta_define "#{name}_values" do
32
35
  self.doc ||= parse_html
33
- self.doc.send(parser, selector).map &:content
36
+ self.doc.send(parser, selector).map do |element|
37
+ content = element.content
38
+ if matcher
39
+ match_data = content.scan(matcher).map &:first
40
+ options[:array] ? match_data : match_data.first
41
+ else
42
+ content.strip
43
+ end
44
+ end
34
45
  end
35
46
  end
36
47
 
@@ -39,6 +50,12 @@ module Scrapify
39
50
  define_count attribute
40
51
  end
41
52
 
53
+ def http_cache_header
54
+ http_header.select do |(k, v)|
55
+ HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
56
+ end
57
+ end
58
+
42
59
  private
43
60
 
44
61
  def add_attribute(name)
@@ -47,7 +64,21 @@ module Scrapify
47
64
  end
48
65
 
49
66
  def parse_html
50
- Nokogiri::HTML(open(url))
67
+ Nokogiri::HTML(html_content)
68
+ end
69
+
70
+ def html_content
71
+ http_response.body
72
+ end
73
+
74
+ def http_response
75
+ @http_response ||= Net::HTTP.get_response URI(url)
76
+ end
77
+
78
+ def http_header
79
+ http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
80
+ hash[k] = v.first
81
+ end
51
82
  end
52
83
 
53
84
  def define_finders
@@ -1,3 +1,3 @@
1
1
  module Scrapify
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/scrapify.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'scrapify/version'
2
2
  require 'active_support/core_ext/class/attribute_accessors'
3
3
  require 'nokogiri'
4
- require 'open-uri'
4
+ require 'uri'
5
+ require 'net/http'
5
6
  require 'meta_define'
6
7
  require 'scrapify/base'
7
8
  require 'json'
data/spec/jsonify_spec.rb CHANGED
@@ -2,6 +2,11 @@ require 'spec_helper'
2
2
  require 'test_models'
3
3
 
4
4
  describe Jsonify do
5
+ before do
6
+ ::Pizza.stubs(:http_cache_header).returns("cache-control" => "private")
7
+ ::Pizza.stubs(:all).returns([])
8
+ end
9
+
5
10
  it "should find all objects and convert to json for index url" do
6
11
  pizzas = [{name: 'cheese'}, {name: 'chicken'}]
7
12
  ::Pizza.expects(:all).returns(pizzas)
@@ -25,4 +30,12 @@ describe Jsonify do
25
30
  header['Content-Type'].should == 'application/json'
26
31
  response.first.should == pizza.to_json
27
32
  end
28
- end
33
+
34
+ it "should forward the http cache headers" do
35
+ ::Pizza.expects(:http_cache_header).returns("cache-control"=>"private")
36
+
37
+ jsonify = Jsonify.new('/pizzas', ::Pizza)
38
+ status, header, response = jsonify.call('REQUEST_PATH' => '/pizzas')
39
+ header['cache-control'].should == "private"
40
+ end
41
+ end
data/spec/pizza.rb CHANGED
@@ -4,6 +4,8 @@ class Pizza
4
4
 
5
5
  attribute :name, css: ".menu_lft li a"
6
6
  attribute :image_url, xpath: "//li//input//@value"
7
+ attribute :price, css: ".price", regex: /([\d\.]+)/
8
+ attribute :ingredients, css: ".ingredients", regex: /contains (\w+)/, array: true
7
9
 
8
10
  key :name
9
11
  end
@@ -5,17 +5,43 @@ describe Scrapify do
5
5
 
6
6
  before do
7
7
  @pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
8
- FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
8
+ FakeWeb.register_uri :get, @pizza_url,
9
+ :cache_control => "private, s-maxage=0, max-age=0, must-revalidate",
10
+ :age => 51592,
11
+ :length => 12312,
12
+ :body => <<-HTML
9
13
  <ul class="menu_lft">
10
- <li><a>chicken supreme</a><input value="chicken.jpg"></li>
11
- <li><a>veg supreme</a><input value="veg.jpg"></li>
12
- <li><a>pepperoni</a><input value="pepperoni.jpg"></li>
14
+ <li>
15
+ <a> chicken supreme </a><input value="chicken.jpg">
16
+ <span class='price'>(1.23)</span>
17
+ <span class='ingredients'>
18
+ <ol>
19
+ <li>contains corn</li>
20
+ <li>contains tomato</li>
21
+ <ol>
22
+ </span>
23
+ </li>
24
+ <li>
25
+ <a>veg supreme</a><input value="veg.jpg">
26
+ <span class='price'>(2.34)</span>
27
+ <span class='ingredients'>
28
+ <ol>
29
+ <li>contains mushroom</li>
30
+ <li>contains jalapeno</li>
31
+ <ol>
32
+ </span>
33
+ </li>
34
+ <li>
35
+ <a>pepperoni</a><input value="pepperoni.jpg">
36
+ <span class='price'>(3.45)</span>
37
+ <span class='ingredients'></span>
38
+ </li>
13
39
  </ul>
14
40
  HTML
15
41
  end
16
42
 
17
43
  it "should return attribute names" do
18
- ::Pizza.attribute_names.should == [:name, :image_url]
44
+ ::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
19
45
  end
20
46
 
21
47
  describe "html" do
@@ -30,6 +56,27 @@ describe Scrapify do
30
56
  it "should parse html and fetch attributes using xpath" do
31
57
  ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
32
58
  end
59
+
60
+ it "should parse html and extract attributes using regex" do
61
+ ::Pizza.price_values.should == ['1.23', '2.34', '3.45']
62
+ end
63
+
64
+ it "should parse html and extract multiple attributes using regex" do
65
+ ::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
66
+ end
67
+
68
+ it "should strip content" do
69
+ ::Pizza.first.name.should == 'chicken supreme'
70
+ end
71
+
72
+ describe "cache headers" do
73
+ it "should return the http headers" do
74
+ ::Pizza.http_cache_header.should == {
75
+ "cache-control" => "private, s-maxage=0, max-age=0, must-revalidate",
76
+ "age" => 51592,
77
+ }
78
+ end
79
+ end
33
80
  end
34
81
 
35
82
  describe "find" do
@@ -80,14 +127,33 @@ describe Scrapify do
80
127
  describe "attributes" do
81
128
  it "should return attributes hash" do
82
129
  first_pizza = ::Pizza.first
83
- first_pizza.attributes.should == {name: "chicken supreme", image_url: "chicken.jpg"}
130
+ first_pizza.attributes.should == {
131
+ name: "chicken supreme",
132
+ image_url: "chicken.jpg",
133
+ price: '1.23',
134
+ ingredients: ['corn', 'tomato']
135
+ }
84
136
  end
85
137
  end
86
138
 
87
139
  describe "to_json" do
88
140
  it "should convert attributes to json" do
89
141
  first_pizza = ::Pizza.first
90
- first_pizza.to_json.should == {name: "chicken supreme", image_url: "chicken.jpg"}.to_json
142
+ first_pizza.to_json.should == {
143
+ name: "chicken supreme",
144
+ image_url: "chicken.jpg",
145
+ price: '1.23',
146
+ ingredients: ['corn', 'tomato']
147
+ }.to_json
148
+ end
149
+
150
+ it "should convert array to json" do
151
+ pizzas = ::Pizza.all
152
+ pizzas.to_json.should == [
153
+ {name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
154
+ {name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
155
+ {name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
156
+ ].to_json
91
157
  end
92
158
  end
93
- end
159
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapify
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-26 00:00:00.000000000Z
12
+ date: 2012-06-14 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70261744007960 !ruby/object:Gem::Requirement
16
+ requirement: &70352195064240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70261744007960
24
+ version_requirements: *70352195064240
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: mocha
27
- requirement: &70261744006240 !ruby/object:Gem::Requirement
27
+ requirement: &70352195063700 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70261744006240
35
+ version_requirements: *70352195063700
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: fakeweb
38
- requirement: &70261743993080 !ruby/object:Gem::Requirement
38
+ requirement: &70352195062820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70261743993080
46
+ version_requirements: *70352195062820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70261743990880 !ruby/object:Gem::Requirement
49
+ requirement: &70352195062120 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70261743990880
57
+ version_requirements: *70352195062120
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: activesupport
60
- requirement: &70261743989080 !ruby/object:Gem::Requirement
60
+ requirement: &70352195061500 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70261743989080
68
+ version_requirements: *70352195061500
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: json
71
- requirement: &70261743987980 !ruby/object:Gem::Requirement
71
+ requirement: &70352195060200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70261743987980
79
+ version_requirements: *70352195060200
80
80
  description: ScrApify scraps static html sites to RESTlike APIs
81
81
  email:
82
82
  - sathish316@gmail.com